From 08a8fb8d05a933b5daa95aed713a7ea838b9dcd2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Thu, 20 Aug 2020 13:41:54 +0200
Subject: [PATCH 01/53] Improvemnt of documentation
 DenseMaterix/DenseMatrixView::vectorProduct.

---
 .../Tutorials/Matrices/CMakeLists.txt         | 26 ++++++
 .../Tutorials/Matrices/tutorial_Matrices.md   | 86 +++++++++++++++++++
 src/TNL/Matrices/DenseMatrix.h                |  4 +
 src/TNL/Matrices/DenseMatrixView.h            |  4 +
 4 files changed, 120 insertions(+)
 create mode 100644 Documentation/Tutorials/Matrices/CMakeLists.txt
 create mode 100644 Documentation/Tutorials/Matrices/tutorial_Matrices.md
diff --git a/Documentation/Tutorials/Matrices/CMakeLists.txt b/Documentation/Tutorials/Matrices/CMakeLists.txt
new file mode 100644
index 000000000..0535e8fd5
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/CMakeLists.txt
@@ -0,0 +1,26 @@
+IF( BUILD_CUDA )
+   CUDA_ADD_EXECUTABLE( UniquePointerExample UniquePointerExample.cu )
+   ADD_CUSTOM_COMMAND( COMMAND UniquePointerExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/UniquePointerExample.out OUTPUT UniquePointerExample.out )
+   CUDA_ADD_EXECUTABLE( SharedPointerExample SharedPointerExample.cu )
+   ADD_CUSTOM_COMMAND( COMMAND SharedPointerExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SharedPointerExample.out OUTPUT SharedPointerExample.out )
+   CUDA_ADD_EXECUTABLE( DevicePointerExample DevicePointerExample.cu )
+   ADD_CUSTOM_COMMAND( COMMAND DevicePointerExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DevicePointerExample.out OUTPUT DevicePointerExample.out )
+ELSE()
+   ADD_EXECUTABLE( UniquePointerExample UniquePointerExample.cpp )
+   ADD_CUSTOM_COMMAND( COMMAND UniquePointerExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/UniquePointerExample.out OUTPUT UniquePointerExample.out )
+ENDIF()
+
+ADD_EXECUTABLE( UniquePointerHostExample UniquePointerHostExample.cpp )
+ADD_CUSTOM_COMMAND( COMMAND UniquePointerHostExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/UniquePointerHostExample.out OUTPUT UniquePointerHostExample.out )
+
+
+IF( BUILD_CUDA )
+ADD_CUSTOM_TARGET( TutorialsPointersCuda ALL DEPENDS
+   UniquePointerExample.out
+   SharedPointerExample.out
+   DevicePointerExample.out )
+ENDIF()
+
+ADD_CUSTOM_TARGET( TutorialsPointers ALL DEPENDS
+   UniquePointerHostExample.out
+)
\ No newline at end of file
diff --git a/Documentation/Tutorials/Matrices/tutorial_Matrices.md b/Documentation/Tutorials/Matrices/tutorial_Matrices.md
new file mode 100644
index 000000000..f9ef457e4
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/tutorial_Matrices.md
@@ -0,0 +1,86 @@
+\page tutorial_Pointers  Cross-device pointers tutorial
+
+## Introduction
+
+Smart pointers in TNL are motivated by the smart pointers in the STL library. In addition, they can manage image of the object they hold on different devices which is supposed to make objects offloading easier.
+
+## Table of Contents
+1. [Unique pointers](#unique_pointers)
+2. [Shared pointers](#shared_pointers)
+3. [Device pointers](#device_pointers)
+
+
+## Unique pointers <a name="unique_pointers"></a>
+
+Simillar to STL unique smart pointer `std::unique_ptr`, `UniquePointer` manages certain dynamicaly allocated object. The object is automatically deallocated when the pointer goes out of scope. The definition of `UniquePointer` reads as:
+
+\include codeSnippetUniquePointer.cpp
+
+It takes two template parameters:
+
+1. `Object` is a type of object managed by the pointer.
+2. `Device` is a device where the object is to be allocated.
+
+If the device type is `Devices::Host`, `UniquePointer` behaves as usual unique smart pointer. See the following example:
+
+\include UniquePointerHostExample.cpp
+
+The result is:
+
+\include UniquePointerHostExample.out
+
+
+If the device is different, `Devices::Cuda` for example, the unique pointer creates an image of the object even in the host memory. It allows one to manipulate the object on the host. All smart pointers are registered in a special register using which they can be synchronised with the host images before calling a CUDA kernel - all at once. This means that all modified images of the objects in the host memory are transferred on the GPU. See the following example:
+
+\include UniquePointerExample.cpp
+
+The result looks as:
+
+\include UniquePointerExample.out
+
+A disadventage of `UniquePointer` is that it cannot be passed to the CUDA kernel since it requires making a copy of itself. This is, however, from the nature of this object, prohibited. For this reason we have to derreference the pointer on the host. This is done by a method `getData`. Its template parameter tells what object image we want to dereference - the one on the host or the one on the device. When we passing the object on the device, we need to get the device image. The method `getData` returns constant reference on the object. Non-constant reference is accessible via a method `modifyData`. When this method is used to get the reference on the host image, the pointer is marked as **potentialy modified**. Note that we need to have non-const reference even when we need to change the data (array elements for example) but not the meta-data (array size for example). If meta-data do not change there is no need to synchronize the object image with the one on the device. To distinguish between these two situations, the smart pointer keeps one more object image which stores the meta-data state since the last synchronization. Before the device image is synchronised, the host image and the last-synchronization-state image are compared. If they do not change no synchronization is required. One can see that TNL cross-device smart pointers are really meant only for small objects, otherwise the smart pointers overhead might be significant.
+
+## Shared pointers <a name="shared_pointers"></a>
+
+One of the main goals of the TNL library is to make the development of the HPC code, including GPU kernels, as easy and efficient as possible. One way to do this is to profit from the object opriented programming even in CUDA kernels. Let us explain it on arrays. From certain point of view `Array` can be understood as an object consisting of data and metadata. Data part means elements that we insert into the array. Metadata is a pointer to the data but also size of the array. This information makes use of the class easier for example by checking array bounds when accessing the array elements. It is something that, when it is performed even in CUDA kernels, may help significantly with finding bugs in a code. To do this, we need to transfer not only pointers to the data but also complete metadata on the device. It is simple if the structure which is supposed to be transfered on the GPU does not have pointers to metadata. See the following example:
+
+
+\include codeSnippetSharedPointer-1.cpp
+
+If the pointer `data` points to a memory on GPU, this array can be passed to a kernel like this:
+
+\include codeSnippetSharedPointer-2.cpp
+
+The kernel `cudaKernel` can access the data as follows:
+
+\include codeSnippetSharedPointer-3.cpp
+
+But what if we have an object like this:
+
+\include codeSnippetSharedPointer-4.cpp
+
+Assume that there is an instance of `ArrayTuple` lets say `tuple` containing pointers to instances `a1` and `a2` of `Array`. The instances must be allocated on the GPU if one wants to simply pass the `tuple` to the CUDA kernel. Indeed, the CUDA kernels needs the arrays `a1` and `a2` to be on the GPU. See the following example:
+
+\include codeSnippetSharedPointer-5.cpp
+
+See, that the kernel needs to dereference `tuple.a1` and `tuple.a2`. Therefore these pointers must point to the global memoty of the GPU which means that arrays `a1` and `a2` must be allocated there using [cudaMalloc](http://developer.download.nvidia.com/compute/cuda/2_3/toolkit/docs/online/group__CUDART__MEMORY_gc63ffd93e344b939d6399199d8b12fef.html) lets say. It means, however, that the arrays `a1` and `a2` cannot be managed (for example resizing them requires changing `a1->size` and `a2->size`) on the host system by the CPU. The only solution to this is to have images of `a1` and `a2` and in the host memory and to copy them on the GPU before calling the CUDA kernel. One must not forget to modify the pointers in the `tuple` to point to the array copies on the GPU. To simplify this, TNL offers *cross-device shared smart pointers*. In addition to common smart pointers thay can manage an images of an object on different devices. Note that [CUDA Unified Memory](https://devblogs.nvidia.com/unified-memory-cuda-beginners/) is an answer to this problem as well. TNL cross-device smart pointers can be more efficient in some situations. (TODO: Prove this with benchmark problem.)
+
+The previous example could be implemented in TNL as follows:
+
+\include SharedPointerExample.cpp
+
+The result looks as:
+
+\include SharedPointerExample.out
+
+One of the differences between `UniquePointer` and `SmartPointer` is that the `SmartPointer` can be passed to the CUDA kernel. Dereferencing by operators `*` and `->` can be done in kernels as well and the result is reference to a proper object image i.e. on the host or the device. When these operators are used on constant smart pointer, constant reference is returned which is the same as calling the method `getData` with appropriate explicitely stated `Device` template parameter. In case of non-constant `SharedPointer` non-constant reference is obtained. It has the same effect as calling `modifyData` method. On the host system, everything what was mentioned in the section about `UniquePointer` holds even for the `SharedPointer`. In addition, `modifyData` method call or non-constant dereferencing can be done in kernel on the device. In this case, the programmer gets non-constant reference to an object which is however meant to be used to change the data managed by the object but not the metadata. There is no way to synchronize objects managed by the smart pointers from the device to the host. **It means that the metadata should not be changed on the device!** In fact, it would not make sense. Imagine changing array size or re-allocating the array within a CUDA kernel. This is something one should never do.
+
+## Device pointers <a name="device_pointers"></a>
+
+The last type of the smart pointer implemented in TNL is `DevicePointer`. It works the same way as `SharedPointer` but it does not create new object on the host system. `DevicePointer` is therefore useful in situation when there is already an object created in the host memory and we want to create its image even on the device. Both images are linked one with each other and so one can just manipulate the one on the host and then synchronize it on the device. The following listing is a modification of the previous example with tuple:
+
+\include DevicePointerExample.cpp
+
+The result looks the same:
+
+\include DevicePointerExample.out
diff --git a/src/TNL/Matrices/DenseMatrix.h b/src/TNL/Matrices/DenseMatrix.h
index d77b821be..32c4678d0 100644
--- a/src/TNL/Matrices/DenseMatrix.h
+++ b/src/TNL/Matrices/DenseMatrix.h
@@ -616,6 +616,10 @@ class DenseMatrix : public Matrix< Real, Device, Index, RealAllocator >
        *    is computed. It is zero by default.
        * \param end is the end of the rows range for which the vector product
        *    is computed. It is number if the matrix rows by default.
+       *
+       * Note that the ouput vector dimension must be the same as the number of matrix rows
+       * no matter how we set `begin` and `end` parameters. These parameters just say that
+       * some matrix rows and the output vector elements are omitted.
        */
       template< typename InVector, typename OutVector >
       void vectorProduct( const InVector& inVector,
diff --git a/src/TNL/Matrices/DenseMatrixView.h b/src/TNL/Matrices/DenseMatrixView.h
index b28817a20..5f565734c 100644
--- a/src/TNL/Matrices/DenseMatrixView.h
+++ b/src/TNL/Matrices/DenseMatrixView.h
@@ -564,6 +564,10 @@ class DenseMatrixView : public MatrixView< Real, Device, Index >
        *    is computed. It is zero by default.
        * \param end is the end of the rows range for which the vector product
        *    is computed. It is number if the matrix rows by default.
+       *
+       * Note that the ouput vector dimension must be the same as the number of matrix rows
+       * no matter how we set `begin` and `end` parameters. These parameters just say that
+       * some matrix rows and the output vector elements are omitted.
        */
       template< typename InVector, typename OutVector >
       void vectorProduct( const InVector& inVector,
-- 
GitLab


From 1e40e617ff7b9542d9bd6f393710f1564223035e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Thu, 20 Aug 2020 13:42:27 +0200
Subject: [PATCH 02/53] Fixed DenseMatrixView::print.

---
 src/TNL/Matrices/DenseMatrixView.hpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/TNL/Matrices/DenseMatrixView.hpp b/src/TNL/Matrices/DenseMatrixView.hpp
index d7a781e20..c8645b13b 100644
--- a/src/TNL/Matrices/DenseMatrixView.hpp
+++ b/src/TNL/Matrices/DenseMatrixView.hpp
@@ -678,7 +678,8 @@ void DenseMatrixView< Real, Device, Index, Organization >::print( std::ostream&
          str_ << std::setw( 4 ) << std::right << column << ":" << std::setw( 4 ) << std::left << this->getElement( row, column );
          str << std::setw( 10 ) << str_.str();
       }
-      str << std::endl;
+      if( row < this->getRows() - 1 )
+         str << std::endl;
    }
 }
 
-- 
GitLab


From 1867a7ec8959011cca07d65556ac623f733823c3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Thu, 20 Aug 2020 13:43:18 +0200
Subject: [PATCH 03/53] Added tutorial for DenseMatrix.

---
 Documentation/Tutorials/CMakeLists.txt        |   1 +
 .../Tutorials/Matrices/CMakeLists.txt         |  60 +++++---
 ...nseMatrixExample_Constructor_init_list.cpp |   1 +
 ...enseMatrixExample_Constructor_init_list.cu |   1 +
 .../DenseMatrixExample_addElement.cpp         |   1 +
 .../Matrices/DenseMatrixExample_addElement.cu |   1 +
 .../Matrices/DenseMatrixExample_forRows.cpp   |   1 +
 .../Matrices/DenseMatrixExample_forRows.cu    |   1 +
 ...nseMatrixExample_rowsReduction_maxNorm.cpp |  66 +++++++++
 ...enseMatrixExample_rowsReduction_maxNorm.cu |   1 +
 ...rixExample_rowsReduction_vectorProduct.cpp |  76 ++++++++++
 ...trixExample_rowsReduction_vectorProduct.cu |   1 +
 .../DenseMatrixExample_setElement.cpp         |   1 +
 .../Matrices/DenseMatrixExample_setElement.cu |   1 +
 .../DenseMatrixViewExample_setElement.cpp     |   1 +
 .../DenseMatrixViewExample_setElement.cu      |   1 +
 .../Tutorials/Matrices/tutorial_Matrices.md   | 140 +++++++++++++-----
 Documentation/Tutorials/index.md              |  12 +-
 18 files changed, 301 insertions(+), 66 deletions(-)
 create mode 120000 Documentation/Tutorials/Matrices/DenseMatrixExample_Constructor_init_list.cpp
 create mode 120000 Documentation/Tutorials/Matrices/DenseMatrixExample_Constructor_init_list.cu
 create mode 120000 Documentation/Tutorials/Matrices/DenseMatrixExample_addElement.cpp
 create mode 120000 Documentation/Tutorials/Matrices/DenseMatrixExample_addElement.cu
 create mode 120000 Documentation/Tutorials/Matrices/DenseMatrixExample_forRows.cpp
 create mode 120000 Documentation/Tutorials/Matrices/DenseMatrixExample_forRows.cu
 create mode 100644 Documentation/Tutorials/Matrices/DenseMatrixExample_rowsReduction_maxNorm.cpp
 create mode 120000 Documentation/Tutorials/Matrices/DenseMatrixExample_rowsReduction_maxNorm.cu
 create mode 100644 Documentation/Tutorials/Matrices/DenseMatrixExample_rowsReduction_vectorProduct.cpp
 create mode 120000 Documentation/Tutorials/Matrices/DenseMatrixExample_rowsReduction_vectorProduct.cu
 create mode 120000 Documentation/Tutorials/Matrices/DenseMatrixExample_setElement.cpp
 create mode 120000 Documentation/Tutorials/Matrices/DenseMatrixExample_setElement.cu
 create mode 120000 Documentation/Tutorials/Matrices/DenseMatrixViewExample_setElement.cpp
 create mode 120000 Documentation/Tutorials/Matrices/DenseMatrixViewExample_setElement.cu

diff --git a/Documentation/Tutorials/CMakeLists.txt b/Documentation/Tutorials/CMakeLists.txt
index 7f6aea702..98734f50c 100644
--- a/Documentation/Tutorials/CMakeLists.txt
+++ b/Documentation/Tutorials/CMakeLists.txt
@@ -3,3 +3,4 @@ add_subdirectory( Vectors )
 add_subdirectory( ReductionAndScan )
 add_subdirectory( ForLoops )
 add_subdirectory( Pointers )
+add_subdirectory( Matrices )
diff --git a/Documentation/Tutorials/Matrices/CMakeLists.txt b/Documentation/Tutorials/Matrices/CMakeLists.txt
index 0535e8fd5..13140ac05 100644
--- a/Documentation/Tutorials/Matrices/CMakeLists.txt
+++ b/Documentation/Tutorials/Matrices/CMakeLists.txt
@@ -1,26 +1,46 @@
 IF( BUILD_CUDA )
-   CUDA_ADD_EXECUTABLE( UniquePointerExample UniquePointerExample.cu )
-   ADD_CUSTOM_COMMAND( COMMAND UniquePointerExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/UniquePointerExample.out OUTPUT UniquePointerExample.out )
-   CUDA_ADD_EXECUTABLE( SharedPointerExample SharedPointerExample.cu )
-   ADD_CUSTOM_COMMAND( COMMAND SharedPointerExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SharedPointerExample.out OUTPUT SharedPointerExample.out )
-   CUDA_ADD_EXECUTABLE( DevicePointerExample DevicePointerExample.cu )
-   ADD_CUSTOM_COMMAND( COMMAND DevicePointerExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DevicePointerExample.out OUTPUT DevicePointerExample.out )
-ELSE()
-   ADD_EXECUTABLE( UniquePointerExample UniquePointerExample.cpp )
-   ADD_CUSTOM_COMMAND( COMMAND UniquePointerExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/UniquePointerExample.out OUTPUT UniquePointerExample.out )
-ENDIF()
+   CUDA_ADD_EXECUTABLE( DenseMatrixExample_Constructor_init_list DenseMatrixExample_Constructor_init_list.cu )
+   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_Constructor_init_list > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_Constructor_init_list.out OUTPUT DenseMatrixExample_Constructor_init_list.out )
+
+   CUDA_ADD_EXECUTABLE( DenseMatrixExample_addElement DenseMatrixExample_addElement.cu )
+   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_addElement > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_addElement.out OUTPUT DenseMatrixExample_addElement.out )
+
+   CUDA_ADD_EXECUTABLE( DenseMatrixExample_setElement DenseMatrixExample_setElement.cu )
+   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_setElement > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_setElement.out OUTPUT DenseMatrixExample_setElement.out )
 
-ADD_EXECUTABLE( UniquePointerHostExample UniquePointerHostExample.cpp )
-ADD_CUSTOM_COMMAND( COMMAND UniquePointerHostExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/UniquePointerHostExample.out OUTPUT UniquePointerHostExample.out )
+   CUDA_ADD_EXECUTABLE( DenseMatrixExample_forRows DenseMatrixExample_forRows.cu )
+   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_forRows > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_forRows.out OUTPUT DenseMatrixExample_forRows.out )
+   
+   CUDA_ADD_EXECUTABLE( DenseMatrixExample_rowsReduction_vectorProduct DenseMatrixExample_rowsReduction_vectorProduct.cu )
+   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_rowsReduction_vectorProduct > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_rowsReduction_vectorProduct.out OUTPUT DenseMatrixExample_rowsReduction_vectorProduct.out )
+   
+   CUDA_ADD_EXECUTABLE( DenseMatrixExample_rowsReduction_maxNorm DenseMatrixExample_rowsReduction_maxNorm.cu )
+   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_rowsReduction_maxNorm > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_rowsReduction_maxNorm.out OUTPUT DenseMatrixExample_rowsReduction_maxNorm.out )
 
+   CUDA_ADD_EXECUTABLE( DenseMatrixViewExample_setElement DenseMatrixViewExample_setElement.cu )
+   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixViewExample_setElement > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixViewExample_setElement.out OUTPUT DenseMatrixViewExample_setElement.out )
 
+ELSE()
+#   ADD_EXECUTABLE( UniquePointerExample UniquePointerExample.cpp )
+#   ADD_CUSTOM_COMMAND( COMMAND UniquePointerExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/UniquePointerExample.out OUTPUT UniquePointerExample.out )
+ENDIF()
+#
+#ADD_EXECUTABLE( UniquePointerHostExample UniquePointerHostExample.cpp )
+#ADD_CUSTOM_COMMAND( COMMAND UniquePointerHostExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/UniquePointerHostExample.out OUTPUT UniquePointerHostExample.out )
+#
+#
 IF( BUILD_CUDA )
-ADD_CUSTOM_TARGET( TutorialsPointersCuda ALL DEPENDS
-   UniquePointerExample.out
-   SharedPointerExample.out
-   DevicePointerExample.out )
+ADD_CUSTOM_TARGET( TutorialsMatricesCuda ALL DEPENDS
+   DenseMatrixExample_Constructor_init_list.out
+   DenseMatrixExample_addElement.out
+   DenseMatrixExample_setElement.out
+   DenseMatrixExample_forRows.out
+   DenseMatrixExample_rowsReduction_vectorProduct.out
+   DenseMatrixExample_rowsReduction_maxNorm.out
+   DenseMatrixViewExample_setElement.out
+ )
 ENDIF()
-
-ADD_CUSTOM_TARGET( TutorialsPointers ALL DEPENDS
-   UniquePointerHostExample.out
-)
\ No newline at end of file
+#
+#ADD_CUSTOM_TARGET( TutorialsPointers ALL DEPENDS
+#   UniquePointerHostExample.out
+#)
\ No newline at end of file
diff --git a/Documentation/Tutorials/Matrices/DenseMatrixExample_Constructor_init_list.cpp b/Documentation/Tutorials/Matrices/DenseMatrixExample_Constructor_init_list.cpp
new file mode 120000
index 000000000..faa270f15
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/DenseMatrixExample_Constructor_init_list.cpp
@@ -0,0 +1 @@
+../../Examples/Matrices/DenseMatrix/DenseMatrixExample_Constructor_init_list.cpp
\ No newline at end of file
diff --git a/Documentation/Tutorials/Matrices/DenseMatrixExample_Constructor_init_list.cu b/Documentation/Tutorials/Matrices/DenseMatrixExample_Constructor_init_list.cu
new file mode 120000
index 000000000..e633e76a9
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/DenseMatrixExample_Constructor_init_list.cu
@@ -0,0 +1 @@
+../../Examples/Matrices/DenseMatrix/DenseMatrixExample_Constructor_init_list.cu
\ No newline at end of file
diff --git a/Documentation/Tutorials/Matrices/DenseMatrixExample_addElement.cpp b/Documentation/Tutorials/Matrices/DenseMatrixExample_addElement.cpp
new file mode 120000
index 000000000..c471b0ce3
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/DenseMatrixExample_addElement.cpp
@@ -0,0 +1 @@
+../../Examples/Matrices/DenseMatrix/DenseMatrixExample_addElement.cpp
\ No newline at end of file
diff --git a/Documentation/Tutorials/Matrices/DenseMatrixExample_addElement.cu b/Documentation/Tutorials/Matrices/DenseMatrixExample_addElement.cu
new file mode 120000
index 000000000..67dd6dced
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/DenseMatrixExample_addElement.cu
@@ -0,0 +1 @@
+../../Examples/Matrices/DenseMatrix/DenseMatrixExample_addElement.cu
\ No newline at end of file
diff --git a/Documentation/Tutorials/Matrices/DenseMatrixExample_forRows.cpp b/Documentation/Tutorials/Matrices/DenseMatrixExample_forRows.cpp
new file mode 120000
index 000000000..690bdbf92
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/DenseMatrixExample_forRows.cpp
@@ -0,0 +1 @@
+../../Examples/Matrices/DenseMatrix/DenseMatrixExample_forRows.cpp
\ No newline at end of file
diff --git a/Documentation/Tutorials/Matrices/DenseMatrixExample_forRows.cu b/Documentation/Tutorials/Matrices/DenseMatrixExample_forRows.cu
new file mode 120000
index 000000000..0783daede
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/DenseMatrixExample_forRows.cu
@@ -0,0 +1 @@
+../../Examples/Matrices/DenseMatrix/DenseMatrixExample_forRows.cu
\ No newline at end of file
diff --git a/Documentation/Tutorials/Matrices/DenseMatrixExample_rowsReduction_maxNorm.cpp b/Documentation/Tutorials/Matrices/DenseMatrixExample_rowsReduction_maxNorm.cpp
new file mode 100644
index 000000000..a1837ebc7
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/DenseMatrixExample_rowsReduction_maxNorm.cpp
@@ -0,0 +1,66 @@
+#include <iostream>
+#include <iomanip>
+#include <functional>
+#include <TNL/Matrices/DenseMatrix.h>
+#include <TNL/Devices/Host.h>
+
+template< typename Device >
+void rowsReduction()
+{
+   TNL::Matrices::DenseMatrix< double, Device > matrix {
+      {  1,  0,  0,  0,  0 },
+      {  1,  2,  0,  0,  0 },
+      {  0,  1,  8,  0,  0 },
+      {  0,  0,  1,  9,  0 },
+      {  0,  0,  0,  0,  1 } };
+
+   /***
+    * Find largest element in each row.
+    */
+   TNL::Containers::Vector< double, Device > rowMax( matrix.getRows() );
+
+   /***
+    * Prepare vector view for lambdas.
+    */
+   auto rowMaxView = rowMax.getView();
+
+   /***
+    * Fetch lambda just returns absolute value of matrix elements.
+    */
+   auto fetch = [=] __cuda_callable__ ( int rowIdx, int columnIdx, const double& value ) -> double {
+      return TNL::abs( value );
+   };
+
+   /***
+    * Reduce lambda return maximum of given values.
+    */
+   auto reduce = [=] __cuda_callable__ ( double& a, const double& b ) -> double {
+      return TNL::max( a, b );
+   };
+
+   /***
+    * Keep lambda store the largest value in each row to the vector rowMax.
+    */
+   auto keep = [=] __cuda_callable__ ( int rowIdx, const double& value ) mutable {
+      rowMaxView[ rowIdx ] = value;
+   };
+
+   /***
+    * Compute the largest values in each row.
+    */
+   matrix.rowsReduction( 0, matrix.getRows(), fetch, reduce, keep, std::numeric_limits< double >::lowest() );
+
+   std::cout << "Max. elements in rows are: " << rowMax << std::endl;
+   std::cout << "Max. matrix norm is: " << TNL::max( rowMax ) << std::endl;
+}
+
+int main( int argc, char* argv[] )
+{
+   std::cout << "Rows reduction on host:" << std::endl;
+   rowsReduction< TNL::Devices::Host >();
+
+#ifdef HAVE_CUDA
+   std::cout << "Rows reduction on CUDA device:" << std::endl;
+   rowsReduction< TNL::Devices::Cuda >();
+#endif
+}
diff --git a/Documentation/Tutorials/Matrices/DenseMatrixExample_rowsReduction_maxNorm.cu b/Documentation/Tutorials/Matrices/DenseMatrixExample_rowsReduction_maxNorm.cu
new file mode 120000
index 000000000..04b5e78e1
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/DenseMatrixExample_rowsReduction_maxNorm.cu
@@ -0,0 +1 @@
+DenseMatrixExample_rowsReduction_maxNorm.cpp
\ No newline at end of file
diff --git a/Documentation/Tutorials/Matrices/DenseMatrixExample_rowsReduction_vectorProduct.cpp b/Documentation/Tutorials/Matrices/DenseMatrixExample_rowsReduction_vectorProduct.cpp
new file mode 100644
index 000000000..8f0f99cf4
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/DenseMatrixExample_rowsReduction_vectorProduct.cpp
@@ -0,0 +1,76 @@
+#include <iostream>
+#include <iomanip>
+#include <functional>
+#include <TNL/Matrices/DenseMatrix.h>
+#include <TNL/Devices/Host.h>
+
+template< typename Device >
+void rowsReduction()
+{
+   TNL::Matrices::DenseMatrix< double, Device > matrix {
+      {  1,  0,  0,  0,  0 },
+      {  1,  2,  0,  0,  0 },
+      {  0,  1,  8,  0,  0 },
+      {  0,  0,  1,  9,  0 },
+      {  0,  0,  0,  0,  1 } };
+
+   /***
+    * Allocate input and output vectors for matrix-vector product
+    */
+   TNL::Containers::Vector< double, Device > x( matrix.getColumns() ),
+                                             y( matrix.getRows() );
+
+   /***
+    * Fill the input vectors with ones.
+    */
+   x = 1.0;
+
+   /***
+    * Prepare vector view for lambdas.
+    */
+   auto xView = x.getView();
+   auto yView = y.getView();
+
+   /***
+    * Fetch lambda just returns product of appropriate matrix elements and the
+    * input vector elements.
+    */
+   auto fetch = [=] __cuda_callable__ ( int rowIdx, int columnIdx, const double& value ) -> double {
+      return xView[ columnIdx ] * value;
+   };
+
+   /***
+    * Reduce lambda return sum of given values.
+    */
+   auto reduce = [=] __cuda_callable__ ( double& a, const double& b ) -> double {
+      return a + b;
+   };
+
+   /***
+    * Keep lambda store the result of matrix-vector product to output vector y.
+    */
+   auto keep = [=] __cuda_callable__ ( int rowIdx, const double& value ) mutable {
+      yView[ rowIdx ] = value;
+   };
+
+   /***
+    * Compute matrix-vector product.
+    */
+   matrix.rowsReduction( 0, matrix.getRows(), fetch, reduce, keep, 0.0 );
+
+   std::cout << "The matrix reads as:" << std::endl << matrix << std::endl;
+   std::cout << "The input vector is:" << x << std::endl;
+   std::cout << "Result of matrix-vector multiplication is: " << y << std::endl;
+}
+
+int main( int argc, char* argv[] )
+{
+   std::cout << "Rows reduction on host:" << std::endl;
+   rowsReduction< TNL::Devices::Host >();
+
+#ifdef HAVE_CUDA
+   std::cout << std::endl;
+   std::cout << "Rows reduction on CUDA device:" << std::endl;
+   rowsReduction< TNL::Devices::Cuda >();
+#endif
+}
diff --git a/Documentation/Tutorials/Matrices/DenseMatrixExample_rowsReduction_vectorProduct.cu b/Documentation/Tutorials/Matrices/DenseMatrixExample_rowsReduction_vectorProduct.cu
new file mode 120000
index 000000000..36e05a773
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/DenseMatrixExample_rowsReduction_vectorProduct.cu
@@ -0,0 +1 @@
+DenseMatrixExample_rowsReduction_vectorProduct.cpp
\ No newline at end of file
diff --git a/Documentation/Tutorials/Matrices/DenseMatrixExample_setElement.cpp b/Documentation/Tutorials/Matrices/DenseMatrixExample_setElement.cpp
new file mode 120000
index 000000000..cb68721bb
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/DenseMatrixExample_setElement.cpp
@@ -0,0 +1 @@
+../../Examples/Matrices/DenseMatrix/DenseMatrixExample_setElement.cpp
\ No newline at end of file
diff --git a/Documentation/Tutorials/Matrices/DenseMatrixExample_setElement.cu b/Documentation/Tutorials/Matrices/DenseMatrixExample_setElement.cu
new file mode 120000
index 000000000..79539e197
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/DenseMatrixExample_setElement.cu
@@ -0,0 +1 @@
+../../Examples/Matrices/DenseMatrix/DenseMatrixExample_setElement.cu
\ No newline at end of file
diff --git a/Documentation/Tutorials/Matrices/DenseMatrixViewExample_setElement.cpp b/Documentation/Tutorials/Matrices/DenseMatrixViewExample_setElement.cpp
new file mode 120000
index 000000000..a3832e2e8
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/DenseMatrixViewExample_setElement.cpp
@@ -0,0 +1 @@
+../../Examples/Matrices/DenseMatrix/DenseMatrixViewExample_setElement.cpp
\ No newline at end of file
diff --git a/Documentation/Tutorials/Matrices/DenseMatrixViewExample_setElement.cu b/Documentation/Tutorials/Matrices/DenseMatrixViewExample_setElement.cu
new file mode 120000
index 000000000..9d1266dd3
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/DenseMatrixViewExample_setElement.cu
@@ -0,0 +1 @@
+../../Examples/Matrices/DenseMatrix/DenseMatrixViewExample_setElement.cu
\ No newline at end of file
diff --git a/Documentation/Tutorials/Matrices/tutorial_Matrices.md b/Documentation/Tutorials/Matrices/tutorial_Matrices.md
index f9ef457e4..022daf6ae 100644
--- a/Documentation/Tutorials/Matrices/tutorial_Matrices.md
+++ b/Documentation/Tutorials/Matrices/tutorial_Matrices.md
@@ -1,86 +1,144 @@
-\page tutorial_Pointers  Cross-device pointers tutorial
+\page tutorial_Matrices  Matrices tutorial
 
 ## Introduction
 
-Smart pointers in TNL are motivated by the smart pointers in the STL library. In addition, they can manage image of the object they hold on different devices which is supposed to make objects offloading easier.
+TNL offers the following type of matrices:  dense matrices, sparse matrices, tridiagonal matrices, multidiagonal matrices and lambda matrices. The sparse matrices can be marked as symmetric to optimize memory requirements. The interfaces of given matrix types are designed to be as unified as possible to ensure that the user can easily switch between different matrix types while making no or only a little changes in the source code. All matrix types allows traversing all matrix elements and manipulate them using a lambda function as well as performing flexible reduction in matrix rows. The following text describes particular matrix types in details.
+
 
 ## Table of Contents
-1. [Unique pointers](#unique_pointers)
-2. [Shared pointers](#shared_pointers)
-3. [Device pointers](#device_pointers)
+1. [Dense matrices](#dense_matrices)
+2. [Sparse matrices](#sparse_matrices)
+3. [Tridiagonal matrices](#tridiagonal_matrices)
+4. [Multidiagonal matrices](#multidiagonal_matrices)
+5. [Lambda matrices](#lambda_matrices)
 
 
-## Unique pointers <a name="unique_pointers"></a>
+## Dense matrices <a name="dense_matrices"></a>
 
-Simillar to STL unique smart pointer `std::unique_ptr`, `UniquePointer` manages certain dynamicaly allocated object. The object is automatically deallocated when the pointer goes out of scope. The definition of `UniquePointer` reads as:
+Dense matrix is a templated class defined in namespace \ref TNL::Matrices. It has five template parameters:
 
-\include codeSnippetUniquePointer.cpp
+* `Real` is a type of the matrix elements. It is `double` by default.
+* `Device` is a device where the matrix shall be allocated. Currently it can be either \ref TNL::Devices::Host for CPU or \ref TNL::Devices::Cuda for GPU supporting CUDA. It is \ref TNL::Devices::Host by default.
+* `Index` is a type to be used for indexing of the matrix elements. It `int` by default.
+* `ElementsOrganization` defines the organization of the matrix elements in memory. It can be \ref TNL::Algorithms::Segments::ColumnMajorOrder or \ref TNL::Algorithms::Segments::RowMajorOrder for column-major and row-major organization respectively. Be default it is the row-major order if the matrix is allocated in the host system and column major order if it is allocated on GPU.
+* `RealAllocator` is a memory allocator (one from \ref TNL::Allocators) which shall be used for allocation of the matrix elements. By default, it is the default allocator for given `Real` type and `Device` type -- see \ref TNL::Allocators::Default.
 
-It takes two template parameters:
+### Dense matrix allocation and initiation
 
-1. `Object` is a type of object managed by the pointer.
-2. `Device` is a device where the object is to be allocated.
+The following examples show how to allocate the dense matrix and how to initialize the matrix elements. Small matrices can be created simply by the constructor with an initializer list.
 
-If the device type is `Devices::Host`, `UniquePointer` behaves as usual unique smart pointer. See the following example:
+\include Matrices/DenseMatrix/DenseMatrixExample_Constructor_init_list.cpp
 
-\include UniquePointerHostExample.cpp
+In fact, the constructor takes a list of initializer lists. Each embedded list defines one matrix row and so the number of matrix rows is given by the size of the outer initializer list.  The number of matrix columns is given by the longest inner initializer lists. Shorter inner lists are filled with zeros from the right side. The result looks as follows:
 
-The result is:
+\include DenseMatrixExample_Constructor_init_list.out
 
-\include UniquePointerHostExample.out
+Larger matrices can be set-up with methods `setElement` and `addElement` (\ref TNL::Matrices::DenseMatrix::setElement, \ref TNL::Matrices::DenseMatrix::addElement). The following example shows how to call these methods from the host.
 
+\include DenseMatrixExample_addElement.cpp
 
-If the device is different, `Devices::Cuda` for example, the unique pointer creates an image of the object even in the host memory. It allows one to manipulate the object on the host. All smart pointers are registered in a special register using which they can be synchronised with the host images before calling a CUDA kernel - all at once. This means that all modified images of the objects in the host memory are transferred on the GPU. See the following example:
+As we can see, both methods can be called from the host no matter where the matrix is allocated. If it is on GPU, each call of `setElement` or `addElement` (\ref TNL::Matrices::DenseMatrix::setElement, \ref TNL::Matrices::DenseMatrix::addElement) causes slow transfer of tha data between CPU and GPU. Use this approach only if the performance is not a priority for example for matrices which are set only once this way. The result looks as follows:
 
-\include UniquePointerExample.cpp
+\include DenseMatrixExample_addElement.out
 
-The result looks as:
+More efficient way of the matrix initialization on GPU consists in calling the methods `setElement` and `addElement` (\ref TNL::Matrices::DenseMatrix::setElement, \ref TNL::Matrices::DenseMatrix::addElement) directly from GPU. It is demonstrated in the following example (of course it works even for CPU):
 
-\include UniquePointerExample.out
+\include DenseMatrixExample_setElement.cpp
 
-A disadventage of `UniquePointer` is that it cannot be passed to the CUDA kernel since it requires making a copy of itself. This is, however, from the nature of this object, prohibited. For this reason we have to derreference the pointer on the host. This is done by a method `getData`. Its template parameter tells what object image we want to dereference - the one on the host or the one on the device. When we passing the object on the device, we need to get the device image. The method `getData` returns constant reference on the object. Non-constant reference is accessible via a method `modifyData`. When this method is used to get the reference on the host image, the pointer is marked as **potentialy modified**. Note that we need to have non-const reference even when we need to change the data (array elements for example) but not the meta-data (array size for example). If meta-data do not change there is no need to synchronize the object image with the one on the device. To distinguish between these two situations, the smart pointer keeps one more object image which stores the meta-data state since the last synchronization. Before the device image is synchronised, the host image and the last-synchronization-state image are compared. If they do not change no synchronization is required. One can see that TNL cross-device smart pointers are really meant only for small objects, otherwise the smart pointers overhead might be significant.
+Here we use `SharedPointer` (\ref TNL::Pointers::SharedPointer) to make the matrix accessible in lambda functions even on GPU. We first call the `setElement` method from CPU to set the `i`-th diagonal element to `i`. Next we iterate over the matrix rows with `ParallelFor`and for each row we call a lambda function `f`. This is done on the same device where the matrix is allocated and so it is more efficient for matrices allocated on GPU. In the lambda function we just set the `i`-th diagonal element to `-i`. The result looks as follows:
 
-## Shared pointers <a name="shared_pointers"></a>
+\include DenseMatrixExample_setElement.out
 
-One of the main goals of the TNL library is to make the development of the HPC code, including GPU kernels, as easy and efficient as possible. One way to do this is to profit from the object opriented programming even in CUDA kernels. Let us explain it on arrays. From certain point of view `Array` can be understood as an object consisting of data and metadata. Data part means elements that we insert into the array. Metadata is a pointer to the data but also size of the array. This information makes use of the class easier for example by checking array bounds when accessing the array elements. It is something that, when it is performed even in CUDA kernels, may help significantly with finding bugs in a code. To do this, we need to transfer not only pointers to the data but also complete metadata on the device. It is simple if the structure which is supposed to be transfered on the GPU does not have pointers to metadata. See the following example:
+If we want to set more matrix elements in each row, we can use inner for-loop in the lambda function `f`. This, however, is limiting the parallelization and it can be inefficient for larger matrices. The next example demonstrates a method `forRows` (\ref TNL::Matrices::DenseMatrix::forRows) which iterates over all matrix elements in parallel and it calls a lambda function defining an operation we want to do on the matrix elements.
 
+\include DenseMatrixExample_forRows.cpp
 
-\include codeSnippetSharedPointer-1.cpp
+Firstly note, that this is simpler since we do not need any `SharedPointer`. The lambda function `f` requires the following parameters:
 
-If the pointer `data` points to a memory on GPU, this array can be passed to a kernel like this:
+* `rowIdx` is the row index of given matrix element.
+* `columnIdx` is the column index of given matrix element.
+* `value` is a reference on the matrix element value and so by changing this value we can modify the matrix element.
+* `compute` is a boolean which, when set to `false`, indicates that we can skip the rest of the matrix row. This is, however, only a hint and it does not guarantee that the rest of the matrix row is really skipped.
 
-\include codeSnippetSharedPointer-2.cpp
+The result looks as follows:
 
-The kernel `cudaKernel` can access the data as follows:
+\include DenseMatrixExample_forRows.out
 
-\include codeSnippetSharedPointer-3.cpp
+### Flexible reduction in matrix rows
 
-But what if we have an object like this:
+Simillar operation to `forRows` is `rowsReduction` (\ref TNL::Matrices::DenseMatrix::rowsReduction) which performs given reduction in each matric row. For example, a matrix-vector product can be seen as a reduction of products of matrix elements and input vector in particular matrix rows. The first element of the result vector ios obtained as:
 
-\include codeSnippetSharedPointer-4.cpp
+\f[
+y_1 = a_{11} x_1 + a_{12} x_2 + \ldots + a_{1n} x_n = \sum_{j=1}^n a_{1j}x_j
+\f]
 
-Assume that there is an instance of `ArrayTuple` lets say `tuple` containing pointers to instances `a1` and `a2` of `Array`. The instances must be allocated on the GPU if one wants to simply pass the `tuple` to the CUDA kernel. Indeed, the CUDA kernels needs the arrays `a1` and `a2` to be on the GPU. See the following example:
+and in general i-th element of the result vector is computed as
 
-\include codeSnippetSharedPointer-5.cpp
+\f[
+y_i = a_{i1} x_1 + a_{i2} x_2 + \ldots + a_{in} x_n = \sum_{j=1}^n a_{ij}x_j.
+\f]
 
-See, that the kernel needs to dereference `tuple.a1` and `tuple.a2`. Therefore these pointers must point to the global memoty of the GPU which means that arrays `a1` and `a2` must be allocated there using [cudaMalloc](http://developer.download.nvidia.com/compute/cuda/2_3/toolkit/docs/online/group__CUDART__MEMORY_gc63ffd93e344b939d6399199d8b12fef.html) lets say. It means, however, that the arrays `a1` and `a2` cannot be managed (for example resizing them requires changing `a1->size` and `a2->size`) on the host system by the CPU. The only solution to this is to have images of `a1` and `a2` and in the host memory and to copy them on the GPU before calling the CUDA kernel. One must not forget to modify the pointers in the `tuple` to point to the array copies on the GPU. To simplify this, TNL offers *cross-device shared smart pointers*. In addition to common smart pointers thay can manage an images of an object on different devices. Note that [CUDA Unified Memory](https://devblogs.nvidia.com/unified-memory-cuda-beginners/) is an answer to this problem as well. TNL cross-device smart pointers can be more efficient in some situations. (TODO: Prove this with benchmark problem.)
+We see that in i-th matrix row we have to compute the sum \f$\sum_{j=1}^n a_{ij}x_j\f$ which is reduction of products \f$ a_{ij}x_j\f$. Similar to *flexible parallel reduction* (\ref TNL::Algorithms::Reduction) we just need to design proper lambda functions. See the following example:
 
-The previous example could be implemented in TNL as follows:
 
-\include SharedPointerExample.cpp
+\include DenseMatrixExample_rowsReduction_vectorProduct.cpp
 
+The `fetch` lambda function computes the product \f$ a_{ij}x_j\f$ where \f$ a_{ij} \f$ is represented by `value` and \f$x_j \f$ is represented by `xView[columnIdx]`. The reduction is just sum of results particular products and it is represented by by the lambda function `reduce`. Finaly, the lambda function `keep` is responsible for storing the results of reduction in each matrix row (which is represented by the variable `value`) into the output vector `y`.  
 The result looks as:
 
-\include SharedPointerExample.out
+\include DenseMatrixExample_rowsReduction_vectorProduct.out
+
+We will show one more example which is computation of maximal absolute value in each matrix row. The results will be stored in a vector:
+
+\f[
+y_i = \max_{j=1,\ldots,n} |a_{ij}|.
+\f]
+
+See the following example:
+
+\include DenseMatrixExample_rowsReduction_maxNorm.cpp
+
+
+The `fetch` lambda function just returns absolute value of \f$a_{ij} \f$ which is represented again by the varibale `value`. The `reduce` lambda function returns larger of given values and the lambda fuction 'keep' stores the results to the output vectro the same way as in the previous example. Of course, if we compute the maximum of all output vector elements we get some kined of max matrix norm. The output looks as:
+
+\include DenseMatrixExample_rowsReduction_maxNorm.out
+
+### Dense-matrix vector product
+
+One of the most important matrix operation is the matrix-vector multiplication. It is represented by a method `vectorProduct` (\ref TNL::Matrices::DenseMatrix::vectorProduct). It is templated method with two template parameters `InVector` and `OutVector` telling the types of input and output vector respectively. Usually one will substitute some of \ref TNL::Containers::Array, \ref TNL::Containers::ArrayView, \ref TNL::Containers::Vector or \ref TNL::Containers::VectorView for these types. The method accepts the following parameters:
+
+* `inVector` is the input vector having the same number of elements as the number of matrix columns.
+* `outVector` is the output vector having the same number of elements as the number of matrix rows.
+* `matrixMultiplicator` is a number by which the result of matrix-vector product is multiplied. 
+* `outVectorMultiplicator` is a number by which the output vector is multiplied before added to the result of matrix-vector product.
+* `begin` is an index of the first matrix row that is involved in the multiplication. It is zero be default.
+* `end` is an index of the last matrix row that is involved in the multiplication. It is the last matrix row by default.
+
+Note that the ouput vector dimension must be the same as the number of matrix rows no matter how we set `begin` and `end` parameters. These parameters just say that some matrix rows and the output vector elements are omitted.
+
+To summarize, this method computes the following formula:
+
+`outVector = matrixMultiplicator * ( *this ) * inVector + outVectorMultiplicator * outVector.`
+
+### Dense matrix IO
+
+The dense matrix can be saved to a file using a method `save` (\ref TNL::Matrices::DenseMatrix::save) and restored with a method `load` (\ref TNL::Matrices::DenseMatrix::load). To print the matrix a method `print` (\ref TNL::Matrices::DenseMatrix::print) can be used.
+
+### Dense matrix view
+
+Similar to array view (\ref TNL::Containers::ArayView) and vector view (\ref TNL::Containers::VectorView), matrices also offer their view for easier use with lambda functions. For the dense matrix there is a `DenseMatrixView` (\ref TNL::Matrcioes::DenseMatrixView). We will demonstrate it on the example showing the method `setElement` (\ref TNL::Matrices::DenseMatrix::setElement). However, the `SharedPointer` will be replaced with the `DenseMatrixView`. The code looks as follows:
+
+\include DenseMatrixViewExample_setElement.cpp
+
+And the result is:
 
-One of the differences between `UniquePointer` and `SmartPointer` is that the `SmartPointer` can be passed to the CUDA kernel. Dereferencing by operators `*` and `->` can be done in kernels as well and the result is reference to a proper object image i.e. on the host or the device. When these operators are used on constant smart pointer, constant reference is returned which is the same as calling the method `getData` with appropriate explicitely stated `Device` template parameter. In case of non-constant `SharedPointer` non-constant reference is obtained. It has the same effect as calling `modifyData` method. On the host system, everything what was mentioned in the section about `UniquePointer` holds even for the `SharedPointer`. In addition, `modifyData` method call or non-constant dereferencing can be done in kernel on the device. In this case, the programmer gets non-constant reference to an object which is however meant to be used to change the data managed by the object but not the metadata. There is no way to synchronize objects managed by the smart pointers from the device to the host. **It means that the metadata should not be changed on the device!** In fact, it would not make sense. Imagine changing array size or re-allocating the array within a CUDA kernel. This is something one should never do.
+\include DenseMatrixViewExample_setElement.out
 
-## Device pointers <a name="device_pointers"></a>
 
-The last type of the smart pointer implemented in TNL is `DevicePointer`. It works the same way as `SharedPointer` but it does not create new object on the host system. `DevicePointer` is therefore useful in situation when there is already an object created in the host memory and we want to create its image even on the device. Both images are linked one with each other and so one can just manipulate the one on the host and then synchronize it on the device. The following listing is a modification of the previous example with tuple:
+## Sparse matrices <a name="sparse_matrices"></a>
 
-\include DevicePointerExample.cpp
+## Tridiagonal matrices <a name="tridiagonal_matrices"></a>
 
-The result looks the same:
+## Multidiagonal matrices <a name="multidiagonal_matrices"></a>
 
-\include DevicePointerExample.out
+## Lambda matrices <a name="lambda_matrices"></a>
diff --git a/Documentation/Tutorials/index.md b/Documentation/Tutorials/index.md
index 132f30799..d517faa3b 100644
--- a/Documentation/Tutorials/index.md
+++ b/Documentation/Tutorials/index.md
@@ -2,8 +2,10 @@
 
 ## Tutorials
 
-1. [Arrays](tutorial_Arrays.html)
-2. [Vectors](tutorial_Vectors.html)
-3. [Flexible parallel reduction and scan](tutorial_ReductionAndScan.html)
-4. [For loops](tutorial_ForLoops.html)
-5. [Cross-device pointers](tutorial_Pointers.html)
+1. [Building applications with TNL](tutorial_building_applications_with_tnl.html)
+2. [Arrays](tutorial_Arrays.html)
+3. [Vectors](tutorial_Vectors.html)
+4. [Flexible parallel reduction and scan](tutorial_ReductionAndScan.html)
+5. [For loops](tutorial_ForLoops.html)
+6. [Cross-device pointers](tutorial_Pointers.html)
+7. [Matrices](tutorial_Matrices.html)
-- 
GitLab


From 619b455b5411e419336829e0074e9d6f64c27394 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Fri, 21 Aug 2020 14:19:36 +0200
Subject: [PATCH 04/53] Added SparseMatrix constructor with row capacities
 vector.

---
 src/TNL/Matrices/SparseMatrix.h   | 37 ++++++++++++++++++++++++++++---
 src/TNL/Matrices/SparseMatrix.hpp | 24 ++++++++++++++++++--
 2 files changed, 56 insertions(+), 5 deletions(-)

diff --git a/src/TNL/Matrices/SparseMatrix.h b/src/TNL/Matrices/SparseMatrix.h
index 3bb7a3e58..6d068f370 100644
--- a/src/TNL/Matrices/SparseMatrix.h
+++ b/src/TNL/Matrices/SparseMatrix.h
@@ -41,7 +41,7 @@ namespace Matrices {
  * \tparam RealAllocator is allocator for the matrix elements values.
  * \tparam IndexAllocator is allocator for the matrix elements column indexes.
  */
-template< typename Real,
+template< typename Real =  double,
           typename Device = Devices::Host,
           typename Index = int,
           typename MatrixType = GeneralMatrix,
@@ -202,14 +202,20 @@ class SparseMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * \param realAllocator is used for allocation of matrix elements values.
        * \param indexAllocator is used for allocation of matrix elements column indexes.
        */
-      SparseMatrix( const IndexType rows,
-                    const IndexType columns,
+      template< typename Index_t, std::enable_if_t< std::is_integral< Index_t >::value, int > = 0 >
+      SparseMatrix( const Index_t rows,
+                    const Index_t columns,
                     const RealAllocatorType& realAllocator = RealAllocatorType(),
                     const IndexAllocatorType& indexAllocator = IndexAllocatorType() );
 
       /**
+<<<<<<< HEAD
        * \brief Constructor with matrix rows capacities and number of columns.
        *
+=======
+       * \brief Constructor with matrix rows capacities given as an initializer list and a number of columns.
+       * 
+>>>>>>> Added SparseMatrix constructor with row capacities vector.
        * The number of matrix rows is given by the size of \e rowCapacities list.
        *
        * \tparam ListIndex is the initializer list values type.
@@ -230,6 +236,31 @@ class SparseMatrix : public Matrix< Real, Device, Index, RealAllocator >
                              const RealAllocatorType& realAllocator = RealAllocatorType(),
                              const IndexAllocatorType& indexAllocator = IndexAllocatorType() );
 
+      /**
+       * \brief Constructor with matrix rows capacities given as a vector and number of columns.
+       * 
+       * The number of matrix rows is given by the size of \e rowCapacities vector.
+       * 
+       * \tparam RowCapacitiesVector is the row capacities vector type. Usually it is some of
+       *    \ref TNL::Containers::Array, \ref TNL::Containers::ArrayView, \ref TNL::Containers::Vector or
+       *    \ref TNL::Containers::VectorView.
+       * \param rowCapacities is a vector telling how many matrix elements must be
+       *    allocated in each row.
+       * \param columns is the number of matrix columns.
+       * \param realAllocator is used for allocation of matrix elements values.
+       * \param indexAllocator is used for allocation of matrix elements column indexes.
+       * 
+       * \par Example
+       * \include Matrices/SparseMatrix/SparseMatrixExample_Constructor_rowCapacities_vector.cpp
+       * \par Output
+       * \include SparseMatrixExample_Constructor_rowCapacities_vector.out
+       */
+      template< typename RowCapacitiesVector, std::enable_if_t< TNL::IsArrayType< RowCapacitiesVector >::value, int > = 0 >
+      explicit SparseMatrix( const RowCapacitiesVector& rowCapacities,
+                             const IndexType columns,
+                             const RealAllocatorType& realAllocator = RealAllocatorType(),
+                             const IndexAllocatorType& indexAllocator = IndexAllocatorType() );
+
       /**
        * \brief Constructor with matrix dimensions and data in initializer list.
        *
diff --git a/src/TNL/Matrices/SparseMatrix.hpp b/src/TNL/Matrices/SparseMatrix.hpp
index 1371fc27b..e2086d0eb 100644
--- a/src/TNL/Matrices/SparseMatrix.hpp
+++ b/src/TNL/Matrices/SparseMatrix.hpp
@@ -41,9 +41,10 @@ template< typename Real,
           typename ComputeReal,
           typename RealAllocator,
           typename IndexAllocator >
+   template< typename Index_t, std::enable_if_t< std::is_integral< Index_t >::value, int > >
 SparseMatrix< Real, Device, Index, MatrixType, Segments, ComputeReal, RealAllocator, IndexAllocator >::
-SparseMatrix( const IndexType rows,
-              const IndexType columns,
+SparseMatrix( const Index_t rows,
+              const Index_t columns,
               const RealAllocatorType& realAllocator,
               const IndexAllocatorType& indexAllocator )
 : BaseType( rows, columns, realAllocator ), columnIndexes( indexAllocator ),
@@ -71,6 +72,25 @@ SparseMatrix( const std::initializer_list< ListIndex >& rowCapacities,
    this->setRowCapacities( RowsCapacitiesType( rowCapacities ) );
 }
 
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          template< typename, typename, typename > class Segments,
+          typename ComputeReal,
+          typename RealAllocator,
+          typename IndexAllocator >
+   template< typename RowCapacitiesVector, std::enable_if_t< TNL::IsArrayType< RowCapacitiesVector >::value, int > >
+SparseMatrix< Real, Device, Index, MatrixType, Segments, ComputeReal, RealAllocator, IndexAllocator >::
+SparseMatrix( const RowCapacitiesVector& rowCapacities,
+              const IndexType columns,
+              const RealAllocatorType& realAllocator,
+              const IndexAllocatorType& indexAllocator )
+: BaseType( rowCapacities.getSize(), columns, realAllocator ), columnIndexes( indexAllocator )
+{
+   this->setRowCapacities( rowCapacities );
+}
+
 template< typename Real,
           typename Device,
           typename Index,
-- 
GitLab


From 1efbddf2cd95531c2937b6b24904a2e394edc18d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Fri, 21 Aug 2020 14:20:28 +0200
Subject: [PATCH 05/53] Writting documentation on sparse matrices.

---
 .../Matrices/SparseMatrix/CMakeLists.txt      |  11 ++
 ...ample_Constructor_rowCapacities_vector.cpp |  30 +++++
 ...xample_Constructor_rowCapacities_vector.cu |   1 +
 .../SparseMatrixExample_setRowCapacities.cpp  |   4 +-
 .../Tutorials/Matrices/CMakeLists.txt         |  45 +++++--
 ...eMatrixExample_Constructor_init_list_2.cpp |   1 +
 ...seMatrixExample_Constructor_init_list_2.cu |   1 +
 ...ample_Constructor_rowCapacities_vector.cpp |   1 +
 ...xample_Constructor_rowCapacities_vector.cu |   1 +
 .../SparseMatrixExample_setRowCapacities.cpp  |   1 +
 .../SparseMatrixExample_setRowCapacities.cu   |   1 +
 .../Tutorials/Matrices/tutorial_Matrices.md   | 118 +++++++++++++++++-
 12 files changed, 204 insertions(+), 11 deletions(-)
 create mode 100644 Documentation/Examples/Matrices/SparseMatrix/SparseMatrixExample_Constructor_rowCapacities_vector.cpp
 create mode 120000 Documentation/Examples/Matrices/SparseMatrix/SparseMatrixExample_Constructor_rowCapacities_vector.cu
 create mode 120000 Documentation/Tutorials/Matrices/SparseMatrixExample_Constructor_init_list_2.cpp
 create mode 120000 Documentation/Tutorials/Matrices/SparseMatrixExample_Constructor_init_list_2.cu
 create mode 120000 Documentation/Tutorials/Matrices/SparseMatrixExample_Constructor_rowCapacities_vector.cpp
 create mode 120000 Documentation/Tutorials/Matrices/SparseMatrixExample_Constructor_rowCapacities_vector.cu
 create mode 120000 Documentation/Tutorials/Matrices/SparseMatrixExample_setRowCapacities.cpp
 create mode 120000 Documentation/Tutorials/Matrices/SparseMatrixExample_setRowCapacities.cu

diff --git a/Documentation/Examples/Matrices/SparseMatrix/CMakeLists.txt b/Documentation/Examples/Matrices/SparseMatrix/CMakeLists.txt
index 3f0410315..e4000dec8 100644
--- a/Documentation/Examples/Matrices/SparseMatrix/CMakeLists.txt
+++ b/Documentation/Examples/Matrices/SparseMatrix/CMakeLists.txt
@@ -9,6 +9,11 @@ IF( BUILD_CUDA )
                         ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SparseMatrixExample_Constructor_init_list_2.out
                        OUTPUT SparseMatrixExample_Constructor_init_list_2.out )
 
+   CUDA_ADD_EXECUTABLE( SparseMatrixExample_Constructor_rowCapacities_vector_cuda SparseMatrixExample_Constructor_rowCapacities_vector.cu )
+   ADD_CUSTOM_COMMAND( COMMAND SparseMatrixExample_Constructor_rowCapacities_vector_cuda >
+                        ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SparseMatrixExample_Constructor_rowCapacities_vector.out
+                       OUTPUT SparseMatrixExample_Constructor_rowCapacities_vector.out )
+
    CUDA_ADD_EXECUTABLE( SparseMatrixExample_Constructor_std_map_cuda SparseMatrixExample_Constructor_std_map.cu )
    ADD_CUSTOM_COMMAND( COMMAND SparseMatrixExample_Constructor_std_map_cuda >
                         ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SparseMatrixExample_Constructor_std_map.out
@@ -150,6 +155,11 @@ ELSE()
                         ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SparseMatrixExample_Constructor_init_list_2.out
                        OUTPUT SparseMatrixExample_Constructor_init_list_2.out )
 
+   ADD_EXECUTABLE( SparseMatrixExample_Constructor_rowCapacities_vector SparseMatrixExample_Constructor_rowCapacities_vector.cpp )
+   ADD_CUSTOM_COMMAND( COMMAND SparseMatrixExample_Constructor_rowCapacities_vector >
+                        ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SparseMatrixExample_Constructor_rowCapacities_vector.out
+                       OUTPUT SparseMatrixExample_Constructor_rowCapacities_vector.out )
+
    ADD_EXECUTABLE( SparseMatrixExample_Constructor_std_map SparseMatrixExample_Constructor_std_map.cpp )
    ADD_CUSTOM_COMMAND( COMMAND SparseMatrixExample_Constructor_std_map >
                         ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SparseMatrixExample_Constructor_std_map.out
@@ -285,6 +295,7 @@ ENDIF()
 ADD_CUSTOM_TARGET( RunSparseMatricesExamples ALL DEPENDS
    SparseMatrixExample_Constructor_init_list_1.out
    SparseMatrixExample_Constructor_init_list_2.out
+   SparseMatrixExample_Constructor_rowCapacities_vector.out
    SparseMatrixExample_Constructor_std_map.out
    SparseMatrixExample_getSerializationType.out
    SparseMatrixExample_setRowCapacities.out
diff --git a/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixExample_Constructor_rowCapacities_vector.cpp b/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixExample_Constructor_rowCapacities_vector.cpp
new file mode 100644
index 000000000..0ad6e6c4c
--- /dev/null
+++ b/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixExample_Constructor_rowCapacities_vector.cpp
@@ -0,0 +1,30 @@
+#include <iostream>
+#include <TNL/Matrices/SparseMatrix.h>
+#include <TNL/Containers/Vector.h>
+#include <TNL/Devices/Host.h>
+
+
+template< typename Device >
+void initializerListExample()
+{
+   TNL::Containers::Vector< int, Device > rowCapacities{  1,  2,  3,  4,  5 };
+   TNL::Matrices::SparseMatrix< double, Device > matrix {
+      rowCapacities,  // row capacities
+      6 };            // number of matrix columns
+
+   for( int row = 0; row < matrix.getRows(); row++ )
+      for( int column = 0; column <= row; column++ )
+         matrix.setElement( row, column, row - column + 1 );
+   std::cout << "General sparse matrix: " << std::endl << matrix << std::endl;
+}
+
+int main( int argc, char* argv[] )
+{
+   std::cout << "Creating matrices on CPU ... " << std::endl;
+   initializerListExample< TNL::Devices::Host >();
+
+#ifdef HAVE_CUDA
+   std::cout << "Creating matrices on CUDA GPU ... " << std::endl;
+   initializerListExample< TNL::Devices::Cuda >();
+#endif
+}
diff --git a/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixExample_Constructor_rowCapacities_vector.cu b/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixExample_Constructor_rowCapacities_vector.cu
new file mode 120000
index 000000000..e409998b7
--- /dev/null
+++ b/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixExample_Constructor_rowCapacities_vector.cu
@@ -0,0 +1 @@
+SparseMatrixExample_Constructor_rowCapacities_vector.cpp
\ No newline at end of file
diff --git a/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixExample_setRowCapacities.cpp b/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixExample_setRowCapacities.cpp
index f282aee6d..fbf87e57a 100644
--- a/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixExample_setRowCapacities.cpp
+++ b/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixExample_setRowCapacities.cpp
@@ -19,11 +19,11 @@ void setRowCapacitiesExample()
 
 int main( int argc, char* argv[] )
 {
-   std::cout << "Creating matrices on CPU ... " << std::endl;
+   std::cout << "Creating matrix on CPU ... " << std::endl;
    setRowCapacitiesExample< TNL::Devices::Host >();
 
 #ifdef HAVE_CUDA
-   std::cout << "Creating matrices on CUDA GPU ... " << std::endl;
+   std::cout << "Creating matrix on CUDA GPU ... " << std::endl;
    setRowCapacitiesExample< TNL::Devices::Cuda >();
 #endif
 }
diff --git a/Documentation/Tutorials/Matrices/CMakeLists.txt b/Documentation/Tutorials/Matrices/CMakeLists.txt
index 13140ac05..2ec5496a0 100644
--- a/Documentation/Tutorials/Matrices/CMakeLists.txt
+++ b/Documentation/Tutorials/Matrices/CMakeLists.txt
@@ -1,24 +1,53 @@
 IF( BUILD_CUDA )
    CUDA_ADD_EXECUTABLE( DenseMatrixExample_Constructor_init_list DenseMatrixExample_Constructor_init_list.cu )
-   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_Constructor_init_list > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_Constructor_init_list.out OUTPUT DenseMatrixExample_Constructor_init_list.out )
+   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_Constructor_init_list >
+                       ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_Constructor_init_list.out 
+                       OUTPUT DenseMatrixExample_Constructor_init_list.out )
 
    CUDA_ADD_EXECUTABLE( DenseMatrixExample_addElement DenseMatrixExample_addElement.cu )
-   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_addElement > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_addElement.out OUTPUT DenseMatrixExample_addElement.out )
+   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_addElement >
+                       ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_addElement.out
+                       OUTPUT DenseMatrixExample_addElement.out )
 
    CUDA_ADD_EXECUTABLE( DenseMatrixExample_setElement DenseMatrixExample_setElement.cu )
-   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_setElement > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_setElement.out OUTPUT DenseMatrixExample_setElement.out )
+   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_setElement >
+                       ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_setElement.out
+                       OUTPUT DenseMatrixExample_setElement.out )
 
    CUDA_ADD_EXECUTABLE( DenseMatrixExample_forRows DenseMatrixExample_forRows.cu )
-   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_forRows > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_forRows.out OUTPUT DenseMatrixExample_forRows.out )
+   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_forRows >
+                       ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_forRows.out
+                       OUTPUT DenseMatrixExample_forRows.out )
    
    CUDA_ADD_EXECUTABLE( DenseMatrixExample_rowsReduction_vectorProduct DenseMatrixExample_rowsReduction_vectorProduct.cu )
-   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_rowsReduction_vectorProduct > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_rowsReduction_vectorProduct.out OUTPUT DenseMatrixExample_rowsReduction_vectorProduct.out )
+   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_rowsReduction_vectorProduct >
+                       ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_rowsReduction_vectorProduct.out
+                       OUTPUT DenseMatrixExample_rowsReduction_vectorProduct.out )
    
    CUDA_ADD_EXECUTABLE( DenseMatrixExample_rowsReduction_maxNorm DenseMatrixExample_rowsReduction_maxNorm.cu )
-   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_rowsReduction_maxNorm > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_rowsReduction_maxNorm.out OUTPUT DenseMatrixExample_rowsReduction_maxNorm.out )
+   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_rowsReduction_maxNorm >
+                       ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_rowsReduction_maxNorm.out
+                       OUTPUT DenseMatrixExample_rowsReduction_maxNorm.out )
 
    CUDA_ADD_EXECUTABLE( DenseMatrixViewExample_setElement DenseMatrixViewExample_setElement.cu )
-   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixViewExample_setElement > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixViewExample_setElement.out OUTPUT DenseMatrixViewExample_setElement.out )
+   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixViewExample_setElement >
+                       ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixViewExample_setElement.out OUTPUT
+                       DenseMatrixViewExample_setElement.out )
+
+   CUDA_ADD_EXECUTABLE( SparseMatrixExample_Constructor_init_list_2 SparseMatrixExample_Constructor_init_list_2.cu )
+   ADD_CUSTOM_COMMAND( COMMAND SparseMatrixExample_Constructor_init_list_2 >
+                       ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SparseMatrixExample_Constructor_init_list_2.out
+                       OUTPUT SparseMatrixExample_Constructor_init_list_2.out )
+
+   CUDA_ADD_EXECUTABLE( SparseMatrixExample_setRowCapacities SparseMatrixExample_setRowCapacities.cu )
+   ADD_CUSTOM_COMMAND( COMMAND SparseMatrixExample_setRowCapacities >
+                       ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SparseMatrixExample_setRowCapacities.out
+                       OUTPUT SparseMatrixExample_setRowCapacities.out )
+
+   CUDA_ADD_EXECUTABLE( SparseMatrixExample_Constructor_rowCapacities_vector SparseMatrixExample_Constructor_rowCapacities_vector.cu )
+   ADD_CUSTOM_COMMAND( COMMAND SparseMatrixExample_Constructor_rowCapacities_vector >
+                       ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SparseMatrixExample_Constructor_rowCapacities_vector.out
+                       OUTPUT SparseMatrixExample_Constructor_rowCapacities_vector.out )
 
 ELSE()
 #   ADD_EXECUTABLE( UniquePointerExample UniquePointerExample.cpp )
@@ -38,6 +67,8 @@ ADD_CUSTOM_TARGET( TutorialsMatricesCuda ALL DEPENDS
    DenseMatrixExample_rowsReduction_vectorProduct.out
    DenseMatrixExample_rowsReduction_maxNorm.out
    DenseMatrixViewExample_setElement.out
+   SparseMatrixExample_Constructor_init_list_2.out
+   SparseMatrixExample_setRowCapacities.out
  )
 ENDIF()
 #
diff --git a/Documentation/Tutorials/Matrices/SparseMatrixExample_Constructor_init_list_2.cpp b/Documentation/Tutorials/Matrices/SparseMatrixExample_Constructor_init_list_2.cpp
new file mode 120000
index 000000000..9d23bbb1c
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/SparseMatrixExample_Constructor_init_list_2.cpp
@@ -0,0 +1 @@
+../../Examples/Matrices/SparseMatrix/SparseMatrixExample_Constructor_init_list_2.cpp
\ No newline at end of file
diff --git a/Documentation/Tutorials/Matrices/SparseMatrixExample_Constructor_init_list_2.cu b/Documentation/Tutorials/Matrices/SparseMatrixExample_Constructor_init_list_2.cu
new file mode 120000
index 000000000..759f1a1ca
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/SparseMatrixExample_Constructor_init_list_2.cu
@@ -0,0 +1 @@
+../../Examples/Matrices/SparseMatrix/SparseMatrixExample_Constructor_init_list_2.cu
\ No newline at end of file
diff --git a/Documentation/Tutorials/Matrices/SparseMatrixExample_Constructor_rowCapacities_vector.cpp b/Documentation/Tutorials/Matrices/SparseMatrixExample_Constructor_rowCapacities_vector.cpp
new file mode 120000
index 000000000..ddeed9a7b
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/SparseMatrixExample_Constructor_rowCapacities_vector.cpp
@@ -0,0 +1 @@
+../../Examples/Matrices/SparseMatrix/SparseMatrixExample_Constructor_rowCapacities_vector.cpp
\ No newline at end of file
diff --git a/Documentation/Tutorials/Matrices/SparseMatrixExample_Constructor_rowCapacities_vector.cu b/Documentation/Tutorials/Matrices/SparseMatrixExample_Constructor_rowCapacities_vector.cu
new file mode 120000
index 000000000..5957448c6
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/SparseMatrixExample_Constructor_rowCapacities_vector.cu
@@ -0,0 +1 @@
+../../Examples/Matrices/SparseMatrix/SparseMatrixExample_Constructor_rowCapacities_vector.cu
\ No newline at end of file
diff --git a/Documentation/Tutorials/Matrices/SparseMatrixExample_setRowCapacities.cpp b/Documentation/Tutorials/Matrices/SparseMatrixExample_setRowCapacities.cpp
new file mode 120000
index 000000000..973b2f3a8
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/SparseMatrixExample_setRowCapacities.cpp
@@ -0,0 +1 @@
+../../Examples/Matrices/SparseMatrix/SparseMatrixExample_setRowCapacities.cpp
\ No newline at end of file
diff --git a/Documentation/Tutorials/Matrices/SparseMatrixExample_setRowCapacities.cu b/Documentation/Tutorials/Matrices/SparseMatrixExample_setRowCapacities.cu
new file mode 120000
index 000000000..ef674e0f0
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/SparseMatrixExample_setRowCapacities.cu
@@ -0,0 +1 @@
+../../Examples/Matrices/SparseMatrix/SparseMatrixExample_setRowCapacities.cu
\ No newline at end of file
diff --git a/Documentation/Tutorials/Matrices/tutorial_Matrices.md b/Documentation/Tutorials/Matrices/tutorial_Matrices.md
index 022daf6ae..d4fa61d97 100644
--- a/Documentation/Tutorials/Matrices/tutorial_Matrices.md
+++ b/Documentation/Tutorials/Matrices/tutorial_Matrices.md
@@ -19,7 +19,7 @@ Dense matrix is a templated class defined in namespace \ref TNL::Matrices. It ha
 
 * `Real` is a type of the matrix elements. It is `double` by default.
 * `Device` is a device where the matrix shall be allocated. Currently it can be either \ref TNL::Devices::Host for CPU or \ref TNL::Devices::Cuda for GPU supporting CUDA. It is \ref TNL::Devices::Host by default.
-* `Index` is a type to be used for indexing of the matrix elements. It `int` by default.
+* `Index` is a type to be used for indexing of the matrix elements. It is `int` by default.
 * `ElementsOrganization` defines the organization of the matrix elements in memory. It can be \ref TNL::Algorithms::Segments::ColumnMajorOrder or \ref TNL::Algorithms::Segments::RowMajorOrder for column-major and row-major organization respectively. Be default it is the row-major order if the matrix is allocated in the host system and column major order if it is allocated on GPU.
 * `RealAllocator` is a memory allocator (one from \ref TNL::Allocators) which shall be used for allocation of the matrix elements. By default, it is the default allocator for given `Real` type and `Device` type -- see \ref TNL::Allocators::Default.
 
@@ -126,7 +126,11 @@ The dense matrix can be saved to a file using a method `save` (\ref TNL::Matrice
 
 ### Dense matrix view
 
-Similar to array view (\ref TNL::Containers::ArayView) and vector view (\ref TNL::Containers::VectorView), matrices also offer their view for easier use with lambda functions. For the dense matrix there is a `DenseMatrixView` (\ref TNL::Matrcioes::DenseMatrixView). We will demonstrate it on the example showing the method `setElement` (\ref TNL::Matrices::DenseMatrix::setElement). However, the `SharedPointer` will be replaced with the `DenseMatrixView`. The code looks as follows:
+Similar to array view (\ref TNL::Containers::ArayView) and vector view (\ref TNL::Containers::VectorView), matrices also offer their view for easier use with lambda functions. For the dense matrix there is a `DenseMatrixView` (\ref TNL::Matrcioes::DenseMatrixView).
+
+TODO: Template parameters description
+
+We will demonstrate it on the example showing the method `setElement` (\ref TNL::Matrices::DenseMatrix::setElement). However, the `SharedPointer` will be replaced with the `DenseMatrixView`. The code looks as follows:
 
 \include DenseMatrixViewExample_setElement.cpp
 
@@ -134,11 +138,121 @@ And the result is:
 
 \include DenseMatrixViewExample_setElement.out
 
+The dense matrix view offers almost all methods which the dense matrix does. So it can be easily used at almost any situation the same way as the dense matrix itself.
+
+
+TODO: Using DenseMatrixView for data encapsulation
 
 ## Sparse matrices <a name="sparse_matrices"></a>
 
+[Sparse matrices](https://en.wikipedia.org/wiki/Sparse_matrix) arte extremely important in a lot of numerical algorithms. They are used at situations when we need to operate with matrices having majority of the matrix elements equal to zero. In this case, only the non-zero matrix elements are stored with possible some *padding zeros* used for memory alignment. This is necessary mainly on GPUs. Consider just matrix having 50,000 rows and columns whih is 2,500,000,000 matrix elements. If we store each matrix element in double precision (it means eight bytes per element) we need 20,000,000,000 bytes which is nearly 20 GB of memory. If there are only five non-zero elements in each row we need only \f$8 \times 5 \times 50,000=2,000,000\f$ bytes and so nearly 200 MB. It is really great difference.
+
+Major disadventage of sparse matrices is that there are a lot of different formats for storing such matrices. Though [CSR - Compressed Sparse Row](https://en.wikipedia.org/wiki/Sparse_matrix#Compressed_sparse_row_(CSR,_CRS_or_Yale_format)) format is the most popular of all, especially for GPUs there are many other formats which perform differently on various matrices. So it is a good idea to test several sparse matrix formats if you want to get the best performance. In TNL, there is one templated class \ref TNL::Matrices::SparseMatrix representing the sparse matrices. The change of underlying matrix format can be done just by changing one template parameter. The list of the template paramaters is as follows:
+
+
+* `Real` is type if the matrix elements. It is `double` by default.
+* `Device` is a device where the matrix is allocated. Currently it can be either \ref TNL::Devices::Host for CPU or \ref TNL::Devices::Cuda for GPU supporting CUDA. It is \ref TNL::Devices::Host by default.
+* `Index` is a type to be used for indexing of the matrix elements. It is `int` by default.
+* `MatrixType` tells if the matrix is symmetric (\ref TNL::Matrices::SymmetricMatrix) or general (\ref TNL::Matrices::GeneralMatrix). It is a \ref TNL::Matrices::GeneralMatrix by default.
+* `Segments` define the format of the sparse matrix. It can be (by default, it is \ref TNL::Algorithms::Segments::CSR):
+   * \ref TNL::Algorithms::Segments::CSR for [CSR format](https://en.wikipedia.org/wiki/Sparse_matrix#Compressed_sparse_row_(CSR,_CRS_or_Yale_format)).
+   * \ref TNL::Algorithms::Segments::Ellpack for [Ellpack format](http://mgarland.org/files/papers/nvr-2008-004.pdf).
+   * \ref TNL::Algorithms::Segments::SlicedEllpack for [SlicedEllpack format](https://link.springer.com/chapter/10.1007/978-3-642-11515-8_10) which was also presented as [Row-grouped CSR format](https://arxiv.org/abs/1012.2270).
+   * \ref TNL::Algorithms::Segments::ChunkedEllpack for [ChunkedEllpack format](http://geraldine.fjfi.cvut.cz/~oberhuber/data/vyzkum/publikace/12-heller-oberhuber-improved-rgcsr-format.pdf) which we reffered as Improved Row-grouped CSR and we renamed it to Ellpack format since it uses padding zeros.
+   * \ref TNL::Algorithms::Segments::BiEllpack for [BiEllpack format](https://www.sciencedirect.com/science/article/pii/S0743731514000458?casa_token=2phrEj0Ef1gAAAAA:Lgf6rMBUN6T7TJne6mAgI_CSUJ-jR8jz7Eghdv6L0SJeGm4jfso-x6Wh8zgERk3Si7nFtTAJngg).
+* `ComputeReal` is type which is used for internal computations. By default it is the same as `Real` if `Real` is not `bool`. If `Real` is `bool`, `ComputeReal` is set to `Index` type. This can be changed, of course, by the user.
+* `RealAllocator` is a memory allocator (one from \ref TNL::Allocators) which shall be used for allocation of the matrix elements. By default, it is the default allocator for given `Real` type and `Device` type – see TNL::Allocators::Default.
+* `IndexAllocator` is a memory allocator (one from \ref TNL::Allocators) which shall be used for allocation of the column indexes of the matrix elements. By default, it is the default allocator for given `Index` type and `Device` type – see \ref TNL::Allocators::Default.
+
+If `Real` is set to `bool`, we get *a binary matrix* for which the non-zero elements can be equal only to one and so the matrix elements values are not stored explicitly in the memory.
+
+### Sparse matrix allocation and initiation
+
+Small matrices can be initialized by a constructor with initializer list. We assume having the follwong sparse matrix
+
+\f[
+\left(
+\begin{array}{ccccc}
+ 1 &  0 &  0 &  0 &  0 \\
+-1 &  2 & -1 &  0 &  0 \\
+ 0 & -1 &  2 & -1 &  0 \\
+ 0 &  0 & -1 &  2 & -1 \\
+ 0 &  0 &  0 & -1 &  0
+\end{array}
+\right).
+\f]
+
+The following example shows how to create it using the initializer list constructor:
+
+\include SparseMatrixExample_Constructor_init_list_2.cpp
+
+The constructor accepts the following parameters:
+
+* `rows` is a number of matrix rows.
+* `columns` is a number of matrix columns.
+* `data` is definition of non-zero matrix elements. It is a initializer list of triples having a form `{ row_index, column_index, value }`. In fact, it is very much like the coordinates format - [COO](https://en.wikipedia.org/wiki/Sparse_matrix#Coordinate_list_(COO)).
+
+The constructor also accepts `Real` and `Index` allocators (\ref TNL::Allocators) but their are not important for this example. The result looks as follows:
+
+\include SparseMatrixExample_Constructor_init_list_2.out
+
+Larger matrices are created in two steps:
+
+1. We use a method \ref TNL::Matrices::SparseMatrix::setRowCapacities to initialize the underlying matrix format and to allocate memory for the matrix elements. This method only needs to know how many non-zero elements are supposed to be in each row. Once this is set, it cannot be changed only by reseting the whole matrix. In most situations, this is not an issue to compute the number of non-zero elements in each row. Note, however, that we do not tell the positions of the non-zeto elements. If some matrix format needs this information it cannot be used with this implementation of the sparse matrix.
+2. The non-zero matrix elements can be set-up. We insert one non-zero element after another by telling its coordinates and a value. Since probably all sparse matrix formats are designed to allow quick acces to particular matrix rows, this insertion is usualy quite efficient and can by done in parallel by mapping different threads to different matrix rows.
+
+See the following example which creates lower triangular matrix like this one
+
+\f[
+\left(
+\begin{array}{ccccc}
+ 1 &  0 &  0 &  0 &  0 \\
+ 2 &  1 &  0 &  0 &  0 \\
+ 3 &  2 &  1 &  0 &  0 \\
+ 4 &  3 &  2 &  1 &  0 \\
+ 5 &  4 &  3 &  2 &  1
+\end{array}
+\right).
+\f]
+
+
+\include SparseMatrixExample_setRowCapacities.cpp
+
+The method \ref TNL::Matrices::SparseMatrix::setRowCapacities reads the required capacities of the matrix rows from a vector (or simmilar container - \ref TNL::Containers::Array, \ref TNL::Containers::ArrayView, \ref TNL::Containers::Vector and \ref TNL::Containers::VectorView) which has the same number of elements as the number of matrix rows and each element defines the capacity of the related row. The result looks as follows:
+
+\include SparseMatrixExample_setRowCapacities.out
+
+There are constructors which also set the row capacities, one uses a vector ...
+
+\include SparseMatrixExample_Constructor_rowCapacities_vector.cpp
+
+... the result looks as follows ...
+
+\include SparseMatrixExample_Constructor_rowCapacities_vector.out
+
+while the other uses an initializer list ...
+
+\include SparseMatrixExample_Constructor_init_list_1.cpp
+
+... the result looks as follows.
+
+\include SparseMatrixExample_Constructor_init_list_1.out.
+
+
+
+### Flexible reduction in matrix rows
+### Sparse-matrix vector product
+### Sparse matrix IO
+### Sparse matrix view
+
 ## Tridiagonal matrices <a name="tridiagonal_matrices"></a>
 
+### Dense matrix allocation and initiation
+### Flexible reduction in matrix rows
+### Dense-matrix vector product
+### Dense matrix IO
+### Dense matrix view
+
 ## Multidiagonal matrices <a name="multidiagonal_matrices"></a>
 
 ## Lambda matrices <a name="lambda_matrices"></a>
-- 
GitLab


From aa1dbbc285a95a0d45a75c5de5aabc0256aac3f5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Fri, 28 Aug 2020 13:04:58 +0200
Subject: [PATCH 06/53] Writting documentation on sparse matrices.

---
 .../SparseMatrixExample_setElement.cpp        |  7 ++
 .../Tutorials/Matrices/CMakeLists.txt         | 38 +++++++-
 ...parseMatrixExample_Constructor_std_map.cpp |  1 +
 ...SparseMatrixExample_Constructor_std_map.cu |  1 +
 .../SparseMatrixExample_addElement.cpp        |  1 +
 .../SparseMatrixExample_addElement.cu         |  1 +
 .../Matrices/SparseMatrixExample_forRows.cpp  |  1 +
 .../Matrices/SparseMatrixExample_forRows.cu   |  1 +
 .../SparseMatrixExample_setElement.cpp        |  1 +
 .../SparseMatrixExample_setElement.cu         |  1 +
 .../SparseMatrixExample_setElements.cpp       |  1 +
 .../SparseMatrixExample_setElements.cu        |  1 +
 .../SparseMatrixExample_setElements_map.cpp   |  1 +
 .../SparseMatrixExample_setElements_map.cu    |  1 +
 .../Tutorials/Matrices/tutorial_Matrices.md   | 94 +++++++++++++++----
 15 files changed, 130 insertions(+), 21 deletions(-)
 create mode 120000 Documentation/Tutorials/Matrices/SparseMatrixExample_Constructor_std_map.cpp
 create mode 120000 Documentation/Tutorials/Matrices/SparseMatrixExample_Constructor_std_map.cu
 create mode 120000 Documentation/Tutorials/Matrices/SparseMatrixExample_addElement.cpp
 create mode 120000 Documentation/Tutorials/Matrices/SparseMatrixExample_addElement.cu
 create mode 120000 Documentation/Tutorials/Matrices/SparseMatrixExample_forRows.cpp
 create mode 120000 Documentation/Tutorials/Matrices/SparseMatrixExample_forRows.cu
 create mode 120000 Documentation/Tutorials/Matrices/SparseMatrixExample_setElement.cpp
 create mode 120000 Documentation/Tutorials/Matrices/SparseMatrixExample_setElement.cu
 create mode 120000 Documentation/Tutorials/Matrices/SparseMatrixExample_setElements.cpp
 create mode 120000 Documentation/Tutorials/Matrices/SparseMatrixExample_setElements.cu
 create mode 120000 Documentation/Tutorials/Matrices/SparseMatrixExample_setElements_map.cpp
 create mode 120000 Documentation/Tutorials/Matrices/SparseMatrixExample_setElements_map.cu

diff --git a/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixExample_setElement.cpp b/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixExample_setElement.cpp
index 178e502dc..d9a3ae463 100644
--- a/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixExample_setElement.cpp
+++ b/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixExample_setElement.cpp
@@ -11,12 +11,19 @@ void setElements()
 {
    auto rowCapacities = { 1, 1, 1, 1, 1 };
    TNL::Pointers::SharedPointer< TNL::Matrices::SparseMatrix< double, Device > > matrix( rowCapacities, 5 );
+
+   /***
+    * Calling the method setElements from host (CPU).
+    */
    for( int i = 0; i < 5; i++ )
       matrix->setElement( i, i, i );
 
    std::cout << "Matrix set from the host:" << std::endl;
    std::cout << *matrix << std::endl;
 
+   /***
+    * This lambda function will run on the native device of the matrix which can be CPU or GPU.
+    */
    auto f = [=] __cuda_callable__ ( int i ) mutable {
       matrix->setElement( i, i, -i );
    };
diff --git a/Documentation/Tutorials/Matrices/CMakeLists.txt b/Documentation/Tutorials/Matrices/CMakeLists.txt
index 2ec5496a0..50e34ce73 100644
--- a/Documentation/Tutorials/Matrices/CMakeLists.txt
+++ b/Documentation/Tutorials/Matrices/CMakeLists.txt
@@ -18,7 +18,7 @@ IF( BUILD_CUDA )
    ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_forRows >
                        ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_forRows.out
                        OUTPUT DenseMatrixExample_forRows.out )
-   
+
    CUDA_ADD_EXECUTABLE( DenseMatrixExample_rowsReduction_vectorProduct DenseMatrixExample_rowsReduction_vectorProduct.cu )
    ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_rowsReduction_vectorProduct >
                        ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_rowsReduction_vectorProduct.out
@@ -49,6 +49,37 @@ IF( BUILD_CUDA )
                        ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SparseMatrixExample_Constructor_rowCapacities_vector.out
                        OUTPUT SparseMatrixExample_Constructor_rowCapacities_vector.out )
 
+   CUDA_ADD_EXECUTABLE( SparseMatrixExample_Constructor_std_map SparseMatrixExample_Constructor_std_map.cu )
+   ADD_CUSTOM_COMMAND( COMMAND SparseMatrixExample_Constructor_std_map >
+                       ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SparseMatrixExample_Constructor_std_map.out
+                       OUTPUT SparseMatrixExample_Constructor_std_map.out )
+
+   CUDA_ADD_EXECUTABLE( SparseMatrixExample_setElements SparseMatrixExample_setElements.cu )
+   ADD_CUSTOM_COMMAND( COMMAND SparseMatrixExample_setElements >
+                       ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SparseMatrixExample_setElements.out
+                       OUTPUT SparseMatrixExample_setElements.out )
+
+   CUDA_ADD_EXECUTABLE( SparseMatrixExample_setElements_map SparseMatrixExample_setElements_map.cu )
+   ADD_CUSTOM_COMMAND( COMMAND SparseMatrixExample_setElements_map >
+                       ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SparseMatrixExample_setElements_map.out
+                       OUTPUT SparseMatrixExample_setElements_map.out )
+
+   CUDA_ADD_EXECUTABLE( SparseMatrixExample_setElement SparseMatrixExample_setElement.cu )
+   ADD_CUSTOM_COMMAND( COMMAND SparseMatrixExample_setElement >
+                       ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SparseMatrixExample_setElement.out
+                       OUTPUT SparseMatrixExample_setElement.out )
+
+   CUDA_ADD_EXECUTABLE( SparseMatrixExample_addElement SparseMatrixExample_addElement.cu )
+   ADD_CUSTOM_COMMAND( COMMAND SparseMatrixExample_addElement >
+                       ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SparseMatrixExample_addElement.out
+                       OUTPUT SparseMatrixExample_addElement.out )
+
+   CUDA_ADD_EXECUTABLE( SparseMatrixExample_forRows SparseMatrixExample_forRows.cu )
+   ADD_CUSTOM_COMMAND( COMMAND SparseMatrixExample_forRows >
+                       ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SparseMatrixExample_forRows.out
+                       OUTPUT SparseMatrixExample_forRows.out )
+
+
 ELSE()
 #   ADD_EXECUTABLE( UniquePointerExample UniquePointerExample.cpp )
 #   ADD_CUSTOM_COMMAND( COMMAND UniquePointerExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/UniquePointerExample.out OUTPUT UniquePointerExample.out )
@@ -69,6 +100,11 @@ ADD_CUSTOM_TARGET( TutorialsMatricesCuda ALL DEPENDS
    DenseMatrixViewExample_setElement.out
    SparseMatrixExample_Constructor_init_list_2.out
    SparseMatrixExample_setRowCapacities.out
+   SparseMatrixExample_Constructor_std_map.out
+   SparseMatrixExample_setElements.out
+   SparseMatrixExample_setElements_map.out
+   SparseMatrixExample_setElement.out
+   SparseMatrixExample_forRows.out
  )
 ENDIF()
 #
diff --git a/Documentation/Tutorials/Matrices/SparseMatrixExample_Constructor_std_map.cpp b/Documentation/Tutorials/Matrices/SparseMatrixExample_Constructor_std_map.cpp
new file mode 120000
index 000000000..dcc4ec9ae
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/SparseMatrixExample_Constructor_std_map.cpp
@@ -0,0 +1 @@
+../../Examples/Matrices/SparseMatrix/SparseMatrixExample_Constructor_std_map.cpp
\ No newline at end of file
diff --git a/Documentation/Tutorials/Matrices/SparseMatrixExample_Constructor_std_map.cu b/Documentation/Tutorials/Matrices/SparseMatrixExample_Constructor_std_map.cu
new file mode 120000
index 000000000..75a75befb
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/SparseMatrixExample_Constructor_std_map.cu
@@ -0,0 +1 @@
+../../Examples/Matrices/SparseMatrix/SparseMatrixExample_Constructor_std_map.cu
\ No newline at end of file
diff --git a/Documentation/Tutorials/Matrices/SparseMatrixExample_addElement.cpp b/Documentation/Tutorials/Matrices/SparseMatrixExample_addElement.cpp
new file mode 120000
index 000000000..215dde5df
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/SparseMatrixExample_addElement.cpp
@@ -0,0 +1 @@
+../../Examples/Matrices/SparseMatrix/SparseMatrixExample_addElement.cpp
\ No newline at end of file
diff --git a/Documentation/Tutorials/Matrices/SparseMatrixExample_addElement.cu b/Documentation/Tutorials/Matrices/SparseMatrixExample_addElement.cu
new file mode 120000
index 000000000..c2425241f
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/SparseMatrixExample_addElement.cu
@@ -0,0 +1 @@
+../../Examples/Matrices/SparseMatrix/SparseMatrixExample_addElement.cu
\ No newline at end of file
diff --git a/Documentation/Tutorials/Matrices/SparseMatrixExample_forRows.cpp b/Documentation/Tutorials/Matrices/SparseMatrixExample_forRows.cpp
new file mode 120000
index 000000000..6115ba227
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/SparseMatrixExample_forRows.cpp
@@ -0,0 +1 @@
+../../Examples/Matrices/SparseMatrix/SparseMatrixExample_forRows.cpp
\ No newline at end of file
diff --git a/Documentation/Tutorials/Matrices/SparseMatrixExample_forRows.cu b/Documentation/Tutorials/Matrices/SparseMatrixExample_forRows.cu
new file mode 120000
index 000000000..b6d3f1732
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/SparseMatrixExample_forRows.cu
@@ -0,0 +1 @@
+../../Examples/Matrices/SparseMatrix/SparseMatrixExample_forRows.cu
\ No newline at end of file
diff --git a/Documentation/Tutorials/Matrices/SparseMatrixExample_setElement.cpp b/Documentation/Tutorials/Matrices/SparseMatrixExample_setElement.cpp
new file mode 120000
index 000000000..1507393de
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/SparseMatrixExample_setElement.cpp
@@ -0,0 +1 @@
+../../Examples/Matrices/SparseMatrix/SparseMatrixExample_setElement.cpp
\ No newline at end of file
diff --git a/Documentation/Tutorials/Matrices/SparseMatrixExample_setElement.cu b/Documentation/Tutorials/Matrices/SparseMatrixExample_setElement.cu
new file mode 120000
index 000000000..2f13c04ed
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/SparseMatrixExample_setElement.cu
@@ -0,0 +1 @@
+../../Examples/Matrices/SparseMatrix/SparseMatrixExample_setElement.cu
\ No newline at end of file
diff --git a/Documentation/Tutorials/Matrices/SparseMatrixExample_setElements.cpp b/Documentation/Tutorials/Matrices/SparseMatrixExample_setElements.cpp
new file mode 120000
index 000000000..0f5e5d1dd
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/SparseMatrixExample_setElements.cpp
@@ -0,0 +1 @@
+../../Examples/Matrices/SparseMatrix/SparseMatrixExample_setElements.cpp
\ No newline at end of file
diff --git a/Documentation/Tutorials/Matrices/SparseMatrixExample_setElements.cu b/Documentation/Tutorials/Matrices/SparseMatrixExample_setElements.cu
new file mode 120000
index 000000000..120be6659
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/SparseMatrixExample_setElements.cu
@@ -0,0 +1 @@
+../../Examples/Matrices/SparseMatrix/SparseMatrixExample_setElements.cu
\ No newline at end of file
diff --git a/Documentation/Tutorials/Matrices/SparseMatrixExample_setElements_map.cpp b/Documentation/Tutorials/Matrices/SparseMatrixExample_setElements_map.cpp
new file mode 120000
index 000000000..5206fcc2e
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/SparseMatrixExample_setElements_map.cpp
@@ -0,0 +1 @@
+../../Examples/Matrices/SparseMatrix/SparseMatrixExample_setElements_map.cpp
\ No newline at end of file
diff --git a/Documentation/Tutorials/Matrices/SparseMatrixExample_setElements_map.cu b/Documentation/Tutorials/Matrices/SparseMatrixExample_setElements_map.cu
new file mode 120000
index 000000000..9c5f7c0f5
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/SparseMatrixExample_setElements_map.cu
@@ -0,0 +1 @@
+../../Examples/Matrices/SparseMatrix/SparseMatrixExample_setElements_map.cu
\ No newline at end of file
diff --git a/Documentation/Tutorials/Matrices/tutorial_Matrices.md b/Documentation/Tutorials/Matrices/tutorial_Matrices.md
index d4fa61d97..2f1591088 100644
--- a/Documentation/Tutorials/Matrices/tutorial_Matrices.md
+++ b/Documentation/Tutorials/Matrices/tutorial_Matrices.md
@@ -27,7 +27,7 @@ Dense matrix is a templated class defined in namespace \ref TNL::Matrices. It ha
 
 The following examples show how to allocate the dense matrix and how to initialize the matrix elements. Small matrices can be created simply by the constructor with an initializer list.
 
-\include Matrices/DenseMatrix/DenseMatrixExample_Constructor_init_list.cpp
+\includelineno Matrices/DenseMatrix/DenseMatrixExample_Constructor_init_list.cpp
 
 In fact, the constructor takes a list of initializer lists. Each embedded list defines one matrix row and so the number of matrix rows is given by the size of the outer initializer list.  The number of matrix columns is given by the longest inner initializer lists. Shorter inner lists are filled with zeros from the right side. The result looks as follows:
 
@@ -35,7 +35,7 @@ In fact, the constructor takes a list of initializer lists. Each embedded list d
 
 Larger matrices can be set-up with methods `setElement` and `addElement` (\ref TNL::Matrices::DenseMatrix::setElement, \ref TNL::Matrices::DenseMatrix::addElement). The following example shows how to call these methods from the host.
 
-\include DenseMatrixExample_addElement.cpp
+\includelineno DenseMatrixExample_addElement.cpp
 
 As we can see, both methods can be called from the host no matter where the matrix is allocated. If it is on GPU, each call of `setElement` or `addElement` (\ref TNL::Matrices::DenseMatrix::setElement, \ref TNL::Matrices::DenseMatrix::addElement) causes slow transfer of tha data between CPU and GPU. Use this approach only if the performance is not a priority for example for matrices which are set only once this way. The result looks as follows:
 
@@ -43,7 +43,7 @@ As we can see, both methods can be called from the host no matter where the matr
 
 More efficient way of the matrix initialization on GPU consists in calling the methods `setElement` and `addElement` (\ref TNL::Matrices::DenseMatrix::setElement, \ref TNL::Matrices::DenseMatrix::addElement) directly from GPU. It is demonstrated in the following example (of course it works even for CPU):
 
-\include DenseMatrixExample_setElement.cpp
+\includelineno DenseMatrixExample_setElement.cpp
 
 Here we use `SharedPointer` (\ref TNL::Pointers::SharedPointer) to make the matrix accessible in lambda functions even on GPU. We first call the `setElement` method from CPU to set the `i`-th diagonal element to `i`. Next we iterate over the matrix rows with `ParallelFor`and for each row we call a lambda function `f`. This is done on the same device where the matrix is allocated and so it is more efficient for matrices allocated on GPU. In the lambda function we just set the `i`-th diagonal element to `-i`. The result looks as follows:
 
@@ -51,7 +51,7 @@ Here we use `SharedPointer` (\ref TNL::Pointers::SharedPointer) to make the matr
 
 If we want to set more matrix elements in each row, we can use inner for-loop in the lambda function `f`. This, however, is limiting the parallelization and it can be inefficient for larger matrices. The next example demonstrates a method `forRows` (\ref TNL::Matrices::DenseMatrix::forRows) which iterates over all matrix elements in parallel and it calls a lambda function defining an operation we want to do on the matrix elements.
 
-\include DenseMatrixExample_forRows.cpp
+\includelineno DenseMatrixExample_forRows.cpp
 
 Firstly note, that this is simpler since we do not need any `SharedPointer`. The lambda function `f` requires the following parameters:
 
@@ -81,7 +81,7 @@ y_i = a_{i1} x_1 + a_{i2} x_2 + \ldots + a_{in} x_n = \sum_{j=1}^n a_{ij}x_j.
 We see that in i-th matrix row we have to compute the sum \f$\sum_{j=1}^n a_{ij}x_j\f$ which is reduction of products \f$ a_{ij}x_j\f$. Similar to *flexible parallel reduction* (\ref TNL::Algorithms::Reduction) we just need to design proper lambda functions. See the following example:
 
 
-\include DenseMatrixExample_rowsReduction_vectorProduct.cpp
+\includelineno DenseMatrixExample_rowsReduction_vectorProduct.cpp
 
 The `fetch` lambda function computes the product \f$ a_{ij}x_j\f$ where \f$ a_{ij} \f$ is represented by `value` and \f$x_j \f$ is represented by `xView[columnIdx]`. The reduction is just sum of results particular products and it is represented by by the lambda function `reduce`. Finaly, the lambda function `keep` is responsible for storing the results of reduction in each matrix row (which is represented by the variable `value`) into the output vector `y`.  
 The result looks as:
@@ -96,7 +96,7 @@ y_i = \max_{j=1,\ldots,n} |a_{ij}|.
 
 See the following example:
 
-\include DenseMatrixExample_rowsReduction_maxNorm.cpp
+\includelineno DenseMatrixExample_rowsReduction_maxNorm.cpp
 
 
 The `fetch` lambda function just returns absolute value of \f$a_{ij} \f$ which is represented again by the varibale `value`. The `reduce` lambda function returns larger of given values and the lambda fuction 'keep' stores the results to the output vectro the same way as in the previous example. Of course, if we compute the maximum of all output vector elements we get some kined of max matrix norm. The output looks as:
@@ -132,7 +132,7 @@ TODO: Template parameters description
 
 We will demonstrate it on the example showing the method `setElement` (\ref TNL::Matrices::DenseMatrix::setElement). However, the `SharedPointer` will be replaced with the `DenseMatrixView`. The code looks as follows:
 
-\include DenseMatrixViewExample_setElement.cpp
+\includelineno DenseMatrixViewExample_setElement.cpp
 
 And the result is:
 
@@ -145,7 +145,7 @@ TODO: Using DenseMatrixView for data encapsulation
 
 ## Sparse matrices <a name="sparse_matrices"></a>
 
-[Sparse matrices](https://en.wikipedia.org/wiki/Sparse_matrix) arte extremely important in a lot of numerical algorithms. They are used at situations when we need to operate with matrices having majority of the matrix elements equal to zero. In this case, only the non-zero matrix elements are stored with possible some *padding zeros* used for memory alignment. This is necessary mainly on GPUs. Consider just matrix having 50,000 rows and columns whih is 2,500,000,000 matrix elements. If we store each matrix element in double precision (it means eight bytes per element) we need 20,000,000,000 bytes which is nearly 20 GB of memory. If there are only five non-zero elements in each row we need only \f$8 \times 5 \times 50,000=2,000,000\f$ bytes and so nearly 200 MB. It is really great difference.
+[Sparse matrices](https://en.wikipedia.org/wiki/Sparse_matrix) are extremely important in a lot of numerical algorithms. They are used at situations when we need to operate with matrices having majority of the matrix elements equal to zero. In this case, only the non-zero matrix elements are stored with possible some *padding zeros* used for memory alignment. This is necessary mainly on GPUs. Consider just matrix having 50,000 rows and columns whih is 2,500,000,000 matrix elements. If we store each matrix element in double precision (it means eight bytes per element) we need 20,000,000,000 bytes which is nearly 20 GB of memory. If there are only five non-zero elements in each row we need only \f$8 \times 5 \times 50,000=2,000,000\f$ bytes and so nearly 200 MB. It is really great difference.
 
 Major disadventage of sparse matrices is that there are a lot of different formats for storing such matrices. Though [CSR - Compressed Sparse Row](https://en.wikipedia.org/wiki/Sparse_matrix#Compressed_sparse_row_(CSR,_CRS_or_Yale_format)) format is the most popular of all, especially for GPUs there are many other formats which perform differently on various matrices. So it is a good idea to test several sparse matrix formats if you want to get the best performance. In TNL, there is one templated class \ref TNL::Matrices::SparseMatrix representing the sparse matrices. The change of underlying matrix format can be done just by changing one template parameter. The list of the template paramaters is as follows:
 
@@ -164,7 +164,7 @@ Major disadventage of sparse matrices is that there are a lot of different forma
 * `RealAllocator` is a memory allocator (one from \ref TNL::Allocators) which shall be used for allocation of the matrix elements. By default, it is the default allocator for given `Real` type and `Device` type – see TNL::Allocators::Default.
 * `IndexAllocator` is a memory allocator (one from \ref TNL::Allocators) which shall be used for allocation of the column indexes of the matrix elements. By default, it is the default allocator for given `Index` type and `Device` type – see \ref TNL::Allocators::Default.
 
-If `Real` is set to `bool`, we get *a binary matrix* for which the non-zero elements can be equal only to one and so the matrix elements values are not stored explicitly in the memory.
+**If `Real` is set to `bool`, we get *a binary matrix* for which the non-zero elements can be equal only to one and so the matrix elements values are not stored explicitly in the memory.**
 
 ### Sparse matrix allocation and initiation
 
@@ -184,7 +184,7 @@ Small matrices can be initialized by a constructor with initializer list. We ass
 
 The following example shows how to create it using the initializer list constructor:
 
-\include SparseMatrixExample_Constructor_init_list_2.cpp
+\includelineno SparseMatrixExample_Constructor_init_list_2.cpp
 
 The constructor accepts the following parameters:
 
@@ -192,7 +192,11 @@ The constructor accepts the following parameters:
 * `columns` is a number of matrix columns.
 * `data` is definition of non-zero matrix elements. It is a initializer list of triples having a form `{ row_index, column_index, value }`. In fact, it is very much like the coordinates format - [COO](https://en.wikipedia.org/wiki/Sparse_matrix#Coordinate_list_(COO)).
 
-The constructor also accepts `Real` and `Index` allocators (\ref TNL::Allocators) but their are not important for this example. The result looks as follows:
+The constructor also accepts `Real` and `Index` allocators (\ref TNL::Allocators) but their are not important for this example. A method `setElements` works the same way:
+
+\includelineno SparseMatrixExample_setElements.cpp
+
+The result of both examples looks as follows:
 
 \include SparseMatrixExample_Constructor_init_list_2.out
 
@@ -216,31 +220,81 @@ See the following example which creates lower triangular matrix like this one
 \f]
 
 
-\include SparseMatrixExample_setRowCapacities.cpp
+\includelineno SparseMatrixExample_setRowCapacities.cpp
 
 The method \ref TNL::Matrices::SparseMatrix::setRowCapacities reads the required capacities of the matrix rows from a vector (or simmilar container - \ref TNL::Containers::Array, \ref TNL::Containers::ArrayView, \ref TNL::Containers::Vector and \ref TNL::Containers::VectorView) which has the same number of elements as the number of matrix rows and each element defines the capacity of the related row. The result looks as follows:
 
 \include SparseMatrixExample_setRowCapacities.out
 
-There are constructors which also set the row capacities, one uses a vector ...
+There are constructors which also set the row capacities. The first one uses a vector:
+
+\includelineno SparseMatrixExample_Constructor_rowCapacities_vector.cpp
+
+The second one uses an initializer list:
+
+\includelineno SparseMatrixExample_Constructor_init_list_1.cpp
+
+The result of both examples looks as follows:
+
+\include SparseMatrixExample_Constructor_init_list_1.out
+
+Finaly, there is a constructor which creates the sparse matrix from 'std::map'. It is usefull especially in situation when you cannot compute the matrix elements by rows but rather in random order. You can do it on CPU and store the matrix elements in `std::map` data structure in a [COO](https://en.wikipedia.org/wiki/Sparse_matrix#Coordinate_list_(COO)) format manner. It means that each entry of the `map` is the following pair:
+
+```
+std::pair( std::pair( row_index, column_index ), element_value )
+```
+
+which defines one matrix element at given coordinates with given value. Of course, you can insert such entries in any order into the `map`. When it is complete you can pass it the sparse matrix. See the following example:
+
+\includelineno SparseMatrixExample_Constructor_std_map.cpp
+
+A method `setElements` works the same way for already existing instances of sparse matrix:
+
 
-\include SparseMatrixExample_Constructor_rowCapacities_vector.cpp
+\includelineno SparseMatrixExample_setElements_map.cpp
 
-... the result looks as follows ...
+The result of both examples looks as folows:
 
-\include SparseMatrixExample_Constructor_rowCapacities_vector.out
+\include SparseMatrixExample_setElements_map.out
 
-while the other uses an initializer list ...
+Another way of setting the sparse matrix is via the methods `setElement` and `addElement` (\ref TNL::Matrices::SparseMatrix::setElement, \ref TNL::Matrices::addElement). The procedure is as follows:
 
-\include SparseMatrixExample_Constructor_init_list_1.cpp
+1. Setup the matrix dimensions.
+2. Setup the row capacities.
+3. Setup the matrix elements.
 
-... the result looks as follows.
+The method can be called from both host (CPU) and device (GPU) if the matrix is allocated there. Note, however, that if the matrix is allocated on GPU and the method is called from CPU there will be significant performance drop because the matrix elements will be transfered one after another. However, if the matrix elements setup is not a critical part of your algorithm this can be an easy way how to do it. See the following example:
 
-\include SparseMatrixExample_Constructor_init_list_1.out.
+\includelineno SparseMatrixExample_setElement.cpp
 
+Note that we use `SharedPointer` (\ref TNL::Pointers::SharedPointer) to pass the matrix easily into the lambda function when it runs on GPU. The first for-loop runs on CPU no matter where the matrix is allocated. Next we call the lambda function `f` from `ParallelFor` which is device sensitive and so it runs on CPU or GPU depending where the matrix is allocated. To avoid use of `SharedPointer`, which requires explicit synchronization of smart pointers, you may use `SparseMatrixView' (\ref TNL::Matrices::SparseMatrixView) to achiev the same. The result looks as follows:
+
+\include SparseMatrixExample_setElement.out
+
+The method `addElement` adds a value to specific matrix element. Otherwise, it behaves the same as `setElement`. See the following example:
+
+\includelineno SparseMatrixExample_addElement.cpp
+
+The result looks as follows:
+
+\include SparseMatrixExample_addElement.out
+
+
+Finaly, for the most efficient way of setting the non-zero matrix elements, is use of a method `forRows`. It requires indexes of the range of rows (`begin` and `end`) to be processed and a lambda function `function` which is called for each non-zero element. The lambda functions provides the following data:
+
+* `rowIdx` is a row index of the matrix element.
+* `localIdx` is an index of the non-zero matrix element within the matrix row.
+* `columnIdx` is a column index of the matrix element. If the matrix element is suppsoed to be changed, this parameter can be a reference and so its value can be changed.
+* `value` is a value of the matrix element. It the matrix element is supposed to be changed, this parameter can be a reference as well and so the element value can be changed.
+* `compute` is a bool reference. When it is set to `false` the rest of the row can be omitted. This is, however, only a hint and it depends on the underlying matrix format if it is taken into account.
+
+See the following example:
 
 
 ### Flexible reduction in matrix rows
+
+In the same way as with the dense matrices, we can perform *flexible parallel reduction* in rows even with sparse matrices. 
+
 ### Sparse-matrix vector product
 ### Sparse matrix IO
 ### Sparse matrix view
-- 
GitLab


From e409125737640fc5db91e5f7d238b6fe91a0bb80 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Mon, 31 Aug 2020 16:35:44 +0200
Subject: [PATCH 07/53] Writting documentation on sparse matrices.

---
 .../SparseMatrixExample_forAllRows.cpp        |   6 +-
 .../SparseMatrixExample_forRows.cpp           |   6 +-
 .../SparseMatrixViewExample_forAllRows.cpp    |   6 +-
 .../SparseMatrixViewExample_forRows.cpp       |   6 +-
 .../Tutorials/Matrices/CMakeLists.txt         |   5 +
 ...rixExample_rowsReduction_vectorProduct.cpp |  10 +-
 ...rixExample_rowsReduction_vectorProduct.cpp |  70 +++++++++
 ...trixExample_rowsReduction_vectorProduct.cu |   1 +
 .../Tutorials/Matrices/tutorial_Matrices.md   | 140 +++++++++++++++++-
 9 files changed, 228 insertions(+), 22 deletions(-)
 create mode 100644 Documentation/Tutorials/Matrices/SparseMatrixExample_rowsReduction_vectorProduct.cpp
 create mode 120000 Documentation/Tutorials/Matrices/SparseMatrixExample_rowsReduction_vectorProduct.cu

diff --git a/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixExample_forAllRows.cpp b/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixExample_forAllRows.cpp
index 739600539..a8f6108bc 100644
--- a/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixExample_forAllRows.cpp
+++ b/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixExample_forAllRows.cpp
@@ -9,13 +9,13 @@ void forAllRowsExample()
    TNL::Matrices::SparseMatrix< double, Device > matrix( { 1, 2, 3, 4, 5 }, 5 );
 
    auto f = [=] __cuda_callable__ ( int rowIdx, int localIdx, int& columnIdx, double& value, bool& compute ) {
-      if( rowIdx < columnIdx )  // This is important, some matrix formats may allocate more matrix elements
-                                // than we requested. These padding elements are processed here as well.
+      if( rowIdx < localIdx )  // This is important, some matrix formats may allocate more matrix elements
+                               // than we requested. These padding elements are processed here as well.
          compute = false;
       else
       {
          columnIdx = localIdx;
-         value = rowIdx + localIdx;
+         value = rowIdx + localIdx + 1;
       }
    };
 
diff --git a/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixExample_forRows.cpp b/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixExample_forRows.cpp
index 2330c2ca5..0e2ee3423 100644
--- a/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixExample_forRows.cpp
+++ b/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixExample_forRows.cpp
@@ -9,13 +9,13 @@ void forRowsExample()
    TNL::Matrices::SparseMatrix< double, Device > matrix( { 1, 2, 3, 4, 5 }, 5 );
 
    auto f = [=] __cuda_callable__ ( int rowIdx, int localIdx, int& columnIdx, double& value, bool& compute ) {
-      if( rowIdx < columnIdx )  // This is important, some matrix formats may allocate more matrix elements
-                                // than we requested. These padding elements are processed here as well.
+      if( rowIdx < localIdx )  // This is important, some matrix formats may allocate more matrix elements
+                               // than we requested. These padding elements are processed here as well.
          compute = false;
       else
       {
          columnIdx = localIdx;
-         value = rowIdx + localIdx;
+         value = rowIdx + localIdx + 1;
       }
    };
 
diff --git a/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixViewExample_forAllRows.cpp b/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixViewExample_forAllRows.cpp
index fda71a42f..ee09d6121 100644
--- a/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixViewExample_forAllRows.cpp
+++ b/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixViewExample_forAllRows.cpp
@@ -10,13 +10,13 @@ void forAllRowsExample()
    auto view = matrix.getView();
 
    auto f = [=] __cuda_callable__ ( int rowIdx, int localIdx, int& columnIdx, double& value, bool& compute ) {
-      if( rowIdx < columnIdx )  // This is important, some matrix formats may allocate more matrix elements
-                                // than we requested. These padding elements are processed here as well.
+      if( rowIdx < localIdx )  // This is important, some matrix formats may allocate more matrix elements
+                               // than we requested. These padding elements are processed here as well.
          compute = false;
       else
       {
          columnIdx = localIdx;
-         value = rowIdx + localIdx;
+         value = rowIdx + localIdx + 1;
       }
    };
 
diff --git a/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixViewExample_forRows.cpp b/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixViewExample_forRows.cpp
index 987c3dec4..8b76bae18 100644
--- a/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixViewExample_forRows.cpp
+++ b/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixViewExample_forRows.cpp
@@ -10,13 +10,13 @@ void forRowsExample()
    auto view = matrix.getView();
 
    auto f = [=] __cuda_callable__ ( int rowIdx, int localIdx, int& columnIdx, double& value, bool& compute ) {
-      if( rowIdx < columnIdx )  // This is important, some matrix formats may allocate more matrix elements
-                                // than we requested. These padding elements are processed here as well.
+      if( rowIdx < localIdx )  // This is important, some matrix formats may allocate more matrix elements
+                               // than we requested. These padding elements are processed here as well.
          compute = false;
       else
       {
          columnIdx = localIdx;
-         value = rowIdx + localIdx;
+         value = rowIdx + localIdx + 1;
       }
    };
 
diff --git a/Documentation/Tutorials/Matrices/CMakeLists.txt b/Documentation/Tutorials/Matrices/CMakeLists.txt
index 50e34ce73..7ab2f29e1 100644
--- a/Documentation/Tutorials/Matrices/CMakeLists.txt
+++ b/Documentation/Tutorials/Matrices/CMakeLists.txt
@@ -79,6 +79,10 @@ IF( BUILD_CUDA )
                        ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SparseMatrixExample_forRows.out
                        OUTPUT SparseMatrixExample_forRows.out )
 
+   CUDA_ADD_EXECUTABLE( SparseMatrixExample_rowsReduction_vectorProduct SparseMatrixExample_rowsReduction_vectorProduct.cu )
+   ADD_CUSTOM_COMMAND( COMMAND SparseMatrixExample_rowsReduction_vectorProduct >
+                       ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SparseMatrixExample_rowsReduction_vectorProduct.out
+                       OUTPUT SparseMatrixExample_rowsReduction_vectorProduct.out )
 
 ELSE()
 #   ADD_EXECUTABLE( UniquePointerExample UniquePointerExample.cpp )
@@ -105,6 +109,7 @@ ADD_CUSTOM_TARGET( TutorialsMatricesCuda ALL DEPENDS
    SparseMatrixExample_setElements_map.out
    SparseMatrixExample_setElement.out
    SparseMatrixExample_forRows.out
+   SparseMatrixExample_rowsReduction_vectorProduct.out
  )
 ENDIF()
 #
diff --git a/Documentation/Tutorials/Matrices/DenseMatrixExample_rowsReduction_vectorProduct.cpp b/Documentation/Tutorials/Matrices/DenseMatrixExample_rowsReduction_vectorProduct.cpp
index 8f0f99cf4..1dcef95dd 100644
--- a/Documentation/Tutorials/Matrices/DenseMatrixExample_rowsReduction_vectorProduct.cpp
+++ b/Documentation/Tutorials/Matrices/DenseMatrixExample_rowsReduction_vectorProduct.cpp
@@ -3,6 +3,7 @@
 #include <functional>
 #include <TNL/Matrices/DenseMatrix.h>
 #include <TNL/Devices/Host.h>
+#include <TNL/Devices/Cuda.h>
 
 template< typename Device >
 void rowsReduction()
@@ -39,13 +40,6 @@ void rowsReduction()
       return xView[ columnIdx ] * value;
    };
 
-   /***
-    * Reduce lambda return sum of given values.
-    */
-   auto reduce = [=] __cuda_callable__ ( double& a, const double& b ) -> double {
-      return a + b;
-   };
-
    /***
     * Keep lambda store the result of matrix-vector product to output vector y.
     */
@@ -56,7 +50,7 @@ void rowsReduction()
    /***
     * Compute matrix-vector product.
     */
-   matrix.rowsReduction( 0, matrix.getRows(), fetch, reduce, keep, 0.0 );
+   matrix.rowsReduction( 0, matrix.getRows(), fetch, std::plus<>{}, keep, 0.0 );
 
    std::cout << "The matrix reads as:" << std::endl << matrix << std::endl;
    std::cout << "The input vector is:" << x << std::endl;
diff --git a/Documentation/Tutorials/Matrices/SparseMatrixExample_rowsReduction_vectorProduct.cpp b/Documentation/Tutorials/Matrices/SparseMatrixExample_rowsReduction_vectorProduct.cpp
new file mode 100644
index 000000000..dd72230fe
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/SparseMatrixExample_rowsReduction_vectorProduct.cpp
@@ -0,0 +1,70 @@
+#include <iostream>
+#include <iomanip>
+#include <functional>
+#include <TNL/Matrices/SparseMatrix.h>
+#include <TNL/Devices/Host.h>
+#include <TNL/Devices/Cuda.h>
+
+template< typename Device >
+void rowsReduction()
+{
+   TNL::Matrices::SparseMatrix< double, Device > matrix { 5, 5, {
+      { 0, 0, 1 },
+      { 1, 0, 1 }, { 1, 1, 2 },
+      { 2, 1, 1 }, { 2, 2, 8 },
+      { 3, 2, 1 }, { 3, 3, 9 },
+      { 4, 4, 1 } } };
+
+   /***
+    * Allocate input and output vectors for matrix-vector product
+    */
+   TNL::Containers::Vector< double, Device > x( matrix.getColumns() ),
+                                             y( matrix.getRows() );
+
+   /***
+    * Fill the input vectors with ones.
+    */
+   x = 1.0;
+
+   /***
+    * Prepare vector view for lambdas.
+    */
+   auto xView = x.getView();
+   auto yView = y.getView();
+
+   /***
+    * Fetch lambda just returns product of appropriate matrix elements and the
+    * input vector elements.
+    */
+   auto fetch = [=] __cuda_callable__ ( int rowIdx, int columnIdx, const double& value ) -> double {
+      return xView[ columnIdx ] * value;
+   };
+
+   /***
+    * Keep lambda store the result of matrix-vector product to output vector y.
+    */
+   auto keep = [=] __cuda_callable__ ( int rowIdx, const double& value ) mutable {
+      yView[ rowIdx ] = value;
+   };
+
+   /***
+    * Compute matrix-vector product.
+    */
+   matrix.rowsReduction( 0, matrix.getRows(), fetch, std::plus<>{}, keep, 0.0 );
+
+   std::cout << "The matrix reads as:" << std::endl << matrix << std::endl;
+   std::cout << "The input vector is:" << x << std::endl;
+   std::cout << "Result of matrix-vector multiplication is: " << y << std::endl;
+}
+
+int main( int argc, char* argv[] )
+{
+   std::cout << "Rows reduction on host:" << std::endl;
+   rowsReduction< TNL::Devices::Host >();
+
+#ifdef HAVE_CUDA
+   std::cout << std::endl;
+   std::cout << "Rows reduction on CUDA device:" << std::endl;
+   rowsReduction< TNL::Devices::Cuda >();
+#endif
+}
diff --git a/Documentation/Tutorials/Matrices/SparseMatrixExample_rowsReduction_vectorProduct.cu b/Documentation/Tutorials/Matrices/SparseMatrixExample_rowsReduction_vectorProduct.cu
new file mode 120000
index 000000000..1be7a26d8
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/SparseMatrixExample_rowsReduction_vectorProduct.cu
@@ -0,0 +1 @@
+SparseMatrixExample_rowsReduction_vectorProduct.cpp
\ No newline at end of file
diff --git a/Documentation/Tutorials/Matrices/tutorial_Matrices.md b/Documentation/Tutorials/Matrices/tutorial_Matrices.md
index 2f1591088..6ffe353bf 100644
--- a/Documentation/Tutorials/Matrices/tutorial_Matrices.md
+++ b/Documentation/Tutorials/Matrices/tutorial_Matrices.md
@@ -279,7 +279,6 @@ The result looks as follows:
 
 \include SparseMatrixExample_addElement.out
 
-
 Finaly, for the most efficient way of setting the non-zero matrix elements, is use of a method `forRows`. It requires indexes of the range of rows (`begin` and `end`) to be processed and a lambda function `function` which is called for each non-zero element. The lambda functions provides the following data:
 
 * `rowIdx` is a row index of the matrix element.
@@ -290,13 +289,150 @@ Finaly, for the most efficient way of setting the non-zero matrix elements, is u
 
 See the following example:
 
+\includelineno SparseMatrixExample_forRows.cpp
+
+On the line 9, we allocate a lower triangular matrix (because the row capacities `{1,2,3,4,5}` are equal to row index) using the `SparseMatrix`. On the line 11, we prepare lambda function `f` which we execute on the line 22 just by calling the method `forRows` (\ref TNL::Matrices::SpartseMatrix::forRows). This method takes the range of matrix rows as the first two parameters and the lambda function as the last parameter. The lambda function receives parameters metioned above (see the line 11). We first check if the matrix element coordinates (`rowIdx` and `localIdx`) points to an element lying before the matrix diagonal or on the diagonal. At this moment we should better explain the meaning of the parameter `localIdx`. It says the local index or the range of the non-zero element in the matrix row. The sparse matrix formats usualy in the first step compress the matrix rows by omitting the zero matrix elements as follows
+
+\f[
+\left(
+\begin{array}{ccccc}
+0 & 1 & 0 & 2 & 0 \\
+0 & 0 & 5 & 0 & 0 \\
+4 & 0 & 0 & 0 & 7 \\
+0 & 3 & 0 & 8 & 5 \\
+0 & 5 & 7 & 0 & 0
+\end{array}
+\right)
+\rightarrow
+\left(
+\begin{array}{ccccc}
+1 & 2 & . & . & . \\
+5 & . & . & . & . \\
+4 & 7 & . & . & . \\
+3 & 8 & 5 & . & . \\
+5 & 7 & . & . & .
+\end{array}
+\right)
+\f]
+
+Some sparse matrix formats adds back padding zeros for better alignment of data in memory. But if this is not the case, the local indexes of the matrix elements would read as:
+
+\f[
+\left(
+\begin{array}{ccccc}
+0 & 1 & . & . & . \\
+0 & . & . & . & . \\
+0 & 1 & . & . & . \\
+0 & 1 & 2 & . & . \\
+0 & 1 & . & . & .
+\end{array}
+\right)
+\f]
+
+In case of the lower triangular matrix in our example, the local index is in fact the same as the column index
+
+\f[
+\left(
+\begin{array}{ccccc}
+0 & . & . & . & . \\
+0 & 1 & . & . & . \\
+0 & 1 & 2 & . & . \\
+0 & 1 & 2 & 3 & . \\
+0 & 1 & 2 & 3 & 4
+\end{array}
+\right)
+\f]
+
+If we call the method `forRows` to setup the matrix elements for the first time, the parameter `columnIdx` has no sense because the matrix elements and their column indexes were not set yet. Therefore it is important that the test on the line 12 reads as
+
+```
+if( rowIdx < localIdx )
+```
+
+because
+
+```
+if( rowIdx < columnIdx )
+```
+
+would not make sense. If we pass through this test, the matrix element lies in the lower triangular part of the matrix and we may set the matrix elements which is done on the lines 17 and 18. The column index (`columnIdx`) is set to local index (line 17) and `value` is set on the line 18. The result looks as follows:
+
+
+\includelineno SparseMatrixExample_forRows.out
 
 ### Flexible reduction in matrix rows
 
-In the same way as with the dense matrices, we can perform *flexible parallel reduction* in rows even with sparse matrices. 
+The *flexible parallel reduction* in rows for sparse matrices is very simmilar to the one for dense matrices. It consits of three lambda functions:
+
+1. `fetch` reads and preproces data entering the flexible parallel reduction.
+2. `reduce` performs the reduction operation.
+3. `keep` stores the results from each matrix row.
+
+See the following example:
+
+\includelineno SparseMatrixExample_rowsReduction_vectorProduct.cpp
+
+On the lines 11-16 we set the following matrix:
+
+\f[
+\left(
+\begin{array}{ccccc}
+1 & . & . & . & . \\
+1 & 2 & . & . & . \\
+. & 1 & 8 & . & . \\
+. & . & 1 & 9 & . \\
+. & . & . & . & 1
+\end{array}
+\right)
+\f]
+
+Next we prepare input (`x`) and output (`y`) vectors on the lines 21 and 22 and set all elements of the input vector to one (line 27). Since we will need to access these vectors in lambda functions we prepare their views on lines 32 and 33. On the lines 39-41, we define the `fetch` lambda function. It receives three arguments:
+
+1. `rowIdx` is a row index of the matrix element being currently processed.
+2. `columnIdx` is a column index of the matrix elements being currently processed.
+3. `value` is a value of the matrix element being currently procesed.
+
+We ommit the row index and take the column index which indicates index of the element of the input vector we need to fetch (`xView[ columnIdx ]`). We take its value and multiply it with the value (`value`) of the current matrix element. We do not need to write lambda function for reduction since it is only summation of the intermediate results from the `fetch` lamda and we can use `std::plus<>{}` (see the line 60). The `keep` lambda function offers two parameters:
+
+1. `rowIdx` tells the index of the matrix row for which we aimm to store the result.
+2. `value` is the result obtained in the given matrix row.
+
+In our example, we just write the result into appropriate element of the output vector `y` which is given just by the row index `rowIdx` -- see the line 47.  On the line 53 we start the computation of the matrix-vector product. The method `rowsReduction` (\ref TNL::Matrices::SparseMatrix::rowsReduction) accepts the following arguments:
+
+1. `begin` is the begining of the matrix rows range on which the reduction will be performed.
+2. `end` is the end of the matrix rows range on which the reduction will be performed. The last matrix row which is going to be processed has index `end-1`.
+3. `fetch` is the fetch lambda function.
+4. `reduce` is the the lmabda function performing the reduction.
+5. `keep` is the lambda function responsible for processing the results from particular matrix rows.
+6. `zero` is the "zero" element of given reduction opertation also known as *idempotent*. It is really 0 for summation in our example (adding zero to any number does not change the result).
+
+At the end we print the matrix, the input and the output vector -- lines 55-57. The result looks as follows:
+
+\include SparseMatrixExample_rowsReduction_vectorProduct.out
 
 ### Sparse-matrix vector product
+
+As we mentioned already in the part explaining the dense matrices, matrix-vector multiplication or in this case sparse matrix-vector multiplication ([SpMV](https://en.wikipedia.org/wiki/Sparse_matrix-vector_multiplication)) is one of the most important operations in numerical mathematics and high-performance computing. It is represented by a method `vectorProduct` (\ref TNL::Matrices::SparseMatrix::vectorProduct). It is templated method with two template parameters `InVector` and `OutVector` telling the types of input and output vector respectively. Usually one will substitute some of \ref TNL::Containers::Array, \ref TNL::Containers::ArrayView, \ref TNL::Containers::Vector or \ref TNL::Containers::VectorView for these types. The method computes the following formula
+
+```
+outVector = matrixMultiplicator * ( *this ) * inVector + outVectorMultiplicator * outVector
+```
+
+and it accepts the following parameters:
+
+* `inVector` is the input vector having the same number of elements as the number of matrix columns.
+* `outVector` is the output vector having the same number of elements as the number of matrix rows.
+* `matrixMultiplicator` is a number by which the result of matrix-vector product is multiplied. 
+* `outVectorMultiplicator` is a number by which the output vector is multiplied before added to the result of matrix-vector product.
+* `begin` is an index of the first matrix row that is involved in the multiplication. It is zero be default.
+* `end` is an index of the last matrix row that is involved in the multiplication. It is the last matrix row by default.
+
+Note that the ouput vector dimension must be the same as the number of matrix rows no matter how we set `begin` and `end` parameters. These parameters just say that some matrix rows and the output vector elements are omitted.
+
 ### Sparse matrix IO
+
+The sparse matrix can be saved to a file using a method `save` (\ref TNL::Matrices::SparseMatrix::save) and restored with a method `load` (\ref TNL::Matrices::SparseMatrix::load). To print the matrix a method `print` (\ref TNL::Matrices::SparseMatrix::print) can be used.
+
 ### Sparse matrix view
 
 ## Tridiagonal matrices <a name="tridiagonal_matrices"></a>
-- 
GitLab


From 31dc085f015a73811564f6fd58ece68f22140f28 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Mon, 31 Aug 2020 21:08:22 +0200
Subject: [PATCH 08/53] Writting tutorials on matrices.

---
 .../SparseMatrixViewExample_setElement.cpp    |  4 ++
 .../Tutorials/Matrices/CMakeLists.txt         | 12 ++++
 ...seMatrixViewExample_data_encapsulation.cpp | 64 +++++++++++++++++++
 ...nseMatrixViewExample_data_encapsulation.cu |  1 +
 .../SparseMatrixViewExample_setElement.cpp    |  1 +
 .../SparseMatrixViewExample_setElement.cu     |  1 +
 .../Tutorials/Matrices/tutorial_Matrices.md   | 46 +++++++++++--
 src/TNL/Matrices/DenseMatrixView.h            |  5 +-
 8 files changed, 126 insertions(+), 8 deletions(-)
 create mode 100644 Documentation/Tutorials/Matrices/DenseMatrixViewExample_data_encapsulation.cpp
 create mode 120000 Documentation/Tutorials/Matrices/DenseMatrixViewExample_data_encapsulation.cu
 create mode 120000 Documentation/Tutorials/Matrices/SparseMatrixViewExample_setElement.cpp
 create mode 120000 Documentation/Tutorials/Matrices/SparseMatrixViewExample_setElement.cu

diff --git a/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixViewExample_setElement.cpp b/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixViewExample_setElement.cpp
index 3de6634a3..ada2b2a82 100644
--- a/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixViewExample_setElement.cpp
+++ b/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixViewExample_setElement.cpp
@@ -10,6 +10,10 @@ template< typename Device >
 void setElements()
 {
    TNL::Matrices::SparseMatrix< double, Device > matrix( { 1, 1, 1, 1, 1 }, 5 );
+
+   /****
+    * Get the matrix view.
+    */
    auto view = matrix.getView();
    for( int i = 0; i < 5; i++ )
       view.setElement( i, i, i );
diff --git a/Documentation/Tutorials/Matrices/CMakeLists.txt b/Documentation/Tutorials/Matrices/CMakeLists.txt
index 7ab2f29e1..8f66ce09e 100644
--- a/Documentation/Tutorials/Matrices/CMakeLists.txt
+++ b/Documentation/Tutorials/Matrices/CMakeLists.txt
@@ -34,6 +34,11 @@ IF( BUILD_CUDA )
                        ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixViewExample_setElement.out OUTPUT
                        DenseMatrixViewExample_setElement.out )
 
+   CUDA_ADD_EXECUTABLE( DenseMatrixViewExample_data_encapsulation DenseMatrixViewExample_data_encapsulation.cu )
+   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixViewExample_data_encapsulation >
+                       ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixViewExample_data_encapsulation.out OUTPUT
+                       DenseMatrixViewExample_data_encapsulation.out )
+
    CUDA_ADD_EXECUTABLE( SparseMatrixExample_Constructor_init_list_2 SparseMatrixExample_Constructor_init_list_2.cu )
    ADD_CUSTOM_COMMAND( COMMAND SparseMatrixExample_Constructor_init_list_2 >
                        ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SparseMatrixExample_Constructor_init_list_2.out
@@ -84,6 +89,11 @@ IF( BUILD_CUDA )
                        ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SparseMatrixExample_rowsReduction_vectorProduct.out
                        OUTPUT SparseMatrixExample_rowsReduction_vectorProduct.out )
 
+   CUDA_ADD_EXECUTABLE( SparseMatrixViewExample_setElement SparseMatrixViewExample_setElement.cu )
+   ADD_CUSTOM_COMMAND( COMMAND SparseMatrixViewExample_setElement >
+                       ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SparseMatrixViewExample_setElement.out
+                       OUTPUT SparseMatrixViewExample_setElement.out )
+
 ELSE()
 #   ADD_EXECUTABLE( UniquePointerExample UniquePointerExample.cpp )
 #   ADD_CUSTOM_COMMAND( COMMAND UniquePointerExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/UniquePointerExample.out OUTPUT UniquePointerExample.out )
@@ -102,6 +112,7 @@ ADD_CUSTOM_TARGET( TutorialsMatricesCuda ALL DEPENDS
    DenseMatrixExample_rowsReduction_vectorProduct.out
    DenseMatrixExample_rowsReduction_maxNorm.out
    DenseMatrixViewExample_setElement.out
+   DenseMatrixViewExample_data_encapsulation.out
    SparseMatrixExample_Constructor_init_list_2.out
    SparseMatrixExample_setRowCapacities.out
    SparseMatrixExample_Constructor_std_map.out
@@ -110,6 +121,7 @@ ADD_CUSTOM_TARGET( TutorialsMatricesCuda ALL DEPENDS
    SparseMatrixExample_setElement.out
    SparseMatrixExample_forRows.out
    SparseMatrixExample_rowsReduction_vectorProduct.out
+   SparseMatrixViewExample_setElement.out
  )
 ENDIF()
 #
diff --git a/Documentation/Tutorials/Matrices/DenseMatrixViewExample_data_encapsulation.cpp b/Documentation/Tutorials/Matrices/DenseMatrixViewExample_data_encapsulation.cpp
new file mode 100644
index 000000000..99cf67583
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/DenseMatrixViewExample_data_encapsulation.cpp
@@ -0,0 +1,64 @@
+#include <iostream>
+#ifdef HAVE_CUDA
+#include <cuda.h>
+#endif
+#include <TNL/Containers/VectorView.h>
+#include <TNL/Algorithms/ParallelFor.h>
+#include <TNL/Matrices/DenseMatrix.h>
+#include <TNL/Devices/Host.h>
+
+template< typename Device >
+void encapsulation()
+{
+   const int size = 5;
+
+   /***
+    * Allocate the dense matrix with no use of TNL
+    */
+   double* host_data = new double[ size * size ];
+   for( int row = 0; row < size; row++ )
+      for( int column = 0; column < size; column++ )
+         host_data[ row * size + column ] = row * size + column + 1;
+   double* data = nullptr;
+   if( std::is_same< Device, TNL::Devices::Host >::value )
+   {
+      data = new double[ size * size ];
+      memcpy( data, host_data, sizeof( double ) * size * size );
+   }
+#ifdef HAVE_CUDA
+   else if( std::is_same< Device, TNL::Devices::Cuda >::value )
+   {
+      cudaMalloc( ( void**) &data, sizeof( double ) * size * size );
+      cudaMemcpy( data, host_data, sizeof( double ) * size * size,  cudaMemcpyHostToDevice );
+   }
+#endif
+
+   /***
+    * Encapsulate the matrix into DenseMatrixView.
+    */
+   TNL::Containers::VectorView< double, Device > dataView( data, size * size );
+   TNL::Matrices::DenseMatrixView< double, Device, int, TNL::Algorithms::Segments::RowMajorOrder > matrix( 5, 5, dataView );
+
+   std::cout << "Dense matrix view reads as:" << std::endl;
+   std::cout << matrix << std::endl;
+
+   auto f = [=] __cuda_callable__ ( int i ) mutable {
+      matrix.setElement( i, i, -i );
+   };
+   TNL::Algorithms::ParallelFor< Device >::exec( 0, 5, f );
+
+   std::cout << "Dense matrix view after elements manipulation:" << std::endl;
+   std::cout << matrix << std::endl;
+}
+
+int main( int argc, char* argv[] )
+{
+   std::cout << "Dense matrix encapsulation on host:" << std::endl;
+   encapsulation< TNL::Devices::Host >();
+
+#ifdef HAVE_CUDA
+   std::cout << "Dense matrix encapsulation on CUDA device:" << std::endl;
+   encapsulation< TNL::Devices::Cuda >();
+#endif
+}
+
diff --git a/Documentation/Tutorials/Matrices/DenseMatrixViewExample_data_encapsulation.cu b/Documentation/Tutorials/Matrices/DenseMatrixViewExample_data_encapsulation.cu
new file mode 120000
index 000000000..8be2d545a
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/DenseMatrixViewExample_data_encapsulation.cu
@@ -0,0 +1 @@
+DenseMatrixViewExample_data_encapsulation.cpp
\ No newline at end of file
diff --git a/Documentation/Tutorials/Matrices/SparseMatrixViewExample_setElement.cpp b/Documentation/Tutorials/Matrices/SparseMatrixViewExample_setElement.cpp
new file mode 120000
index 000000000..0b861369e
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/SparseMatrixViewExample_setElement.cpp
@@ -0,0 +1 @@
+../../Examples/Matrices/SparseMatrix/SparseMatrixViewExample_setElement.cpp
\ No newline at end of file
diff --git a/Documentation/Tutorials/Matrices/SparseMatrixViewExample_setElement.cu b/Documentation/Tutorials/Matrices/SparseMatrixViewExample_setElement.cu
new file mode 120000
index 000000000..9a6e8304d
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/SparseMatrixViewExample_setElement.cu
@@ -0,0 +1 @@
+../../Examples/Matrices/SparseMatrix/SparseMatrixViewExample_setElement.cu
\ No newline at end of file
diff --git a/Documentation/Tutorials/Matrices/tutorial_Matrices.md b/Documentation/Tutorials/Matrices/tutorial_Matrices.md
index 6ffe353bf..25ea8d793 100644
--- a/Documentation/Tutorials/Matrices/tutorial_Matrices.md
+++ b/Documentation/Tutorials/Matrices/tutorial_Matrices.md
@@ -126,22 +126,32 @@ The dense matrix can be saved to a file using a method `save` (\ref TNL::Matrice
 
 ### Dense matrix view
 
-Similar to array view (\ref TNL::Containers::ArayView) and vector view (\ref TNL::Containers::VectorView), matrices also offer their view for easier use with lambda functions. For the dense matrix there is a `DenseMatrixView` (\ref TNL::Matrcioes::DenseMatrixView).
+Similar to array view (\ref TNL::Containers::ArayView) and vector view (\ref TNL::Containers::VectorView), matrices also offer their view for easier use with lambda functions. For the dense matrix there is a `DenseMatrixView` (\ref TNL::Matrices::DenseMatrixView) which is a templated class with the following template arguments (they are the same as for `DenseMatrix` -- \ref TNL::Matrices::DenseMatrix -- except of the allocator):
 
-TODO: Template parameters description
+* `Real` is a type of matrix elements. 
+* `Device` is a device on which the matrix is allocated. This can be \ref TNL::Devices::Host or \ref TNL::Devices::Cuda.
+* `Index` is a type for indexing the matrix elements and also row and column indexes.
+* `Organization` tells the ordering of matrix elements in memory. It is either RowMajorOrder or ColumnMajorOrder.
 
-We will demonstrate it on the example showing the method `setElement` (\ref TNL::Matrices::DenseMatrix::setElement). However, the `SharedPointer` will be replaced with the `DenseMatrixView`. The code looks as follows:
+The first main reason for using the dense matrix view is its ability to be captured by lambda functions since the copy constructor makes only shallow copy. We will demonstrate it on the example showing the method `setElement` (\ref TNL::Matrices::DenseMatrix::setElement). However, the `SharedPointer` will be replaced with the `DenseMatrixView`. The code looks as follows:
 
 \includelineno DenseMatrixViewExample_setElement.cpp
 
-And the result is:
+You can see that we do not need to use the shared pointer (\ref TNL::Pointers::SharedPointer) as we did in the example demonstrating the method `setElement` for dense matrix.  And the result is:
 
 \include DenseMatrixViewExample_setElement.out
 
-The dense matrix view offers almost all methods which the dense matrix does. So it can be easily used at almost any situation the same way as the dense matrix itself.
+The second reason for using the `DenseMatrixView` is to encapsulate data allocated by some other library or program then TNL. The following example demonstrates how to do it:
+
+\includelineno DenseMatrixViewExample_data_encapsulation.cpp
 
+On the lines 18--34 we create matrix by allocating array `data` and filling the matrix using a formula \f$ a_{ij} = i * size + j + 1\f$. We do it first on the host (lines 18--21) in auxilliary array `host_data` to make initiation of the array `data` easier in case when `Device` is GPU. Next, depending on the argument `Device`, we allocate the array `data` on the host or on GPU and copy data from the arary `host_data` to the array `data`. To insert this array into the dense matrix view, we first need to encapsulate it with vector view (\ref TNL::Conatianers::VectorView) `dataView` on the line 39 which can be then used to create the dense matrix view `matrix` on the line 40. Note that wee must set proper matrix elements organizationa which is `RowMajorOrder` (\ref TNL::Algorithms::Segments::RowMajorOrder) in this example. Next, we print the matrix to see if the encapsulation was succesfull (lines 42 and 43) and finaly we demonstrate manipulation with matrix elements (lines 45--48) and we print the result (lines 50 and 51). 
+
+The result looks as follows:
 
-TODO: Using DenseMatrixView for data encapsulation
+\includelineno DenseMatrixViewExample_data_encapsulation.out
+
+The dense matrix view offers almost all methods which the dense matrix does. So it can be easily used at almost any situation the same way as the dense matrix itself.
 
 ## Sparse matrices <a name="sparse_matrices"></a>
 
@@ -435,6 +445,30 @@ The sparse matrix can be saved to a file using a method `save` (\ref TNL::Matric
 
 ### Sparse matrix view
 
+Sparse matrix view serves, simillar to other views in TNL, to data sharing and for use with lambda functions (views can be easily captured since they make only shallow copy). The sparse matrix view (\ref TNL::Matrices::SparseMatrixView) is templated class having the following template arguments (they are the same as for `SparseMatrix` -- \ref TNL::Matrices::SparseMatrix -- except of the allocators):
+
+* `Real` is type if the matrix elements. It is `double` by default.
+* `Device` is a device where the matrix is allocated. Currently it can be either \ref TNL::Devices::Host for CPU or \ref TNL::Devices::Cuda for GPU supporting CUDA. It is \ref TNL::Devices::Host by default.
+* `Index` is a type to be used for indexing of the matrix elements. It is `int` by default.
+* `MatrixType` tells if the matrix is symmetric (\ref TNL::Matrices::SymmetricMatrix) or general (\ref TNL::Matrices::GeneralMatrix). It is a \ref TNL::Matrices::GeneralMatrix by default.
+* `Segments` define the format of the sparse matrix. It can be (by default, it is \ref TNL::Algorithms::Segments::CSR):
+   * \ref TNL::Algorithms::Segments::CSR for [CSR format](https://en.wikipedia.org/wiki/Sparse_matrix#Compressed_sparse_row_(CSR,_CRS_or_Yale_format)).
+   * \ref TNL::Algorithms::Segments::Ellpack for [Ellpack format](http://mgarland.org/files/papers/nvr-2008-004.pdf).
+   * \ref TNL::Algorithms::Segments::SlicedEllpack for [SlicedEllpack format](https://link.springer.com/chapter/10.1007/978-3-642-11515-8_10) which was also presented as [Row-grouped CSR format](https://arxiv.org/abs/1012.2270).
+   * \ref TNL::Algorithms::Segments::ChunkedEllpack for [ChunkedEllpack format](http://geraldine.fjfi.cvut.cz/~oberhuber/data/vyzkum/publikace/12-heller-oberhuber-improved-rgcsr-format.pdf) which we reffered as Improved Row-grouped CSR and we renamed it to Ellpack format since it uses padding zeros.
+   * \ref TNL::Algorithms::Segments::BiEllpack for [BiEllpack format](https://www.sciencedirect.com/science/article/pii/S0743731514000458?casa_token=2phrEj0Ef1gAAAAA:Lgf6rMBUN6T7TJne6mAgI_CSUJ-jR8jz7Eghdv6L0SJeGm4jfso-x6Wh8zgERk3Si7nFtTAJngg).
+* `ComputeReal` is type which is used for internal computations. By default it is the same as `Real` if `Real` is not `bool`. If `Real` is `bool`, `ComputeReal` is set to `Index` type. This can be changed, of course, by the user.
+
+**If `Real` is set to `bool`, we get *a binary matrix view*.**
+
+The following example shows the use of `SparseMatrixView` with lambda functions:
+
+\includelineno SparseMatrixViewExample_setElement.cpp
+
+The result looks as follows:
+
+\includelineno SparseMatrixViewExample_setElement.out
+
 ## Tridiagonal matrices <a name="tridiagonal_matrices"></a>
 
 ### Dense matrix allocation and initiation
diff --git a/src/TNL/Matrices/DenseMatrixView.h b/src/TNL/Matrices/DenseMatrixView.h
index 5f565734c..2cf971771 100644
--- a/src/TNL/Matrices/DenseMatrixView.h
+++ b/src/TNL/Matrices/DenseMatrixView.h
@@ -29,8 +29,9 @@ namespace Matrices {
  * \tparam Real is a type of matrix elements.
  * \tparam Device is a device where the matrix is allocated.
  * \tparam Index is a type for indexing of the matrix elements.
- * \tparam MatrixElementsOrganization tells the ordering of matrix elements. It is either RowMajorOrder
- *         or ColumnMajorOrder.
+ * \tparam MatrixElementsOrganization tells the ordering of matrix elements in memory. It is either
+ *         \ref TNL::Algorithms::Segments::RowMajorOrder
+ *         or \ref TNL::Algorithms::Segments::ColumnMajorOrder.
  * 
  * See \ref DenseMatrix.
  */
-- 
GitLab


From 9df662f9c1b8102a58c4d0fd2236b112ee2d6e0f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Tue, 5 Jan 2021 16:23:52 +0100
Subject: [PATCH 09/53] Fix of rowsReduction in Tridiagonal matrix - wrong
 format of reduction lambda function.

---
 ...TridiagonalMatrixExample_rowsReduction.cpp |  2 +-
 .../TridiagonalMatrixViewExample_forRows.cpp  | 20 +++++------
 src/TNL/Matrices/TridiagonalMatrixView.hpp    | 36 +++++++++----------
 3 files changed, 29 insertions(+), 29 deletions(-)

diff --git a/Documentation/Examples/Matrices/TridiagonalMatrix/TridiagonalMatrixExample_rowsReduction.cpp b/Documentation/Examples/Matrices/TridiagonalMatrix/TridiagonalMatrixExample_rowsReduction.cpp
index 792dc98d3..aae0bd4e3 100644
--- a/Documentation/Examples/Matrices/TridiagonalMatrix/TridiagonalMatrixExample_rowsReduction.cpp
+++ b/Documentation/Examples/Matrices/TridiagonalMatrix/TridiagonalMatrixExample_rowsReduction.cpp
@@ -46,7 +46,7 @@ void rowsReduction()
    /***
     * Reduce lambda return maximum of given values.
     */
-   auto reduce = [=] __cuda_callable__ ( double& a, const double& b ) -> double {
+   auto reduce = [=] __cuda_callable__ ( const double& a, const double& b ) -> double {
       return TNL::max( a, b );
    };
 
diff --git a/Documentation/Examples/Matrices/TridiagonalMatrix/TridiagonalMatrixViewExample_forRows.cpp b/Documentation/Examples/Matrices/TridiagonalMatrix/TridiagonalMatrixViewExample_forRows.cpp
index 24fe78f7f..d3ddd6208 100644
--- a/Documentation/Examples/Matrices/TridiagonalMatrix/TridiagonalMatrixViewExample_forRows.cpp
+++ b/Documentation/Examples/Matrices/TridiagonalMatrix/TridiagonalMatrixViewExample_forRows.cpp
@@ -10,11 +10,11 @@ void forRowsExample()
     * Set the following matrix (dots represent zero matrix elements and zeros are
     * padding zeros for memory alignment):
     * 
-    * 0 / 1  3  .  .  . \   -> { 0, 1, 3 }
-    *   | 2  1  3  .  . |   -> { 2, 1, 3 }
-    *   | .  2  1  3  . |   -> { 2, 1, 3 }
-    *   | .  .  2  1  3 |   -> { 2, 1, 3 }
-    *   \ .  .  .  2  1 / 0 -> { 2, 1, 0 } 
+    * 0 / 2  1  .  .  . \   -> { 0, 2, 1 }
+    *   | 3  2  1  .  . |   -> { 3, 2, 1 }
+    *   | .  3  2  1  . |   -> { 3, 2, 1 }
+    *   | .  .  3  2  1 |   -> { 3, 2, 1 }
+    *   \ .  .  .  3  2 / 0 -> { 3, 2, 0 } 
     */
    TNL::Matrices::TridiagonalMatrix< double, Device > matrix(
       5,      // number of matrix rows
@@ -29,11 +29,11 @@ void forRowsExample()
        * 
        *                           0  1  2  <- localIdx values
        *                           -------
-       * 0 / 1  3  .  .  . \   -> { 0, 1, 3 }
-       *   | 2  1  3  .  . |   -> { 2, 1, 3 }
-       *   | .  2  1  3  . |   -> { 2, 1, 3 }
-       *   | .  .  2  1  3 |   -> { 2, 1, 3 }
-       *   \ .  .  .  2  1 / 0 -> { 2, 1, 0 } 
+       * 0 / 2  1  .  .  . \   -> { 0, 2, 1 }
+       *   | 3  2  1  .  . |   -> { 3, 2, 1 }
+       *   | .  3  2  1  . |   -> { 3, 2, 1 }
+       *   | .  .  3  2  1 |   -> { 3, 2, 1 }
+       *   \ .  .  .  3  2 / 0 -> { 3, 2, 0 } 
        * 
        */
       value = 3 - localIdx;
diff --git a/src/TNL/Matrices/TridiagonalMatrixView.hpp b/src/TNL/Matrices/TridiagonalMatrixView.hpp
index b84c63f9b..d920b21d0 100644
--- a/src/TNL/Matrices/TridiagonalMatrixView.hpp
+++ b/src/TNL/Matrices/TridiagonalMatrixView.hpp
@@ -98,8 +98,8 @@ getCompressedRowLengths( Vector& rowLengths ) const
    auto fetch = [] __cuda_callable__ ( IndexType row, IndexType column, const RealType& value ) -> IndexType {
       return ( value != 0.0 );
    };
-   auto reduce = [] __cuda_callable__ ( IndexType& aux, const IndexType a ) {
-      aux += a;
+   auto reduce = [] __cuda_callable__ ( const IndexType& aux, const IndexType a ) -> IndexType {
+      return aux + a;
    };
    auto keep = [=] __cuda_callable__ ( const IndexType rowIdx, const IndexType value ) mutable {
       rowLengths_view[ rowIdx ] = value;
@@ -275,23 +275,23 @@ rowsReduction( IndexType first, IndexType last, Fetch& fetch, Reduce& reduce, Ke
       Real_ sum( zero );
       if( rowIdx == 0 )
       {
-         reduce( sum, fetch( 0, 1, values_view[ indexer.getGlobalIndex( 0, 1 ) ] ) );
-         reduce( sum, fetch( 0, 2, values_view[ indexer.getGlobalIndex( 0, 2 ) ] ) );
+         sum = reduce( sum, fetch( 0, 1, values_view[ indexer.getGlobalIndex( 0, 1 ) ] ) );
+         sum = reduce( sum, fetch( 0, 2, values_view[ indexer.getGlobalIndex( 0, 2 ) ] ) );
          keep( 0, sum );
          return;
       }
       if( rowIdx + 1 < indexer.getColumns() )
       {
-         reduce( sum, fetch( rowIdx, rowIdx - 1, values_view[ indexer.getGlobalIndex( rowIdx, 0 ) ] ) );
-         reduce( sum, fetch( rowIdx, rowIdx,     values_view[ indexer.getGlobalIndex( rowIdx, 1 ) ] ) );
-         reduce( sum, fetch( rowIdx, rowIdx + 1, values_view[ indexer.getGlobalIndex( rowIdx, 2 ) ] ) );
+         sum = reduce( sum, fetch( rowIdx, rowIdx - 1, values_view[ indexer.getGlobalIndex( rowIdx, 0 ) ] ) );
+         sum = reduce( sum, fetch( rowIdx, rowIdx,     values_view[ indexer.getGlobalIndex( rowIdx, 1 ) ] ) );
+         sum = reduce( sum, fetch( rowIdx, rowIdx + 1, values_view[ indexer.getGlobalIndex( rowIdx, 2 ) ] ) );
          keep( rowIdx, sum );
          return;
       }
       if( rowIdx < indexer.getColumns() )
       {
-         reduce( sum, fetch( rowIdx, rowIdx - 1, values_view[ indexer.getGlobalIndex( rowIdx, 0 ) ] ) );
-         reduce( sum, fetch( rowIdx, rowIdx,     values_view[ indexer.getGlobalIndex( rowIdx, 1 ) ] ) );
+         sum = reduce( sum, fetch( rowIdx, rowIdx - 1, values_view[ indexer.getGlobalIndex( rowIdx, 0 ) ] ) );
+         sum = reduce( sum, fetch( rowIdx, rowIdx,     values_view[ indexer.getGlobalIndex( rowIdx, 1 ) ] ) );
          keep( rowIdx, sum );
       }
       else
@@ -319,23 +319,23 @@ rowsReduction( IndexType first, IndexType last, Fetch& fetch, Reduce& reduce, Ke
       Real_ sum( zero );
       if( rowIdx == 0 )
       {
-         reduce( sum, fetch( 0, 1, values_view[ indexer.getGlobalIndex( 0, 1 ) ] ) );
-         reduce( sum, fetch( 0, 2, values_view[ indexer.getGlobalIndex( 0, 2 ) ] ) );
+         sum = reduce( sum, fetch( 0, 1, values_view[ indexer.getGlobalIndex( 0, 1 ) ] ) );
+         sum = reduce( sum, fetch( 0, 2, values_view[ indexer.getGlobalIndex( 0, 2 ) ] ) );
          keep( 0, sum );
          return;
       }
       if( rowIdx + 1 < indexer.getColumns() )
       {
-         reduce( sum, fetch( rowIdx, rowIdx - 1, values_view[ indexer.getGlobalIndex( rowIdx, 0 ) ] ) );
-         reduce( sum, fetch( rowIdx, rowIdx,     values_view[ indexer.getGlobalIndex( rowIdx, 1 ) ] ) );
-         reduce( sum, fetch( rowIdx, rowIdx + 1, values_view[ indexer.getGlobalIndex( rowIdx, 2 ) ] ) );
+         sum = reduce( sum, fetch( rowIdx, rowIdx - 1, values_view[ indexer.getGlobalIndex( rowIdx, 0 ) ] ) );
+         sum = reduce( sum, fetch( rowIdx, rowIdx,     values_view[ indexer.getGlobalIndex( rowIdx, 1 ) ] ) );
+         sum = reduce( sum, fetch( rowIdx, rowIdx + 1, values_view[ indexer.getGlobalIndex( rowIdx, 2 ) ] ) );
          keep( rowIdx, sum );
          return;
       }
       if( rowIdx < indexer.getColumns() )
       {
-         reduce( sum, fetch( rowIdx, rowIdx - 1, values_view[ indexer.getGlobalIndex( rowIdx, 0 ) ] ) );
-         reduce( sum, fetch( rowIdx, rowIdx,     values_view[ indexer.getGlobalIndex( rowIdx, 1 ) ] ) );
+         sum = reduce( sum, fetch( rowIdx, rowIdx - 1, values_view[ indexer.getGlobalIndex( rowIdx, 0 ) ] ) );
+         sum = reduce( sum, fetch( rowIdx, rowIdx,     values_view[ indexer.getGlobalIndex( rowIdx, 1 ) ] ) );
          keep( rowIdx, sum );
       }
       else
@@ -487,8 +487,8 @@ vectorProduct( const InVector& inVector,
    auto fetch = [=] __cuda_callable__ ( const IndexType& row, const IndexType& column, const RealType& value ) -> RealType {
       return value * inVectorView[ column ];
    };
-   auto reduction = [] __cuda_callable__ ( RealType& sum, const RealType& value ) {
-      sum += value;
+   auto reduction = [] __cuda_callable__ ( const RealType& sum, const RealType& value ) -> RealType {
+      return sum + value;
    };
    auto keeper1 = [=] __cuda_callable__ ( IndexType row, const RealType& value ) mutable {
       outVectorView[ row ] = value;
-- 
GitLab


From 55f72761df9da05814ec6d825ccce06d6cb47626 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Tue, 5 Jan 2021 16:25:32 +0100
Subject: [PATCH 10/53] Writting tutorial for tridiagonal matrix.

---
 ...lMatrixExample_Constructor_init_list_1.cpp |   1 +
 ...TridiagonalMatrixExample_rowsReduction.cpp |   1 +
 .../TridiagonalMatrixExample_setElement.cpp   |   1 +
 .../TridiagonalMatrixExample_setElements.cpp  |   1 +
 .../TridiagonalMatrixViewExample_forRows.cpp  |   1 +
 .../TridiagonalMatrixViewExample_getRow.cpp   |   1 +
 ...ridiagonalMatrixViewExample_setElement.cpp |   1 +
 .../Tutorials/Matrices/tutorial_Matrices.md   | 292 +++++++++++++++++-
 8 files changed, 288 insertions(+), 11 deletions(-)
 create mode 120000 Documentation/Tutorials/Matrices/TridiagonalMatrixExample_Constructor_init_list_1.cpp
 create mode 120000 Documentation/Tutorials/Matrices/TridiagonalMatrixExample_rowsReduction.cpp
 create mode 120000 Documentation/Tutorials/Matrices/TridiagonalMatrixExample_setElement.cpp
 create mode 120000 Documentation/Tutorials/Matrices/TridiagonalMatrixExample_setElements.cpp
 create mode 120000 Documentation/Tutorials/Matrices/TridiagonalMatrixViewExample_forRows.cpp
 create mode 120000 Documentation/Tutorials/Matrices/TridiagonalMatrixViewExample_getRow.cpp
 create mode 120000 Documentation/Tutorials/Matrices/TridiagonalMatrixViewExample_setElement.cpp

diff --git a/Documentation/Tutorials/Matrices/TridiagonalMatrixExample_Constructor_init_list_1.cpp b/Documentation/Tutorials/Matrices/TridiagonalMatrixExample_Constructor_init_list_1.cpp
new file mode 120000
index 000000000..f074fa48b
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/TridiagonalMatrixExample_Constructor_init_list_1.cpp
@@ -0,0 +1 @@
+../../Examples/Matrices/TridiagonalMatrix/TridiagonalMatrixExample_Constructor_init_list_1.cpp
\ No newline at end of file
diff --git a/Documentation/Tutorials/Matrices/TridiagonalMatrixExample_rowsReduction.cpp b/Documentation/Tutorials/Matrices/TridiagonalMatrixExample_rowsReduction.cpp
new file mode 120000
index 000000000..5a8b79027
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/TridiagonalMatrixExample_rowsReduction.cpp
@@ -0,0 +1 @@
+../../Examples/Matrices/TridiagonalMatrix/TridiagonalMatrixExample_rowsReduction.cpp
\ No newline at end of file
diff --git a/Documentation/Tutorials/Matrices/TridiagonalMatrixExample_setElement.cpp b/Documentation/Tutorials/Matrices/TridiagonalMatrixExample_setElement.cpp
new file mode 120000
index 000000000..aa3443952
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/TridiagonalMatrixExample_setElement.cpp
@@ -0,0 +1 @@
+../../Examples/Matrices/TridiagonalMatrix/TridiagonalMatrixExample_setElement.cpp
\ No newline at end of file
diff --git a/Documentation/Tutorials/Matrices/TridiagonalMatrixExample_setElements.cpp b/Documentation/Tutorials/Matrices/TridiagonalMatrixExample_setElements.cpp
new file mode 120000
index 000000000..6a1a2e1ef
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/TridiagonalMatrixExample_setElements.cpp
@@ -0,0 +1 @@
+../../Examples/Matrices/TridiagonalMatrix/TridiagonalMatrixExample_setElements.cpp
\ No newline at end of file
diff --git a/Documentation/Tutorials/Matrices/TridiagonalMatrixViewExample_forRows.cpp b/Documentation/Tutorials/Matrices/TridiagonalMatrixViewExample_forRows.cpp
new file mode 120000
index 000000000..8f072994a
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/TridiagonalMatrixViewExample_forRows.cpp
@@ -0,0 +1 @@
+../../Examples/Matrices/TridiagonalMatrix/TridiagonalMatrixViewExample_forRows.cpp
\ No newline at end of file
diff --git a/Documentation/Tutorials/Matrices/TridiagonalMatrixViewExample_getRow.cpp b/Documentation/Tutorials/Matrices/TridiagonalMatrixViewExample_getRow.cpp
new file mode 120000
index 000000000..960c717f4
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/TridiagonalMatrixViewExample_getRow.cpp
@@ -0,0 +1 @@
+../../Examples/Matrices/TridiagonalMatrix/TridiagonalMatrixViewExample_getRow.cpp
\ No newline at end of file
diff --git a/Documentation/Tutorials/Matrices/TridiagonalMatrixViewExample_setElement.cpp b/Documentation/Tutorials/Matrices/TridiagonalMatrixViewExample_setElement.cpp
new file mode 120000
index 000000000..59094634e
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/TridiagonalMatrixViewExample_setElement.cpp
@@ -0,0 +1 @@
+../../Examples/Matrices/TridiagonalMatrix/TridiagonalMatrixViewExample_setElement.cpp
\ No newline at end of file
diff --git a/Documentation/Tutorials/Matrices/tutorial_Matrices.md b/Documentation/Tutorials/Matrices/tutorial_Matrices.md
index 25ea8d793..1a630cd2d 100644
--- a/Documentation/Tutorials/Matrices/tutorial_Matrices.md
+++ b/Documentation/Tutorials/Matrices/tutorial_Matrices.md
@@ -15,7 +15,7 @@ TNL offers the following type of matrices:  dense matrices, sparse matrices, tri
 
 ## Dense matrices <a name="dense_matrices"></a>
 
-Dense matrix is a templated class defined in namespace \ref TNL::Matrices. It has five template parameters:
+Dense matrix is a templated class defined in the namespace \ref TNL::Matrices. It has five template parameters:
 
 * `Real` is a type of the matrix elements. It is `double` by default.
 * `Device` is a device where the matrix shall be allocated. Currently it can be either \ref TNL::Devices::Host for CPU or \ref TNL::Devices::Cuda for GPU supporting CUDA. It is \ref TNL::Devices::Host by default.
@@ -103,7 +103,7 @@ The `fetch` lambda function just returns absolute value of \f$a_{ij} \f$ which i
 
 \include DenseMatrixExample_rowsReduction_maxNorm.out
 
-### Dense-matrix vector product
+### Dense matrix-vector product
 
 One of the most important matrix operation is the matrix-vector multiplication. It is represented by a method `vectorProduct` (\ref TNL::Matrices::DenseMatrix::vectorProduct). It is templated method with two template parameters `InVector` and `OutVector` telling the types of input and output vector respectively. Usually one will substitute some of \ref TNL::Containers::Array, \ref TNL::Containers::ArrayView, \ref TNL::Containers::Vector or \ref TNL::Containers::VectorView for these types. The method accepts the following parameters:
 
@@ -122,7 +122,7 @@ To summarize, this method computes the following formula:
 
 ### Dense matrix IO
 
-The dense matrix can be saved to a file using a method `save` (\ref TNL::Matrices::DenseMatrix::save) and restored with a method `load` (\ref TNL::Matrices::DenseMatrix::load). To print the matrix a method `print` (\ref TNL::Matrices::DenseMatrix::print) can be used.
+The dense matrix can be saved to a file using a method `save` (\ref TNL::Matrices::DenseMatrix::save) and restored with a method `load` (\ref TNL::Matrices::DenseMatrix::load). To print the matrix, there is a method `print` (\ref TNL::Matrices::DenseMatrix::print) can be used.
 
 ### Dense matrix view
 
@@ -178,7 +178,7 @@ Major disadventage of sparse matrices is that there are a lot of different forma
 
 ### Sparse matrix allocation and initiation
 
-Small matrices can be initialized by a constructor with initializer list. We assume having the follwong sparse matrix
+Small matrices can be initialized by a constructor with initializer list. We assume having the following sparse matrix
 
 \f[
 \left(
@@ -404,7 +404,7 @@ Next we prepare input (`x`) and output (`y`) vectors on the lines 21 and 22 and
 
 We ommit the row index and take the column index which indicates index of the element of the input vector we need to fetch (`xView[ columnIdx ]`). We take its value and multiply it with the value (`value`) of the current matrix element. We do not need to write lambda function for reduction since it is only summation of the intermediate results from the `fetch` lamda and we can use `std::plus<>{}` (see the line 60). The `keep` lambda function offers two parameters:
 
-1. `rowIdx` tells the index of the matrix row for which we aimm to store the result.
+1. `rowIdx` tells the index of the matrix row for which we aim to store the result.
 2. `value` is the result obtained in the given matrix row.
 
 In our example, we just write the result into appropriate element of the output vector `y` which is given just by the row index `rowIdx` -- see the line 47.  On the line 53 we start the computation of the matrix-vector product. The method `rowsReduction` (\ref TNL::Matrices::SparseMatrix::rowsReduction) accepts the following arguments:
@@ -420,7 +420,7 @@ At the end we print the matrix, the input and the output vector -- lines 55-57.
 
 \include SparseMatrixExample_rowsReduction_vectorProduct.out
 
-### Sparse-matrix vector product
+### Sparse matrix-vector product
 
 As we mentioned already in the part explaining the dense matrices, matrix-vector multiplication or in this case sparse matrix-vector multiplication ([SpMV](https://en.wikipedia.org/wiki/Sparse_matrix-vector_multiplication)) is one of the most important operations in numerical mathematics and high-performance computing. It is represented by a method `vectorProduct` (\ref TNL::Matrices::SparseMatrix::vectorProduct). It is templated method with two template parameters `InVector` and `OutVector` telling the types of input and output vector respectively. Usually one will substitute some of \ref TNL::Containers::Array, \ref TNL::Containers::ArrayView, \ref TNL::Containers::Vector or \ref TNL::Containers::VectorView for these types. The method computes the following formula
 
@@ -441,7 +441,7 @@ Note that the ouput vector dimension must be the same as the number of matrix ro
 
 ### Sparse matrix IO
 
-The sparse matrix can be saved to a file using a method `save` (\ref TNL::Matrices::SparseMatrix::save) and restored with a method `load` (\ref TNL::Matrices::SparseMatrix::load). To print the matrix a method `print` (\ref TNL::Matrices::SparseMatrix::print) can be used.
+The sparse matrix can be saved to a file using a method `save` (\ref TNL::Matrices::SparseMatrix::save) and restored with a method `load` (\ref TNL::Matrices::SparseMatrix::load). For printing the matrix, there is a method `print` (\ref TNL::Matrices::SparseMatrix::print) can be used.
 
 ### Sparse matrix view
 
@@ -471,11 +471,281 @@ The result looks as follows:
 
 ## Tridiagonal matrices <a name="tridiagonal_matrices"></a>
 
-### Dense matrix allocation and initiation
+Tridiagonal matrix format serves for specific matrix pattern when the nonzero matrix elements can be placed only at the diagonal and immediately next to the diagonal. Here is an example:
+
+\f[
+\left(
+ \begin{array}{ccccccc}
+  2  & -1  &  .  & .   &  . & .  \\
+ -1  &  2  & -1  &  .  &  . & .  \\
+  .  & -1  &  2  & -1  &  . & .  \\
+  .  &  .  & -1  &  2  & -1 &  . \\
+  .  &  .  &  .  & -1  &  2 & -1 \\
+  .  &  .  &  .  &  .  & -1 &  2
+ \end{array}
+ \right)
+\f]
+
+An advantage is that we do not store the column indexes  explicitly as it is in \ref TNL::Matrices::SparseMatrix. This can reduce significantly the  memory requirements which also means better performance. See the following table for the storage requirements comparison between \ref TNL::Matrices::TridiagonalMatrix and \ref TNL::Matrices::SparseMatrix.
+
+ 
+  Real   | Index      |      SparseMatrix    | TridiagonalMatrix   | Ratio
+ --------|------------|----------------------|---------------------|--------
+  float  | 32-bit int | 8 bytes per element  | 4 bytes per element | 50%
+  double | 32-bit int | 12 bytes per element | 8 bytes per element | 75%
+  float  | 64-bit int | 12 bytes per element | 4 bytes per element | 30%
+  double | 64-bit int | 16 bytes per element | 8 bytes per element | 50%
+ 
+Tridiagonal matrix is a templated class defined in the namespace \ref TNL::Matrices. It has five template parameters:
+
+* `Real` is a type of the matrix elements. It is `double` by default.
+* `Device` is a device where the matrix shall be allocated. Currently it can be either \ref TNL::Devices::Host for CPU or \ref TNL::Devices::Cuda for GPU supporting CUDA. It is \ref TNL::Devices::Host by default.
+* `Index` is a type to be used for indexing of the matrix elements. It is `int` by default.
+* `ElementsOrganization` defines the organization of the matrix elements in memory. It can be \ref TNL::Algorithms::Segments::ColumnMajorOrder or \ref TNL::Algorithms::Segments::RowMajorOrder for column-major and row-major organization respectively. Be default it is the row-major order if the matrix is allocated in the host system and column major order if it is allocated on GPU.
+* `RealAllocator` is a memory allocator (one from \ref TNL::Allocators) which shall be used for allocation of the matrix elements. By default, it is the default allocator for given `Real` type and `Device` type -- see \ref TNL::Allocators::Default.
+
+### Tridiagonal matrix allocation and initiation
+
+The tridiagonal matrix can be initialized by the means of the constructor with initializer list. The matrix from the begining of this section can be constructed as the following example shows:
+
+\includelineno TridiagonalMatrixExample_Constructor_init_list_1.cpp
+
+For better alignment in the memory the tridiagonal format is organised like if there were three nonzero matrix elements in each row. This is not true for example in the first row where there is no matrix element on the left side of the diagonal. The same happens on the last row of the matrix. In our example, we have to add even the artificial matrix elements like this:
+
+\f[
+\begin{array}{c}
+0 \\
+. \\
+. \\
+. \\
+. \\
+.
+\end{array}
+\left(
+ \begin{array}{ccccccc}
+  2  & -1  &  .  & .   &  . & .  \\
+ -1  &  2  & -1  &  .  &  . & .  \\
+  .  & -1  &  2  & -1  &  . & .  \\
+  .  &  .  & -1  &  2  & -1 &  . \\
+  .  &  .  &  .  & -1  &  2 & -1 \\
+  .  &  .  &  .  &  .  & -1 &  2
+ \end{array}
+ \right)
+ \begin{array}{c}
+. \\
+. \\
+. \\
+. \\
+. \\
+0
+\end{array}
+\f]
+
+If a matrix has more rows then columns, we have to extend the last two rows with nonzero elements in this way
+
+\f[
+\left(
+ \begin{array}{ccccccc}
+  2  & -1  &  .  & .   &  . & .  \\
+ -1  &  2  & -1  &  .  &  . & .  \\
+  .  & -1  &  2  & -1  &  . & .  \\
+  .  &  .  & -1  &  2  & -1 &  . \\
+  .  &  .  &  .  & -1  &  2 & -1 \\
+  .  &  .  &  .  &  .  & -1 &  2 \\
+  .  &  .  &  .  &  .  &  . & -1
+ \end{array}
+ \right)
+\rightarrow
+\begin{array}{c}
+0 \\
+. \\
+. \\
+. \\
+. \\
+. \\
+.
+\end{array}
+\left(
+ \begin{array}{ccccccc}
+  2  & -1  &  .  & .   &  . & .  \\
+ -1  &  2  & -1  &  .  &  . & .  \\
+  .  & -1  &  2  & -1  &  . & .  \\
+  .  &  .  & -1  &  2  & -1 &  . \\
+  .  &  .  &  .  & -1  &  2 & -1 \\
+  .  &  .  &  .  &  .  & -1 &  2 \\
+  .  &  .  &  .  &  .  &  . & -1
+ \end{array}
+ \right)
+ \begin{array}{cc}
+. & . \\
+. & . \\
+. & . \\
+. & . \\
+. & . \\
+0 & . \\
+0 & 0
+\end{array}
+\f]
+
+The output of the example looks as:
+
+\includelineno TridiagonalMatrixExample_Constructor_init_list_1.out
+
+Similar way of the tridiagonal matrix setup is offered by the method `setElements` (\ref TNL::Matrices::TridiagonalMatrix::setElements) as the following example demonstrates:
+
+\includelineno TridiagonalMatrixExample_setElements.cpp
+
+
+Here we create the matrix in two steps. Firstly, we setup the matrix dimensions by the appropriate constructor (line 24) and after that we setup the matrix elemets (line 25-45). The result looks the same as in the previous example:
+
+\includelineno TridiagonalMatrixExample_setElements.out
+
+In the following example we create tridiagonal matrix with 5 rows and 5 columns (line 12-14) by the means of a shared pointer (\ref TNL::Pointers::SharedPointer) to make this work even on GPU. We set numbers 0,...,4 on the diagonal (line 16) and we print the matrix (line 18). Next we use a lambda function (lines 21-27) combined with parallel for (\ref TNL::Alfgorithms::ParallelFor) (line 35), to modify the matrix. The offdiagonal elements are set to 1 (lines 23 and 26) and for the diagonal elements, we change the sign (line 24).
+
+\includelineno TridiagonalMatrixExample_setElement.cpp
+
+The result looks as follows:
+
+\includelineno TridiagonalMatrixExample_setElement.out
+
+ A slightly simpler way how to do the same with no need for shared pointer (\ref TNL::Pointers::SharedPointer), could be with the use of tridiagonal matrix view and the method `getRow` (\ref TNL::Matrices::TridiagonalMatrixView::getRow) as the following example demonstrates:
+
+\includelineno TridiagonalMatrixViewExample_getRow.cpp
+
+We create a matrix with the same size (line 10-15) set ones on the diagonal (lines 15-16). Next, we fetch the tridiagonal matrix view (line 16) which we can refer in the lambda function for matrix elements modification (lines 18-26). Inside the lambda function, we first get a matrix row by calling the method `getRow` (\ref TNL::Matrices::TridiagonalMatrixView::getRow) using which we can acces the matrix elements (lines 21-25). The lambda function is called by the parallel for (\ref TNL::Algorithms::ParallelFor).
+
+The result looks as follows:
+
+\includelineno TridiagonalMatrixViewExample_getRow.out
+
+Finaly, even a bit more simple and bit less flexible way of matrix elements manipulation with use of the method `forRows` (\ref TNL::Matrices::TridiagonalMatrix::forRows) is demosntrated in the following example:
+
+\includelineno TridiagonalMatrixViewExample_forRows.cpp
+
+On the line 41 we call the method `forRows` (\ref TNL::Matrices::TridiagonalMatrix::forRows) instead of parallel for (\ref TNL::Algorithms::ParallelFor). This method iterates overl all matrix rows and all nonzero matrix elements. The lambda function function on the line 24 therefore do not receive only the matrix row index but also local index of the matrix element (`localIdx`) which is a rank of the nonzero matrix element in given row. The values of the local index for given matrix elements is as follows
+
+\f[ 
+\left(
+\begin{array}{cccccc}
+1 & 2 &   &   &   &     \\
+0 & 1 & 2 &   &   &     \\
+  & 0 & 1 & 2 &   &     \\
+  &   & 0 & 1 & 2 &     \\
+  &   &   & 0 & 1 & 2   \\
+  &   &   &   & 0 & 1
+\end{array}
+\right)
+\f]
+
+Next parameter `columnIdx` received by the lambda function is the column index of the matrix element. The fourth parameter `value` is a reference on the matrix element which we use for its modification. If the last parameter `compute` is set to false, the iterations over the matrix rows is terminated.
+
+The result looks as follows:
+
+\includelineno TridiagonalMatrixViewExample_forRows.out
+
 ### Flexible reduction in matrix rows
-### Dense-matrix vector product
-### Dense matrix IO
-### Dense matrix view
+
+The *flexible parallel reduction* in rows for tridiagonal matrices is also simmilar as for dense and sparse matrices. It is represented by three lambda functions:
+
+1. `fetch` reads and preproces data entering the flexible parallel reduction.
+2. `reduce` performs the reduction operation.
+3. `keep` stores the results from each matrix row.
+
+See the following example:
+
+\includelineno TridiagonalMatrixExample_rowsReduction.cpp
+
+Here we first set tridiagonal matrix (lines 10-27) which looks as
+
+\f[
+\left(
+\begin{array}{ccccc}
+1 & 3 &   &   &   &    \\
+2 & 1 & 3 &   &   &    \\
+  & 2 & 1 & 3 &   &    \\
+  &   & 2 & 1 & 3 &    \\
+  &   &   & 2 & 1 & 3
+\end{array}
+\right).
+\f]
+
+Next we want to compute maximal absolute value of the nonzero matrix elements in each row. We allocate the vector `rowMax` where we will store the results (line 32). The lambda function `fetch` (lines 42-44) is responsible for reading the matrix elements. It receives three arguments:
+
+1. `rowIdx` is a row index of the matrix element being currently processed.
+2. `columnIdx` is a column index of the matrix elements being currently processed.
+3. `value` is a value of the matrix element being currently procesed.
+
+In our example, the only thing this function has to do, is to compute the absolute value of each matrix element represented by variable `value`. The next lambda function, `reduce` (lines 49-51), performs reduction operation. In this case, it returns maximum of two input values `a` and `b`. Finaly, the lambda function `keep` (lines 56-58) is defined with the following parameters:
+
+1. `rowIdx` tells the index of the matrix row for which we aim to store the result.
+2. `value` is the result obtained in the given matrix row.
+
+In our example, it just takes the result of the reduction in variable `value` in each row and stores it into the vector `rowMax` via related vector view `rowMaxView`.
+
+The method `rowsReduction` (\ref TNL::Matrices::SparseMatrix::rowsReduction) activates all the mantioned lambda functions (line 63). It accepts the following arguments:
+
+1. `begin` is the begining of the matrix rows range on which the reduction will be performed.
+2. `end` is the end of the matrix rows range on which the reduction will be performed. The last matrix row which is going to be processed has index `end-1`.
+3. `fetch` is the fetch lambda function.
+4. `reduce` is the the lmabda function performing the reduction.
+5. `keep` is the lambda function responsible for processing the results from particular matrix rows.
+6. `zero` is the "zero" element of given reduction opertation also known as *idempotent*. In our example, the role of this element has the lowest number of given type which we can obtain using function `std::numeric_limits< double >::lowest()` from STL.
+
+ The results looks as follows:
+
+\includelineno TridiagonalMatrixExample_rowsReduction.out
+
+### Tridiagonal matrix-vector product
+
+Similar to dense and sparse matrices, matrix-vector multiplication is represented by a method `vectorProduct` (\ref TNL::Matrices::TridiagonalMatrix::vectorProduct). It is templated method with two template parameters `InVector` and `OutVector` telling the types of input and output vector respectively. Usually one will substitute some of \ref TNL::Containers::Array, \ref TNL::Containers::ArrayView, \ref TNL::Containers::Vector or \ref TNL::Containers::VectorView for these types. The method computes the following formula
+
+```
+outVector = matrixMultiplicator * ( *this ) * inVector + outVectorMultiplicator * outVector
+```
+
+and it accepts the following parameters:
+
+* `inVector` is the input vector having the same number of elements as the number of matrix columns.
+* `outVector` is the output vector having the same number of elements as the number of matrix rows.
+* `matrixMultiplicator` is a number by which the result of matrix-vector product is multiplied. 
+* `outVectorMultiplicator` is a number by which the output vector is multiplied before added to the result of matrix-vector product.
+* `begin` is an index of the first matrix row that is involved in the multiplication. It is zero be default.
+* `end` is an index of the last matrix row that is involved in the multiplication. It is the last matrix row by default.
+
+Note that the ouput vector dimension must be the same as the number of matrix rows no matter how we set `begin` and `end` parameters. These parameters just say that some matrix rows and the output vector elements are omitted.
+
+### Tridiagonal matrix IO
+
+The tridiagonal matrix can be saved to a file using a method `save` (\ref TNL::Matrices::TridiagonalMatrix::save) and restored with a method `load` (\ref TNL::Matrices::TridiagonalMatrix::load). For printing the matrix, there is a method `print` (\ref TNL::Matrices::TridiagonalMatrix::print) can be used.
+
+### Tridiagonal matrix view
+
+Similar to dense and sparse matrix view, tridiagonal matrix also offers its view for easier use with lambda functions. It represented by a templated class \ref TNL::Matrices::TridiagonalMatrixView with the following template parameters:
+
+* `Real` is a type of matrix elements. 
+* `Device` is a device on which the matrix is allocated. This can be \ref TNL::Devices::Host or \ref TNL::Devices::Cuda.
+* `Index` is a type for indexing the matrix elements and also row and column indexes.
+* `Organization` tells the ordering of matrix elements in memory. It is either RowMajorOrder or ColumnMajorOrder.
+
+The first main reason for using the dense matrix view is its ability to be captured by lambda functions since the copy constructor makes only shallow copy. We can demonstrate it on the example showing the method `setElement` (\ref TNL::Matrices::TridiagonalMatrix::setElement). The code looks as follows:
+
+\includelineno TridiagonalMatrixViewExample_setElement.cpp
+
+The matrix view is obtained by the method `getView` (\ref TNL::Matrices::TridiagonalMatrix::getView) on the line 13. We firsrt show, that the view can be used the same way as common matrix (lines 14 and 15) but it can be used the same way even in lambda functions as we can see on the lines 20-26. Compare it with the same example using shared pointer instead of the matrix view:
+
+\includelineno TridiagonalMatrixExample_setElement.cpp
+
+The main disadventages are:
+
+1. The shared pointer must be created together with the matrix (line 14) and there is no way to get it later. The matrix view can be obtained from any matrix at any time.
+2. We have to synchronize shared pointers explicitly by calling the function \ref TNL::Pointers::synchronizeSmartPointersOnDevice (line 34).
+
+So for the sake of using a matrix in lambda functions, the matrix view is better tool. The result of both examples looks as:
+
+\include TridiagonalMatrixExample_setElement.out
+
+As we mentioned already, the tridiagonal matrix view offers almost all methods which the tridiagonal matrix does. So it can be easily used at almost any situation the same way as the tridiagonal matrix itself.
+
 
 ## Multidiagonal matrices <a name="multidiagonal_matrices"></a>
 
-- 
GitLab


From 1646b7b5a61f742ade625c54dca6c31f6d3fd9bd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Fri, 8 Jan 2021 17:39:59 +0100
Subject: [PATCH 11/53] Improving table matrix types comparison in tridiagonal
 matrix documentation.

---
 src/TNL/Matrices/TridiagonalMatrix.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/TNL/Matrices/TridiagonalMatrix.h b/src/TNL/Matrices/TridiagonalMatrix.h
index e5c9ee24a..426fa2e74 100644
--- a/src/TNL/Matrices/TridiagonalMatrix.h
+++ b/src/TNL/Matrices/TridiagonalMatrix.h
@@ -44,12 +44,12 @@ namespace Matrices {
  * memory requirements which also means better performance. See the following table
  * for the storage requirements comparison between \ref TridiagonalMatrix and \ref SparseMatrix.
  *
- *  Data types         |      SparseMatrix    | TridiagonalMatrix | Ratio
- * --------------------|----------------------|---------------------|--------
- *  float + 32-bit int | 8 bytes per element  | 4 bytes per element | 50%
- *  double + 32-bit int| 12 bytes per element | 8 bytes per element | 75%
- *  float + 64-bit int | 12 bytes per element | 4 bytes per element | 30%
- *  double + 64-bit int| 16 bytes per element | 8 bytes per element | 50%
+ *  Real   | Index      |      SparseMatrix    | TridiagonalMatrix   | Ratio
+ * --------|------------|----------------------|---------------------|-------
+ *  float  | 32-bit int | 8 bytes per element  | 4 bytes per element | 50%
+ *  double | 32-bit int | 12 bytes per element | 8 bytes per element | 75%
+ *  float  | 64-bit int | 12 bytes per element | 4 bytes per element | 30%
+ *  double | 64-bit int | 16 bytes per element | 8 bytes per element | 50%
  *
  * \tparam Real is a type of matrix elements.
  * \tparam Device is a device where the matrix is allocated.
-- 
GitLab


From c766a855566070ee06c2043f0ecbae0494882e77 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Fri, 8 Jan 2021 17:57:45 +0100
Subject: [PATCH 12/53] Added methods
 TNL::Matrices::MutlidiagonalMatrix::setSubdiagonals.

---
 src/TNL/Matrices/MultidiagonalMatrix.h        | 103 +++++++++++++-----
 src/TNL/Matrices/MultidiagonalMatrix.hpp      |  37 ++++++-
 src/TNL/Matrices/MultidiagonalMatrixView.h    |  50 ++++++---
 .../Matrices/MultidiagonalMatrixTest.h        |  38 +++++++
 4 files changed, 185 insertions(+), 43 deletions(-)

diff --git a/src/TNL/Matrices/MultidiagonalMatrix.h b/src/TNL/Matrices/MultidiagonalMatrix.h
index fa612e9f0..05d834750 100644
--- a/src/TNL/Matrices/MultidiagonalMatrix.h
+++ b/src/TNL/Matrices/MultidiagonalMatrix.h
@@ -47,14 +47,15 @@ namespace Matrices {
  * are \f$\{-3,-1,0,1,3\}\f$. Advantage is that we do not store the column indexes
  * explicitly as it is in \ref SparseMatrix. This can reduce significantly the
  * memory requirements which also means better performance. See the following table
- * for the storage requirements comparison between \ref MultidiagonalMatrix and \ref SparseMatrix.
+ * for the storage requirements comparison between \ref TNL::Matrices::MultidiagonalMatrix 
+ * and \ref TNL::Matrices::SparseMatrix.
  *
- *  Data types         |      SparseMatrix    | MultidiagonalMatrix | Ratio
- * --------------------|----------------------|---------------------|--------
- *  float + 32-bit int | 8 bytes per element  | 4 bytes per element | 50%
- *  double + 32-bit int| 12 bytes per element | 8 bytes per element | 75%
- *  float + 64-bit int | 12 bytes per element | 4 bytes per element | 30%
- *  double + 64-bit int| 16 bytes per element | 8 bytes per element | 50%
+ *  Real   | Index     |      SparseMatrix    | MultidiagonalMatrix | Ratio
+ * --------|-----------|----------------------|---------------------|-------
+ *  float  | 32-bit int| 8 bytes per element  | 4 bytes per element | 50%
+ *  double | 32-bit int| 12 bytes per element | 8 bytes per element | 75%
+ *  float  | 64-bit int| 12 bytes per element | 4 bytes per element | 30%
+ *  double | 64-bit int| 16 bytes per element | 8 bytes per element | 50%
  *
  * \tparam Real is a type of matrix elements.
  * \tparam Device is a device where the matrix is allocated.
@@ -296,6 +297,28 @@ class MultidiagonalMatrix : public Matrix< Real, Device, Index, RealAllocator >
                           const IndexType columns,
                           const Vector& diagonalsOffsets );
 
+      /**
+       * @brief Set the diagonals offsets by means of vector-like container.
+       *
+       * This method deletes current matrix elements.
+       *
+       * @tparam Vector is a type of vector-like container holding the subdiagonals offsets.
+       * @param diagonalsOffsets  is a vector-like container holding the subdiagonals offsets.
+       */
+      template< typename Vector >
+      void setDiagonalsOffsets( const Vector& diagonalsOffsets );
+
+      /**
+       * @brief Set the diagonals offsets by means of initializer list.
+       *
+       * This method deletes current matrix elements.
+       *
+       * @tparam ListIndex is type of indexes used for the subdiagonals offsets definition.
+       * @param diagonalsOffsets is a initializer list with subdiagonals offsets.
+       */
+      template< typename ListIndex >
+      void setDiagonalsOffsets( const std::initializer_list< ListIndex > diagonalsOffsets );
+
       /**
        * \brief This method is for compatibility with \ref SparseMatrix.
        *
@@ -311,6 +334,22 @@ class MultidiagonalMatrix : public Matrix< Real, Device, Index, RealAllocator >
       template< typename RowCapacitiesVector >
       void setRowCapacities( const RowCapacitiesVector& rowCapacities );
 
+      /**
+       * \brief Returns number of diagonals.
+       *
+       * \return Number of diagonals.
+       */
+      const IndexType& getDiagonalsCount() const;
+
+      /**
+       * \brief Returns vector with diagonals offsets.
+       *
+       * \return vector with diagonals offsets.
+       */
+      const DiagonalsOffsetsType& getDiagonalsOffsets() const;
+      template< typename RowCapacitiesVector >
+      void setRowCapacities( const RowCapacitiesVector& rowCapacities );
+
       /**
        * \brief Set matrix elements from an initializer list.
        *
@@ -329,20 +368,6 @@ class MultidiagonalMatrix : public Matrix< Real, Device, Index, RealAllocator >
       template< typename ListReal >
       void setElements( const std::initializer_list< std::initializer_list< ListReal > >& data );
 
-      /**
-       * \brief Returns number of diagonals.
-       *
-       * \return Number of diagonals.
-       */
-      const IndexType& getDiagonalsCount() const;
-
-      /**
-       * \brief Returns vector with diagonals offsets.
-       *
-       * \return vector with diagonals offsets.
-       */
-      const DiagonalsOffsetsType& getDiagonalsOffsets() const;
-
       /**
        * \brief Computes number of non-zeros in each row.
        *
@@ -666,9 +691,21 @@ class MultidiagonalMatrix : public Matrix< Real, Device, Index, RealAllocator >
        *
        * \tparam Function is type of lambda function that will operate on matrix elements.
        *    It is should have form like
-       *  `function( IndexType rowIdx, IndexType localIdx, IndexType columnIdx, const RealType& value, bool& compute )`.
-       *  The \e localIdx parameter is a rank of the non-zero element in given row.
-       *  If the 'compute' variable is set to false the iteration over the row can
+       *
+       *  `function( IndexType rowIdx, IndexType localIdx, IndexType columnIdx, const RealType& value, bool& compute )`,
+       *
+       * where
+       *
+       * \e rowIdx is an index of the matrix row. 
+       *
+       * \e localIdx parameter is a rank of the non-zero element in given row. It is also, in fact,
+       *  index of the matrix subdiagonal.
+       *
+       * \e columnIdx is a column index of the matrx element.
+       *
+       * \e value is the matrix element value.
+       *
+       * \e compute is a reference to a boolen variable. If it is set to false the iteration over the row can 
        *  be interrupted.
        *
        * \param begin defines beginning of the range [begin,end) of rows to be processed.
@@ -688,9 +725,21 @@ class MultidiagonalMatrix : public Matrix< Real, Device, Index, RealAllocator >
        *
        * \tparam Function is type of lambda function that will operate on matrix elements.
        *    It is should have form like
-       *  `function( IndexType rowIdx, IndexType localIdx, IndexType columnIdx, const RealType& value, bool& compute )`.
-       *  The \e localIdx parameter is a rank of the non-zero element in given row.
-       *  If the 'compute' variable is set to false the iteration over the row can
+       *
+       *  `function( IndexType rowIdx, IndexType localIdx, IndexType columnIdx, const RealType& value, bool& compute )`,
+       *
+       * where
+       *
+       * \e rowIdx is an index of the matrix row. 
+       *
+       * \e localIdx parameter is a rank of the non-zero element in given row. It is also, in fact,
+       *  index of the matrix subdiagonal.
+       *
+       * \e columnIdx is a column index of the matrx element.
+       *
+       * \e value is a reference to the matrix element value. It can be used even for changing the matrix element value.
+       *
+       * \e compute is a reference to a boolen variable. If it is set to false the iteration over the row can 
        *  be interrupted.
        *
        * \param begin defines beginning of the range [begin,end) of rows to be processed.
diff --git a/src/TNL/Matrices/MultidiagonalMatrix.hpp b/src/TNL/Matrices/MultidiagonalMatrix.hpp
index 1adad89ea..b0494ee1f 100644
--- a/src/TNL/Matrices/MultidiagonalMatrix.hpp
+++ b/src/TNL/Matrices/MultidiagonalMatrix.hpp
@@ -74,9 +74,9 @@ MultidiagonalMatrix( const IndexType columns,
                      const std::initializer_list< ListIndex > diagonalsOffsets,
                      const std::initializer_list< std::initializer_list< ListReal > >& data )
 {
-   Containers::Vector< IndexType, DeviceType, IndexType > shifts( diagonalsOffsets );
-   TNL_ASSERT_GT( shifts.getSize(), 0, "Cannot construct multidiagonal matrix with no diagonals shifts." );
-   this->setDimensions( data.size(), columns, shifts );
+   Containers::Vector< IndexType, DeviceType, IndexType > offsets( diagonalsOffsets );
+   TNL_ASSERT_GT( offsets.getSize(), 0, "Cannot construct multidiagonal matrix with no diagonals offsets." );
+   this->setDimensions( data.size(), columns, offsets );
    this->setElements( data );
 }
 
@@ -149,6 +149,37 @@ setDimensions( const IndexType rows,
    this->view = this->getView();
 }
 
+template< typename Real,
+          typename Device,
+          typename Index,
+          ElementsOrganization Organization,
+          typename RealAllocator,
+          typename IndexAllocator >
+   template< typename Vector >
+void
+MultidiagonalMatrix< Real, Device, Index, Organization, RealAllocator, IndexAllocator >::
+setDiagonalsOffsets( const Vector& diagonalsOffsets )
+{
+   TNL_ASSERT_GT( diagonalsOffsets.getSize(), 0, "Cannot construct multidiagonal matrix with no diagonals offsets." );
+   this->setDimensions( this->getRows(), this->getColumns(), diagonalsOffsets );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          ElementsOrganization Organization,
+          typename RealAllocator,
+          typename IndexAllocator >
+   template< typename ListIndex >
+void
+MultidiagonalMatrix< Real, Device, Index, Organization, RealAllocator, IndexAllocator >::
+setDiagonalsOffsets( const std::initializer_list< ListIndex > diagonalsOffsets )
+{
+   Containers::Vector< IndexType, DeviceType, IndexType > offsets( diagonalsOffsets );
+   TNL_ASSERT_GT( offsets.getSize(), 0, "Cannot construct multidiagonal matrix with no diagonals offsets." );
+   this->setDimensions( this->getRows(), this->getColumns(), offsets );
+}
+
 template< typename Real,
           typename Device,
           typename Index,
diff --git a/src/TNL/Matrices/MultidiagonalMatrixView.h b/src/TNL/Matrices/MultidiagonalMatrixView.h
index a058e643e..a26251a3b 100644
--- a/src/TNL/Matrices/MultidiagonalMatrixView.h
+++ b/src/TNL/Matrices/MultidiagonalMatrixView.h
@@ -427,7 +427,7 @@ class MultidiagonalMatrixView : public MatrixView< Real, Device, Index >
 
       /**
        * \brief Method for performing general reduction on all matrix rows.
-       * 
+       *
        * \tparam Fetch is a type of lambda function for data fetch declared as
        *          `fetch( IndexType rowIdx, IndexType& columnIdx, RealType& elementValue ) -> FetchValue`.
        *          The return type of this lambda can be any non void.
@@ -436,12 +436,12 @@ class MultidiagonalMatrixView : public MatrixView< Real, Device, Index >
        * \tparam Keep is a type of lambda function for storing results of reduction in each row.
        *          It is declared as `keep( const IndexType rowIdx, const double& value )`.
        * \tparam FetchValue is type returned by the Fetch lambda function.
-       * 
+       *
        * \param fetch is an instance of lambda function for data fetch.
        * \param reduce is an instance of lambda function for reduction.
        * \param keep in an instance of lambda function for storing results.
        * \param zero is zero of given reduction operation also known as idempotent element.
-       * 
+       *
        * \par Example
        * \include Matrices/MultidiagonalMatrix/MultidiagonalMatrixViewExample_allRowsReduction.cpp
        * \par Output
@@ -455,15 +455,27 @@ class MultidiagonalMatrixView : public MatrixView< Real, Device, Index >
        * 
        * \tparam Function is type of lambda function that will operate on matrix elements.
        *    It is should have form like
-       *  `function( IndexType rowIdx, IndexType localIdx, IndexType columnIdx, const RealType& value, bool& compute )`.
-       *  The \e localIdx parameter is a rank of the non-zero element in given row. 
-       *  If the 'compute' variable is set to false the iteration over the row can 
+       *
+       *  `function( IndexType rowIdx, IndexType localIdx, IndexType columnIdx, const RealType& value, bool& compute )`,
+       *
+       * where
+       *
+       * \e rowIdx is an index of the matrix row.
+       *
+       * \e localIdx parameter is a rank of the non-zero element in given row. It is also, in fact,
+       *  index of the matrix subdiagonal.
+       *
+       * \e columnIdx is a column index of the matrx element.
+       *
+       * \e value is the matrix element value.
+       *
+       * \e compute is a reference to a boolen variable. If it is set to false the iteration over the row can
        *  be interrupted.
-       * 
+       *
        * \param begin defines beginning of the range [begin,end) of rows to be processed.
        * \param end defines ending of the range [begin,end) of rows to be processed.
        * \param function is an instance of the lambda function to be called in each row.
-       * 
+       *
        * \par Example
        * \include Matrices/MultidiagonalMatrix/MultidiagonalMatrixViewExample_forRows.cpp
        * \par Output
@@ -477,15 +489,27 @@ class MultidiagonalMatrixView : public MatrixView< Real, Device, Index >
        * 
        * \tparam Function is type of lambda function that will operate on matrix elements.
        *    It is should have form like
-       *  `function( IndexType rowIdx, IndexType localIdx, IndexType columnIdx, const RealType& value, bool& compute )`.
-       *  The \e localIdx parameter is a rank of the non-zero element in given row. 
-       *  If the 'compute' variable is set to false the iteration over the row can 
-       *  be interrupted.
        * 
+       *  `function( IndexType rowIdx, IndexType localIdx, IndexType columnIdx, const RealType& value, bool& compute )`,
+       *
+       * where
+       *
+       * \e rowIdx is an index of the matrix row.
+       *
+       * \e localIdx parameter is a rank of the non-zero element in given row. It is also, in fact,
+       *  index of the matrix subdiagonal.
+       *
+       * \e columnIdx is a column index of the matrx element.
+       *
+       * \e value is a reference to the matrix element value. It can be used even for changing the matrix element value.
+       *
+       * \e compute is a reference to a boolen variable. If it is set to false the iteration over the row can
+       *  be interrupted.
+       *
        * \param begin defines beginning of the range [begin,end) of rows to be processed.
        * \param end defines ending of the range [begin,end) of rows to be processed.
        * \param function is an instance of the lambda function to be called in each row.
-       * 
+       *
        * \par Example
        * \include Matrices/MultidiagonalMatrix/MultidiagonalMatrixViewExample_forRows.cpp
        * \par Output
diff --git a/src/UnitTests/Matrices/MultidiagonalMatrixTest.h b/src/UnitTests/Matrices/MultidiagonalMatrixTest.h
index dd721dd89..8051f039f 100644
--- a/src/UnitTests/Matrices/MultidiagonalMatrixTest.h
+++ b/src/UnitTests/Matrices/MultidiagonalMatrixTest.h
@@ -64,6 +64,44 @@ void test_SetDimensions()
    EXPECT_EQ( m.getColumns(), 8 );
 }
 
+template< typename Matrix >
+void test_SetDiagonalsOffsets()
+{
+   using RealType = typename Matrix::RealType;
+   using DeviceType = typename Matrix::DeviceType;
+   using IndexType = typename Matrix::IndexType;
+   using DiagonalsOffsetsType = typename Matrix::DiagonalsOffsetsType;
+
+   const IndexType rows = 9;
+   const IndexType cols = 8;
+   const DiagonalsOffsetsType diagonalsOffsets{ -3, -1, 0, 2, 4 };
+
+   Matrix m;
+   m.setDimensions( rows, cols );
+   m.setDiagonalsOffsets( diagonalsOffsets );
+
+   EXPECT_EQ( m.getRows(), 9 );
+   EXPECT_EQ( m.getColumns(), 8 );
+}
+
+template< typename Matrix >
+void test_SetDiagonalsOffsets_initalizer_list()
+{
+   using RealType = typename Matrix::RealType;
+   using DeviceType = typename Matrix::DeviceType;
+   using IndexType = typename Matrix::IndexType;
+   using DiagonalsOffsetsType = typename Matrix::DiagonalsOffsetsType;
+
+   const IndexType rows = 9;
+   const IndexType cols = 8;
+
+   Matrix m;
+   m.setDimensions( rows, cols );
+   m.setDiagonalsOffsets( { -3, -1, 0, 2, 4 } );
+
+   EXPECT_EQ( m.getRows(), 9 );
+   EXPECT_EQ( m.getColumns(), 8 );
+}
 
 template< typename Matrix1, typename Matrix2 >
 void test_SetLike()
-- 
GitLab


From 983508a7999f3146dc5b547e6f80eb976d0b1c80 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Fri, 8 Jan 2021 17:58:36 +0100
Subject: [PATCH 13/53] Writting documentation on tridiagonal and multidiagonal
 matrices.

---
 ...ltidiagonalMatrixExample_rowsReduction.cpp |   2 +-
 .../MultidiagonalMatrixViewExample_getRow.cpp |   5 +-
 ...MultidiagonalMatrixExample_Constructor.cpp |   1 +
 ...lMatrixExample_Constructor_init_list_1.cpp |   1 +
 .../Tutorials/Matrices/tutorial_Matrices.md   | 408 +++++++++++++++++-
 5 files changed, 405 insertions(+), 12 deletions(-)
 create mode 120000 Documentation/Tutorials/Matrices/MultidiagonalMatrixExample_Constructor.cpp
 create mode 120000 Documentation/Tutorials/Matrices/MultidiagonalMatrixExample_Constructor_init_list_1.cpp

diff --git a/Documentation/Examples/Matrices/MultidiagonalMatrix/MultidiagonalMatrixExample_rowsReduction.cpp b/Documentation/Examples/Matrices/MultidiagonalMatrix/MultidiagonalMatrixExample_rowsReduction.cpp
index dc3d40483..2b579d963 100644
--- a/Documentation/Examples/Matrices/MultidiagonalMatrix/MultidiagonalMatrixExample_rowsReduction.cpp
+++ b/Documentation/Examples/Matrices/MultidiagonalMatrix/MultidiagonalMatrixExample_rowsReduction.cpp
@@ -48,7 +48,7 @@ void rowsReduction()
    /***
     * Reduce lambda return maximum of given values.
     */
-   auto reduce = [=] __cuda_callable__ ( double& a, const double& b ) -> double {
+   auto reduce = [=] __cuda_callable__ ( const double& a, const double& b ) -> double {
       return TNL::max( a, b );
    };
 
diff --git a/Documentation/Examples/Matrices/MultidiagonalMatrix/MultidiagonalMatrixViewExample_getRow.cpp b/Documentation/Examples/Matrices/MultidiagonalMatrix/MultidiagonalMatrixViewExample_getRow.cpp
index ac322f9aa..db0872b10 100644
--- a/Documentation/Examples/Matrices/MultidiagonalMatrix/MultidiagonalMatrixViewExample_getRow.cpp
+++ b/Documentation/Examples/Matrices/MultidiagonalMatrix/MultidiagonalMatrixViewExample_getRow.cpp
@@ -8,14 +8,11 @@ template< typename Device >
 void getRowExample()
 {
    const int matrixSize( 5 );
-   auto diagonalsOffsets = { -1, 0, 1 }; // Variadic templates in SharedPointer
-                                         // constructor do not recognize initializer
-                                         // list so we give it a hint.
    using MatrixType = TNL::Matrices::MultidiagonalMatrix< double, Device >;
    MatrixType matrix(
       matrixSize,  // number of matrix rows
       matrixSize,  // number of matrix columns
-      diagonalsOffsets );
+      { -1, 0, 1 } );
    auto view = matrix.getView();
 
    auto f = [=] __cuda_callable__ ( int rowIdx ) mutable {
diff --git a/Documentation/Tutorials/Matrices/MultidiagonalMatrixExample_Constructor.cpp b/Documentation/Tutorials/Matrices/MultidiagonalMatrixExample_Constructor.cpp
new file mode 120000
index 000000000..da7690427
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/MultidiagonalMatrixExample_Constructor.cpp
@@ -0,0 +1 @@
+../../Examples/Matrices/MultidiagonalMatrix/MultidiagonalMatrixExample_Constructor.cpp
\ No newline at end of file
diff --git a/Documentation/Tutorials/Matrices/MultidiagonalMatrixExample_Constructor_init_list_1.cpp b/Documentation/Tutorials/Matrices/MultidiagonalMatrixExample_Constructor_init_list_1.cpp
new file mode 120000
index 000000000..1e5ca52b0
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/MultidiagonalMatrixExample_Constructor_init_list_1.cpp
@@ -0,0 +1 @@
+../../Examples/Matrices/MultidiagonalMatrix/MultidiagonalMatrixExample_Constructor_init_list_1.cpp
\ No newline at end of file
diff --git a/Documentation/Tutorials/Matrices/tutorial_Matrices.md b/Documentation/Tutorials/Matrices/tutorial_Matrices.md
index 1a630cd2d..760325a58 100644
--- a/Documentation/Tutorials/Matrices/tutorial_Matrices.md
+++ b/Documentation/Tutorials/Matrices/tutorial_Matrices.md
@@ -596,11 +596,11 @@ Similar way of the tridiagonal matrix setup is offered by the method `setElement
 \includelineno TridiagonalMatrixExample_setElements.cpp
 
 
-Here we create the matrix in two steps. Firstly, we setup the matrix dimensions by the appropriate constructor (line 24) and after that we setup the matrix elemets (line 25-45). The result looks the same as in the previous example:
+Here we create the matrix in two steps. Firstly, we setup the matrix dimensions by the appropriate constructor (line 24) and after that we setup the matrix elements (line 25-45). The result looks the same as in the previous example:
 
 \includelineno TridiagonalMatrixExample_setElements.out
 
-In the following example we create tridiagonal matrix with 5 rows and 5 columns (line 12-14) by the means of a shared pointer (\ref TNL::Pointers::SharedPointer) to make this work even on GPU. We set numbers 0,...,4 on the diagonal (line 16) and we print the matrix (line 18). Next we use a lambda function (lines 21-27) combined with parallel for (\ref TNL::Alfgorithms::ParallelFor) (line 35), to modify the matrix. The offdiagonal elements are set to 1 (lines 23 and 26) and for the diagonal elements, we change the sign (line 24).
+In the following example we create tridiagonal matrix with 5 rows and 5 columns (line 12-14) by the means of a shared pointer (\ref TNL::Pointers::SharedPointer) to make this work even on GPU. We set numbers 0,...,4 on the diagonal (line 16) and we print the matrix (line 18). Next we use a lambda function (lines 21-27) combined with parallel for (\ref TNL::Algorithms::ParallelFor) (line 35), to modify the matrix. The offdiagonal elements are set to 1 (lines 23 and 26) and for the diagonal elements, we change the sign (line 24).
 
 \includelineno TridiagonalMatrixExample_setElement.cpp
 
@@ -710,9 +710,9 @@ and it accepts the following parameters:
 * `matrixMultiplicator` is a number by which the result of matrix-vector product is multiplied. 
 * `outVectorMultiplicator` is a number by which the output vector is multiplied before added to the result of matrix-vector product.
 * `begin` is an index of the first matrix row that is involved in the multiplication. It is zero be default.
-* `end` is an index of the last matrix row that is involved in the multiplication. It is the last matrix row by default.
+* `end` is an index indicating the last matrix row that is involved in the multiplication which is `end - 1`. It is the number of matrix rows.
 
-Note that the ouput vector dimension must be the same as the number of matrix rows no matter how we set `begin` and `end` parameters. These parameters just say that some matrix rows and the output vector elements are omitted.
+Note that the output vector dimension must be the same as the number of matrix rows no matter how we set `begin` and `end` parameters. These parameters just say that some matrix rows and the output vector elements are omitted.
 
 ### Tridiagonal matrix IO
 
@@ -720,14 +720,14 @@ The tridiagonal matrix can be saved to a file using a method `save` (\ref TNL::M
 
 ### Tridiagonal matrix view
 
-Similar to dense and sparse matrix view, tridiagonal matrix also offers its view for easier use with lambda functions. It represented by a templated class \ref TNL::Matrices::TridiagonalMatrixView with the following template parameters:
+Similar to dense and sparse matrix view, tridiagonal matrix also offers its view for easier use with lambda functions. It is represented by a templated class \ref TNL::Matrices::TridiagonalMatrixView with the following template parameters:
 
 * `Real` is a type of matrix elements. 
 * `Device` is a device on which the matrix is allocated. This can be \ref TNL::Devices::Host or \ref TNL::Devices::Cuda.
 * `Index` is a type for indexing the matrix elements and also row and column indexes.
 * `Organization` tells the ordering of matrix elements in memory. It is either RowMajorOrder or ColumnMajorOrder.
 
-The first main reason for using the dense matrix view is its ability to be captured by lambda functions since the copy constructor makes only shallow copy. We can demonstrate it on the example showing the method `setElement` (\ref TNL::Matrices::TridiagonalMatrix::setElement). The code looks as follows:
+The first main reason for using the matrix view is its ability to be captured by lambda functions since the copy constructor makes only shallow copy. We can demonstrate it on the example showing the method `setElement` (\ref TNL::Matrices::TridiagonalMatrix::setElement). The code looks as follows:
 
 \includelineno TridiagonalMatrixViewExample_setElement.cpp
 
@@ -746,7 +746,401 @@ So for the sake of using a matrix in lambda functions, the matrix view is better
 
 As we mentioned already, the tridiagonal matrix view offers almost all methods which the tridiagonal matrix does. So it can be easily used at almost any situation the same way as the tridiagonal matrix itself.
 
-
 ## Multidiagonal matrices <a name="multidiagonal_matrices"></a>
 
+Multidiagonal matrices are generalization of the tridiagonal matrix. It is a special type of sparse matrices with specific pattern of the nonzero matrix elements which are positioned only parallel along diagonal. See the following example:
+
+\f[
+  \left(
+  \begin{array}{ccccccc}
+   4  & -1  &  .  & -1  &  . & .  \\
+  -1  &  4  & -1  &  .  & -1 & .  \\
+   .  & -1  &  4  & -1  &  . & -1 \\
+  -1  & .   & -1  &  4  & -1 &  . \\
+   .  & -1  &  .  & -1  &  4 & -1 \\
+   .  &  .  & -1  &  .  & -1 &  4
+  \end{array}
+  \right)
+ \f]
+
+ We can see that the matrix elements lay on lines parallel to the main diagonal. Such lines can be expressed by their offsets from the main diagonal. On the following figure, each such line is depicted in different color:
+
+  \f[
+\begin{array}{ccc}
+\color{green}{-3} & .                & \color{cyan}{-1} \\
+\hline
+ \color{green}{*} & .                & \color{cyan}{*} \\
+ .                & \color{green}{*} & . \\
+ .                & .                & \color{green}{*} \\
+ .                & .                & . \\
+ .                & .                & . \\
+ .                & .                & . 
+\end{array}
+\left(
+  \begin{array}{ccccccc}
+ \color{blue}{0}    & \color{magenta}{1}   & .                   & \color{red}{3}      & .                   & . \\
+   \hline
+  \color{blue}{4}   & \color{magenta}{-1}  &  .                  & \color{red}{-1}     &  .                  & .  \\
+  \color{cyan}{-1}  & \color{blue}{4}      & \color{magenta}{-1} &  .                  & \color{red}{-1}     & .  \\
+   .                & \color{cyan}{-1}     & \color{blue}{4}     & \color{magenta}{-1} &  .                  & \color{red}{-1} \\
+  \color{green}{-1} & .                    & \color{cyan}{-1}    & \color{blue}{4}     & \color{magenta}{-1} &  . \\
+   .                & \color{green}{-1}    &  .                  & \color{cyan}{-1}    &  \color{blue}{4}    & \color{magenta}{-1} \\
+   .                &  .                   & \color{green}{-1}   &  .                  & \color{cyan}{-1}    &  \color{blue}{4}
+  \end{array}
+  \right)
+ \f]
+
+ In this matrix, the offsets reads as \f$\{-3, -1, 0, +1, +3\}\f$. It also means that the column indexes on \f$i-\f$th row are \f$\{i-3, i-1, i, i+1, i+3\}\f$ (where the resulting index is non-negative and  smaller than the number of matrix columns). An advantage is that, similar to the tridiagonal matrix (\ref TNL::Matrices::TridiagonalMatrix), we do not store the column indexes explicitly as it is in \ref SparseMatrix. This can reduce significantly the  memory requirements which also means better performance. See the following table for the storage requirements comparison between \ref TNL::Matrices::MultidiagonalMatrix and \ref TNL::Matrices::SparseMatrix.
+
+  Real   | Index     |      SparseMatrix    | MultidiagonalMatrix | Ratio
+ --------|-----------|----------------------|---------------------|--------
+  float  | 32-bit int| 8 bytes per element  | 4 bytes per element | 50%
+  double | 32-bit int| 12 bytes per element | 8 bytes per element | 75%
+  float  | 64-bit int| 12 bytes per element | 4 bytes per element | 30%
+  double | 64-bit int| 16 bytes per element | 8 bytes per element | 50%
+ 
+Multidiagonal matrix is a templated class defined in the namespace \ref TNL::Matrices. It has six template parameters:
+
+* `Real` is a type of the matrix elements. It is `double` by default.
+* `Device` is a device where the matrix shall be allocated. Currently it can be either \ref TNL::Devices::Host for CPU or \ref TNL::Devices::Cuda for GPU supporting CUDA. It is \ref TNL::Devices::Host by default.
+* `Index` is a type to be used for indexing of the matrix elements. It is `int` by default.
+* `ElementsOrganization` defines the organization of the matrix elements in memory. It can be \ref TNL::Algorithms::Segments::ColumnMajorOrder or \ref TNL::Algorithms::Segments::RowMajorOrder for column-major and row-major organization respectively. Be default it is the row-major order if the matrix is allocated in the host system and column major order if it is allocated on GPU.
+* `RealAllocator` is a memory allocator (one from \ref TNL::Allocators) which shall be used for allocation of the matrix elements. By default, it is the default allocator for given `Real` type and `Device` type -- see \ref TNL::Allocators::Default.
+* `IndexAllocator` is a memory allocator (one from \ref TNL::Allocators) which shall be used for allocation of the matrix elements offsets. By default, it is the default allocator for given `Index` type and `Device` type -- see \ref TNL::Allocators::Default.
+
+### Multidiagonal matrix allocation and initiation
+
+The construction of the multidiagonal matrix differs from the tridiagonal mainly in necessity to define the offsets of "subdiagonals" as we demonstrate on the following example which creates matrix like of the following form:
+
+\f[
+\left(
+\begin{array}{cccccccccccccccc}
+1  &  . &    &    &  . &    &    &    &    &    &    &    &     &    &    &   \\
+.  &  1 &  . &    &    &  . &    &    &    &    &    &    &     &    &    &   \\
+   &  . &  1 &  . &    &    & .  &    &    &    &    &    &     &    &    &   \\
+   &    &  . &  1 &  . &    &    &  . &    &    &    &    &     &    &    &   \\
+.  &    &    &  . &  1 & .  &    &    & .  &    &    &    &     &    &    &   \\
+   & -1 &    &    & -1 & 1  & -1 &    &    & -1 &    &    &     &    &    &   \\
+   &    & -1 &    &    & -1 &  1 & -1 &    &    & -1 &    &     &    &    &   \\
+   &    &    & .  &    &    &  . &  1 & .  &    &    & .  &     &    &    &   \\
+   &    &    &    & .  &    &    &  . & 1  &  . &    &    &  .  &    &    &   \\
+   &    &    &    &    & -1 &    &    & -1 &  1 & -1 &    &     & -1 &    &   \\
+   &    &    &    &    &    & -1 &    &    & -1 &  1 & -1 &     &    & -1 &   \\
+   &    &    &    &    &    &    &  . &    &    &  . &  1 &  .  &    &    & . \\
+   &    &    &    &    &    &    &    & .  &    &    &  . &  1  & .  &    &   \\
+   &    &    &    &    &    &    &    &    &  . &    &    &  .  & 1  & .  &   \\
+   &    &    &    &    &    &    &    &    &    &  . &    &     & .  & 1  & . \\
+   &    &    &    &    &    &    &    &    &    &    & .  &     &    & .  & 1
+\end{array}
+\right)
+\f]
+
+The code reads as:
+
+\includelineno MultidiagonalMatrixExample_Constructor.cpp
+
+The matrix from this example arises from a discretization of the [Laplace operator in 2D by the finite difference method](https://en.wikipedia.org/wiki/Discrete_Poisson_equation). We use this example because it is very frequent numerical problem. If the reader, however, is not familiar with the finite difference method, please, do not be scared, we will just create the matrix mentioned above.
+
+We firstly compute the matrix size (`matrixSize`) based on the numerical grid dimensions on the line 16. The subdiagonals offsets are defined by the numerical grid size and since it is four in this example the offsets read as \f$\left\{-4,-1,0,1,4 \right\} \f$ or `{ -gridSize, -1, 0, 1, gridSize}` (line 17). Here we store the offsets (referred as `shifts`) in vector (\ref TNL::Containers::Vector). Next we use a constructor with matrix dimensions and offsets passed via TNL vector (line 18). Next we fetch matrix view (line 19) (see [Multidiagonal matrix view](#multidiagonal_matrix_view)).
+
+The matrix is constructed by iterating over particular nodes of the numerical grid. Each node corresponed to one matrix row. This is why the lambda function `f` (lines 20-35) take two indexes `i` and `j` (line 20). Their values are coordinates of the twodimensional numerical grid. Based on these coodrinates we compute index (`elementIdx`) of the corresponding matrix row (line 21). We fetch matrix row (`row`) by calling the `getRow` method (\ref TNL::Matrices::MutlidiagonalMatrix::getRow) (line 22). Depending on the grid node coordinates we set either the boundary conditions (lines 23-26) for the boundary nodes (those laying on the boundary of the grid and so their coordinates fulfil the condition `i == 0 || j == 0 || i == gridSize - 1 || j == gridSize - 1` ) for which se set onle diagonal element to 1. The inner nodes of the numerical grid are handled on the lines 29-33 where we set coefficients approximating the Laplace operator. We use the method `setElement` of the matrix row (\ref TNL::Matrices::MultidiagonalMatrixRow::setElement) which takes the local index of the nonzero matrix element as the first parametr and the new value of the element as the second parameter. The local indexes, in fact, refer to particular subdiagonals as depicted on the following figure (in blue): 
+
+\f[
+\begin{array}{cccc}
+\color{blue}{-4} &   &   & \color{blue}{-1} \\
+\hline
+.  &   &   & .  \\
+   & . &   &    \\
+   &   & . &    \\
+   &   &   & .  \\
+   &   &   &    \\
+   &   &   &    \\
+   &   &   &    \\
+   &   &   &    \\
+   &   &   &    \\
+   &   &   &    \\
+   &   &   &    \\
+   &   &   &    \\
+   &   &   &    \\
+   &   &   &    \\
+   &   &   &    \\
+   &   &   &   
+\end{array}
+\left(
+\begin{array}{cccccccccccccccc}
+\color{blue}{0}  &  \color{blue}{1} &    &    &  \color{blue}{4} &    &    &    &    &    &    &    &     &    &    &   \\
+\hline
+1  &  . &    &    &  . &    &    &    &    &    &    &    &     &    &    &   \\
+.  &  1 &  . &    &    &  . &    &    &    &    &    &    &     &    &    &   \\
+   &  . &  1 &  . &    &    & .  &    &    &    &    &    &     &    &    &   \\
+   &    &  . &  1 &  . &    &    &  . &    &    &    &    &     &    &    &   \\
+.  &    &    &  . &  1 & .  &    &    & .  &    &    &    &     &    &    &   \\
+   & -1 &    &    & -1 & 1  & -1 &    &    & -1 &    &    &     &    &    &   \\
+   &    & -1 &    &    & -1 &  1 & -1 &    &    & -1 &    &     &    &    &   \\
+   &    &    & .  &    &    &  . &  1 & .  &    &    & .  &     &    &    &   \\
+   &    &    &    & .  &    &    &  . & 1  &  . &    &    &  .  &    &    &   \\
+   &    &    &    &    & -1 &    &    & -1 &  1 & -1 &    &     & -1 &    &   \\
+   &    &    &    &    &    & -1 &    &    & -1 &  1 & -1 &     &    & -1 &   \\
+   &    &    &    &    &    &    &  . &    &    &  . &  1 &  .  &    &    & . \\
+   &    &    &    &    &    &    &    & .  &    &    &  . &  1  & .  &    &   \\
+   &    &    &    &    &    &    &    &    &  . &    &    &  .  & 1  & .  &   \\
+   &    &    &    &    &    &    &    &    &    &  . &    &     & .  & 1  & . \\
+   &    &    &    &    &    &    &    &    &    &    & .  &     &    & .  & 1
+\end{array}
+\right)
+\f]
+
+We use `ParallelFor2D` (\ref TNL::Algorithms::ParallelFor2D) to iterate over all nodes of the numerical grid (line 36) and apply the lambda function. Also note that for the sake of better memory alignemnt and faster acces to the matrix elements, we store all subdiagonals in complete form including the elemenets which are outside the matrix as depicted on the following figure where zeros stand for the padding artificial zero matrix elements
+
+\f[
+\begin{array}{cccc}
+0  &   &   & 0  \\
+   & 0 &   &    \\
+   &   & 0 &    \\
+   &   &   & 0  \\
+   &   &   &    \\
+   &   &   &    \\
+   &   &   &    \\
+   &   &   &    \\
+   &   &   &    \\
+   &   &   &    \\
+   &   &   &    \\
+   &   &   &    \\
+   &   &   &    \\
+   &   &   &    \\
+   &   &   &    \\
+   &   &   &   
+\end{array}
+\left(
+\begin{array}{cccccccccccccccc}
+1  &  0 &    &    &  0 &    &    &    &    &    &    &    &     &    &    &   \\
+0  &  1 &  0 &    &    &  0 &    &    &    &    &    &    &     &    &    &   \\
+   &  0 &  1 &  0 &    &    & 0  &    &    &    &    &    &     &    &    &   \\
+   &    &  0 &  1 &  0 &    &    &  0 &    &    &    &    &     &    &    &   \\
+0  &    &    &  0 &  1 & 0  &    &    & 0  &    &    &    &     &    &    &   \\
+   & -1 &    &    & -1 & 1  & -1 &    &    & -1 &    &    &     &    &    &   \\
+   &    & -1 &    &    & -1 &  1 & -1 &    &    & -1 &    &     &    &    &   \\
+   &    &    & 0  &    &    &  0 &  1 & 0  &    &    & 0  &     &    &    &   \\
+   &    &    &    & 0  &    &    &  0 & 1  &  0 &    &    &  0  &    &    &   \\
+   &    &    &    &    & -1 &    &    & -1 &  1 & -1 &    &     & -1 &    &   \\
+   &    &    &    &    &    & -1 &    &    & -1 &  1 & -1 &     &    & -1 &   \\
+   &    &    &    &    &    &    &  0 &    &    &  0 &  1 &  0  &    &    & 0 \\
+   &    &    &    &    &    &    &    & 0  &    &    &  0 &  1  & 0  &    &   \\
+   &    &    &    &    &    &    &    &    &  0 &    &    &  0  & 1  & 0  &   \\
+   &    &    &    &    &    &    &    &    &    &  0 &    &     & 0  & 1  & 0 \\
+   &    &    &    &    &    &    &    &    &    &    & 0  &     &    & 0  & 1
+\end{array}
+\right)
+\begin{array}
+   &   &   &    \\
+   &   &   &    \\
+   &   &   &    \\
+   &   &   &    \\
+   &   &   &    \\
+   &   &   &    \\
+   &   &   &    \\
+   &   &   &    \\
+   &   &   &    \\
+   &   &   &    \\
+   &   &   &    \\
+   &   &   &    \\
+0  &   &   &    \\
+   & 0 &   &    \\
+   &   & 0 &    \\
+0  &   &   & 0  
+\end{array}
+\f]
+
+
+The result looks as follows:
+
+\includelineno MultidiagonalMatrixExample_Constructor.out
+
+Slightly simpler way of doing the same is by using the constructor of multidiagonal matrix taking the subdiagonals offsets as an STL initializer list:
+
+\includelineno MultidiagonalMatrixExample_Constructor_init_list_1.cpp
+
+The only change is on the line 17 which reads as
+
+```
+TNL::Matrices::MultidiagonalMatrix< double, Device > matrix( matrixSize, matrixSize, { - gridSize, -1, 0, 1, gridSize } );
+```
+
+Here we call the mentioned cosntructor, which accepts the matrix dimensions (number of rows and columns) as first two parameters and the initializer list with the subdiagonal offsets as the last one. The result looks the same as in the previous example.
+
+There is also a constructor with initializer list for matrix elements values as demonstrated by the following example:
+
+\includelineno MultidiagonalMatrixExample_Constructor_init_list_2.cpp
+
+Here, we create a matrix which looks as 
+
+\f[
+\left(
+\begin{array}{cccccc}
+4  & -1 &    & -1 &    &    \\
+-1 &  4 & -1 &    & -1 &    \\
+   & -1 & 4  & -1 &    & -1 \\
+-1 &    & -1 &  4 & -1 &    \\
+   & -1 &    & -1 & 4  & -1 \\
+   &    & -1 &    & -1 &  4 \\
+\end{array}
+\right).
+\f]
+
+On the lines 25-46, we call the constructor which, in addition to matrix dimensions and subdiagonals offsets, accepts also initializer list of initializer lists with matrix elements values. Each embeded list corresponds to one matrix row and it contains values of matrix elements on particular subdiagonals including those which lies out of the matrix. The resuls looks as follows:
+
+\includelineno MultidiagonalMatrixExample_Constructor_init_list_2.out
+
+The matrix elements values can be changed the same way using the method method `setElements` (\ref TNL::Matrices::MutlidiagonalMatrix::setElements) which accepts the elements values in the same form of embedded initializer list. It just does not allow changing the subdiagonals offsets. For this purpose method `setDiagonalsOffsets` (\ref TNL::Matrices::MultidiagonalMatrix::setDiagonalsOffsets) can be used. Note, however, that this method deletes all current matrix elements.
+
+Another way of setting the matrix elements is by means of the method `setElement` (\ref TNL::Matrices::MutlidiagonalMatrix::setElement). It works the same way as with other matrix types as we can see in the follofwing example:
+
+\includelineno MultidiagonalMatrixExample_setElement.cpp
+
+This examples shows that the method `setElement` can be used both on the host (CPU) (line 17) as well as in the GPU kernels (lines 23-27). Here we use shared pointer (\ref TNL::Pointers::SharedPointer) (line 15) to pass the multidiagonal matrix to lambda function `f` (lines 22-28) which may run on GPU. In this case we have to synchronize to share pointer explicitly by calling the function \ref TNL::Pointers::synchronizeSmartPointersOnDevice. To avoid this inconvenience the same can be achieved with the multidiagonal matrix view:
+
+\includelineno MultidiagonalMatrixViewExample_setElement.cpp
+
+In this example, we fetch the matrix view (line 16) immediately after creating the matrix itself (line 15). Note that the matrix view can be obtained from the matrix at any time while the shared pointer only at the time of the matrix creation. On the other hand, if the original matrix is changed, all matrix views become invalid which is not true for the shared pointers. So it is better to fetch the matrix view immediately before we use it to avoid the sitaution that you would use invalid matrix view. The method `setElement` (\ref TNL::Matrices::MutlidiagonalMatrixView::setElement) can be used on both host (CPU) (line 19) and the device (lines 25-29) if the lambda function `f` (lines 24-30) runs in GPU kernel. The result of both examles looks the same:
+
+\includelineno MultidiagonalMatrixViewExample_setElement.out
+
+Another way for setting the matrix elements is by means of the multidiagonal matrix row:
+
+\includelineno MultidiagonalMatrixViewExample_getRow.cpp
+
+Here we use the matrix view again (line 19) and in the lambda function `f` which serves for the matrix elements setting, we fetch the matrix row just at the beginning (line 22). Next we use the method `setElement` (\ref TNL::Matrices::MultidiagonalMatrixRow::setElement) which accepts two parameters. The first is the local index of the matrix element which in case of the multidiagonal matrix agrees with index of the subdiagonal as demonstrated on this figure which shows just the matrix we are creating in this example (the subdiagonal indexes are depicted in blue color):
+
+\f[
+\begin{array}{c}
+\color{blue}{0} \\
+\hline
+* \\
+  \\
+  \\
+  \\
+~
+\end{array}
+\left(
+\begin{array}{ccccc}
+ \color{blue}{1} &  \color{blue}{2} &    &    &    \\
+ \hline
+2  & -1 &    &    &    \\
+-1 &  2 & -1 &    &    \\
+   & -1 &  2 & -1 &    \\
+   &    & -1 &  2 & -1 \\ 
+   &    &    & -1 &  2
+\end{array}
+\right)
+\f]
+
+The second parameter of the method `setElement` is the new matrix elements value. An adventage of this method is that it can acces  the matrix elements faster. The output of this example looks as follows:
+
+\includelineno MultidiagonalMatrixViewExample_getRow.out
+
+Similar and even a bit simpler way of setting the matrix elements is offered by the method `forRows` (\ref TNL::Matrices::MultidiagonalMatrix::forRows, \ref TNL::Matrices::MultidiagonalMatrixView::forRows) as demonstrated in the following example:
+
+\includelineno MultidiagonalMatrixViewExample_forRows.cpp
+
+In this case, we need to provide a lambda function `f` (lines 27-43) which is called for each matrix row just by the method `forRows` (line 44). The lambda function `f` provides the following parameters
+
+* `rowIdx` is an index iof the matrix row.
+* `localIdx` is in index of the matrix subdiagonal.
+* `columnIdx` is a column index of the matrix element.
+* `value` is a reference to the matrix element value. It can be used even for changing the value.
+* `compute` is a reference to boolean. If it is set to false, the iteration over the matrix row can be stopped.
+
+In this example, the matrix element value depends only on the subdiagonal index `localIdx` as we can see on the line 42. The result looks as follows:
+
+\includelineno MultidiagonalMatrixExample_forRows.out
+
+### Flexible reduction in matrix rows
+
+The flexible parallel reduction in rows for multidiagonal matrices works the same way as for other matrix types. It consits of three lambda functions:
+
+1. `fetch` reads and preproces data entering the flexible parallel reduction.
+2. `reduce` performs the reduction operation.
+3. `keep` stores the results from each matrix row.
+
+See the following example:
+
+\includelineno MultidiagonalMatrixExample_rowsReduction.cpp
+
+On the lines 10-29, we first create the following matrix
+
+\f[
+\left(
+\begin{array}{ccccc}
+1  &   &   &   &  \\
+2  & 1 &   &   &  \\
+3  & 2 & 1 &   &  \\
+   & 3 & 2 & 1 &  \\
+   &   & 3 & 2 & 1
+\end{array}
+\right)
+\f]
+
+and we aim to compute maximal value in each row. We first create vector `rowMax` into which we will store the results and fetch it view `rowMaxView` (line 39). Next we prepare necessary lambda functions:
+
+* `fetch` (lines 44-46) is responsible for reading the matrix element value which is stored in the constant reference `value` and for returning its absolute value. The other parameters `rowIdx` and `columnIdx` correspond to row and column indexes respectively and they are omitted in our example.
+* `reduce` (lines 51-53) returns maximum value of the two input values `a` and `b`.
+* `keep` (line 58-60) stores the input `value` at the corresponding position, given by the row index `rowIdx`, in the ouput vector view `rowMaxView`.
+
+Finaly we call the method `rowsReduction` (\ref TNL::Matrices::MultidiagonalMatrix::rowsReduction) with parameters telling the interval of rows to be processed (the first and second parameter), the lambda functions `fetch`, `reduce` and `keep`, and the idempotent element for the reduction operation which is the lowest number of given type (\ref std::numeric_limits< double >::lowest ). The result looks as follows:
+
+\includelineno MultidiagonalMatrixExample_rowsReduction.out
+
+### Multidiagonal matrix-vector product
+
+Similar to matrix types, matrix-vector multiplication is represented by the method `vectorProduct` (\ref TNL::Matrices::MultidiagonalMatrix::vectorProduct). It is templated method with two template parameters `InVector` and `OutVector` telling the types of the input and output vector respectively. Usually one will substitute some of \ref TNL::Containers::Array, \ref TNL::Containers::ArrayView, \ref TNL::Containers::Vector or \ref TNL::Containers::VectorView for these types. The method computes the following formula
+
+```
+outVector = matrixMultiplicator * ( *this ) * inVector + outVectorMultiplicator * outVector
+```
+
+and it accepts the following parameters:
+
+* `inVector` is the input vector having the same number of elements as the number of matrix columns.
+* `outVector` is the output vector having the same number of elements as the number of matrix rows.
+* `matrixMultiplicator` is a number by which the result of matrix-vector product is multiplied. 
+* `outVectorMultiplicator` is a number by which the output vector is multiplied before it is added to the result of matrix-vector product.
+* `begin` is an index of the first matrix row that is involved in the multiplication. It is zero be default.
+* `end` is an index indicating the last matrix row that is involved in the multiplication which is `end - 1`. It is the number of matrix rows.
+
+Note that the output vector dimension must be the same as the number of matrix rows no matter how we set `begin` and `end` parameters. These parameters just say that some matrix rows and the output vector elements are omitted.
+
+### Multidiagonal matrix IO
+
+The multidiagonal matrix can be saved to a file using a method `save` (\ref TNL::Matrices::MultiidiagonalMatrix::save) and restored with a method `load` (\ref TNL::Matrices::MultidiagonalMatrix::load). For printing the matrix, there is a method `print` (\ref TNL::Matrices::MultidiagonalMatrix::print) can be used.
+
+### Multidiagonal matrix view <a name="multidiagonal_matrix_view"></a>
+
+Multidiagonal matrix also offers its view for easier use with lambda functions. It is represented by a templated class \ref TNL::Matrices::MultidiagonalMatrixView with the following template parameters:
+
+* `Real` is a type of matrix elements. 
+* `Device` is a device on which the matrix is allocated. This can be \ref TNL::Devices::Host or \ref TNL::Devices::Cuda.
+* `Index` is a type for indexing the matrix elements and also row and column indexes.
+* `Organization` tells the ordering of matrix elements in memory. It is either RowMajorOrder or ColumnMajorOrder.
+
+The first main reason for using the matrix view is its ability to be captured by lambda functions since the copy constructor makes only shallow copy. We can demonstrate it on the example showing the method `setElement` (\ref TNL::Matrices::MultidiagonalMatrix::setElement). The code looks as follows:
+
+\includelineno MultidiagonalMatrixViewExample_setElement.cpp
+
+The matrix view is obtained by the method `getView` (\ref TNL::Matrices::MultidiagonalMatrix::getView) on the line 13. We firsrt show, that the view can be used the same way as common matrix (lines 14 and 15) but it can be used the same way even in lambda functions as we can see on the lines 20-26. Compare it with the same example using shared pointer instead of the matrix view:
+
+\includelineno MultidiagonalMatrixExample_setElement.cpp
+
+The main disadventages are:
+
+1. The shared pointer must be created together with the matrix (line 14) and there is no way to get it later. The matrix view can be obtained from any matrix at any time.
+2. We have to synchronize shared pointers explicitly by calling the function \ref TNL::Pointers::synchronizeSmartPointersOnDevice (line 34).
+
+So for the sake of using a matrix in lambda functions, the matrix view is better tool. The result of both examples looks as:
+
+\include MultidiagonalMatrixExample_setElement.out
+
+As we mentioned already, the multidiagonal matrix view offers almost all methods which the multidiagonal matrix does. So it can be easily used at almost any situation the same way as the multidiagonal matrix itself.
+
+TODO: Move to explanation of the matrix view to introduction.
+
 ## Lambda matrices <a name="lambda_matrices"></a>
-- 
GitLab


From 18779e30097186cbd1fc3d4db02b0c08664129fb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Fri, 8 Jan 2021 22:04:27 +0100
Subject: [PATCH 14/53] Writting tutorials on lambda matrix.

---
 .../LambdaMatrixExample_rowsReduction.cpp     |   2 +-
 .../Tutorials/Matrices/tutorial_Matrices.md   | 110 +++++++++++++++++-
 src/TNL/Matrices/LambdaMatrix.h               |  16 +--
 3 files changed, 118 insertions(+), 10 deletions(-)

diff --git a/Documentation/Examples/Matrices/LambdaMatrix/LambdaMatrixExample_rowsReduction.cpp b/Documentation/Examples/Matrices/LambdaMatrix/LambdaMatrixExample_rowsReduction.cpp
index 17f3ace0d..4cb0aedab 100644
--- a/Documentation/Examples/Matrices/LambdaMatrix/LambdaMatrixExample_rowsReduction.cpp
+++ b/Documentation/Examples/Matrices/LambdaMatrix/LambdaMatrixExample_rowsReduction.cpp
@@ -40,7 +40,7 @@ void rowsReduction()
    /***
     * Reduce lambda return maximum of given values.
     */
-   auto reduce = [=] __cuda_callable__ ( double& a, const double& b ) -> double {
+   auto reduce = [=] __cuda_callable__ ( const double& a, const double& b ) -> double {
       return TNL::max( a, b );
    };
 
diff --git a/Documentation/Tutorials/Matrices/tutorial_Matrices.md b/Documentation/Tutorials/Matrices/tutorial_Matrices.md
index 760325a58..d6b15c76c 100644
--- a/Documentation/Tutorials/Matrices/tutorial_Matrices.md
+++ b/Documentation/Tutorials/Matrices/tutorial_Matrices.md
@@ -1137,10 +1137,118 @@ The main disadventages are:
 
 So for the sake of using a matrix in lambda functions, the matrix view is better tool. The result of both examples looks as:
 
-\include MultidiagonalMatrixExample_setElement.out
+\includelineno MultidiagonalMatrixExample_setElement.out
 
 As we mentioned already, the multidiagonal matrix view offers almost all methods which the multidiagonal matrix does. So it can be easily used at almost any situation the same way as the multidiagonal matrix itself.
 
 TODO: Move to explanation of the matrix view to introduction.
 
 ## Lambda matrices <a name="lambda_matrices"></a>
+
+Lambda matrix (\ref TNL::Matrices::LambdaMatrix) is a special type of matrix which could be also called *** matrix-free matrix ***. Its elements are not stored in memory explicitlely but they are evaluated on-the-fly by means of user defined lambda functions. If the matrix elements can be expressed by computationaly not expansive formula, we can significantly reduce the memory consumptions which can be appriciated especially on GPU. Since the memory accesses are quite expensive even on CPU, we can get, at the end, even much faster code.
+
+The lambda matrix (\ref TNL::Matrices::LambdaMatrix) is a templated class with the following template parameters:
+
+* `MatrixElementsLambda` is a lambda function which evaluates the matrix elements values and column indexes.
+* `CompressedRowLengthsLambda` is a lambda function telling how many nonzero elements are there in given matrix row.
+* `Real` is a of matrix elements values.
+* `Device` is a device on which the lambda functions mentioned above will be evaluated.
+* `Index` is a type to be used for indexing.
+
+The lambda function `MatrixElementsLambda` is supposed to have the following declaration:
+
+```
+matrixElements( Index rows, 
+                Index columns,
+                Index row,
+                Index localIdx,
+                Index& columnIdx,
+                Real& value )
+```
+where the particular parameterts have the following meaning:
+
+* `rows` tells the number of matrix rows.
+* `columns` tells the number of matrix columns.
+* `rowIdx` is index of the matrix row in which we are supposed to evaluate the matrix element.
+* `localIdx` is a rank of the nonzero matrix element.
+* `columnIdx` is a reference on variable where we are supposed to store the matrix element column index.
+* `value` is a reference on variable where we are supposed to store the matrix element value.
+
+The lambda function `CompressedRowLengthsLambda` is supposed to look like this:
+
+```
+rowLengths( Index rows, 
+            Index columns,
+            Index row ) -> Index
+```
+
+where the parameters can be described as follows:
+
+* `rows` tells the number of matrix rows.
+* `columns` tells the number of matrix columns.
+* `rowIdx` is index of the matrix row for which we are supposed to evaluate the number of nonzero matrix elements.
+
+The lambda function is supposed to return just the number of the nonzero matrix elements in given matrix row.
+
+### Lambda matrix inititation
+
+See the following example which demonstrates how to create the lambda matrix:
+
+\includelineno LambdaMatrixExample_Constructor.cpp
+
+Here we create two simple diagonal matrices. Therefore thay share the same lambda function `rowLengths` telling the the number of nonzero matrix elements in particular matrix rows which is always one (line 9). The first matrix, defined by the lambda function `matrixElements1`, is identity matrix and so its each diagonal element equals one. We set the matrix element value to `1.0` (line 12) and the column index equals the row index (line 15). The second matrix, defined by the lambda function `matrixElements2`, is also diagonal but not the identity matrix. The values of the diagonal elements equal to row index (line 16).
+
+With the same lambda functions we can define matrices with different dimensions. In this example, we set the matrix size to five (line 19). It can be quite difficult to express the lambda matrix type because it depends on the types of the lambda functions. To make this easier, one may use a lambda-matrix factory (\ref TNL::Matrices::LambdaMatrixFactory). Using `decltype` one can deduce even the matrix type (line 24) followed by calling lambda matrix constructor with matrix dimensions and instances of the lambda functions (line 25). Or one can just simply employ the keyword `auto` (line 30) followed by setting the matrix dimensins (line 31).
+
+The result looks as follows:
+
+\includelineno LambdaMatrixExample_Constructor.out
+
+Of course, the lambda matrix has the same interface as other matrix types. The following example demonstrates the use of the method `forRows` to copy the lambda matrix into the dense matrix:
+
+\includelineno LambdaMatrixExample_forRows.cpp
+
+Here, we treat the lambda matrix as if it was dense matrix. The lambda function `rowLengths` returns the number of the nonzero elements equal to the number of matrix columns (line 13). However, the lambda function `matrixElements` (lines 14-17), sets nozero values only to lower triangular part of the matrix. The elements in the upper part are equal to zero (line 16). Next we create an instance of the lambda matrix with help of the lambda matrix factory (\ref TNL::Matrices::LambdaMatrixFactory) (lines 19-20) and an instance of the dense matrix (\ref TNL::Matrices::DenseMatrix) (lines 22-23). 
+
+Next we call the lambda function `f` by the method `forRows` (\ref TNL::Matrices::LambdaMatrix::forRows) to set the matrix elements of the dense matrix `denseMatrix` (line 26) via the dense matrix view (`denseView`) (\ref TNL::Matrices::DenseMatrixView). Note, that in the lambda function `f` we get the matrix element value already evaluated in the variable `value` as we are used to from other matrix types. So in fact, the same lambda function `f` woudl do the same job even for sparse matrix or any other. Also note, that in this case we iterate even over all zero matrix elements because the lambda function `rowLengths` (line 13) tells so. The result looks as follows:
+
+\includelineno LambdaMatrixExample_forRows.out
+
+### Flexible reduction in matrix rows
+
+The reduction of matrix rows is available for the lambda matrices as well. See the follogin example:
+
+\includelineno LambdaMatrixExample_rowsReduction.cpp
+
+On the lines 14-21, we create the same lower trianguilar lambda matrix as in the previous example. As we did it in similar examples for other matrix types, we want to compute maximal absolute value of matrix elements in each row. For this purpose we define well known lambda functions:
+
+* `fetch` takes the value of the lambda matrix element and returns its absolute value.
+* `reduce` computes maximum value of two input variables.
+* `keep` stores the results into output vector `rowMax`.
+
+Note that the interface of the lambda functions is the same as for other matrix types. The result looks as follows:
+
+\includelineno LambdaMatrixExample_rowsReduction.out
+
+### Lambda matrix-vector product
+
+The matrix-vector multiplication is represented by the method `vectorProduct` (\ref TNL::Matrices::LambdaMatrix::vectorProduct). It is templated method with two template parameters `InVector` and `OutVector` telling the types of the input and output vector respectively. Usually one will substitute some of \ref TNL::Containers::Array, \ref TNL::Containers::ArrayView, \ref TNL::Containers::Vector or \ref TNL::Containers::VectorView for these types. The method computes the following formula
+
+```
+outVector = matrixMultiplicator * ( *this ) * inVector + outVectorMultiplicator * outVector
+```
+
+and it accepts the following parameters:
+
+* `inVector` is the input vector having the same number of elements as the number of matrix columns.
+* `outVector` is the output vector having the same number of elements as the number of matrix rows.
+* `matrixMultiplicator` is a number by which the result of matrix-vector product is multiplied. 
+* `outVectorMultiplicator` is a number by which the output vector is multiplied before it is added to the result of matrix-vector product.
+* `begin` is an index of the first matrix row that is involved in the multiplication. It is zero be default.
+* `end` is an index indicating the last matrix row that is involved in the multiplication which is `end - 1`. It is the number of matrix rows.
+
+Note that the output vector dimension must be the same as the number of matrix rows no matter how we set `begin` and `end` parameters. These parameters just say that some matrix rows and the output vector elements are omitted.
+
+### Lambda matrix IO
+
+The lambda matrix, can be printed by the means of the method `print` (\ref TNL::Matrices::LambdaMatrix::print). The lambda matrix do not offer the methods `save` and `load` since it does not manage any data. Of course, the lambda function evaluating the matrix elements can use any supporting data containers but it is up these containers to manage the IO operations.
\ No newline at end of file
diff --git a/src/TNL/Matrices/LambdaMatrix.h b/src/TNL/Matrices/LambdaMatrix.h
index 1692510e7..cfd0a330e 100644
--- a/src/TNL/Matrices/LambdaMatrix.h
+++ b/src/TNL/Matrices/LambdaMatrix.h
@@ -17,7 +17,7 @@ namespace TNL {
 namespace Matrices {
 
 /**
- * \brief "Matrix-free" matrix based on lambda functions.
+ * \brief "Matrix-free matrix" based on lambda functions.
  * 
  * The elements of this matrix are not stored explicitly in memory but
  * implicitly on a form of lambda functions.
@@ -26,22 +26,22 @@ namespace Matrices {
  * 
  *    It has the following form:
  * 
- *   `matrixElements( IndexType rows, IndexType columns, IndexType row, IndexType localIdx, IndexType& elementColumn, RealType& elementValue )`
+ *   `matrixElements( Index rows, Index columns, Index rowIdx, Index localIdx, Index& columnIdx, Real& value )`
  * 
- *    where \e rows is the number of matrix rows, \e columns is the number of matrix columns, \e row is the index of matrix row being queried,
- *    \e localIdx is the rank of the non-zero element in given row, \e elementColumn is a column index of the matrix element computed by
- *    this lambda and \e elementValue is a value of the matrix element computed by this lambda.
+ *    where \e rows is the number of matrix rows, \e columns is the number of matrix columns, \e rowIdx is the index of matrix row being queried,
+ *    \e localIdx is the rank of the non-zero element in given row, \e columnIdx is a column index of the matrix element computed by
+ *    this lambda and \e value is a value of the matrix element computed by this lambda.
  * \tparam CompressedRowLengthsLambda is a lambda function returning a number of non-zero elements in each row.
  * 
  *    It has the following form:
  * 
- *    `rowLengths( IndexType rows, IndexType columns, IndexType row ) -> IndexType`
+ *    `rowLengths( Index rows, Index columns, Index rowIdx ) -> IndexType`
  * 
- *    where \e rows is the number of matrix rows, \e columns is the number of matrix columns and \e row is an index of the row being queried.
+ *    where \e rows is the number of matrix rows, \e columns is the number of matrix columns and \e rowIdx is an index of the row being queried.
  *
  * \tparam Real is a type of matrix elements values.
  * \tparam Device is a device on which the lambda functions will be evaluated. 
- * \ẗparam Index is a type used for indexing.
+ * \ẗparam Index is a type to be used for indexing.
  */
 template< typename MatrixElementsLambda,
           typename CompressedRowLengthsLambda,
-- 
GitLab


From f4f5bf7caf0bc3fbe7f359ac4fb1912645280fc7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Fri, 8 Jan 2021 22:04:53 +0100
Subject: [PATCH 15/53] Small fixes in documentation of multidiagonal matrix.

---
 src/TNL/Matrices/MultidiagonalMatrix.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/TNL/Matrices/MultidiagonalMatrix.h b/src/TNL/Matrices/MultidiagonalMatrix.h
index 05d834750..6508d10ce 100644
--- a/src/TNL/Matrices/MultidiagonalMatrix.h
+++ b/src/TNL/Matrices/MultidiagonalMatrix.h
@@ -696,7 +696,7 @@ class MultidiagonalMatrix : public Matrix< Real, Device, Index, RealAllocator >
        *
        * where
        *
-       * \e rowIdx is an index of the matrix row. 
+       * \e rowIdx is an index of the matrix row.
        *
        * \e localIdx parameter is a rank of the non-zero element in given row. It is also, in fact,
        *  index of the matrix subdiagonal.
@@ -705,7 +705,7 @@ class MultidiagonalMatrix : public Matrix< Real, Device, Index, RealAllocator >
        *
        * \e value is the matrix element value.
        *
-       * \e compute is a reference to a boolen variable. If it is set to false the iteration over the row can 
+       * \e compute is a reference to a boolen variable. If it is set to false the iteration over the row can
        *  be interrupted.
        *
        * \param begin defines beginning of the range [begin,end) of rows to be processed.
@@ -730,16 +730,16 @@ class MultidiagonalMatrix : public Matrix< Real, Device, Index, RealAllocator >
        *
        * where
        *
-       * \e rowIdx is an index of the matrix row. 
+       * \e rowIdx is an index of the matrix row.
        *
        * \e localIdx parameter is a rank of the non-zero element in given row. It is also, in fact,
        *  index of the matrix subdiagonal.
        *
-       * \e columnIdx is a column index of the matrx element.
+       * \e columnIdx is a column index of the matrix element.
        *
        * \e value is a reference to the matrix element value. It can be used even for changing the matrix element value.
        *
-       * \e compute is a reference to a boolen variable. If it is set to false the iteration over the row can 
+       * \e compute is a reference to a boolen variable. If it is set to false the iteration over the row can
        *  be interrupted.
        *
        * \param begin defines beginning of the range [begin,end) of rows to be processed.
-- 
GitLab


From 9f01bcf18fc94f025edf66975d5dc1a524930bc2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Sat, 9 Jan 2021 12:51:56 +0100
Subject: [PATCH 16/53] Removed ambiguous declaration of
 MultidiagonalMatrix::setRowCapavities.

---
 src/TNL/Matrices/MultidiagonalMatrix.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/TNL/Matrices/MultidiagonalMatrix.h b/src/TNL/Matrices/MultidiagonalMatrix.h
index 6508d10ce..797d16a3f 100644
--- a/src/TNL/Matrices/MultidiagonalMatrix.h
+++ b/src/TNL/Matrices/MultidiagonalMatrix.h
@@ -347,8 +347,6 @@ class MultidiagonalMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * \return vector with diagonals offsets.
        */
       const DiagonalsOffsetsType& getDiagonalsOffsets() const;
-      template< typename RowCapacitiesVector >
-      void setRowCapacities( const RowCapacitiesVector& rowCapacities );
 
       /**
        * \brief Set matrix elements from an initializer list.
-- 
GitLab


From ca921a47a881c874640ea71037e3252ee6494efa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Sat, 9 Jan 2021 12:52:53 +0100
Subject: [PATCH 17/53] Fixed printing of tridiagonal matrix.

---
 src/TNL/Matrices/TridiagonalMatrixView.hpp    |  2 +-
 .../Matrices/TridiagonalMatrixTest.h          | 55 -------------------
 2 files changed, 1 insertion(+), 56 deletions(-)

diff --git a/src/TNL/Matrices/TridiagonalMatrixView.hpp b/src/TNL/Matrices/TridiagonalMatrixView.hpp
index d920b21d0..d7537c20f 100644
--- a/src/TNL/Matrices/TridiagonalMatrixView.hpp
+++ b/src/TNL/Matrices/TridiagonalMatrixView.hpp
@@ -695,7 +695,7 @@ void TridiagonalMatrixView< Real, Device, Index, Organization >::print( std::ost
          {
             auto v = this->getElement( row, column );
             if( v )
-               str << " Col:" << column << "->" << v << "\t";
+               str << column << ":" << v << "\t";
          }
       str << std::endl;
    }
diff --git a/src/UnitTests/Matrices/TridiagonalMatrixTest.h b/src/UnitTests/Matrices/TridiagonalMatrixTest.h
index 500ed9938..3b68f7490 100644
--- a/src/UnitTests/Matrices/TridiagonalMatrixTest.h
+++ b/src/UnitTests/Matrices/TridiagonalMatrixTest.h
@@ -1264,54 +1264,6 @@ void test_SaveAndLoad()
    EXPECT_EQ( savedMatrix.getElement( 3, 3 ), 16 );
 }
 
-template< typename Matrix >
-void test_Print()
-{
-   using RealType = typename Matrix::RealType;
-   using DeviceType = typename Matrix::DeviceType;
-   using IndexType = typename Matrix::IndexType;
-
-   /*
-    * Sets up the following 5x4 sparse matrix:
-    *
-    *    /  1  2  0  0 \
-    *    |  5  6  7  0 |
-    *    |  0 10 11 12 |
-    *    |  0  0 15 16 |
-    *    \  0  0  0 20 /
-    */
-   const IndexType rows = 5;
-   const IndexType cols = 4;
-
-   Matrix m( rows, cols );
-
-   RealType value = 1;
-   for( IndexType i = 0; i < rows; i++)
-      for( IndexType j = 0; j < cols; j++)
-      {
-         if( abs( i - j ) <= 1 )
-            m.setElement( i, j, value );
-         value++;
-      }
-
-   std::stringstream printed;
-   std::stringstream couted;
-
-   //change the underlying buffer and save the old buffer
-   auto old_buf = std::cout.rdbuf(printed.rdbuf());
-
-   m.print( std::cout ); //all the std::cout goes to ss
-
-   std::cout.rdbuf(old_buf); //reset
-   couted << "Row: 0 ->  Col:0->1\t Col:1->2\t\n"
-             "Row: 1 ->  Col:0->5\t Col:1->6\t Col:2->7\t\n"
-             "Row: 2 ->  Col:1->10\t Col:2->11\t Col:3->12\t\n"
-             "Row: 3 ->  Col:2->15\t Col:3->16\t\n"
-             "Row: 4 ->  Col:3->20\t\n";
-
-   EXPECT_EQ( printed.str(), couted.str() );
-}
-
 // test fixture for typed tests
 template< typename Matrix >
 class MatrixTest : public ::testing::Test
@@ -1478,13 +1430,6 @@ TYPED_TEST( MatrixTest, saveAndLoadTest )
     test_SaveAndLoad< MatrixType >();
 }
 
-TYPED_TEST( MatrixTest, printTest )
-{
-    using MatrixType = typename TestFixture::MatrixType;
-
-    test_Print< MatrixType >();
-}
-
 //// test_getType is not general enough yet. DO NOT TEST IT YET.
 
 //TEST( TridiagonalMatrixTest, Tridiagonal_GetTypeTest_Host )
-- 
GitLab


From be26a735f460553864a93871474baf3e36699cd4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Sat, 9 Jan 2021 16:52:36 +0100
Subject: [PATCH 18/53] One more fix of tridiagonal matrix printing.

---
 src/TNL/Matrices/TridiagonalMatrixView.hpp | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/src/TNL/Matrices/TridiagonalMatrixView.hpp b/src/TNL/Matrices/TridiagonalMatrixView.hpp
index d7537c20f..30afaa938 100644
--- a/src/TNL/Matrices/TridiagonalMatrixView.hpp
+++ b/src/TNL/Matrices/TridiagonalMatrixView.hpp
@@ -10,6 +10,7 @@
 
 #pragma once
 
+#include <iomanip>
 #include <TNL/Assert.h>
 #include <TNL/Matrices/TridiagonalMatrixView.h>
 #include <TNL/Exceptions/NotImplementedError.h>
@@ -693,9 +694,13 @@ void TridiagonalMatrixView< Real, Device, Index, Organization >::print( std::ost
       for( IndexType column = row - 1; column < row + 2; column++ )
          if( column >= 0 && column < this->columns )
          {
-            auto v = this->getElement( row, column );
-            if( v )
-               str << column << ":" << v << "\t";
+            auto value = this->getElement( row, column );
+            if( value )
+            {
+               std::stringstream str_;
+               str_ << std::setw( 4 ) << std::right << column << ":" << std::setw( 4 ) << std::left << value;
+               str << std::setw( 10 ) << str_.str();
+            }
          }
       str << std::endl;
    }
-- 
GitLab


From cfd0bb8828b71d350f7fc6efd309f3f19215663e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Sat, 9 Jan 2021 17:05:28 +0100
Subject: [PATCH 19/53] Added check of correct lambda functions definition in
 lambda matrix factory.

---
 src/TNL/Matrices/LambdaMatrix.h | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/src/TNL/Matrices/LambdaMatrix.h b/src/TNL/Matrices/LambdaMatrix.h
index cfd0a330e..aa3602ba2 100644
--- a/src/TNL/Matrices/LambdaMatrix.h
+++ b/src/TNL/Matrices/LambdaMatrix.h
@@ -10,6 +10,7 @@
 
 #pragma once
 
+#include <type_traits>
 #include <TNL/String.h>
 #include <TNL/Devices/Host.h>
 
@@ -377,6 +378,15 @@ struct LambdaMatrixFactory
                        CompressedRowLengthsLambda& compressedRowLengthsLambda )
    -> LambdaMatrix< MatrixElementsLambda, CompressedRowLengthsLambda, Real, Device, Index >
    {
+      // TODO: fix the following asserts, they do not work in fact
+      static_assert( std::is_same<
+            std::enable_if_t< true, decltype(matrixElementsLambda( Index(), Index(), Index(), Index(), std::declval< Index& >(), std::declval< Real& >() ) ) >,
+            void >::value,
+         "Wong type of MatrixElementsLambda, it should be - matrixElementsLambda( Index rows, Index columns, Index rowIdx, Index localIdx, Index& columnIdx, Real& value )" );
+      static_assert( std::is_integral<
+         std::enable_if_t< true, decltype(compressedRowLengthsLambda( Index(), Index(), Index() ) ) >
+          >::value ,
+         "Wong type of CompressedRowLengthsLambda, it should be - matrixElementsLambda( Index rows, Index columns, Index rowIdx )" );
       return LambdaMatrix< MatrixElementsLambda, CompressedRowLengthsLambda, Real, Device, Index >(
          matrixElementsLambda,
          compressedRowLengthsLambda );
@@ -405,6 +415,16 @@ struct LambdaMatrixFactory
                        CompressedRowLengthsLambda& compressedRowLengthsLambda )
    -> LambdaMatrix< MatrixElementsLambda, CompressedRowLengthsLambda, Real, Device, Index >
    {
+      // TODO: fix the following asserts, they do not work in fact
+      static_assert( std::is_same<
+            std::enable_if_t< true, decltype(matrixElementsLambda( Index(), Index(), Index(), Index(), std::declval< Index& >(), std::declval< Real& >() ) ) >,
+            void >::value,
+         "Wong type of MatrixElementsLambda, it should be - matrixElementsLambda( Index rows, Index columns, Index rowIdx, Index localIdx, Index& columnIdx, Real& value )" );
+      static_assert( std::is_integral<
+         std::enable_if_t< true, decltype(compressedRowLengthsLambda( Index(), Index(), Index() ) ) >
+          >::value ,
+         "Wong type of CompressedRowLengthsLambda, it should be - matrixElementsLambda( Index rows, Index columns, Index rowIdx )" );
+
       return LambdaMatrix< MatrixElementsLambda, CompressedRowLengthsLambda, Real, Device, Index >(
          rows, columns,
          matrixElementsLambda,
-- 
GitLab


From d2f465ee1235f92b033751b64bbe81a55f1251e9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Sat, 9 Jan 2021 17:06:34 +0100
Subject: [PATCH 20/53] Added two examples of approximation of the Laplace
 operator with the lambda matrices.

---
 .../Matrices/LambdaMatrix/CMakeLists.txt      | 26 ++++++-
 .../LambdaMatrixExample_Laplace.cpp           | 75 +++++++++++++++++++
 .../LambdaMatrixExample_Laplace.cu            |  1 +
 .../LambdaMatrixExample_Laplace_2.cpp         | 63 ++++++++++++++++
 .../LambdaMatrixExample_Laplace_2.cu          |  1 +
 .../Tutorials/Matrices/tutorial_Matrices.md   | 12 +++
 6 files changed, 176 insertions(+), 2 deletions(-)
 create mode 100644 Documentation/Examples/Matrices/LambdaMatrix/LambdaMatrixExample_Laplace.cpp
 create mode 120000 Documentation/Examples/Matrices/LambdaMatrix/LambdaMatrixExample_Laplace.cu
 create mode 100644 Documentation/Examples/Matrices/LambdaMatrix/LambdaMatrixExample_Laplace_2.cpp
 create mode 120000 Documentation/Examples/Matrices/LambdaMatrix/LambdaMatrixExample_Laplace_2.cu

diff --git a/Documentation/Examples/Matrices/LambdaMatrix/CMakeLists.txt b/Documentation/Examples/Matrices/LambdaMatrix/CMakeLists.txt
index 6315309b2..9bb955626 100644
--- a/Documentation/Examples/Matrices/LambdaMatrix/CMakeLists.txt
+++ b/Documentation/Examples/Matrices/LambdaMatrix/CMakeLists.txt
@@ -13,9 +13,19 @@ ADD_CUSTOM_COMMAND( COMMAND LambdaMatrixExample_getNonzeroElementsCount >
                      ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/LambdaMatrixExample_getNonzeroElementsCount.out
                     OUTPUT LambdaMatrixExample_getNonzeroElementsCount.out )
 
-
 IF( BUILD_CUDA )
-   CUDA_ADD_EXECUTABLE( LambdaMatrixExample_rowsReduction_cuda LambdaMatrixExample_rowsReduction.cu )
+   CUDA_ADD_EXECUTABLE( LambdaMatrixExample_Laplace_cuda LambdaMatrixExample_Laplace.cu )
+   ADD_CUSTOM_COMMAND( COMMAND LambdaMatrixExample_Laplace_cuda >
+                        ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/LambdaMatrixExample_Laplace.out
+                     OUTPUT LambdaMatrixExample_Laplace.out )
+
+   CUDA_ADD_EXECUTABLE( LambdaMatrixExample_Laplace_2_cuda LambdaMatrixExample_Laplace_2.cu )
+   ADD_CUSTOM_COMMAND( COMMAND LambdaMatrixExample_Laplace_2_cuda >
+                       ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/LambdaMatrixExample_Laplace_2.out
+                       OUTPUT LambdaMatrixExample_Laplace_2.out )
+                  
+
+                     CUDA_ADD_EXECUTABLE( LambdaMatrixExample_rowsReduction_cuda LambdaMatrixExample_rowsReduction.cu )
    ADD_CUSTOM_COMMAND( COMMAND LambdaMatrixExample_rowsReduction_cuda >
                         ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/LambdaMatrixExample_rowsReduction.out
                        OUTPUT LambdaMatrixExample_rowsReduction.out )
@@ -36,6 +46,16 @@ IF( BUILD_CUDA )
                        OUTPUT LambdaMatrixExample_forAllRows.out )
 
 ELSE()
+   ADD_EXECUTABLE( LambdaMatrixExample_Laplace LambdaMatrixExample_Laplace.cpp )
+   ADD_CUSTOM_COMMAND( COMMAND LambdaMatrixExample_Laplace >
+                       ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/LambdaMatrixExample_Laplace.out
+                       OUTPUT LambdaMatrixExample_Laplace.out )
+
+   ADD_EXECUTABLE( LambdaMatrixExample_Laplace_2 LambdaMatrixExample_Laplace_2.cpp )
+   ADD_CUSTOM_COMMAND( COMMAND LambdaMatrixExample_Laplace_2 >
+                        ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/LambdaMatrixExample_Laplace_2.out
+                        OUTPUT LambdaMatrixExample_Laplace_2.out )
+
    ADD_EXECUTABLE( LambdaMatrixExample_rowsReduction LambdaMatrixExample_rowsReduction.cpp )
    ADD_CUSTOM_COMMAND( COMMAND LambdaMatrixExample_rowsReduction >
                         ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/LambdaMatrixExample_rowsReduction.out
@@ -59,6 +79,8 @@ ENDIF()
 
 ADD_CUSTOM_TARGET( RunLambdaMatricesExamples ALL DEPENDS
    LambdaMatrixExample_Constructor.out
+   LambdaMatrixExample_Laplace.out
+   LambdaMatrixExample_Laplace_2.out
    LambdaMatrixExample_getCompressedRowLengths.out
    LambdaMatrixExample_getNonzeroElementsCount.out
    LambdaMatrixExample_rowsReduction.out
diff --git a/Documentation/Examples/Matrices/LambdaMatrix/LambdaMatrixExample_Laplace.cpp b/Documentation/Examples/Matrices/LambdaMatrix/LambdaMatrixExample_Laplace.cpp
new file mode 100644
index 000000000..ac2295e63
--- /dev/null
+++ b/Documentation/Examples/Matrices/LambdaMatrix/LambdaMatrixExample_Laplace.cpp
@@ -0,0 +1,75 @@
+#include <iostream>
+#include <TNL/Matrices/LambdaMatrix.h>
+#include <TNL/Devices/Host.h>
+#include <TNL/Devices/Cuda.h>
+
+
+template< typename Device >
+void laplaceOperatorMatrix()
+{
+   /***
+    * Set  matrix representing approximation of the Laplace operator on regular
+    * grid using the finite difference method.
+    */
+   const int gridSize( 4 );
+   const int matrixSize = gridSize * gridSize;
+   auto rowLengths = [=] __cuda_callable__ ( const int rows, const int columns, const int rowIdx ) -> int
+   {
+      const int gridRow = rowIdx / gridSize;                  // coordinates in the numerical grid
+      const int gridColumn = rowIdx % gridSize;
+      if( gridRow == 0 || gridRow == gridSize - 1 ||          // boundary grid node
+          gridColumn == 0 || gridColumn == gridSize - 1 )
+          return 1;
+      return 5;
+   };
+   auto matrixElements = [=] __cuda_callable__ ( const int rows, const int columns, const int rowIdx, const int localIdx, int& columnIdx, double& value) {
+      const int gridRow = rowIdx / gridSize;                  // coordinates in the numerical grid
+      const int gridColumn = rowIdx % gridSize;
+      if( gridRow == 0 || gridRow == gridSize - 1 ||          // boundary grid node
+          gridColumn == 0 || gridColumn == gridSize - 1 )
+         {
+            columnIdx = rowIdx;                               // diagonal element ....
+            value = 1.0;                                      // ... is set to 1
+         }
+         else                                                 // interior grid node
+         {
+            switch( localIdx )                                // set diagonal element to 4
+            {                                                 // and the others to -1
+               case 0:
+                  columnIdx = rowIdx - gridSize;
+                  value = -1;
+                  break;
+               case 1:
+                  columnIdx = rowIdx - 1;
+                  value = -1;
+                  break;
+               case 2:
+                  columnIdx = rowIdx;
+                  value = 4;
+                  break;
+               case 3:
+                  columnIdx = rowIdx + 1;
+                  value = -1;
+                  break;
+               case 4:
+                  columnIdx = rowIdx + gridSize;
+                  value = -1;
+                  break;
+            }
+         }
+   };
+   auto matrix = TNL::Matrices::LambdaMatrixFactory< double, Device, int >::create(
+      matrixSize, matrixSize, matrixElements, rowLengths );
+   std::cout << "Laplace operator matrix: " << std::endl << matrix << std::endl;
+}
+
+int main( int argc, char* argv[] )
+{
+   std::cout << "Creating Laplace operator matrix on CPU ... " << std::endl;
+   laplaceOperatorMatrix< TNL::Devices::Host >();
+
+#ifdef HAVE_CUDA
+   std::cout << "Creating Laplace operator matrix on CUDA GPU ... " << std::endl;
+   laplaceOperatorMatrix< TNL::Devices::Cuda >();
+#endif
+}
diff --git a/Documentation/Examples/Matrices/LambdaMatrix/LambdaMatrixExample_Laplace.cu b/Documentation/Examples/Matrices/LambdaMatrix/LambdaMatrixExample_Laplace.cu
new file mode 120000
index 000000000..288e1097e
--- /dev/null
+++ b/Documentation/Examples/Matrices/LambdaMatrix/LambdaMatrixExample_Laplace.cu
@@ -0,0 +1 @@
+LambdaMatrixExample_Laplace.cpp
\ No newline at end of file
diff --git a/Documentation/Examples/Matrices/LambdaMatrix/LambdaMatrixExample_Laplace_2.cpp b/Documentation/Examples/Matrices/LambdaMatrix/LambdaMatrixExample_Laplace_2.cpp
new file mode 100644
index 000000000..3c70325d3
--- /dev/null
+++ b/Documentation/Examples/Matrices/LambdaMatrix/LambdaMatrixExample_Laplace_2.cpp
@@ -0,0 +1,63 @@
+#include <iostream>
+#include <TNL/Matrices/LambdaMatrix.h>
+#include <TNL/Devices/Host.h>
+#include <TNL/Devices/Cuda.h>
+
+
+template< typename Device >
+void laplaceOperatorMatrix()
+{
+   /***
+    * Set  matrix representing approximation of the Laplace operator on regular
+    * grid using the finite difference method.
+    */
+   const int gridSize( 4 );
+   const int matrixSize = gridSize * gridSize;
+   TNL::Containers::Vector< int, Device > columnOffsets{ 0, -1, 0, 1, 0 };  // helper vector for getting matrix elements column indexes
+   columnOffsets.setElement( 0, -gridSize );
+   columnOffsets.setElement( 4, gridSize );
+   auto columnOffsetsView = columnOffsets.getView();
+   TNL::Containers::Vector< double, Device > values{ 1, 1, -4, 1, 1 };      // helper vector for getting matrix elements values
+   auto valuesView = values.getView();
+
+   auto rowLengths = [=] __cuda_callable__ ( const int rows, const int columns, const int rowIdx ) -> int
+   {
+      const int gridRow = rowIdx / gridSize;                  // coordinates in the numerical grid
+      const int gridColumn = rowIdx % gridSize;
+
+      if( gridRow == 0 || gridRow == gridSize - 1 ||          // boundary grid node
+          gridColumn == 0 || gridColumn == gridSize - 1 )
+          return 1;
+      return 5;
+   };
+
+   auto matrixElements = [=] __cuda_callable__ ( const int rows, const int columns, const int rowIdx, const int localIdx, int& columnIdx, double& value) {
+      const int gridRow = rowIdx / gridSize;                  // coordinates in the numerical grid
+      const int gridColumn = rowIdx % gridSize;
+      if( gridRow == 0 || gridRow == gridSize - 1 ||          // boundary grid node
+          gridColumn == 0 || gridColumn == gridSize - 1 )
+         {
+            columnIdx = rowIdx;                               // diagonal element ....
+            value = 1.0;                                      // ... is set to 1
+         }
+         else                                                 // interior grid node
+         {
+            columnIdx = rowIdx + columnOffsetsView[ localIdx ];
+            value = valuesView[ localIdx ];
+         }
+   };
+   auto matrix = TNL::Matrices::LambdaMatrixFactory< double, Device, int >::create(
+      matrixSize, matrixSize, matrixElements, rowLengths );
+   std::cout << "Laplace operator matrix: " << std::endl << matrix << std::endl;
+}
+
+int main( int argc, char* argv[] )
+{
+   std::cout << "Creating Laplace operator matrix on CPU ... " << std::endl;
+   laplaceOperatorMatrix< TNL::Devices::Host >();
+
+#ifdef HAVE_CUDA
+   std::cout << "Creating Laplace operator matrix on CUDA GPU ... " << std::endl;
+   laplaceOperatorMatrix< TNL::Devices::Cuda >();
+#endif
+}
diff --git a/Documentation/Examples/Matrices/LambdaMatrix/LambdaMatrixExample_Laplace_2.cu b/Documentation/Examples/Matrices/LambdaMatrix/LambdaMatrixExample_Laplace_2.cu
new file mode 120000
index 000000000..30b1ab049
--- /dev/null
+++ b/Documentation/Examples/Matrices/LambdaMatrix/LambdaMatrixExample_Laplace_2.cu
@@ -0,0 +1 @@
+LambdaMatrixExample_Laplace_2.cpp
\ No newline at end of file
diff --git a/Documentation/Tutorials/Matrices/tutorial_Matrices.md b/Documentation/Tutorials/Matrices/tutorial_Matrices.md
index d6b15c76c..3e421fb4b 100644
--- a/Documentation/Tutorials/Matrices/tutorial_Matrices.md
+++ b/Documentation/Tutorials/Matrices/tutorial_Matrices.md
@@ -1214,6 +1214,18 @@ Next we call the lambda function `f` by the method `forRows` (\ref TNL::Matrices
 
 \includelineno LambdaMatrixExample_forRows.out
 
+At the end of this part, we show two more examples, how to express a matrix approximating the Laplace operator:
+
+\includelineno LambdaMatrixExample_Laplace.cpp
+
+The following is another way of doing the same but precomputed supporting vectors:
+
+\includelineno LambdaMatrixExample_Laplace_2.cpp
+
+The result of both examples looks as follows:
+
+\includelineno LambdaMatrixExample_Laplace.out
+
 ### Flexible reduction in matrix rows
 
 The reduction of matrix rows is available for the lambda matrices as well. See the follogin example:
-- 
GitLab


From a59d2131251c61a014e83f4cac436d56b7b5efa6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Sat, 9 Jan 2021 17:07:38 +0100
Subject: [PATCH 21/53] Removed trailing whitespaces in LambdaMatrix.h

---
 src/TNL/Matrices/LambdaMatrix.h | 104 ++++++++++++++++----------------
 1 file changed, 52 insertions(+), 52 deletions(-)

diff --git a/src/TNL/Matrices/LambdaMatrix.h b/src/TNL/Matrices/LambdaMatrix.h
index aa3602ba2..27ba94cea 100644
--- a/src/TNL/Matrices/LambdaMatrix.h
+++ b/src/TNL/Matrices/LambdaMatrix.h
@@ -19,29 +19,29 @@ namespace Matrices {
 
 /**
  * \brief "Matrix-free matrix" based on lambda functions.
- * 
+ *
  * The elements of this matrix are not stored explicitly in memory but
  * implicitly on a form of lambda functions.
- * 
+ *
  * \tparam MatrixElementsLambda is a lambda function returning matrix elements values and positions.
- * 
+ *
  *    It has the following form:
- * 
+ *
  *   `matrixElements( Index rows, Index columns, Index rowIdx, Index localIdx, Index& columnIdx, Real& value )`
- * 
+ *
  *    where \e rows is the number of matrix rows, \e columns is the number of matrix columns, \e rowIdx is the index of matrix row being queried,
  *    \e localIdx is the rank of the non-zero element in given row, \e columnIdx is a column index of the matrix element computed by
  *    this lambda and \e value is a value of the matrix element computed by this lambda.
  * \tparam CompressedRowLengthsLambda is a lambda function returning a number of non-zero elements in each row.
- * 
+ *
  *    It has the following form:
- * 
+ *
  *    `rowLengths( Index rows, Index columns, Index rowIdx ) -> IndexType`
- * 
+ *
  *    where \e rows is the number of matrix rows, \e columns is the number of matrix columns and \e rowIdx is an index of the row being queried.
  *
  * \tparam Real is a type of matrix elements values.
- * \tparam Device is a device on which the lambda functions will be evaluated. 
+ * \tparam Device is a device on which the lambda functions will be evaluated.
  * \ẗparam Index is a type to be used for indexing.
  */
 template< typename MatrixElementsLambda,
@@ -73,13 +73,13 @@ class LambdaMatrix
 
       /**
        * \brief Constructor with lambda functions defining the matrix elements.
-       * 
+       *
        * Note: It might be difficult to express the types of the lambdas. For easier creation of
        * \e LambdaMatrix you may use \ref LambdaMatrixFactory.
-       * 
+       *
        * \param matrixElements is a lambda function giving matrix elements position and value.
        * \param compressedRowLentghs is a lambda function returning how many non-zero matrix elements are in given row.
-       * 
+       *
        * \par Example
        * \include Matrices/LambdaMatrix/LambdaMatrixExample_Constructor.cpp
        * \par Output
@@ -90,15 +90,15 @@ class LambdaMatrix
 
       /**
        * \brief Constructor with matrix dimensions and lambda functions defining the matrix elements.
-       * 
+       *
        * Note: It might be difficult to express the types of the lambdas. For easier creation of
        * \e LambdaMatrix you may use \ref LambdaMatrixFactory.
-       * 
+       *
        * \param rows is a number of the matrix rows.
        * \param columns is a number of the matrix columns.
        * \param matrixElements is a lambda function giving matrix elements position and value.
        * \param compressedRowLentghs is a lambda function returning how many non-zero matrix elements are in given row.
-       * 
+       *
        * \par Example
        * \include Matrices/LambdaMatrix/LambdaMatrixExample_Constructor.cpp
        * \par Output
@@ -111,21 +111,21 @@ class LambdaMatrix
 
       /**
        * \brief Copy constructor.
-       * 
+       *
        * \param matrix is input matrix.
        */
       LambdaMatrix( const LambdaMatrix& matrix ) = default;
 
       /**
        * \brief Move constructor.
-       * 
+       *
        * \param matrix is input matrix.
        */
       LambdaMatrix( LambdaMatrix&& matrix ) = default;
 
       /**
        * \brief Set number of rows and columns of this matrix.
-       * 
+       *
        * \param rows is the number of matrix rows.
        * \param columns is the number of matrix columns.
        */
@@ -134,7 +134,7 @@ class LambdaMatrix
 
       /**
        * \brief Returns a number of matrix rows.
-       * 
+       *
        * \return number of matrix rows.
        */
       __cuda_callable__
@@ -142,7 +142,7 @@ class LambdaMatrix
 
       /**
        * \brief Returns a number of matrix columns.
-       * 
+       *
        * \return number of matrix columns.
        */
       __cuda_callable__
@@ -150,10 +150,10 @@ class LambdaMatrix
 
       /**
        * \brief Computes number of non-zeros in each row.
-       * 
+       *
        * \param rowLengths is a vector into which the number of non-zeros in each row
        * will be stored.
-       * 
+       *
        * \par Example
        * \include Matrices/LambdaMatrix/LambdaMatrixExample_getCompressedRowLengths.cpp
        * \par Output
@@ -164,9 +164,9 @@ class LambdaMatrix
 
       /**
        * \brief Returns number of non-zero matrix elements.
-       * 
+       *
        * \return number of all non-zero matrix elements.
-       * 
+       *
        * \par Example
        * \include Matrices/LambdaMatrix/LambdaMatrixExample_getElementsCount.cpp
        * \par Output
@@ -176,10 +176,10 @@ class LambdaMatrix
 
       /**
        * \brief Returns value of matrix element at position given by its row and column index.
-       * 
+       *
        * \param row is a row index of the matrix element.
        * \param column i a column index of the matrix element.
-       * 
+       *
        * \return value of given matrix element.
        */
       RealType getElement( const IndexType row,
@@ -187,7 +187,7 @@ class LambdaMatrix
 
       /**
        * \brief Method for performing general reduction on matrix rows.
-       * 
+       *
        * \tparam Fetch is a type of lambda function for data fetch declared as
        *          `fetch( IndexType rowIdx, IndexType columnIdx, RealType elementValue ) -> FetchValue`.
        *          The return type of this lambda can be any non void.
@@ -196,14 +196,14 @@ class LambdaMatrix
        * \tparam Keep is a type of lambda function for storing results of reduction in each row.
        *          It is declared as `keep( const IndexType rowIdx, const double& value )`.
        * \tparam FetchValue is type returned by the Fetch lambda function.
-       * 
+       *
        * \param begin defines beginning of the range [begin,end) of rows to be processed.
        * \param end defines ending of the range [begin,end) of rows to be processed.
        * \param fetch is an instance of lambda function for data fetch.
        * \param reduce is an instance of lambda function for reduction.
        * \param keep in an instance of lambda function for storing results.
        * \param zero is zero of given reduction operation also known as idempotent element.
-       * 
+       *
        * \par Example
        * \include Matrices/LambdaMatrix/LambdaMatrixExample_rowsReduction.cpp
        * \par Output
@@ -214,7 +214,7 @@ class LambdaMatrix
 
       /**
        * \brief Method for performing general reduction on ALL matrix rows.
-       * 
+       *
        * \tparam Fetch is a type of lambda function for data fetch declared as
        *          `fetch( IndexType rowIdx, IndexType columnIdx, RealType elementValue ) -> FetchValue`.
        *          The return type of this lambda can be any non void.
@@ -223,12 +223,12 @@ class LambdaMatrix
        * \tparam Keep is a type of lambda function for storing results of reduction in each row.
        *          It is declared as `keep( const IndexType rowIdx, const double& value )`.
        * \tparam FetchValue is type returned by the Fetch lambda function.
-       * 
+       *
        * \param fetch is an instance of lambda function for data fetch.
        * \param reduce is an instance of lambda function for reduction.
        * \param keep in an instance of lambda function for storing results.
        * \param zero is zero of given reduction operation also known as idempotent element.
-       * 
+       *
        * \par Example
        * \include Matrices/LambdaMatrix/LambdaMatrixExample_allRowsReduction.cpp
        * \par Output
@@ -239,18 +239,18 @@ class LambdaMatrix
 
       /**
        * \brief Method for iteration over all matrix rows for constant instances.
-       * 
+       *
        * \tparam Function is type of lambda function that will operate on matrix elements.
        *    It is should have form like
        *  `function( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx_, const RealType& value, bool& compute )`.
-       *  The column index repeats twice only for compatibility with sparse matrices. 
-       *  If the 'compute' variable is set to false the iteration over the row can 
+       *  The column index repeats twice only for compatibility with sparse matrices.
+       *  If the 'compute' variable is set to false the iteration over the row can
        *  be interrupted.
-       * 
+       *
        * \param begin defines beginning of the range [begin,end) of rows to be processed.
        * \param end defines ending of the range [begin,end) of rows to be processed.
        * \param function is an instance of the lambda function to be called in each row.
-       * 
+       *
        * \par Example
        * \include Matrices/LambdaMatrix/LambdaMatrixExample_forRows.cpp
        * \par Output
@@ -261,12 +261,12 @@ class LambdaMatrix
 
       /**
        * \brief This method calls \e forRows for all matrix rows (for constant instances).
-       * 
+       *
        * See \ref LambdaMatrix::forRows.
-       * 
+       *
        * \tparam Function is a type of lambda function that will operate on matrix elements.
        * \param function  is an instance of the lambda function to be called in each row.
-       * 
+       *
        * \par Example
        * \include Matrices/LambdaMatrix/LambdaMatrixExample_forAllRows.cpp
        * \par Output
@@ -277,16 +277,16 @@ class LambdaMatrix
 
       /**
        * \brief Computes product of matrix and vector.
-       * 
+       *
        * More precisely, it computes:
-       * 
+       *
        * `outVector = matrixMultiplicator * ( *this ) * inVector + outVectorMultiplicator * outVector`
-       * 
+       *
        * \tparam InVector is type of input vector.  It can be \ref Vector,
        *     \ref VectorView, \ref Array, \ref ArraView or similar container.
        * \tparam OutVector is type of output vector. It can be \ref Vector,
        *     \ref VectorView, \ref Array, \ref ArraView or similar container.
-       * 
+       *
        * \param inVector is input vector.
        * \param outVector is output vector.
        * \param matrixMultiplicator is a factor by which the matrix is multiplied. It is one by default.
@@ -315,7 +315,7 @@ class LambdaMatrix
 
       /**
        * \brief Method for printing the matrix to output stream.
-       * 
+       *
        * \param str is the output stream.
        */
       void print( std::ostream& str ) const;
@@ -331,7 +331,7 @@ class LambdaMatrix
 
 /**
  * \brief Insertion operator for dense matrix and output stream.
- * 
+ *
  * \param str is the output stream.
  * \param matrix is the lambda matrix.
  * \return reference to the stream.
@@ -345,9 +345,9 @@ std::ostream& operator<< ( std::ostream& str, const LambdaMatrix< MatrixElements
 
 /**
  * \brief Helper class for creating instances of LambdaMatrix.
- * 
+ *
  * See \ref LambdaMatrix.
- * 
+ *
  * \param matrixElementsLambda
  * \param compressedRowLengthsLambda
  */
@@ -361,12 +361,12 @@ struct LambdaMatrixFactory
 
    /**
     * \brief Creates lambda matrix with given lambda functions.
-    * 
+    *
     * \param matrixElementsLambda is a lambda function evaluating matrix elements.
     * \param compressedRowLengthsLambda is a lambda function returning number of
     *    non-zero matrix elements in given \e row.
     * \return instance of LambdaMatrix.
-    * 
+    *
     * \par Example
     * \include Matrices/LambdaMatrix/LambdaMatrixExample_Constructor.cpp
     * \par Output
@@ -394,14 +394,14 @@ struct LambdaMatrixFactory
 
    /**
     * \brief Creates lambda matrix with given dimensions and lambda functions.
-    * 
+    *
     * \param rows is number of matrix rows.
     * \param columns is number of matrix columns.
     * \param matrixElementsLambda is a lambda function evaluating matrix elements.
     * \param compressedRowLengthsLambda is a lambda function returning number of
     *    non-zero matrix elements in given \e row.
     * \return instance of LambdaMatrix.
-    * 
+    *
     * \par Example
     * \include Matrices/LambdaMatrix/LambdaMatrixExample_Constructor.cpp
     * \par Output
-- 
GitLab


From 2280d8c9c7f9b4598bb237b508374ec3bf8aff20 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Sat, 9 Jan 2021 22:51:25 +0100
Subject: [PATCH 22/53] Improving tutorials on matrices.

---
 .../LambdaMatrixExample_Laplace.cpp           |   10 +-
 .../Tutorials/Matrices/tutorial_Matrices.md   | 1034 +++++++++--------
 2 files changed, 575 insertions(+), 469 deletions(-)

diff --git a/Documentation/Examples/Matrices/LambdaMatrix/LambdaMatrixExample_Laplace.cpp b/Documentation/Examples/Matrices/LambdaMatrix/LambdaMatrixExample_Laplace.cpp
index ac2295e63..26b3e741a 100644
--- a/Documentation/Examples/Matrices/LambdaMatrix/LambdaMatrixExample_Laplace.cpp
+++ b/Documentation/Examples/Matrices/LambdaMatrix/LambdaMatrixExample_Laplace.cpp
@@ -37,23 +37,23 @@ void laplaceOperatorMatrix()
             {                                                 // and the others to -1
                case 0:
                   columnIdx = rowIdx - gridSize;
-                  value = -1;
+                  value = 1;
                   break;
                case 1:
                   columnIdx = rowIdx - 1;
-                  value = -1;
+                  value = 1;
                   break;
                case 2:
                   columnIdx = rowIdx;
-                  value = 4;
+                  value = -4;
                   break;
                case 3:
                   columnIdx = rowIdx + 1;
-                  value = -1;
+                  value = 1;
                   break;
                case 4:
                   columnIdx = rowIdx + gridSize;
-                  value = -1;
+                  value = 1;
                   break;
             }
          }
diff --git a/Documentation/Tutorials/Matrices/tutorial_Matrices.md b/Documentation/Tutorials/Matrices/tutorial_Matrices.md
index 3e421fb4b..086ae7aef 100644
--- a/Documentation/Tutorials/Matrices/tutorial_Matrices.md
+++ b/Documentation/Tutorials/Matrices/tutorial_Matrices.md
@@ -2,18 +2,112 @@
 
 ## Introduction
 
-TNL offers the following type of matrices:  dense matrices, sparse matrices, tridiagonal matrices, multidiagonal matrices and lambda matrices. The sparse matrices can be marked as symmetric to optimize memory requirements. The interfaces of given matrix types are designed to be as unified as possible to ensure that the user can easily switch between different matrix types while making no or only a little changes in the source code. All matrix types allows traversing all matrix elements and manipulate them using a lambda function as well as performing flexible reduction in matrix rows. The following text describes particular matrix types in details.
-
+TNL offers several types of matrices like dense (\ref TNL::Matrices::DenseMatrix), sparse (\ref TNL::Matrices::SparseMatrix), tridiagonal (\ref TNL::Matrices::TridiagonalMatrix), multidiagonal (\ref TNL::Matrices::MultidiagonalMatrix) and lambda matrices (\ref TNL::Matrices::LambdaMatrix). The sparse matrices can be marked as symmetric to lower the memory requirements. The interfaces of given matrix types are designed to be as unified as possible to ensure that the user can easily switch between different matrix types while making no or only a little changes in the source code. All matrix types allows traversing all matrix elements and manipulate them using a lambda functions as well as performing flexible reduction in matrix rows. The following text describes particular matrix types and their unified interface in details.
 
 ## Table of Contents
-1. [Dense matrices](#dense_matrices)
-2. [Sparse matrices](#sparse_matrices)
-3. [Tridiagonal matrices](#tridiagonal_matrices)
-4. [Multidiagonal matrices](#multidiagonal_matrices)
-5. [Lambda matrices](#lambda_matrices)
+1. [Overview of matrix types](#overview_of_matrix_types)
+2. [Allocation and setup of different matrix types](#allocation_and_setup_of_different_matrix_types)
+   1. [Dense matrices](#dense_matrices_setup)
+   2. [Sparse matrices](#sparse_matrices)_setup
+   3. [Tridiagonal matrices](#tridiagonal_matrices_setup)
+   4. [Multidiagonal matrices](#multidiagonal_matrices_setup)
+   5. [Lambda matrices](#lambda_matrices_setup)
+3. [Flexible reduction in matrix rows](#flexible_reduction_in_matrix_rows)
+4. [Matrix-vector product](#matrix_vector_product)
+5. [Matrix I/O operations](#matrix_io_operations)
+
+
+## Overview of matrix types <a name="overview_of_matrix_types"></a>
+
+In majority of numerical algorithms either dense or sparse matrices are used. The dense matrix (\ref TNL::Matrices::DenseMatrix) is such that all or at least most of its matrix elements are nonzero. On the other hand [sparse matrix](https://en.wikipedia.org/wiki/Sparse_matrix) (\ref TNL::Matrices::SparseMatrix) is a matrix which has most of the matrix elements equal to zero. From the implementation point of view, the data structures for the dense matrices allocates all matrix elements while formats for the sparse matrices aims to store explicitly only the nonzero matrix elements. The most popular format for storing the sparse matrices in [CSR format](https://en.wikipedia.org/wiki/Sparse_matrix#Compressed_sparse_row_(CSR,_CRS_or_Yale_format)). However, especially for better data alignment in memory of GPUs, many other formats were designed. In TNL, the user may choose between several different sparse matrix formats. There are also sparse matrices with specific pattern of the nonzero elements like [tridiagonal matrices](https://en.wikipedia.org/wiki/Tridiagonal_matrix) (\ref TNL::Matrices::TridiagonalMatrix) which "has nonzero elements on the main diagonal, the first diagonal below this, and the first diagonal above the main diagonal only". An example of such matrix may look as follows:
+
+\f[
+\left(
+ \begin{array}{ccccccc}
+ -2  &  1  &  .  & .   &  . & .  \\
+  1  & -2  &  1  &  .  &  . & .  \\
+  .  &  1  & -2  &  1   &  . & .  \\
+  .  &  .  &  1  & -2  &  1 &  . \\
+  .  &  .  &  .  &  1  & -2 &  1 \\
+  .  &  .  &  .  &  .  &  1 & -2
+ \end{array}
+ \right)
+\f]
+
+Similar but more general type of matrices are multidiagonal matrices (\ref TNL::Matrices::MultidigonalMatrix) which has the nonzero elements positioned only on lines parallel to the diagonal like the following matrix:
+
+\f[
+  \left(
+  \begin{array}{ccccccc}
+  -4  &  1  &  .  &  1  &  . & .  \\
+   1  & -4  &  1  &  .  &  1 & .  \\
+   .  &  1  & -4  &  1  &  . &  1 \\
+   1  & .   &  1  & -4  &  1 &  . \\
+   .  &  1  &  .  &  1  & -4 &  1 \\
+   .  &  .  &  1  &  .  &  1 & -4
+  \end{array}
+  \right)
+ \f]
+
+Finaly, TNL offers so called *lambda matrices* (\ref TNL::Matrices::LambdaMatrix) which are kind of "matrix-free matrices". They do not store the matrix elements explicitly in the memory, but rather evaluates them on-the-fly based on user defined lambda functions.
+
+In the following table we show comparison of extreme example when we would express a tridiagonal matrix by means of different matrix types.
+
+| Matrix dimensions | Dense elems.   | Dense mem. | Sparse elems. | Sparse mem.  | Tridiag. elems. | Tridiag. mem. | Multidiag. elems. | Mutlidiag. mem. |
+|------------------:|---------------:|-----------:|--------------:|-------------:|----------------:|--------------:|------------------:|----------------:|
+|             10x10 |            100 |     800  B |           >28 |       >336 B |              30 |         240 B |                30 |           252 B |
+|           100x100 |         10,000 |      80 kB |          >298 |     >3,576 B |             300 |       2,400 B |               300 |         2,412 B |
+|       1,000x1,000 |      1,000,000 |       8 MB |        >2,998 |    >35,976 B |           3,000 |      24,000 B |             3,000 |        24,012 B |
+|     10,000x10,000 |    100,000,000 |     800 MB |       >29,998 |   >359,976 B |          30,000 |     240,000 B |            30,000 |       240,012 B |
+|   100,000x100,000 | 10,000,000,000 |      80 GB |     > 299,998 | >3,599,876 B |         300,000 |   2,400,000 B |           300,000 |     2,400,012 B |
+
+In the table:
+
+* **Matrix dimensions** is the number of matrix rows and columns
+* **Dense elems.** is the number of allocated matrix elements in the dense matrix.
+* **Dense mem.** is the allocated memory for the matrix elements in the dense matrix if the elements are stored in the double precision.
+* **Sparse elems.** is the number of allocated matrix elements in the sparse matrix. Some formats may allocate padding zeros for better data alignment in the memory.
+* **Sparse mem.** is the allocated memory for the matrix elements in the sparse matrix if the elements are stored in the double precision and column indexes in 32-bit integer.
+* **Tridiag. elems** is the number of allocated matrix elements in the tridiagonal matrix.
+* **Tridiag mem.** is the allocated memory for the matrix elements in the tridiagonal matrix if the elements are stored in the double precision.
+* **Multidiag. elems** is the number of allocated matrix elements in the multidiagonal matrix.
+* **Multidiag mem.** is the allocated memory for the matrix elements in the multidiagonal matrix if the elements are stored in the double precision.
+
+Choosing the best matrix type can have tremendous impact on the performance but also memory requirements. If we would treat each matrix as a dense one we would not be able to to work with matrices larger than 50,000x50,000 on common personal computers because we would need tens of gibabytes of memory. At the same time we see that the other matrix types can do the same job with only few megabytes. In addition, other matrix types work with much less matrix types and so operations like matrix-vector multiplication can be done with much less operations which means much faster. Since in modern hardware architectures, the computing units are limited mainly by the performance of the memory chips, transferring less data from the memory increases the performance even more.
+
+The following table shows the same but when storing a matrix which has only five nonzero elements in each row. Such matrices arises often from the finite difference method for solution of the partial differential equations:
+
+| Matrix dimensions | Dense elems.   | Dense mem. | Sparse elems. | Sparse mem.  | Multidiag. elems. | Mutlidiag. mem. |
+|------------------:|---------------:|-----------:|--------------:|-------------:|------------------:|----------------:|
+|             10x10 |            100 |     800  B |           >50 |       >600 B |                50 |           420 B |
+|           100x100 |         10,000 |      80 kB |          >500 |     >6,000 B |               500 |         4,020 B |
+|       1,000x1,000 |      1,000,000 |       8 MB |        >5,000 |    >60,000 B |             5,000 |        40,020 B |
+|     10,000x10,000 |    100,000,000 |     800 MB |       >50,000 |   >600,000 B |            50,000 |       400,020 B |
+|   100,000x100,000 | 10,000,000,000 |      80 GB |      >500,000 | >6,000,000 B |           500,000 |     4,000,020 B |
+
+There is no change in the dense matrix part of the table. The numbers grow proportionaly in case of sparse and mutlidiagonal matrix. We see that the multidiagonal matrix type is suitable for the finite difference method or similar numerical methods for solution of the partial differential equations.
+
+General sparse matrix formats needs to store columns indexes for each matrix element which is not true for the multidiagonal matrix. The following table shows how many bytes we need for storing of one matrix element with different matrix types depending on the type of the matrix elements (`Real`) and column indexes (`Index`):
+
+| Real   | Index  | Dense matrix | Multidiagonal matrix |  Sparse matrix | Fill ratio |
+|:------:|:------:|:------------:|:--------------------:|:--------------:|:----------:|
+| float  | 32-bit |          4 B |                  4 B |            8 B |     << 50% |
+| float  | 32-bit |          4 B |                  4 B |           12 B |     << 30% |
+| double | 32-bit |          8 B |                  8 B |           12 B |     << 60% |
+| double | 64-bit |          8 B |                  8 B |           16 B |     << 50% |
+
+In this table:
+
+* **Real** is matrix element type.
+* **Index** is column index type.
+* **Dense matrix** is number of bytes needed to store one matrix element in the dense matrix.
+* **Multidiagonal matrix** is number of bytes needed to store one matrix element in the mutldiagonal matrix.
+* **Sparse matrix** is number of bytes needed to store one matrix element in the sparse matrix.
+* **Fill ratio** is maximal percentage of the nonzero matrix elements until which the sparse matrix can perform better.
 
+## Allocation and setup of different matrix types <a name="allocation_and_setup_of_different_matrix_types"></a>
 
-## Dense matrices <a name="dense_matrices"></a>
+### Dense matrices <a name="dense_matrices_setup"></a>
 
 Dense matrix is a templated class defined in the namespace \ref TNL::Matrices. It has five template parameters:
 
@@ -23,8 +117,6 @@ Dense matrix is a templated class defined in the namespace \ref TNL::Matrices. I
 * `ElementsOrganization` defines the organization of the matrix elements in memory. It can be \ref TNL::Algorithms::Segments::ColumnMajorOrder or \ref TNL::Algorithms::Segments::RowMajorOrder for column-major and row-major organization respectively. Be default it is the row-major order if the matrix is allocated in the host system and column major order if it is allocated on GPU.
 * `RealAllocator` is a memory allocator (one from \ref TNL::Allocators) which shall be used for allocation of the matrix elements. By default, it is the default allocator for given `Real` type and `Device` type -- see \ref TNL::Allocators::Default.
 
-### Dense matrix allocation and initiation
-
 The following examples show how to allocate the dense matrix and how to initialize the matrix elements. Small matrices can be created simply by the constructor with an initializer list.
 
 \includelineno Matrices/DenseMatrix/DenseMatrixExample_Constructor_init_list.cpp
@@ -64,102 +156,12 @@ The result looks as follows:
 
 \include DenseMatrixExample_forRows.out
 
-### Flexible reduction in matrix rows
-
-Simillar operation to `forRows` is `rowsReduction` (\ref TNL::Matrices::DenseMatrix::rowsReduction) which performs given reduction in each matric row. For example, a matrix-vector product can be seen as a reduction of products of matrix elements and input vector in particular matrix rows. The first element of the result vector ios obtained as:
-
-\f[
-y_1 = a_{11} x_1 + a_{12} x_2 + \ldots + a_{1n} x_n = \sum_{j=1}^n a_{1j}x_j
-\f]
-
-and in general i-th element of the result vector is computed as
-
-\f[
-y_i = a_{i1} x_1 + a_{i2} x_2 + \ldots + a_{in} x_n = \sum_{j=1}^n a_{ij}x_j.
-\f]
-
-We see that in i-th matrix row we have to compute the sum \f$\sum_{j=1}^n a_{ij}x_j\f$ which is reduction of products \f$ a_{ij}x_j\f$. Similar to *flexible parallel reduction* (\ref TNL::Algorithms::Reduction) we just need to design proper lambda functions. See the following example:
-
-
-\includelineno DenseMatrixExample_rowsReduction_vectorProduct.cpp
-
-The `fetch` lambda function computes the product \f$ a_{ij}x_j\f$ where \f$ a_{ij} \f$ is represented by `value` and \f$x_j \f$ is represented by `xView[columnIdx]`. The reduction is just sum of results particular products and it is represented by by the lambda function `reduce`. Finaly, the lambda function `keep` is responsible for storing the results of reduction in each matrix row (which is represented by the variable `value`) into the output vector `y`.  
-The result looks as:
-
-\include DenseMatrixExample_rowsReduction_vectorProduct.out
-
-We will show one more example which is computation of maximal absolute value in each matrix row. The results will be stored in a vector:
-
-\f[
-y_i = \max_{j=1,\ldots,n} |a_{ij}|.
-\f]
-
-See the following example:
-
-\includelineno DenseMatrixExample_rowsReduction_maxNorm.cpp
-
-
-The `fetch` lambda function just returns absolute value of \f$a_{ij} \f$ which is represented again by the varibale `value`. The `reduce` lambda function returns larger of given values and the lambda fuction 'keep' stores the results to the output vectro the same way as in the previous example. Of course, if we compute the maximum of all output vector elements we get some kined of max matrix norm. The output looks as:
-
-\include DenseMatrixExample_rowsReduction_maxNorm.out
-
-### Dense matrix-vector product
-
-One of the most important matrix operation is the matrix-vector multiplication. It is represented by a method `vectorProduct` (\ref TNL::Matrices::DenseMatrix::vectorProduct). It is templated method with two template parameters `InVector` and `OutVector` telling the types of input and output vector respectively. Usually one will substitute some of \ref TNL::Containers::Array, \ref TNL::Containers::ArrayView, \ref TNL::Containers::Vector or \ref TNL::Containers::VectorView for these types. The method accepts the following parameters:
-
-* `inVector` is the input vector having the same number of elements as the number of matrix columns.
-* `outVector` is the output vector having the same number of elements as the number of matrix rows.
-* `matrixMultiplicator` is a number by which the result of matrix-vector product is multiplied. 
-* `outVectorMultiplicator` is a number by which the output vector is multiplied before added to the result of matrix-vector product.
-* `begin` is an index of the first matrix row that is involved in the multiplication. It is zero be default.
-* `end` is an index of the last matrix row that is involved in the multiplication. It is the last matrix row by default.
-
-Note that the ouput vector dimension must be the same as the number of matrix rows no matter how we set `begin` and `end` parameters. These parameters just say that some matrix rows and the output vector elements are omitted.
-
-To summarize, this method computes the following formula:
-
-`outVector = matrixMultiplicator * ( *this ) * inVector + outVectorMultiplicator * outVector.`
-
-### Dense matrix IO
-
-The dense matrix can be saved to a file using a method `save` (\ref TNL::Matrices::DenseMatrix::save) and restored with a method `load` (\ref TNL::Matrices::DenseMatrix::load). To print the matrix, there is a method `print` (\ref TNL::Matrices::DenseMatrix::print) can be used.
-
-### Dense matrix view
-
-Similar to array view (\ref TNL::Containers::ArayView) and vector view (\ref TNL::Containers::VectorView), matrices also offer their view for easier use with lambda functions. For the dense matrix there is a `DenseMatrixView` (\ref TNL::Matrices::DenseMatrixView) which is a templated class with the following template arguments (they are the same as for `DenseMatrix` -- \ref TNL::Matrices::DenseMatrix -- except of the allocator):
-
-* `Real` is a type of matrix elements. 
-* `Device` is a device on which the matrix is allocated. This can be \ref TNL::Devices::Host or \ref TNL::Devices::Cuda.
-* `Index` is a type for indexing the matrix elements and also row and column indexes.
-* `Organization` tells the ordering of matrix elements in memory. It is either RowMajorOrder or ColumnMajorOrder.
-
-The first main reason for using the dense matrix view is its ability to be captured by lambda functions since the copy constructor makes only shallow copy. We will demonstrate it on the example showing the method `setElement` (\ref TNL::Matrices::DenseMatrix::setElement). However, the `SharedPointer` will be replaced with the `DenseMatrixView`. The code looks as follows:
-
-\includelineno DenseMatrixViewExample_setElement.cpp
-
-You can see that we do not need to use the shared pointer (\ref TNL::Pointers::SharedPointer) as we did in the example demonstrating the method `setElement` for dense matrix.  And the result is:
-
-\include DenseMatrixViewExample_setElement.out
-
-The second reason for using the `DenseMatrixView` is to encapsulate data allocated by some other library or program then TNL. The following example demonstrates how to do it:
-
-\includelineno DenseMatrixViewExample_data_encapsulation.cpp
-
-On the lines 18--34 we create matrix by allocating array `data` and filling the matrix using a formula \f$ a_{ij} = i * size + j + 1\f$. We do it first on the host (lines 18--21) in auxilliary array `host_data` to make initiation of the array `data` easier in case when `Device` is GPU. Next, depending on the argument `Device`, we allocate the array `data` on the host or on GPU and copy data from the arary `host_data` to the array `data`. To insert this array into the dense matrix view, we first need to encapsulate it with vector view (\ref TNL::Conatianers::VectorView) `dataView` on the line 39 which can be then used to create the dense matrix view `matrix` on the line 40. Note that wee must set proper matrix elements organizationa which is `RowMajorOrder` (\ref TNL::Algorithms::Segments::RowMajorOrder) in this example. Next, we print the matrix to see if the encapsulation was succesfull (lines 42 and 43) and finaly we demonstrate manipulation with matrix elements (lines 45--48) and we print the result (lines 50 and 51). 
-
-The result looks as follows:
-
-\includelineno DenseMatrixViewExample_data_encapsulation.out
-
-The dense matrix view offers almost all methods which the dense matrix does. So it can be easily used at almost any situation the same way as the dense matrix itself.
-
-## Sparse matrices <a name="sparse_matrices"></a>
+## Sparse matrices <a name="sparse_matrices_setup"></a>
 
 [Sparse matrices](https://en.wikipedia.org/wiki/Sparse_matrix) are extremely important in a lot of numerical algorithms. They are used at situations when we need to operate with matrices having majority of the matrix elements equal to zero. In this case, only the non-zero matrix elements are stored with possible some *padding zeros* used for memory alignment. This is necessary mainly on GPUs. Consider just matrix having 50,000 rows and columns whih is 2,500,000,000 matrix elements. If we store each matrix element in double precision (it means eight bytes per element) we need 20,000,000,000 bytes which is nearly 20 GB of memory. If there are only five non-zero elements in each row we need only \f$8 \times 5 \times 50,000=2,000,000\f$ bytes and so nearly 200 MB. It is really great difference.
 
 Major disadventage of sparse matrices is that there are a lot of different formats for storing such matrices. Though [CSR - Compressed Sparse Row](https://en.wikipedia.org/wiki/Sparse_matrix#Compressed_sparse_row_(CSR,_CRS_or_Yale_format)) format is the most popular of all, especially for GPUs there are many other formats which perform differently on various matrices. So it is a good idea to test several sparse matrix formats if you want to get the best performance. In TNL, there is one templated class \ref TNL::Matrices::SparseMatrix representing the sparse matrices. The change of underlying matrix format can be done just by changing one template parameter. The list of the template paramaters is as follows:
 
-
 * `Real` is type if the matrix elements. It is `double` by default.
 * `Device` is a device where the matrix is allocated. Currently it can be either \ref TNL::Devices::Host for CPU or \ref TNL::Devices::Cuda for GPU supporting CUDA. It is \ref TNL::Devices::Host by default.
 * `Index` is a type to be used for indexing of the matrix elements. It is `int` by default.
@@ -277,7 +279,7 @@ The method can be called from both host (CPU) and device (GPU) if the matrix is
 
 \includelineno SparseMatrixExample_setElement.cpp
 
-Note that we use `SharedPointer` (\ref TNL::Pointers::SharedPointer) to pass the matrix easily into the lambda function when it runs on GPU. The first for-loop runs on CPU no matter where the matrix is allocated. Next we call the lambda function `f` from `ParallelFor` which is device sensitive and so it runs on CPU or GPU depending where the matrix is allocated. To avoid use of `SharedPointer`, which requires explicit synchronization of smart pointers, you may use `SparseMatrixView' (\ref TNL::Matrices::SparseMatrixView) to achiev the same. The result looks as follows:
+Note that we use `SharedPointer` (\ref TNL::Pointers::SharedPointer) to pass the matrix easily into the lambda function when it runs on GPU. The first for-loop runs on CPU no matter where the matrix is allocated. Next we call the lambda function `f` from `ParallelFor` which is device sensitive and so it runs on CPU or GPU depending where the matrix is allocated. To avoid use of `SharedPointer`, which requires explicit synchronization of smart pointers, you may use `SparseMatrixView' (\ref TNL::Matrices::SparseMatrixView) to achieve the same. The result looks as follows:
 
 \include SparseMatrixExample_setElement.out
 
@@ -367,109 +369,9 @@ if( rowIdx < columnIdx )
 
 would not make sense. If we pass through this test, the matrix element lies in the lower triangular part of the matrix and we may set the matrix elements which is done on the lines 17 and 18. The column index (`columnIdx`) is set to local index (line 17) and `value` is set on the line 18. The result looks as follows:
 
+\include SparseMatrixExample_forRows.out
 
-\includelineno SparseMatrixExample_forRows.out
-
-### Flexible reduction in matrix rows
-
-The *flexible parallel reduction* in rows for sparse matrices is very simmilar to the one for dense matrices. It consits of three lambda functions:
-
-1. `fetch` reads and preproces data entering the flexible parallel reduction.
-2. `reduce` performs the reduction operation.
-3. `keep` stores the results from each matrix row.
-
-See the following example:
-
-\includelineno SparseMatrixExample_rowsReduction_vectorProduct.cpp
-
-On the lines 11-16 we set the following matrix:
-
-\f[
-\left(
-\begin{array}{ccccc}
-1 & . & . & . & . \\
-1 & 2 & . & . & . \\
-. & 1 & 8 & . & . \\
-. & . & 1 & 9 & . \\
-. & . & . & . & 1
-\end{array}
-\right)
-\f]
-
-Next we prepare input (`x`) and output (`y`) vectors on the lines 21 and 22 and set all elements of the input vector to one (line 27). Since we will need to access these vectors in lambda functions we prepare their views on lines 32 and 33. On the lines 39-41, we define the `fetch` lambda function. It receives three arguments:
-
-1. `rowIdx` is a row index of the matrix element being currently processed.
-2. `columnIdx` is a column index of the matrix elements being currently processed.
-3. `value` is a value of the matrix element being currently procesed.
-
-We ommit the row index and take the column index which indicates index of the element of the input vector we need to fetch (`xView[ columnIdx ]`). We take its value and multiply it with the value (`value`) of the current matrix element. We do not need to write lambda function for reduction since it is only summation of the intermediate results from the `fetch` lamda and we can use `std::plus<>{}` (see the line 60). The `keep` lambda function offers two parameters:
-
-1. `rowIdx` tells the index of the matrix row for which we aim to store the result.
-2. `value` is the result obtained in the given matrix row.
-
-In our example, we just write the result into appropriate element of the output vector `y` which is given just by the row index `rowIdx` -- see the line 47.  On the line 53 we start the computation of the matrix-vector product. The method `rowsReduction` (\ref TNL::Matrices::SparseMatrix::rowsReduction) accepts the following arguments:
-
-1. `begin` is the begining of the matrix rows range on which the reduction will be performed.
-2. `end` is the end of the matrix rows range on which the reduction will be performed. The last matrix row which is going to be processed has index `end-1`.
-3. `fetch` is the fetch lambda function.
-4. `reduce` is the the lmabda function performing the reduction.
-5. `keep` is the lambda function responsible for processing the results from particular matrix rows.
-6. `zero` is the "zero" element of given reduction opertation also known as *idempotent*. It is really 0 for summation in our example (adding zero to any number does not change the result).
-
-At the end we print the matrix, the input and the output vector -- lines 55-57. The result looks as follows:
-
-\include SparseMatrixExample_rowsReduction_vectorProduct.out
-
-### Sparse matrix-vector product
-
-As we mentioned already in the part explaining the dense matrices, matrix-vector multiplication or in this case sparse matrix-vector multiplication ([SpMV](https://en.wikipedia.org/wiki/Sparse_matrix-vector_multiplication)) is one of the most important operations in numerical mathematics and high-performance computing. It is represented by a method `vectorProduct` (\ref TNL::Matrices::SparseMatrix::vectorProduct). It is templated method with two template parameters `InVector` and `OutVector` telling the types of input and output vector respectively. Usually one will substitute some of \ref TNL::Containers::Array, \ref TNL::Containers::ArrayView, \ref TNL::Containers::Vector or \ref TNL::Containers::VectorView for these types. The method computes the following formula
-
-```
-outVector = matrixMultiplicator * ( *this ) * inVector + outVectorMultiplicator * outVector
-```
-
-and it accepts the following parameters:
-
-* `inVector` is the input vector having the same number of elements as the number of matrix columns.
-* `outVector` is the output vector having the same number of elements as the number of matrix rows.
-* `matrixMultiplicator` is a number by which the result of matrix-vector product is multiplied. 
-* `outVectorMultiplicator` is a number by which the output vector is multiplied before added to the result of matrix-vector product.
-* `begin` is an index of the first matrix row that is involved in the multiplication. It is zero be default.
-* `end` is an index of the last matrix row that is involved in the multiplication. It is the last matrix row by default.
-
-Note that the ouput vector dimension must be the same as the number of matrix rows no matter how we set `begin` and `end` parameters. These parameters just say that some matrix rows and the output vector elements are omitted.
-
-### Sparse matrix IO
-
-The sparse matrix can be saved to a file using a method `save` (\ref TNL::Matrices::SparseMatrix::save) and restored with a method `load` (\ref TNL::Matrices::SparseMatrix::load). For printing the matrix, there is a method `print` (\ref TNL::Matrices::SparseMatrix::print) can be used.
-
-### Sparse matrix view
-
-Sparse matrix view serves, simillar to other views in TNL, to data sharing and for use with lambda functions (views can be easily captured since they make only shallow copy). The sparse matrix view (\ref TNL::Matrices::SparseMatrixView) is templated class having the following template arguments (they are the same as for `SparseMatrix` -- \ref TNL::Matrices::SparseMatrix -- except of the allocators):
-
-* `Real` is type if the matrix elements. It is `double` by default.
-* `Device` is a device where the matrix is allocated. Currently it can be either \ref TNL::Devices::Host for CPU or \ref TNL::Devices::Cuda for GPU supporting CUDA. It is \ref TNL::Devices::Host by default.
-* `Index` is a type to be used for indexing of the matrix elements. It is `int` by default.
-* `MatrixType` tells if the matrix is symmetric (\ref TNL::Matrices::SymmetricMatrix) or general (\ref TNL::Matrices::GeneralMatrix). It is a \ref TNL::Matrices::GeneralMatrix by default.
-* `Segments` define the format of the sparse matrix. It can be (by default, it is \ref TNL::Algorithms::Segments::CSR):
-   * \ref TNL::Algorithms::Segments::CSR for [CSR format](https://en.wikipedia.org/wiki/Sparse_matrix#Compressed_sparse_row_(CSR,_CRS_or_Yale_format)).
-   * \ref TNL::Algorithms::Segments::Ellpack for [Ellpack format](http://mgarland.org/files/papers/nvr-2008-004.pdf).
-   * \ref TNL::Algorithms::Segments::SlicedEllpack for [SlicedEllpack format](https://link.springer.com/chapter/10.1007/978-3-642-11515-8_10) which was also presented as [Row-grouped CSR format](https://arxiv.org/abs/1012.2270).
-   * \ref TNL::Algorithms::Segments::ChunkedEllpack for [ChunkedEllpack format](http://geraldine.fjfi.cvut.cz/~oberhuber/data/vyzkum/publikace/12-heller-oberhuber-improved-rgcsr-format.pdf) which we reffered as Improved Row-grouped CSR and we renamed it to Ellpack format since it uses padding zeros.
-   * \ref TNL::Algorithms::Segments::BiEllpack for [BiEllpack format](https://www.sciencedirect.com/science/article/pii/S0743731514000458?casa_token=2phrEj0Ef1gAAAAA:Lgf6rMBUN6T7TJne6mAgI_CSUJ-jR8jz7Eghdv6L0SJeGm4jfso-x6Wh8zgERk3Si7nFtTAJngg).
-* `ComputeReal` is type which is used for internal computations. By default it is the same as `Real` if `Real` is not `bool`. If `Real` is `bool`, `ComputeReal` is set to `Index` type. This can be changed, of course, by the user.
-
-**If `Real` is set to `bool`, we get *a binary matrix view*.**
-
-The following example shows the use of `SparseMatrixView` with lambda functions:
-
-\includelineno SparseMatrixViewExample_setElement.cpp
-
-The result looks as follows:
-
-\includelineno SparseMatrixViewExample_setElement.out
-
-## Tridiagonal matrices <a name="tridiagonal_matrices"></a>
+## Tridiagonal matrices <a name="tridiagonal_matrices_setup"></a>
 
 Tridiagonal matrix format serves for specific matrix pattern when the nonzero matrix elements can be placed only at the diagonal and immediately next to the diagonal. Here is an example:
 
@@ -488,14 +390,14 @@ Tridiagonal matrix format serves for specific matrix pattern when the nonzero ma
 
 An advantage is that we do not store the column indexes  explicitly as it is in \ref TNL::Matrices::SparseMatrix. This can reduce significantly the  memory requirements which also means better performance. See the following table for the storage requirements comparison between \ref TNL::Matrices::TridiagonalMatrix and \ref TNL::Matrices::SparseMatrix.
 
- 
+
   Real   | Index      |      SparseMatrix    | TridiagonalMatrix   | Ratio
  --------|------------|----------------------|---------------------|--------
   float  | 32-bit int | 8 bytes per element  | 4 bytes per element | 50%
   double | 32-bit int | 12 bytes per element | 8 bytes per element | 75%
   float  | 64-bit int | 12 bytes per element | 4 bytes per element | 30%
   double | 64-bit int | 16 bytes per element | 8 bytes per element | 50%
- 
+
 Tridiagonal matrix is a templated class defined in the namespace \ref TNL::Matrices. It has five template parameters:
 
 * `Real` is a type of the matrix elements. It is `double` by default.
@@ -589,7 +491,7 @@ If a matrix has more rows then columns, we have to extend the last two rows with
 
 The output of the example looks as:
 
-\includelineno TridiagonalMatrixExample_Constructor_init_list_1.out
+\include TridiagonalMatrixExample_Constructor_init_list_1.out
 
 Similar way of the tridiagonal matrix setup is offered by the method `setElements` (\ref TNL::Matrices::TridiagonalMatrix::setElements) as the following example demonstrates:
 
@@ -598,7 +500,7 @@ Similar way of the tridiagonal matrix setup is offered by the method `setElement
 
 Here we create the matrix in two steps. Firstly, we setup the matrix dimensions by the appropriate constructor (line 24) and after that we setup the matrix elements (line 25-45). The result looks the same as in the previous example:
 
-\includelineno TridiagonalMatrixExample_setElements.out
+\include TridiagonalMatrixExample_setElements.out
 
 In the following example we create tridiagonal matrix with 5 rows and 5 columns (line 12-14) by the means of a shared pointer (\ref TNL::Pointers::SharedPointer) to make this work even on GPU. We set numbers 0,...,4 on the diagonal (line 16) and we print the matrix (line 18). Next we use a lambda function (lines 21-27) combined with parallel for (\ref TNL::Algorithms::ParallelFor) (line 35), to modify the matrix. The offdiagonal elements are set to 1 (lines 23 and 26) and for the diagonal elements, we change the sign (line 24).
 
@@ -606,7 +508,7 @@ In the following example we create tridiagonal matrix with 5 rows and 5 columns
 
 The result looks as follows:
 
-\includelineno TridiagonalMatrixExample_setElement.out
+\include TridiagonalMatrixExample_setElement.out
 
  A slightly simpler way how to do the same with no need for shared pointer (\ref TNL::Pointers::SharedPointer), could be with the use of tridiagonal matrix view and the method `getRow` (\ref TNL::Matrices::TridiagonalMatrixView::getRow) as the following example demonstrates:
 
@@ -616,15 +518,15 @@ We create a matrix with the same size (line 10-15) set ones on the diagonal (lin
 
 The result looks as follows:
 
-\includelineno TridiagonalMatrixViewExample_getRow.out
+\include TridiagonalMatrixViewExample_getRow.out
 
-Finaly, even a bit more simple and bit less flexible way of matrix elements manipulation with use of the method `forRows` (\ref TNL::Matrices::TridiagonalMatrix::forRows) is demosntrated in the following example:
+Finaly, even a bit more simple and bit less flexible way of matrix elements manipulation with use of the method `forRows` (\ref TNL::Matrices::TridiagonalMatrix::forRows) is demonstrated in the following example:
 
 \includelineno TridiagonalMatrixViewExample_forRows.cpp
 
-On the line 41 we call the method `forRows` (\ref TNL::Matrices::TridiagonalMatrix::forRows) instead of parallel for (\ref TNL::Algorithms::ParallelFor). This method iterates overl all matrix rows and all nonzero matrix elements. The lambda function function on the line 24 therefore do not receive only the matrix row index but also local index of the matrix element (`localIdx`) which is a rank of the nonzero matrix element in given row. The values of the local index for given matrix elements is as follows
+On the line 41 we call the method `forRows` (\ref TNL::Matrices::TridiagonalMatrix::forRows) instead of parallel for (\ref TNL::Algorithms::ParallelFor). This method iterates over all matrix rows and all nonzero matrix elements. The lambda function function on the line 24 therefore do not receive only the matrix row index but also local index of the matrix element (`localIdx`) which is a rank of the nonzero matrix element in given row. The values of the local index for given matrix elements is as follows
 
-\f[ 
+\f[
 \left(
 \begin{array}{cccccc}
 1 & 2 &   &   &   &     \\
@@ -641,129 +543,26 @@ Next parameter `columnIdx` received by the lambda function is the column index o
 
 The result looks as follows:
 
-\includelineno TridiagonalMatrixViewExample_forRows.out
-
-### Flexible reduction in matrix rows
-
-The *flexible parallel reduction* in rows for tridiagonal matrices is also simmilar as for dense and sparse matrices. It is represented by three lambda functions:
-
-1. `fetch` reads and preproces data entering the flexible parallel reduction.
-2. `reduce` performs the reduction operation.
-3. `keep` stores the results from each matrix row.
-
-See the following example:
+\include TridiagonalMatrixViewExample_forRows.out
 
-\includelineno TridiagonalMatrixExample_rowsReduction.cpp
+## Multidiagonal matrices <a name="multidiagonal_matrices_setup"></a>
 
-Here we first set tridiagonal matrix (lines 10-27) which looks as
+Multidiagonal matrices are generalization of the tridiagonal matrix. It is a special type of sparse matrices with specific pattern of the nonzero matrix elements which are positioned only parallel along diagonal. See the following example:
 
 \f[
-\left(
-\begin{array}{ccccc}
-1 & 3 &   &   &   &    \\
-2 & 1 & 3 &   &   &    \\
-  & 2 & 1 & 3 &   &    \\
-  &   & 2 & 1 & 3 &    \\
-  &   &   & 2 & 1 & 3
-\end{array}
-\right).
-\f]
-
-Next we want to compute maximal absolute value of the nonzero matrix elements in each row. We allocate the vector `rowMax` where we will store the results (line 32). The lambda function `fetch` (lines 42-44) is responsible for reading the matrix elements. It receives three arguments:
+  \left(
+  \begin{array}{ccccccc}
+   4  & -1  &  .  & -1  &  . & .  \\
+  -1  &  4  & -1  &  .  & -1 & .  \\
+   .  & -1  &  4  & -1  &  . & -1 \\
+  -1  & .   & -1  &  4  & -1 &  . \\
+   .  & -1  &  .  & -1  &  4 & -1 \\
+   .  &  .  & -1  &  .  & -1 &  4
+  \end{array}
+  \right)
+ \f]
 
-1. `rowIdx` is a row index of the matrix element being currently processed.
-2. `columnIdx` is a column index of the matrix elements being currently processed.
-3. `value` is a value of the matrix element being currently procesed.
-
-In our example, the only thing this function has to do, is to compute the absolute value of each matrix element represented by variable `value`. The next lambda function, `reduce` (lines 49-51), performs reduction operation. In this case, it returns maximum of two input values `a` and `b`. Finaly, the lambda function `keep` (lines 56-58) is defined with the following parameters:
-
-1. `rowIdx` tells the index of the matrix row for which we aim to store the result.
-2. `value` is the result obtained in the given matrix row.
-
-In our example, it just takes the result of the reduction in variable `value` in each row and stores it into the vector `rowMax` via related vector view `rowMaxView`.
-
-The method `rowsReduction` (\ref TNL::Matrices::SparseMatrix::rowsReduction) activates all the mantioned lambda functions (line 63). It accepts the following arguments:
-
-1. `begin` is the begining of the matrix rows range on which the reduction will be performed.
-2. `end` is the end of the matrix rows range on which the reduction will be performed. The last matrix row which is going to be processed has index `end-1`.
-3. `fetch` is the fetch lambda function.
-4. `reduce` is the the lmabda function performing the reduction.
-5. `keep` is the lambda function responsible for processing the results from particular matrix rows.
-6. `zero` is the "zero" element of given reduction opertation also known as *idempotent*. In our example, the role of this element has the lowest number of given type which we can obtain using function `std::numeric_limits< double >::lowest()` from STL.
-
- The results looks as follows:
-
-\includelineno TridiagonalMatrixExample_rowsReduction.out
-
-### Tridiagonal matrix-vector product
-
-Similar to dense and sparse matrices, matrix-vector multiplication is represented by a method `vectorProduct` (\ref TNL::Matrices::TridiagonalMatrix::vectorProduct). It is templated method with two template parameters `InVector` and `OutVector` telling the types of input and output vector respectively. Usually one will substitute some of \ref TNL::Containers::Array, \ref TNL::Containers::ArrayView, \ref TNL::Containers::Vector or \ref TNL::Containers::VectorView for these types. The method computes the following formula
-
-```
-outVector = matrixMultiplicator * ( *this ) * inVector + outVectorMultiplicator * outVector
-```
-
-and it accepts the following parameters:
-
-* `inVector` is the input vector having the same number of elements as the number of matrix columns.
-* `outVector` is the output vector having the same number of elements as the number of matrix rows.
-* `matrixMultiplicator` is a number by which the result of matrix-vector product is multiplied. 
-* `outVectorMultiplicator` is a number by which the output vector is multiplied before added to the result of matrix-vector product.
-* `begin` is an index of the first matrix row that is involved in the multiplication. It is zero be default.
-* `end` is an index indicating the last matrix row that is involved in the multiplication which is `end - 1`. It is the number of matrix rows.
-
-Note that the output vector dimension must be the same as the number of matrix rows no matter how we set `begin` and `end` parameters. These parameters just say that some matrix rows and the output vector elements are omitted.
-
-### Tridiagonal matrix IO
-
-The tridiagonal matrix can be saved to a file using a method `save` (\ref TNL::Matrices::TridiagonalMatrix::save) and restored with a method `load` (\ref TNL::Matrices::TridiagonalMatrix::load). For printing the matrix, there is a method `print` (\ref TNL::Matrices::TridiagonalMatrix::print) can be used.
-
-### Tridiagonal matrix view
-
-Similar to dense and sparse matrix view, tridiagonal matrix also offers its view for easier use with lambda functions. It is represented by a templated class \ref TNL::Matrices::TridiagonalMatrixView with the following template parameters:
-
-* `Real` is a type of matrix elements. 
-* `Device` is a device on which the matrix is allocated. This can be \ref TNL::Devices::Host or \ref TNL::Devices::Cuda.
-* `Index` is a type for indexing the matrix elements and also row and column indexes.
-* `Organization` tells the ordering of matrix elements in memory. It is either RowMajorOrder or ColumnMajorOrder.
-
-The first main reason for using the matrix view is its ability to be captured by lambda functions since the copy constructor makes only shallow copy. We can demonstrate it on the example showing the method `setElement` (\ref TNL::Matrices::TridiagonalMatrix::setElement). The code looks as follows:
-
-\includelineno TridiagonalMatrixViewExample_setElement.cpp
-
-The matrix view is obtained by the method `getView` (\ref TNL::Matrices::TridiagonalMatrix::getView) on the line 13. We firsrt show, that the view can be used the same way as common matrix (lines 14 and 15) but it can be used the same way even in lambda functions as we can see on the lines 20-26. Compare it with the same example using shared pointer instead of the matrix view:
-
-\includelineno TridiagonalMatrixExample_setElement.cpp
-
-The main disadventages are:
-
-1. The shared pointer must be created together with the matrix (line 14) and there is no way to get it later. The matrix view can be obtained from any matrix at any time.
-2. We have to synchronize shared pointers explicitly by calling the function \ref TNL::Pointers::synchronizeSmartPointersOnDevice (line 34).
-
-So for the sake of using a matrix in lambda functions, the matrix view is better tool. The result of both examples looks as:
-
-\include TridiagonalMatrixExample_setElement.out
-
-As we mentioned already, the tridiagonal matrix view offers almost all methods which the tridiagonal matrix does. So it can be easily used at almost any situation the same way as the tridiagonal matrix itself.
-
-## Multidiagonal matrices <a name="multidiagonal_matrices"></a>
-
-Multidiagonal matrices are generalization of the tridiagonal matrix. It is a special type of sparse matrices with specific pattern of the nonzero matrix elements which are positioned only parallel along diagonal. See the following example:
-
-\f[
-  \left(
-  \begin{array}{ccccccc}
-   4  & -1  &  .  & -1  &  . & .  \\
-  -1  &  4  & -1  &  .  & -1 & .  \\
-   .  & -1  &  4  & -1  &  . & -1 \\
-  -1  & .   & -1  &  4  & -1 &  . \\
-   .  & -1  &  .  & -1  &  4 & -1 \\
-   .  &  .  & -1  &  .  & -1 &  4
-  \end{array}
-  \right)
- \f]
-
- We can see that the matrix elements lay on lines parallel to the main diagonal. Such lines can be expressed by their offsets from the main diagonal. On the following figure, each such line is depicted in different color:
+ We can see that the matrix elements lay on lines parallel to the main diagonal. Such lines can be expressed by their offsets from the main diagonal. On the following figure, each such line is depicted in different color:
 
   \f[
 \begin{array}{ccc}
@@ -774,7 +573,7 @@ Multidiagonal matrices are generalization of the tridiagonal matrix. It is a spe
  .                & .                & \color{green}{*} \\
  .                & .                & . \\
  .                & .                & . \\
- .                & .                & . 
+ .                & .                & .
 \end{array}
 \left(
   \begin{array}{ccccccc}
@@ -798,7 +597,7 @@ Multidiagonal matrices are generalization of the tridiagonal matrix. It is a spe
   double | 32-bit int| 12 bytes per element | 8 bytes per element | 75%
   float  | 64-bit int| 12 bytes per element | 4 bytes per element | 30%
   double | 64-bit int| 16 bytes per element | 8 bytes per element | 50%
- 
+
 Multidiagonal matrix is a templated class defined in the namespace \ref TNL::Matrices. It has six template parameters:
 
 * `Real` is a type of the matrix elements. It is `double` by default.
@@ -843,7 +642,7 @@ The matrix from this example arises from a discretization of the [Laplace operat
 
 We firstly compute the matrix size (`matrixSize`) based on the numerical grid dimensions on the line 16. The subdiagonals offsets are defined by the numerical grid size and since it is four in this example the offsets read as \f$\left\{-4,-1,0,1,4 \right\} \f$ or `{ -gridSize, -1, 0, 1, gridSize}` (line 17). Here we store the offsets (referred as `shifts`) in vector (\ref TNL::Containers::Vector). Next we use a constructor with matrix dimensions and offsets passed via TNL vector (line 18). Next we fetch matrix view (line 19) (see [Multidiagonal matrix view](#multidiagonal_matrix_view)).
 
-The matrix is constructed by iterating over particular nodes of the numerical grid. Each node corresponed to one matrix row. This is why the lambda function `f` (lines 20-35) take two indexes `i` and `j` (line 20). Their values are coordinates of the twodimensional numerical grid. Based on these coodrinates we compute index (`elementIdx`) of the corresponding matrix row (line 21). We fetch matrix row (`row`) by calling the `getRow` method (\ref TNL::Matrices::MutlidiagonalMatrix::getRow) (line 22). Depending on the grid node coordinates we set either the boundary conditions (lines 23-26) for the boundary nodes (those laying on the boundary of the grid and so their coordinates fulfil the condition `i == 0 || j == 0 || i == gridSize - 1 || j == gridSize - 1` ) for which se set onle diagonal element to 1. The inner nodes of the numerical grid are handled on the lines 29-33 where we set coefficients approximating the Laplace operator. We use the method `setElement` of the matrix row (\ref TNL::Matrices::MultidiagonalMatrixRow::setElement) which takes the local index of the nonzero matrix element as the first parametr and the new value of the element as the second parameter. The local indexes, in fact, refer to particular subdiagonals as depicted on the following figure (in blue): 
+The matrix is constructed by iterating over particular nodes of the numerical grid. Each node corresponed to one matrix row. This is why the lambda function `f` (lines 20-35) take two indexes `i` and `j` (line 20). Their values are coordinates of the twodimensional numerical grid. Based on these coodrinates we compute index (`elementIdx`) of the corresponding matrix row (line 21). We fetch matrix row (`row`) by calling the `getRow` method (\ref TNL::Matrices::MutlidiagonalMatrix::getRow) (line 22). Depending on the grid node coordinates we set either the boundary conditions (lines 23-26) for the boundary nodes (those laying on the boundary of the grid and so their coordinates fulfil the condition `i == 0 || j == 0 || i == gridSize - 1 || j == gridSize - 1` ) for which se set onle diagonal element to 1. The inner nodes of the numerical grid are handled on the lines 29-33 where we set coefficients approximating the Laplace operator. We use the method `setElement` of the matrix row (\ref TNL::Matrices::MultidiagonalMatrixRow::setElement) which takes the local index of the nonzero matrix element as the first parametr and the new value of the element as the second parameter. The local indexes, in fact, refer to particular subdiagonals as depicted on the following figure (in blue):
 
 \f[
 \begin{array}{cccc}
@@ -864,7 +663,7 @@ The matrix is constructed by iterating over particular nodes of the numerical gr
    &   &   &    \\
    &   &   &    \\
    &   &   &    \\
-   &   &   &   
+   &   &   &
 \end{array}
 \left(
 \begin{array}{cccccccccccccccc}
@@ -909,7 +708,7 @@ We use `ParallelFor2D` (\ref TNL::Algorithms::ParallelFor2D) to iterate over all
    &   &   &    \\
    &   &   &    \\
    &   &   &    \\
-   &   &   &   
+   &   &   &
 \end{array}
 \left(
 \begin{array}{cccccccccccccccc}
@@ -947,14 +746,14 @@ We use `ParallelFor2D` (\ref TNL::Algorithms::ParallelFor2D) to iterate over all
 0  &   &   &    \\
    & 0 &   &    \\
    &   & 0 &    \\
-0  &   &   & 0  
+0  &   &   & 0
 \end{array}
 \f]
 
 
 The result looks as follows:
 
-\includelineno MultidiagonalMatrixExample_Constructor.out
+\include MultidiagonalMatrixExample_Constructor.out
 
 Slightly simpler way of doing the same is by using the constructor of multidiagonal matrix taking the subdiagonals offsets as an STL initializer list:
 
@@ -966,13 +765,13 @@ The only change is on the line 17 which reads as
 TNL::Matrices::MultidiagonalMatrix< double, Device > matrix( matrixSize, matrixSize, { - gridSize, -1, 0, 1, gridSize } );
 ```
 
-Here we call the mentioned cosntructor, which accepts the matrix dimensions (number of rows and columns) as first two parameters and the initializer list with the subdiagonal offsets as the last one. The result looks the same as in the previous example.
+Here we call the mentioned constructor, which accepts the matrix dimensions (number of rows and columns) as first two parameters and the initializer list with the subdiagonal offsets as the last one. The result looks the same as in the previous example.
 
 There is also a constructor with initializer list for matrix elements values as demonstrated by the following example:
 
 \includelineno MultidiagonalMatrixExample_Constructor_init_list_2.cpp
 
-Here, we create a matrix which looks as 
+Here, we create a matrix which looks as
 
 \f[
 \left(
@@ -987,9 +786,9 @@ Here, we create a matrix which looks as
 \right).
 \f]
 
-On the lines 25-46, we call the constructor which, in addition to matrix dimensions and subdiagonals offsets, accepts also initializer list of initializer lists with matrix elements values. Each embeded list corresponds to one matrix row and it contains values of matrix elements on particular subdiagonals including those which lies out of the matrix. The resuls looks as follows:
+On the lines 25-46, we call the constructor which, in addition to matrix dimensions and subdiagonals offsets, accepts also initializer list of initializer lists with matrix elements values. Each embedded list corresponds to one matrix row and it contains values of matrix elements on particular subdiagonals including those which lies out of the matrix. The result looks as follows:
 
-\includelineno MultidiagonalMatrixExample_Constructor_init_list_2.out
+\include MultidiagonalMatrixExample_Constructor_init_list_2.out
 
 The matrix elements values can be changed the same way using the method method `setElements` (\ref TNL::Matrices::MutlidiagonalMatrix::setElements) which accepts the elements values in the same form of embedded initializer list. It just does not allow changing the subdiagonals offsets. For this purpose method `setDiagonalsOffsets` (\ref TNL::Matrices::MultidiagonalMatrix::setDiagonalsOffsets) can be used. Note, however, that this method deletes all current matrix elements.
 
@@ -1003,7 +802,7 @@ This examples shows that the method `setElement` can be used both on the host (C
 
 In this example, we fetch the matrix view (line 16) immediately after creating the matrix itself (line 15). Note that the matrix view can be obtained from the matrix at any time while the shared pointer only at the time of the matrix creation. On the other hand, if the original matrix is changed, all matrix views become invalid which is not true for the shared pointers. So it is better to fetch the matrix view immediately before we use it to avoid the sitaution that you would use invalid matrix view. The method `setElement` (\ref TNL::Matrices::MutlidiagonalMatrixView::setElement) can be used on both host (CPU) (line 19) and the device (lines 25-29) if the lambda function `f` (lines 24-30) runs in GPU kernel. The result of both examles looks the same:
 
-\includelineno MultidiagonalMatrixViewExample_setElement.out
+\include MultidiagonalMatrixViewExample_setElement.out
 
 Another way for setting the matrix elements is by means of the multidiagonal matrix row:
 
@@ -1028,15 +827,15 @@ Here we use the matrix view again (line 19) and in the lambda function `f` which
 2  & -1 &    &    &    \\
 -1 &  2 & -1 &    &    \\
    & -1 &  2 & -1 &    \\
-   &    & -1 &  2 & -1 \\ 
+   &    & -1 &  2 & -1 \\
    &    &    & -1 &  2
 \end{array}
 \right)
 \f]
 
-The second parameter of the method `setElement` is the new matrix elements value. An adventage of this method is that it can acces  the matrix elements faster. The output of this example looks as follows:
+The second parameter of the method `setElement` is the new matrix elements value. An advantage of this method is that it can access  the matrix elements faster. The output of this example looks as follows:
 
-\includelineno MultidiagonalMatrixViewExample_getRow.out
+\include MultidiagonalMatrixViewExample_getRow.out
 
 Similar and even a bit simpler way of setting the matrix elements is offered by the method `forRows` (\ref TNL::Matrices::MultidiagonalMatrix::forRows, \ref TNL::Matrices::MultidiagonalMatrixView::forRows) as demonstrated in the following example:
 
@@ -1052,98 +851,9 @@ In this case, we need to provide a lambda function `f` (lines 27-43) which is ca
 
 In this example, the matrix element value depends only on the subdiagonal index `localIdx` as we can see on the line 42. The result looks as follows:
 
-\includelineno MultidiagonalMatrixExample_forRows.out
-
-### Flexible reduction in matrix rows
-
-The flexible parallel reduction in rows for multidiagonal matrices works the same way as for other matrix types. It consits of three lambda functions:
-
-1. `fetch` reads and preproces data entering the flexible parallel reduction.
-2. `reduce` performs the reduction operation.
-3. `keep` stores the results from each matrix row.
-
-See the following example:
-
-\includelineno MultidiagonalMatrixExample_rowsReduction.cpp
-
-On the lines 10-29, we first create the following matrix
-
-\f[
-\left(
-\begin{array}{ccccc}
-1  &   &   &   &  \\
-2  & 1 &   &   &  \\
-3  & 2 & 1 &   &  \\
-   & 3 & 2 & 1 &  \\
-   &   & 3 & 2 & 1
-\end{array}
-\right)
-\f]
-
-and we aim to compute maximal value in each row. We first create vector `rowMax` into which we will store the results and fetch it view `rowMaxView` (line 39). Next we prepare necessary lambda functions:
-
-* `fetch` (lines 44-46) is responsible for reading the matrix element value which is stored in the constant reference `value` and for returning its absolute value. The other parameters `rowIdx` and `columnIdx` correspond to row and column indexes respectively and they are omitted in our example.
-* `reduce` (lines 51-53) returns maximum value of the two input values `a` and `b`.
-* `keep` (line 58-60) stores the input `value` at the corresponding position, given by the row index `rowIdx`, in the ouput vector view `rowMaxView`.
-
-Finaly we call the method `rowsReduction` (\ref TNL::Matrices::MultidiagonalMatrix::rowsReduction) with parameters telling the interval of rows to be processed (the first and second parameter), the lambda functions `fetch`, `reduce` and `keep`, and the idempotent element for the reduction operation which is the lowest number of given type (\ref std::numeric_limits< double >::lowest ). The result looks as follows:
-
-\includelineno MultidiagonalMatrixExample_rowsReduction.out
-
-### Multidiagonal matrix-vector product
-
-Similar to matrix types, matrix-vector multiplication is represented by the method `vectorProduct` (\ref TNL::Matrices::MultidiagonalMatrix::vectorProduct). It is templated method with two template parameters `InVector` and `OutVector` telling the types of the input and output vector respectively. Usually one will substitute some of \ref TNL::Containers::Array, \ref TNL::Containers::ArrayView, \ref TNL::Containers::Vector or \ref TNL::Containers::VectorView for these types. The method computes the following formula
-
-```
-outVector = matrixMultiplicator * ( *this ) * inVector + outVectorMultiplicator * outVector
-```
-
-and it accepts the following parameters:
-
-* `inVector` is the input vector having the same number of elements as the number of matrix columns.
-* `outVector` is the output vector having the same number of elements as the number of matrix rows.
-* `matrixMultiplicator` is a number by which the result of matrix-vector product is multiplied. 
-* `outVectorMultiplicator` is a number by which the output vector is multiplied before it is added to the result of matrix-vector product.
-* `begin` is an index of the first matrix row that is involved in the multiplication. It is zero be default.
-* `end` is an index indicating the last matrix row that is involved in the multiplication which is `end - 1`. It is the number of matrix rows.
-
-Note that the output vector dimension must be the same as the number of matrix rows no matter how we set `begin` and `end` parameters. These parameters just say that some matrix rows and the output vector elements are omitted.
-
-### Multidiagonal matrix IO
-
-The multidiagonal matrix can be saved to a file using a method `save` (\ref TNL::Matrices::MultiidiagonalMatrix::save) and restored with a method `load` (\ref TNL::Matrices::MultidiagonalMatrix::load). For printing the matrix, there is a method `print` (\ref TNL::Matrices::MultidiagonalMatrix::print) can be used.
-
-### Multidiagonal matrix view <a name="multidiagonal_matrix_view"></a>
-
-Multidiagonal matrix also offers its view for easier use with lambda functions. It is represented by a templated class \ref TNL::Matrices::MultidiagonalMatrixView with the following template parameters:
-
-* `Real` is a type of matrix elements. 
-* `Device` is a device on which the matrix is allocated. This can be \ref TNL::Devices::Host or \ref TNL::Devices::Cuda.
-* `Index` is a type for indexing the matrix elements and also row and column indexes.
-* `Organization` tells the ordering of matrix elements in memory. It is either RowMajorOrder or ColumnMajorOrder.
-
-The first main reason for using the matrix view is its ability to be captured by lambda functions since the copy constructor makes only shallow copy. We can demonstrate it on the example showing the method `setElement` (\ref TNL::Matrices::MultidiagonalMatrix::setElement). The code looks as follows:
-
-\includelineno MultidiagonalMatrixViewExample_setElement.cpp
-
-The matrix view is obtained by the method `getView` (\ref TNL::Matrices::MultidiagonalMatrix::getView) on the line 13. We firsrt show, that the view can be used the same way as common matrix (lines 14 and 15) but it can be used the same way even in lambda functions as we can see on the lines 20-26. Compare it with the same example using shared pointer instead of the matrix view:
-
-\includelineno MultidiagonalMatrixExample_setElement.cpp
-
-The main disadventages are:
-
-1. The shared pointer must be created together with the matrix (line 14) and there is no way to get it later. The matrix view can be obtained from any matrix at any time.
-2. We have to synchronize shared pointers explicitly by calling the function \ref TNL::Pointers::synchronizeSmartPointersOnDevice (line 34).
-
-So for the sake of using a matrix in lambda functions, the matrix view is better tool. The result of both examples looks as:
+\include MultidiagonalMatrixExample_forRows.out
 
-\includelineno MultidiagonalMatrixExample_setElement.out
-
-As we mentioned already, the multidiagonal matrix view offers almost all methods which the multidiagonal matrix does. So it can be easily used at almost any situation the same way as the multidiagonal matrix itself.
-
-TODO: Move to explanation of the matrix view to introduction.
-
-## Lambda matrices <a name="lambda_matrices"></a>
+## Lambda matrices <a name="lambda_matrices_setup"></a>
 
 Lambda matrix (\ref TNL::Matrices::LambdaMatrix) is a special type of matrix which could be also called *** matrix-free matrix ***. Its elements are not stored in memory explicitlely but they are evaluated on-the-fly by means of user defined lambda functions. If the matrix elements can be expressed by computationaly not expansive formula, we can significantly reduce the memory consumptions which can be appriciated especially on GPU. Since the memory accesses are quite expensive even on CPU, we can get, at the end, even much faster code.
 
@@ -1202,7 +912,7 @@ With the same lambda functions we can define matrices with different dimensions.
 
 The result looks as follows:
 
-\includelineno LambdaMatrixExample_Constructor.out
+\include LambdaMatrixExample_Constructor.out
 
 Of course, the lambda matrix has the same interface as other matrix types. The following example demonstrates the use of the method `forRows` to copy the lambda matrix into the dense matrix:
 
@@ -1212,7 +922,7 @@ Here, we treat the lambda matrix as if it was dense matrix. The lambda function
 
 Next we call the lambda function `f` by the method `forRows` (\ref TNL::Matrices::LambdaMatrix::forRows) to set the matrix elements of the dense matrix `denseMatrix` (line 26) via the dense matrix view (`denseView`) (\ref TNL::Matrices::DenseMatrixView). Note, that in the lambda function `f` we get the matrix element value already evaluated in the variable `value` as we are used to from other matrix types. So in fact, the same lambda function `f` woudl do the same job even for sparse matrix or any other. Also note, that in this case we iterate even over all zero matrix elements because the lambda function `rowLengths` (line 13) tells so. The result looks as follows:
 
-\includelineno LambdaMatrixExample_forRows.out
+\include LambdaMatrixExample_forRows.out
 
 At the end of this part, we show two more examples, how to express a matrix approximating the Laplace operator:
 
@@ -1224,43 +934,439 @@ The following is another way of doing the same but precomputed supporting vector
 
 The result of both examples looks as follows:
 
-\includelineno LambdaMatrixExample_Laplace.out
+\include LambdaMatrixExample_Laplace.out
 
-### Flexible reduction in matrix rows
+## Flexible reduction in matrix rows <a name="flexible_reduction_in_matrix_rows"></a>
 
-The reduction of matrix rows is available for the lambda matrices as well. See the follogin example:
+### Dense matrix
 
-\includelineno LambdaMatrixExample_rowsReduction.cpp
+Simillar operation to `forRows` is `rowsReduction` (\ref TNL::Matrices::DenseMatrix::rowsReduction) which performs given reduction in each matric row. For example, a matrix-vector product can be seen as a reduction of products of matrix elements and input vector in particular matrix rows. The first element of the result vector ios obtained as:
 
-On the lines 14-21, we create the same lower trianguilar lambda matrix as in the previous example. As we did it in similar examples for other matrix types, we want to compute maximal absolute value of matrix elements in each row. For this purpose we define well known lambda functions:
+\f[
+y_1 = a_{11} x_1 + a_{12} x_2 + \ldots + a_{1n} x_n = \sum_{j=1}^n a_{1j}x_j
+\f]
 
-* `fetch` takes the value of the lambda matrix element and returns its absolute value.
-* `reduce` computes maximum value of two input variables.
-* `keep` stores the results into output vector `rowMax`.
+and in general i-th element of the result vector is computed as
 
-Note that the interface of the lambda functions is the same as for other matrix types. The result looks as follows:
+\f[
+y_i = a_{i1} x_1 + a_{i2} x_2 + \ldots + a_{in} x_n = \sum_{j=1}^n a_{ij}x_j.
+\f]
 
-\includelineno LambdaMatrixExample_rowsReduction.out
+We see that in i-th matrix row we have to compute the sum \f$\sum_{j=1}^n a_{ij}x_j\f$ which is reduction of products \f$ a_{ij}x_j\f$. Similar to *flexible parallel reduction* (\ref TNL::Algorithms::Reduction) we just need to design proper lambda functions. See the following example:
 
-### Lambda matrix-vector product
 
-The matrix-vector multiplication is represented by the method `vectorProduct` (\ref TNL::Matrices::LambdaMatrix::vectorProduct). It is templated method with two template parameters `InVector` and `OutVector` telling the types of the input and output vector respectively. Usually one will substitute some of \ref TNL::Containers::Array, \ref TNL::Containers::ArrayView, \ref TNL::Containers::Vector or \ref TNL::Containers::VectorView for these types. The method computes the following formula
+\includelineno DenseMatrixExample_rowsReduction_vectorProduct.cpp
+
+The `fetch` lambda function computes the product \f$ a_{ij}x_j\f$ where \f$ a_{ij} \f$ is represented by `value` and \f$x_j \f$ is represented by `xView[columnIdx]`. The reduction is just sum of results particular products and it is represented by by the lambda function `reduce`. Finaly, the lambda function `keep` is responsible for storing the results of reduction in each matrix row (which is represented by the variable `value`) into the output vector `y`.
+The result looks as:
+
+\include DenseMatrixExample_rowsReduction_vectorProduct.out
+
+We will show one more example which is computation of maximal absolute value in each matrix row. The results will be stored in a vector:
+
+\f[
+y_i = \max_{j=1,\ldots,n} |a_{ij}|.
+\f]
+
+See the following example:
+
+\includelineno DenseMatrixExample_rowsReduction_maxNorm.cpp
+
+
+The `fetch` lambda function just returns absolute value of \f$a_{ij} \f$ which is represented again by the varibale `value`. The `reduce` lambda function returns larger of given values and the lambda fuction 'keep' stores the results to the output vectro the same way as in the previous example. Of course, if we compute the maximum of all output vector elements we get some kined of max matrix norm. The output looks as:
+
+\include DenseMatrixExample_rowsReduction_maxNorm.out
+
+### Sparse matrix
+
+The *flexible parallel reduction* in rows for sparse matrices is very simmilar to the one for dense matrices. It consits of three lambda functions:
+
+1. `fetch` reads and preproces data entering the flexible parallel reduction.
+2. `reduce` performs the reduction operation.
+3. `keep` stores the results from each matrix row.
+
+See the following example:
+
+\includelineno SparseMatrixExample_rowsReduction_vectorProduct.cpp
+
+On the lines 11-16 we set the following matrix:
+
+\f[
+\left(
+\begin{array}{ccccc}
+1 & . & . & . & . \\
+1 & 2 & . & . & . \\
+. & 1 & 8 & . & . \\
+. & . & 1 & 9 & . \\
+. & . & . & . & 1
+\end{array}
+\right)
+\f]
+
+Next we prepare input (`x`) and output (`y`) vectors on the lines 21 and 22 and set all elements of the input vector to one (line 27). Since we will need to access these vectors in lambda functions we prepare their views on lines 32 and 33. On the lines 39-41, we define the `fetch` lambda function. It receives three arguments:
+
+1. `rowIdx` is a row index of the matrix element being currently processed.
+2. `columnIdx` is a column index of the matrix elements being currently processed.
+3. `value` is a value of the matrix element being currently procesed.
+
+We ommit the row index and take the column index which indicates index of the element of the input vector we need to fetch (`xView[ columnIdx ]`). We take its value and multiply it with the value (`value`) of the current matrix element. We do not need to write lambda function for reduction since it is only summation of the intermediate results from the `fetch` lamda and we can use `std::plus<>{}` (see the line 60). The `keep` lambda function offers two parameters:
+
+1. `rowIdx` tells the index of the matrix row for which we aim to store the result.
+2. `value` is the result obtained in the given matrix row.
+
+In our example, we just write the result into appropriate element of the output vector `y` which is given just by the row index `rowIdx` -- see the line 47.  On the line 53 we start the computation of the matrix-vector product. The method `rowsReduction` (\ref TNL::Matrices::SparseMatrix::rowsReduction) accepts the following arguments:
+
+1. `begin` is the begining of the matrix rows range on which the reduction will be performed.
+2. `end` is the end of the matrix rows range on which the reduction will be performed. The last matrix row which is going to be processed has index `end-1`.
+3. `fetch` is the fetch lambda function.
+4. `reduce` is the the lmabda function performing the reduction.
+5. `keep` is the lambda function responsible for processing the results from particular matrix rows.
+6. `zero` is the "zero" element of given reduction opertation also known as *idempotent*. It is really 0 for summation in our example (adding zero to any number does not change the result).
+
+At the end we print the matrix, the input and the output vector -- lines 55-57. The result looks as follows:
+
+\include SparseMatrixExample_rowsReduction_vectorProduct.out
+
+### Tridiagonal matrix
+
+The *flexible parallel reduction* in rows for tridiagonal matrices is also simmilar as for dense and sparse matrices. It is represented by three lambda functions:
+
+1. `fetch` reads and preproces data entering the flexible parallel reduction.
+2. `reduce` performs the reduction operation.
+3. `keep` stores the results from each matrix row.
+
+See the following example:
+
+\includelineno TridiagonalMatrixExample_rowsReduction.cpp
+
+Here we first set tridiagonal matrix (lines 10-27) which looks as
+
+\f[
+\left(
+\begin{array}{ccccc}
+1 & 3 &   &   &   &    \\
+2 & 1 & 3 &   &   &    \\
+  & 2 & 1 & 3 &   &    \\
+  &   & 2 & 1 & 3 &    \\
+  &   &   & 2 & 1 & 3
+\end{array}
+\right).
+\f]
+
+Next we want to compute maximal absolute value of the nonzero matrix elements in each row. We allocate the vector `rowMax` where we will store the results (line 32). The lambda function `fetch` (lines 42-44) is responsible for reading the matrix elements. It receives three arguments:
+
+1. `rowIdx` is a row index of the matrix element being currently processed.
+2. `columnIdx` is a column index of the matrix elements being currently processed.
+3. `value` is a value of the matrix element being currently procesed.
+
+In our example, the only thing this function has to do, is to compute the absolute value of each matrix element represented by variable `value`. The next lambda function, `reduce` (lines 49-51), performs reduction operation. In this case, it returns maximum of two input values `a` and `b`. Finaly, the lambda function `keep` (lines 56-58) is defined with the following parameters:
+
+1. `rowIdx` tells the index of the matrix row for which we aim to store the result.
+2. `value` is the result obtained in the given matrix row.
+
+In our example, it just takes the result of the reduction in variable `value` in each row and stores it into the vector `rowMax` via related vector view `rowMaxView`.
+
+The method `rowsReduction` (\ref TNL::Matrices::SparseMatrix::rowsReduction) activates all the mantioned lambda functions (line 63). It accepts the following arguments:
+
+1. `begin` is the begining of the matrix rows range on which the reduction will be performed.
+2. `end` is the end of the matrix rows range on which the reduction will be performed. The last matrix row which is going to be processed has index `end-1`.
+3. `fetch` is the fetch lambda function.
+4. `reduce` is the the lmabda function performing the reduction.
+5. `keep` is the lambda function responsible for processing the results from particular matrix rows.
+6. `zero` is the "zero" element of given reduction opertation also known as *idempotent*. In our example, the role of this element has the lowest number of given type which we can obtain using function `std::numeric_limits< double >::lowest()` from STL.
+
+ The results looks as follows:
+
+\include TridiagonalMatrixExample_rowsReduction.out
+
+### Multidiagonal matrix
+
+The flexible parallel reduction in rows for multidiagonal matrices works the same way as for other matrix types. It consits of three lambda functions:
+
+1. `fetch` reads and preproces data entering the flexible parallel reduction.
+2. `reduce` performs the reduction operation.
+3. `keep` stores the results from each matrix row.
+
+See the following example:
+
+\includelineno MultidiagonalMatrixExample_rowsReduction.cpp
+
+On the lines 10-29, we first create the following matrix
+
+\f[
+\left(
+\begin{array}{ccccc}
+1  &   &   &   &  \\
+2  & 1 &   &   &  \\
+3  & 2 & 1 &   &  \\
+   & 3 & 2 & 1 &  \\
+   &   & 3 & 2 & 1
+\end{array}
+\right)
+\f]
+
+and we aim to compute maximal value in each row. We first create vector `rowMax` into which we will store the results and fetch it view `rowMaxView` (line 39). Next we prepare necessary lambda functions:
+
+* `fetch` (lines 44-46) is responsible for reading the matrix element value which is stored in the constant reference `value` and for returning its absolute value. The other parameters `rowIdx` and `columnIdx` correspond to row and column indexes respectively and they are omitted in our example.
+* `reduce` (lines 51-53) returns maximum value of the two input values `a` and `b`.
+* `keep` (line 58-60) stores the input `value` at the corresponding position, given by the row index `rowIdx`, in the ouput vector view `rowMaxView`.
+
+Finaly we call the method `rowsReduction` (\ref TNL::Matrices::MultidiagonalMatrix::rowsReduction) with parameters telling the interval of rows to be processed (the first and second parameter), the lambda functions `fetch`, `reduce` and `keep`, and the idempotent element for the reduction operation which is the lowest number of given type (\ref std::numeric_limits< double >::lowest ). The result looks as follows:
+
+\include MultidiagonalMatrixExample_rowsReduction.out
+
+### Lambda matrix
+
+The reduction of matrix rows is available for the lambda matrices as well. See the follogin example:
+
+\includelineno LambdaMatrixExample_rowsReduction.cpp
+
+On the lines 14-21, we create the same lower trianguilar lambda matrix as in the previous example. As we did it in similar examples for other matrix types, we want to compute maximal absolute value of matrix elements in each row. For this purpose we define well known lambda functions:
+
+* `fetch` takes the value of the lambda matrix element and returns its absolute value.
+* `reduce` computes maximum value of two input variables.
+* `keep` stores the results into output vector `rowMax`.
+
+Note that the interface of the lambda functions is the same as for other matrix types. The result looks as follows:
+
+\include LambdaMatrixExample_rowsReduction.out
+
+## Matrix-vector product <a name="matrix_vector_product"></a>
+
+### Dense matrix
+
+One of the most important matrix operation is the matrix-vector multiplication. It is represented by a method `vectorProduct` (\ref TNL::Matrices::DenseMatrix::vectorProduct). It is templated method with two template parameters `InVector` and `OutVector` telling the types of input and output vector respectively. Usually one will substitute some of \ref TNL::Containers::Array, \ref TNL::Containers::ArrayView, \ref TNL::Containers::Vector or \ref TNL::Containers::VectorView for these types. The method accepts the following parameters:
+
+* `inVector` is the input vector having the same number of elements as the number of matrix columns.
+* `outVector` is the output vector having the same number of elements as the number of matrix rows.
+* `matrixMultiplicator` is a number by which the result of matrix-vector product is multiplied. 
+* `outVectorMultiplicator` is a number by which the output vector is multiplied before added to the result of matrix-vector product.
+* `begin` is an index of the first matrix row that is involved in the multiplication. It is zero be default.
+* `end` is an index of the last matrix row that is involved in the multiplication. It is the last matrix row by default.
+
+Note that the ouput vector dimension must be the same as the number of matrix rows no matter how we set `begin` and `end` parameters. These parameters just say that some matrix rows and the output vector elements are omitted.
+
+To summarize, this method computes the following formula:
+
+`outVector = matrixMultiplicator * ( *this ) * inVector + outVectorMultiplicator * outVector.`
+
+### Sparse matrix
+
+As we mentioned already in the part explaining the dense matrices, matrix-vector multiplication or in this case sparse matrix-vector multiplication ([SpMV](https://en.wikipedia.org/wiki/Sparse_matrix-vector_multiplication)) is one of the most important operations in numerical mathematics and high-performance computing. It is represented by a method `vectorProduct` (\ref TNL::Matrices::SparseMatrix::vectorProduct). It is templated method with two template parameters `InVector` and `OutVector` telling the types of input and output vector respectively. Usually one will substitute some of \ref TNL::Containers::Array, \ref TNL::Containers::ArrayView, \ref TNL::Containers::Vector or \ref TNL::Containers::VectorView for these types. The method computes the following formula
+
+```
+outVector = matrixMultiplicator * ( *this ) * inVector + outVectorMultiplicator * outVector
+```
+
+and it accepts the following parameters:
+
+* `inVector` is the input vector having the same number of elements as the number of matrix columns.
+* `outVector` is the output vector having the same number of elements as the number of matrix rows.
+* `matrixMultiplicator` is a number by which the result of matrix-vector product is multiplied. 
+* `outVectorMultiplicator` is a number by which the output vector is multiplied before added to the result of matrix-vector product.
+* `begin` is an index of the first matrix row that is involved in the multiplication. It is zero be default.
+* `end` is an index of the last matrix row that is involved in the multiplication. It is the last matrix row by default.
+
+Note that the ouput vector dimension must be the same as the number of matrix rows no matter how we set `begin` and `end` parameters. These parameters just say that some matrix rows and the output vector elements are omitted.
+
+### Tridiagonal matrix
+
+Similar to dense and sparse matrices, matrix-vector multiplication is represented by a method `vectorProduct` (\ref TNL::Matrices::TridiagonalMatrix::vectorProduct). It is templated method with two template parameters `InVector` and `OutVector` telling the types of input and output vector respectively. Usually one will substitute some of \ref TNL::Containers::Array, \ref TNL::Containers::ArrayView, \ref TNL::Containers::Vector or \ref TNL::Containers::VectorView for these types. The method computes the following formula
+
+```
+outVector = matrixMultiplicator * ( *this ) * inVector + outVectorMultiplicator * outVector
+```
+
+and it accepts the following parameters:
+
+* `inVector` is the input vector having the same number of elements as the number of matrix columns.
+* `outVector` is the output vector having the same number of elements as the number of matrix rows.
+* `matrixMultiplicator` is a number by which the result of matrix-vector product is multiplied. 
+* `outVectorMultiplicator` is a number by which the output vector is multiplied before added to the result of matrix-vector product.
+* `begin` is an index of the first matrix row that is involved in the multiplication. It is zero be default.
+* `end` is an index indicating the last matrix row that is involved in the multiplication which is `end - 1`. It is the number of matrix rows.
+
+Note that the output vector dimension must be the same as the number of matrix rows no 
+matter how we set `begin` and `end` parameters. These parameters just say that some matrix rows and the output vector elements are omitted.
+
+### Multidiagonal matrix
+
+
+Similar to matrix types, matrix-vector multiplication is represented by the method `vectorProduct` (\ref TNL::Matrices::MultidiagonalMatrix::vectorProduct). It is templated method with two template parameters `InVector` and `OutVector` telling the types of the input and output vector respectively. Usually one will substitute some of \ref TNL::Containers::Array, \ref TNL::Containers::ArrayView, \ref TNL::Containers::Vector or \ref TNL::Containers::VectorView for these types. The method computes the following formula
+
+```
+outVector = matrixMultiplicator * ( *this ) * inVector + outVectorMultiplicator * outVector
+```
+
+and it accepts the following parameters:
+
+* `inVector` is the input vector having the same number of elements as the number of matrix columns.
+* `outVector` is the output vector having the same number of elements as the number of matrix rows.
+* `matrixMultiplicator` is a number by which the result of matrix-vector product is multiplied. 
+* `outVectorMultiplicator` is a number by which the output vector is multiplied before it is added to the result of matrix-vector product.
+* `begin` is an index of the first matrix row that is involved in the multiplication. It is zero be default.
+* `end` is an index indicating the last matrix row that is involved in the multiplication which is `end - 1`. It is the number of matrix rows.
+
+Note that the output vector dimension must be the same as the number of matrix rows no matter how we set `begin` and `end` parameters. These parameters just say that some matrix rows and the output vector elements are omitted.
+
+### Lambda matrix
+
+The matrix-vector multiplication is represented by the method `vectorProduct` (\ref TNL::Matrices::LambdaMatrix::vectorProduct). It is templated method with two template parameters `InVector` and `OutVector` telling the types of the input and output vector respectively. Usually one will substitute some of \ref TNL::Containers::Array, \ref TNL::Containers::ArrayView, \ref TNL::Containers::Vector or \ref TNL::Containers::VectorView for these types. The method computes the following formula
+
+```
+outVector = matrixMultiplicator * ( *this ) * inVector + outVectorMultiplicator * outVector
+```
+
+and it accepts the following parameters:
+
+* `inVector` is the input vector having the same number of elements as the number of matrix columns.
+* `outVector` is the output vector having the same number of elements as the number of matrix rows.
+* `matrixMultiplicator` is a number by which the result of matrix-vector product is multiplied. 
+* `outVectorMultiplicator` is a number by which the output vector is multiplied before it is added to the result of matrix-vector product.
+* `begin` is an index of the first matrix row that is involved in the multiplication. It is zero be default.
+* `end` is an index indicating the last matrix row that is involved in the multiplication which is `end - 1`. It is the number of matrix rows.
+
+Note that the output vector dimension must be the same as the number of matrix rows no matter how we set `begin` and `end` parameters. These parameters just say that some matrix rows and the output vector elements are omitted.
+
+## Matrix I/O operations <a name="matrix_io_operations"></a>
+
+### Dense matrix
+
+The dense matrix can be saved to a file using a method `save` (\ref TNL::Matrices::DenseMatrix::save) and restored with a method `load` (\ref TNL::Matrices::DenseMatrix::load). To print the matrix, there is a method `print` (\ref TNL::Matrices::DenseMatrix::print) can be used.
+
+### Sparse matrix
+The sparse matrix can be saved to a file using a method `save` (\ref TNL::Matrices::SparseMatrix::save) and restored with a method `load` (\ref TNL::Matrices::SparseMatrix::load). For printing the matrix, there is a method `print` (\ref TNL::Matrices::SparseMatrix::print) can be used.
+
+### Tridiagonal matrix IO
+
+The tridiagonal matrix can be saved to a file using a method `save` (\ref TNL::Matrices::TridiagonalMatrix::save) and restored with a method `load` (\ref TNL::Matrices::TridiagonalMatrix::load). For printing the matrix, there is a method `print` (\ref TNL::Matrices::TridiagonalMatrix::print) can be used.
+
+### Multidiagonal matrix IO
+
+The multidiagonal matrix can be saved to a file using a method `save` (\ref TNL::Matrices::MultiidiagonalMatrix::save) and restored with a method `load` (\ref TNL::Matrices::MultidiagonalMatrix::load). For printing the matrix, there is a method `print` (\ref TNL::Matrices::MultidiagonalMatrix::print) can be used.
+
+### Lambda matrix IO
+
+The lambda matrix, can be printed by the means of the method `print` (\ref TNL::Matrices::LambdaMatrix::print). The lambda matrix do not offer the methods `save` and `load` since it does not manage any data. Of course, the lambda function evaluating the matrix elements can use any supporting data containers but it is up these containers to manage the IO operations.
+
+## Matrix view 
+
+### Dense matrix view
+
+Similar to array view (\ref TNL::Containers::ArayView) and vector view (\ref TNL::Containers::VectorView), matrices also offer their view for easier use with lambda functions. For the dense matrix there is a `DenseMatrixView` (\ref TNL::Matrices::DenseMatrixView) which is a templated class with the following template arguments (they are the same as for `DenseMatrix` -- \ref TNL::Matrices::DenseMatrix -- except of the allocator):
+
+* `Real` is a type of matrix elements.
+* `Device` is a device on which the matrix is allocated. This can be \ref TNL::Devices::Host or \ref TNL::Devices::Cuda.
+* `Index` is a type for indexing the matrix elements and also row and column indexes.
+* `Organization` tells the ordering of matrix elements in memory. It is either RowMajorOrder or ColumnMajorOrder.
+
+The first main reason for using the dense matrix view is its ability to be captured by lambda functions since the copy constructor makes only shallow copy. We will demonstrate it on the example showing the method `setElement` (\ref TNL::Matrices::DenseMatrix::setElement). However, the `SharedPointer` will be replaced with the `DenseMatrixView`. The code looks as follows:
+
+\includelineno DenseMatrixViewExample_setElement.cpp
+
+You can see that we do not need to use the shared pointer (\ref TNL::Pointers::SharedPointer) as we did in the example demonstrating the method `setElement` for dense matrix.  And the result is:
+
+\include DenseMatrixViewExample_setElement.out
+
+The second reason for using the `DenseMatrixView` is to encapsulate data allocated by some other library or program then TNL. The following example demonstrates how to do it:
+
+\includelineno DenseMatrixViewExample_data_encapsulation.cpp
+
+On the lines 18--34 we create matrix by allocating array `data` and filling the matrix using a formula \f$ a_{ij} = i * size + j + 1\f$. We do it first on the host (lines 18--21) in auxilliary array `host_data` to make initiation of the array `data` easier in case when `Device` is GPU. Next, depending on the argument `Device`, we allocate the array `data` on the host or on GPU and copy data from the arary `host_data` to the array `data`. To insert this array into the dense matrix view, we first need to encapsulate it with vector view (\ref TNL::Conatianers::VectorView) `dataView` on the line 39 which can be then used to create the dense matrix view `matrix` on the line 40. Note that wee must set proper matrix elements organizationa which is `RowMajorOrder` (\ref TNL::Algorithms::Segments::RowMajorOrder) in this example. Next, we print the matrix to see if the encapsulation was succesfull (lines 42 and 43) and finaly we demonstrate manipulation with matrix elements (lines 45--48) and we print the result (lines 50 and 51).
+
+The result looks as follows:
+
+\include DenseMatrixViewExample_data_encapsulation.out
+
+The dense matrix view offers almost all methods which the dense matrix does. So it can be easily used at almost any situation the same way as the dense matrix itself.
+
+### Sparse matrix view
+
+Sparse matrix view serves, simillar to other views in TNL, to data sharing and for use with lambda functions (views can be easily captured since they make only shallow copy). The sparse matrix view (\ref TNL::Matrices::SparseMatrixView) is templated class having the following template arguments (they are the same as for `SparseMatrix` -- \ref TNL::Matrices::SparseMatrix -- except of the allocators):
+
+* `Real` is type if the matrix elements. It is `double` by default.
+* `Device` is a device where the matrix is allocated. Currently it can be either \ref TNL::Devices::Host for CPU or \ref TNL::Devices::Cuda for GPU supporting CUDA. It is \ref TNL::Devices::Host by default.
+* `Index` is a type to be used for indexing of the matrix elements. It is `int` by default.
+* `MatrixType` tells if the matrix is symmetric (\ref TNL::Matrices::SymmetricMatrix) or general (\ref TNL::Matrices::GeneralMatrix). It is a \ref TNL::Matrices::GeneralMatrix by default.
+* `Segments` define the format of the sparse matrix. It can be (by default, it is \ref TNL::Algorithms::Segments::CSR):
+   * \ref TNL::Algorithms::Segments::CSR for [CSR format](https://en.wikipedia.org/wiki/Sparse_matrix#Compressed_sparse_row_(CSR,_CRS_or_Yale_format)).
+   * \ref TNL::Algorithms::Segments::Ellpack for [Ellpack format](http://mgarland.org/files/papers/nvr-2008-004.pdf).
+   * \ref TNL::Algorithms::Segments::SlicedEllpack for [SlicedEllpack format](https://link.springer.com/chapter/10.1007/978-3-642-11515-8_10) which was also presented as [Row-grouped CSR format](https://arxiv.org/abs/1012.2270).
+   * \ref TNL::Algorithms::Segments::ChunkedEllpack for [ChunkedEllpack format](http://geraldine.fjfi.cvut.cz/~oberhuber/data/vyzkum/publikace/12-heller-oberhuber-improved-rgcsr-format.pdf) which we reffered as Improved Row-grouped CSR and we renamed it to Ellpack format since it uses padding zeros.
+   * \ref TNL::Algorithms::Segments::BiEllpack for [BiEllpack format](https://www.sciencedirect.com/science/article/pii/S0743731514000458?casa_token=2phrEj0Ef1gAAAAA:Lgf6rMBUN6T7TJne6mAgI_CSUJ-jR8jz7Eghdv6L0SJeGm4jfso-x6Wh8zgERk3Si7nFtTAJngg).
+* `ComputeReal` is type which is used for internal computations. By default it is the same as `Real` if `Real` is not `bool`. If `Real` is `bool`, `ComputeReal` is set to `Index` type. This can be changed, of course, by the user.
+
+**If `Real` is set to `bool`, we get *a binary matrix view*.**
+
+The following example shows the use of `SparseMatrixView` with lambda functions:
+
+\includelineno SparseMatrixViewExample_setElement.cpp
+
+The result looks as follows:
+
+\include SparseMatrixViewExample_setElement.out
+
+### Tridiagonal matrix view
+
+Similar to dense and sparse matrix view, tridiagonal matrix also offers its view for easier use with lambda functions. It is represented by a templated class \ref TNL::Matrices::TridiagonalMatrixView with the following template parameters:
+
+* `Real` is a type of matrix elements. 
+* `Device` is a device on which the matrix is allocated. This can be \ref TNL::Devices::Host or \ref TNL::Devices::Cuda.
+* `Index` is a type for indexing the matrix elements and also row and column indexes.
+* `Organization` tells the ordering of matrix elements in memory. It is either RowMajorOrder or ColumnMajorOrder.
+
+The first main reason for using the matrix view is its ability to be captured by lambda functions since the copy constructor makes only shallow copy. We can demonstrate it on the example showing the method `setElement` (\ref TNL::Matrices::TridiagonalMatrix::setElement). The code looks as follows:
+
+\includelineno TridiagonalMatrixViewExample_setElement.cpp
+
+The matrix view is obtained by the method `getView` (\ref TNL::Matrices::TridiagonalMatrix::getView) on the line 13. We firsrt show, that the view can be used the same way as common matrix (lines 14 and 15) but it can be used the same way even in lambda functions as we can see on the lines 20-26. Compare it with the same example using shared pointer instead of the matrix view:
+
+\includelineno TridiagonalMatrixExample_setElement.cpp
+
+The main disadventages are:
+
+1. The shared pointer must be created together with the matrix (line 14) and there is no way to get it later. The matrix view can be obtained from any matrix at any time.
+2. We have to synchronize shared pointers explicitly by calling the function \ref TNL::Pointers::synchronizeSmartPointersOnDevice (line 34).
+
+So for the sake of using a matrix in lambda functions, the matrix view is better tool. The result of both examples looks as:
+
+\include TridiagonalMatrixExample_setElement.out
+
+As we mentioned already, the tridiagonal matrix view offers almost all methods which the tridiagonal matrix does. So it can be easily used at almost any situation the same way as the tridiagonal matrix itself.
+
+### Multidiagonal matrix view <a name="multidiagonal_matrix_view"></a>
+
+Multidiagonal matrix also offers its view for easier use with lambda functions. It is represented by a templated class \ref TNL::Matrices::MultidiagonalMatrixView with the following template parameters:
+
+* `Real` is a type of matrix elements. 
+* `Device` is a device on which the matrix is allocated. This can be \ref TNL::Devices::Host or \ref TNL::Devices::Cuda.
+* `Index` is a type for indexing the matrix elements and also row and column indexes.
+* `Organization` tells the ordering of matrix elements in memory. It is either RowMajorOrder or ColumnMajorOrder.
+
+The first main reason for using the matrix view is its ability to be captured by lambda functions since the copy constructor makes only shallow copy. We can demonstrate it on the example showing the method `setElement` (\ref TNL::Matrices::MultidiagonalMatrix::setElement). The code looks as follows:
+
+\includelineno MultidiagonalMatrixViewExample_setElement.cpp
+
+The matrix view is obtained by the method `getView` (\ref TNL::Matrices::MultidiagonalMatrix::getView) on the line 13. We firsrt show, that the view can be used the same way as common matrix (lines 14 and 15) but it can be used the same way even in lambda functions as we can see on the lines 20-26. Compare it with the same example using shared pointer instead of the matrix view:
+
+\includelineno MultidiagonalMatrixExample_setElement.cpp
+
+The main disadventages are:
+
+1. The shared pointer must be created together with the matrix (line 14) and there is no way to get it later. The matrix view can be obtained from any matrix at any time.
+2. We have to synchronize shared pointers explicitly by calling the function \ref TNL::Pointers::synchronizeSmartPointersOnDevice (line 34).
+
+So for the sake of using a matrix in lambda functions, the matrix view is better tool. The result of both examples looks as:
+
+\include MultidiagonalMatrixExample_setElement.out
+
+As we mentioned already, the multidiagonal matrix view offers almost all methods which the multidiagonal matrix does. So it can be easily used at almost any situation the same way as the multidiagonal matrix itself.
+
+TODO: Move to explanation of the matrix view to introduction.
+
 
-```
-outVector = matrixMultiplicator * ( *this ) * inVector + outVectorMultiplicator * outVector
-```
 
-and it accepts the following parameters:
 
-* `inVector` is the input vector having the same number of elements as the number of matrix columns.
-* `outVector` is the output vector having the same number of elements as the number of matrix rows.
-* `matrixMultiplicator` is a number by which the result of matrix-vector product is multiplied. 
-* `outVectorMultiplicator` is a number by which the output vector is multiplied before it is added to the result of matrix-vector product.
-* `begin` is an index of the first matrix row that is involved in the multiplication. It is zero be default.
-* `end` is an index indicating the last matrix row that is involved in the multiplication which is `end - 1`. It is the number of matrix rows.
 
-Note that the output vector dimension must be the same as the number of matrix rows no matter how we set `begin` and `end` parameters. These parameters just say that some matrix rows and the output vector elements are omitted.
 
-### Lambda matrix IO
 
-The lambda matrix, can be printed by the means of the method `print` (\ref TNL::Matrices::LambdaMatrix::print). The lambda matrix do not offer the methods `save` and `load` since it does not manage any data. Of course, the lambda function evaluating the matrix elements can use any supporting data containers but it is up these containers to manage the IO operations.
\ No newline at end of file
-- 
GitLab


From 5b70df92ef360992c01462815e83824fa69b88c9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Sun, 10 Jan 2021 17:29:39 +0100
Subject: [PATCH 23/53] Added DenseMatrixRowView::setElement with localIdx
 parameter for compatibility with sparse matrices.

---
 src/TNL/Matrices/DenseMatrixRowView.h   | 34 +++++++++++++++++--------
 src/TNL/Matrices/DenseMatrixRowView.hpp | 14 +++++++++-
 2 files changed, 37 insertions(+), 11 deletions(-)

diff --git a/src/TNL/Matrices/DenseMatrixRowView.h b/src/TNL/Matrices/DenseMatrixRowView.h
index 498ec84f1..49774949b 100644
--- a/src/TNL/Matrices/DenseMatrixRowView.h
+++ b/src/TNL/Matrices/DenseMatrixRowView.h
@@ -15,17 +15,17 @@ namespace TNL {
 
 /**
  * \brief RowView is a simple structure for accessing rows of dense matrix.
- * 
+ *
  * \tparam SegmentView is a segment view of segments representing the matrix format.
  * \tparam ValuesView is a vector view storing the matrix elements values.
- * 
+ *
  * See \ref DenseMatrix and \ref DenseMatrixView.
- * 
+ *
  * \par Example
  * \include Matrices/DenseMatrix/DenseMatrixExample_getRow.cpp
  * \par Output
  * \include DenseMatrixExample_getRow.out
- * 
+ *
  * \par Example
  * \include Matrices/DenseMatrix/DenseMatrixViewExample_getRow.cpp
  * \par Output
@@ -59,7 +59,7 @@ class DenseMatrixRowView
 
       /**
        * \brief Constructor with \e segmentView and \e values
-       * 
+       *
        * \param segmentView instance of SegmentViewType representing matrix row.
        * \param values is a container view for storing the matrix elements values.
        */
@@ -69,7 +69,7 @@ class DenseMatrixRowView
 
       /**
        * \brief Returns size of the matrix row, i.e. number of matrix elements in this row.
-       * 
+       *
        * \return Size of the matrix row.
        */
       __cuda_callable__
@@ -77,9 +77,9 @@ class DenseMatrixRowView
 
       /**
        * \brief Returns constants reference to an element with given column index.
-       * 
+       *
        * \param column is column index of the matrix element.
-       * 
+       *
        * \return constant reference to the matrix element.
        */
       __cuda_callable__
@@ -87,9 +87,9 @@ class DenseMatrixRowView
 
       /**
        * \brief Returns non-constants reference to an element with given column index.
-       * 
+       *
        * \param column is a column index of the matrix element.
-       * 
+       *
        * \return non-constant reference to the matrix element.
        */
       __cuda_callable__
@@ -104,6 +104,20 @@ class DenseMatrixRowView
       __cuda_callable__
       void setElement( const IndexType column,
                        const RealType& value );
+
+      /**
+       * \brief Sets value of matrix element with given column index
+       *
+       * The \e localIdx parameter is here only for compatibility with
+       * the sparse matrices and it is omitted.
+       *
+       * \param column is a column index of the matrix element.
+       * \param value is a value the matrix element will be set to.
+       */
+      __cuda_callable__
+      void setElement( const IndexType localIdx,
+                       const IndexType column,
+                       const RealType& value );
    protected:
 
       SegmentViewType segmentView;
diff --git a/src/TNL/Matrices/DenseMatrixRowView.hpp b/src/TNL/Matrices/DenseMatrixRowView.hpp
index 9ca725396..1c7af4adf 100644
--- a/src/TNL/Matrices/DenseMatrixRowView.hpp
+++ b/src/TNL/Matrices/DenseMatrixRowView.hpp
@@ -56,7 +56,7 @@ getElement( const IndexType column ) -> RealType&
 
 template< typename SegmentView,
           typename ValuesView >
-__cuda_callable__ void 
+__cuda_callable__ void
 DenseMatrixRowView< SegmentView, ValuesView >::
 setElement( const IndexType column,
             const RealType& value )
@@ -66,6 +66,18 @@ setElement( const IndexType column,
    values[ globalIdx ] = value;
 }
 
+template< typename SegmentView,
+          typename ValuesView >
+__cuda_callable__ void
+DenseMatrixRowView< SegmentView, ValuesView >::
+setElement( const IndexType localIdx,
+            const IndexType column,
+            const RealType& value )
+{
+   TNL_ASSERT_LT( column, this->getSize(), "Column index exceeds matrix row size." );
+   const IndexType globalIdx = segmentView.getGlobalIndex( column );
+   values[ globalIdx ] = value;
+}
 
    } // namespace Matrices
 } // namespace TNL
-- 
GitLab


From df48151beb3f082c1bb820886fec860b6747182c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Sun, 10 Jan 2021 17:30:38 +0100
Subject: [PATCH 24/53] Added simple benchmark for comparions of different ways
 of matrix setup for a purpose of matrix tutorial.

---
 .../Matrices/MatrixSetup_Benchmark.cpp        | 315 ++++++++++++++++++
 .../Matrices/MatrixSetup_Benchmark.cu         |   1 +
 2 files changed, 316 insertions(+)
 create mode 100644 Documentation/Tutorials/Matrices/MatrixSetup_Benchmark.cpp
 create mode 120000 Documentation/Tutorials/Matrices/MatrixSetup_Benchmark.cu

diff --git a/Documentation/Tutorials/Matrices/MatrixSetup_Benchmark.cpp b/Documentation/Tutorials/Matrices/MatrixSetup_Benchmark.cpp
new file mode 100644
index 000000000..fd1841e4d
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/MatrixSetup_Benchmark.cpp
@@ -0,0 +1,315 @@
+#include <iostream>
+#include <TNL/Algorithms/ParallelFor.h>
+#include <TNL/Matrices/SparseMatrix.h>
+#include <TNL/Devices/Host.h>
+#include <TNL/Devices/Cuda.h>
+#include <TNL/Timer.h>
+
+const int testsCount = 5;
+
+template< typename Matrix >
+void STL_Map( const int gridSize, Matrix& matrix )
+{
+   /***
+    * Set  matrix representing approximation of the Laplace operator on regular
+    * grid using the finite difference method by means of STL map.
+    */
+   const int matrixSize = gridSize * gridSize;
+   matrix.setDimensions( matrixSize, matrixSize );
+   std::map< std::pair< int, int >, double > map;
+   for( int j = 0; j < gridSize; j++ )
+      for( int i = 0; i < gridSize; i++ )
+      {
+         const int rowIdx = j * gridSize + i;
+         if( i == 0 || j == 0 || i == gridSize - 1 || j == gridSize - 1 )
+            map.insert( std::make_pair( std::make_pair( rowIdx, rowIdx ),  1.0 ) );
+         else
+         {
+            map.insert( std::make_pair( std::make_pair( rowIdx, rowIdx - gridSize ),  1.0 ) );
+            map.insert( std::make_pair( std::make_pair( rowIdx, rowIdx - 1 ),  1.0 ) );
+            map.insert( std::make_pair( std::make_pair( rowIdx, rowIdx ),  -4.0 ) );
+            map.insert( std::make_pair( std::make_pair( rowIdx, rowIdx + 1 ),  1.0 ) );
+            map.insert( std::make_pair( std::make_pair( rowIdx, rowIdx + gridSize ),  1.0 ) );
+         }
+      }
+   matrix.setElements( map );
+}
+
+template< typename Matrix >
+void setElement_on_host( const int gridSize, Matrix& matrix )
+{
+   /***
+    * Set  matrix representing approximation of the Laplace operator on regular
+    * grid using the finite difference method by means setElement method called
+    * from the host system.
+    */
+   const int matrixSize = gridSize * gridSize;
+   TNL::Containers::Vector< int, typename Matrix::DeviceType, int > rowCapacities( matrixSize, 5 );
+   matrix.setDimensions( matrixSize, matrixSize );
+   matrix.setRowCapacities( rowCapacities );
+
+   for( int j = 0; j < gridSize; j++ )
+      for( int i = 0; i < gridSize; i++ )
+      {
+         const int rowIdx = j * gridSize + i;
+         if( i == 0 || j == 0 || i == gridSize - 1 || j == gridSize - 1 )
+            matrix.setElement( rowIdx, rowIdx,  1.0 );
+         else
+         {
+            matrix.setElement( rowIdx, rowIdx - gridSize,  1.0 );
+            matrix.setElement( rowIdx, rowIdx - 1,  1.0 );
+            matrix.setElement( rowIdx, rowIdx,  -4.0 );
+            matrix.setElement( rowIdx, rowIdx + 1,  1.0 );
+            matrix.setElement( rowIdx, rowIdx + gridSize,  1.0 );
+         }
+      }
+}
+
+template< typename Matrix >
+void setElement_on_device( const int gridSize, Matrix& matrix )
+{
+   /***
+    * Set  matrix representing approximation of the Laplace operator on regular
+    * grid using the finite difference method by means of setElement method called
+    * from the native device.
+    */
+   const int matrixSize = gridSize * gridSize;
+   TNL::Containers::Vector< int, typename Matrix::DeviceType, int > rowCapacities( matrixSize, 5 );
+   matrix.setDimensions( matrixSize, matrixSize );
+   matrix.setRowCapacities( rowCapacities );
+
+   auto matrixView = matrix.getView();
+   auto f = [=] __cuda_callable__ ( int i, int j ) mutable {
+      const int rowIdx = j * gridSize + i;
+      if( i == 0 || j == 0 || i == gridSize - 1 || j == gridSize - 1 )
+         matrixView.setElement( rowIdx, rowIdx,  1.0 );
+      else
+      {
+         matrixView.setElement( rowIdx, rowIdx - gridSize,  1.0 );
+         matrixView.setElement( rowIdx, rowIdx - 1,  1.0 );
+         matrixView.setElement( rowIdx, rowIdx,  -4.0 );
+         matrixView.setElement( rowIdx, rowIdx + 1,  1.0 );
+         matrixView.setElement( rowIdx, rowIdx + gridSize,  1.0 );
+      }
+   };
+   TNL::Algorithms::ParallelFor2D< typename Matrix::DeviceType >::exec( 0, 0, gridSize, gridSize, f );
+}
+
+template< typename Matrix >
+void getRow( const int gridSize, Matrix& matrix )
+{
+   /***
+    * Set  matrix representing approximation of the Laplace operator on regular
+    * grid using the finite difference method by means of getRow method.
+    */
+   const int matrixSize = gridSize * gridSize;
+   TNL::Containers::Vector< int, typename Matrix::DeviceType, int > rowCapacities( matrixSize, 5 );
+   matrix.setDimensions( matrixSize, matrixSize );
+   matrix.setRowCapacities( rowCapacities );
+
+   auto matrixView = matrix.getView();
+   auto f = [=] __cuda_callable__ ( int rowIdx ) mutable {
+      const int i = rowIdx % gridSize;
+      const int j = rowIdx / gridSize;
+      auto row = matrixView.getRow( rowIdx );
+      if( i == 0 || j == 0 || i == gridSize - 1 || j == gridSize - 1 )
+         row.setElement( 2, rowIdx,  1.0 );
+      else
+      {
+         row.setElement( 0, rowIdx - gridSize, 1.0 );
+         row.setElement( 1, rowIdx - 1, 1.0 );
+         row.setElement( 2, rowIdx, -4.0 );
+         row.setElement( 3, rowIdx + 1, 1.0 );
+         row.setElement( 4, rowIdx + gridSize, 1.0 );
+      }
+   };
+   TNL::Algorithms::ParallelFor< typename Matrix::DeviceType >::exec( 0, matrixSize, f );
+}
+
+template< typename Matrix >
+void forRows( const int gridSize, Matrix& matrix )
+{
+   /***
+    * Set  matrix representing approximation of the Laplace operator on regular
+    * grid using the finite difference method by means of forRows method.
+    */
+
+   const int matrixSize = gridSize * gridSize;
+   TNL::Containers::Vector< int, typename Matrix::DeviceType, int > rowCapacities( matrixSize, 5 );
+   matrix.setDimensions( matrixSize, matrixSize );
+   matrix.setRowCapacities( rowCapacities );
+   auto matrixView = matrix.getView();
+
+   auto f = [=] __cuda_callable__ ( int rowIdx, int localIdx, int& columnIdx, float& value, bool& compute ) mutable {
+      const int i = rowIdx % gridSize;
+      const int j = rowIdx / gridSize;
+      auto row = matrixView.getRow( rowIdx );
+      if( i == 0 || j == 0 || i == gridSize - 1 || j == gridSize - 1 && localIdx == 0 )
+      {
+         columnIdx = rowIdx;
+         value = 1.0;
+      }
+      else
+      {
+         switch( localIdx )
+         {
+            case 0:
+               columnIdx = rowIdx - gridSize;
+               value = 1.0;
+               break;
+            case 1:
+               columnIdx = rowIdx - 1;
+               value = 1.0;
+               break;
+            case 2:
+               columnIdx = rowIdx;
+               value = -4.0;
+               break;
+            case 3:
+               columnIdx = rowIdx + 1;
+               value = 1.0;
+               break;
+            case 4:
+               columnIdx = rowIdx + gridSize;
+               value = 1.0;
+               break;
+         }
+      }
+   };
+   matrix.forRows( 0, matrixSize, f );
+}
+
+template< typename Device >
+void laplaceOperatorDenseMatrix()
+{
+   std::cout << " Dense matrix test:" << std::endl;
+   for( int gridSize = 16; gridSize <= 8192; gridSize *= 2 )
+   {
+      std::cout << "  Grid size = " << gridSize << std::endl;
+      TNL::Timer timer;
+
+      std::cout << "   setElement on host: ";
+      timer.reset();
+      timer.start();
+      for( int i = 0; i < testsCount; i++ )
+      {
+         TNL::Matrices::DenseMatrix< float, Device, int > matrix;
+         setElement_on_host( gridSize, matrix );
+      }
+      timer.stop();
+      std::cout << timer.getRealTime() / ( double ) testsCount << " sec." << std::endl;
+
+      std::cout << "   setElement on device: ";
+      timer.reset();
+      timer.start();
+      for( int i = 0; i < testsCount; i++ )
+      {
+         TNL::Matrices::DenseMatrix< float, Device, int > matrix;
+         setElement_on_device( gridSize, matrix );
+      }
+      timer.stop();
+      std::cout << timer.getRealTime() / ( double ) testsCount << " sec." << std::endl;
+
+      std::cout << "   getRow: ";
+      timer.reset();
+      timer.start();
+      for( int i = 0; i < testsCount; i++ )
+      {
+         TNL::Matrices::DenseMatrix< float, Device, int > matrix;
+         getRow( gridSize, matrix );
+      }
+      timer.stop();
+      std::cout << timer.getRealTime() / ( double ) testsCount << " sec." << std::endl;
+
+      std::cout << "   forRows: ";
+      timer.reset();
+      timer.start();
+      for( int i = 0; i < testsCount; i++ )
+      {
+         TNL::Matrices::DenseMatrix< float, Device, int > matrix;
+         forRows( gridSize, matrix );
+      }
+      timer.stop();
+      std::cout << timer.getRealTime() / ( double ) testsCount << " sec." << std::endl;
+   }
+}
+
+template< typename Device >
+void laplaceOperatorSparseMatrix()
+{
+   std::cout << " Sparse matrix test:" << std::endl;
+   for( int gridSize = 16; gridSize <= 8192; gridSize *= 2 )
+   {
+      std::cout << "  Grid size = " << gridSize << std::endl;
+      TNL::Timer timer;
+
+      std::cout << "   STL map: ";
+      timer.reset();
+      timer.start();
+      for( int i = 0; i < testsCount; i++ )
+      {
+         TNL::Matrices::SparseMatrix< float, Device, int > matrix;
+         STL_Map( gridSize, matrix );
+      }
+      timer.stop();
+      std::cout << timer.getRealTime() / ( double ) testsCount << " sec." << std::endl;
+
+      std::cout << "   setElement on host: ";
+      timer.reset();
+      timer.start();
+      for( int i = 0; i < testsCount; i++ )
+      {
+         TNL::Matrices::SparseMatrix< float, Device, int > matrix;
+         setElement_on_host( gridSize, matrix );
+      }
+      timer.stop();
+      std::cout << timer.getRealTime() / ( double ) testsCount << " sec." << std::endl;
+
+      std::cout << "   setElement on device: ";
+      timer.reset();
+      timer.start();
+      for( int i = 0; i < testsCount; i++ )
+      {
+         TNL::Matrices::SparseMatrix< float, Device, int > matrix;
+         setElement_on_device( gridSize, matrix );
+      }
+      timer.stop();
+      std::cout << timer.getRealTime() / ( double ) testsCount << " sec." << std::endl;
+
+      std::cout << "   getRow: ";
+      timer.reset();
+      timer.start();
+      for( int i = 0; i < testsCount; i++ )
+      {
+         TNL::Matrices::SparseMatrix< float, Device, int > matrix;
+         getRow( gridSize, matrix );
+      }
+      timer.stop();
+      std::cout << timer.getRealTime() / ( double ) testsCount << " sec." << std::endl;
+
+      std::cout << "   forRows: ";
+      timer.reset();
+      timer.start();
+      for( int i = 0; i < testsCount; i++ )
+      {
+         TNL::Matrices::SparseMatrix< float, Device, int > matrix;
+         forRows( gridSize, matrix );
+      }
+      timer.stop();
+      std::cout << timer.getRealTime() / ( double ) testsCount << " sec." << std::endl;
+
+   }
+}
+
+int main( int argc, char* argv[] )
+{
+   std::cout << "Creating Laplace operator matrix on CPU ... " << std::endl;
+   //laplaceOperatorDenseMatrix< TNL::Devices::Host >();
+   laplaceOperatorSparseMatrix< TNL::Devices::Host >();
+
+#ifdef HAVE_CUDA
+   std::cout << "Creating Laplace operator matrix on CUDA GPU ... " << std::endl;
+   laplaceOperatorDenseMatrix< TNL::Devices::Cuda >();
+   laplaceOperatorSparseMatrix< TNL::Devices::Cuda >();
+#endif
+}
diff --git a/Documentation/Tutorials/Matrices/MatrixSetup_Benchmark.cu b/Documentation/Tutorials/Matrices/MatrixSetup_Benchmark.cu
new file mode 120000
index 000000000..9b65ac7a2
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/MatrixSetup_Benchmark.cu
@@ -0,0 +1 @@
+MatrixSetup_Benchmark.cpp
\ No newline at end of file
-- 
GitLab


From 556f127ba55d2aca901676c101ca6901ff4cc657 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Sun, 10 Jan 2021 17:31:05 +0100
Subject: [PATCH 25/53] Added simple benchmark for comparions of different ways
 of matrix setup for a purpose of matrix tutorial - CMakeLists.txt.

---
 .../Tutorials/Matrices/CMakeLists.txt         | 23 ++++++++++++-------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/Documentation/Tutorials/Matrices/CMakeLists.txt b/Documentation/Tutorials/Matrices/CMakeLists.txt
index 8f66ce09e..d0a3f210a 100644
--- a/Documentation/Tutorials/Matrices/CMakeLists.txt
+++ b/Documentation/Tutorials/Matrices/CMakeLists.txt
@@ -23,7 +23,7 @@ IF( BUILD_CUDA )
    ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_rowsReduction_vectorProduct >
                        ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_rowsReduction_vectorProduct.out
                        OUTPUT DenseMatrixExample_rowsReduction_vectorProduct.out )
-   
+
    CUDA_ADD_EXECUTABLE( DenseMatrixExample_rowsReduction_maxNorm DenseMatrixExample_rowsReduction_maxNorm.cu )
    ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_rowsReduction_maxNorm >
                        ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_rowsReduction_maxNorm.out
@@ -94,15 +94,17 @@ IF( BUILD_CUDA )
                        ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SparseMatrixViewExample_setElement.out
                        OUTPUT SparseMatrixViewExample_setElement.out )
 
+   CUDA_ADD_EXECUTABLE( MatrixSetup_Benchmark_cuda MatrixSetup_Benchmark.cu )
+   ADD_CUSTOM_COMMAND( COMMAND MatrixSetup_Benchmark_cuda >
+                        ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/MatrixSetup_Benchmark.out
+                        OUTPUT MatrixSetup_Benchmark.out )
 ELSE()
-#   ADD_EXECUTABLE( UniquePointerExample UniquePointerExample.cpp )
-#   ADD_CUSTOM_COMMAND( COMMAND UniquePointerExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/UniquePointerExample.out OUTPUT UniquePointerExample.out )
+   ADD_EXECUTABLE( MatrixSetup_Benchmark MatrixSetup_Benchmark_cuda.cpp )
+   ADD_CUSTOM_COMMAND( COMMAND MatrixSetup_Benchmark >
+                        ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/MatrixSetup_Benchmark.out
+                        OUTPUT MatrixSetup_Benchmark.out )
 ENDIF()
-#
-#ADD_EXECUTABLE( UniquePointerHostExample UniquePointerHostExample.cpp )
-#ADD_CUSTOM_COMMAND( COMMAND UniquePointerHostExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/UniquePointerHostExample.out OUTPUT UniquePointerHostExample.out )
-#
-#
+
 IF( BUILD_CUDA )
 ADD_CUSTOM_TARGET( TutorialsMatricesCuda ALL DEPENDS
    DenseMatrixExample_Constructor_init_list.out
@@ -122,7 +124,12 @@ ADD_CUSTOM_TARGET( TutorialsMatricesCuda ALL DEPENDS
    SparseMatrixExample_forRows.out
    SparseMatrixExample_rowsReduction_vectorProduct.out
    SparseMatrixViewExample_setElement.out
+   MatrixSetup_Benchmark.out
  )
+ELSE()
+ADD_CUSTOM_TARGET( TutorialsMatrices ALL DEPENDS
+   MatrixSetup_Benchmark.out
+)
 ENDIF()
 #
 #ADD_CUSTOM_TARGET( TutorialsPointers ALL DEPENDS
-- 
GitLab


From 010045a16a999fd8944e1935a7b6b1e5f93816f0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Sun, 10 Jan 2021 17:31:59 +0100
Subject: [PATCH 26/53] Writting documentation for matrices - comparison of
 different types and matrix setup.

---
 .../Tutorials/Matrices/tutorial_Matrices.md   | 65 ++++++++++++++-----
 1 file changed, 49 insertions(+), 16 deletions(-)

diff --git a/Documentation/Tutorials/Matrices/tutorial_Matrices.md b/Documentation/Tutorials/Matrices/tutorial_Matrices.md
index 086ae7aef..b9118218c 100644
--- a/Documentation/Tutorials/Matrices/tutorial_Matrices.md
+++ b/Documentation/Tutorials/Matrices/tutorial_Matrices.md
@@ -8,13 +8,14 @@ TNL offers several types of matrices like dense (\ref TNL::Matrices::DenseMatrix
 1. [Overview of matrix types](#overview_of_matrix_types)
 2. [Allocation and setup of different matrix types](#allocation_and_setup_of_different_matrix_types)
    1. [Dense matrices](#dense_matrices_setup)
-   2. [Sparse matrices](#sparse_matrices)_setup
+   2. [Sparse matrices](#sparse_matrices_setup)
    3. [Tridiagonal matrices](#tridiagonal_matrices_setup)
    4. [Multidiagonal matrices](#multidiagonal_matrices_setup)
    5. [Lambda matrices](#lambda_matrices_setup)
-3. [Flexible reduction in matrix rows](#flexible_reduction_in_matrix_rows)
-4. [Matrix-vector product](#matrix_vector_product)
-5. [Matrix I/O operations](#matrix_io_operations)
+3. [Matrix view](#matrix_view)
+4. [Flexible reduction in matrix rows](#flexible_reduction_in_matrix_rows)
+5. [Matrix-vector product](#matrix_vector_product)
+6. [Matrix I/O operations](#matrix_io_operations)
 
 
 ## Overview of matrix types <a name="overview_of_matrix_types"></a>
@@ -107,6 +108,38 @@ In this table:
 
 ## Allocation and setup of different matrix types <a name="allocation_and_setup_of_different_matrix_types"></a>
 
+There are several ways how to create new matrix:
+
+1. **Initializer lists** allow to create matrix from the C++ initializer lists. The matrix elements must be therefore encoded in the source code and so it is useful for rather smaller matrices. Methods and constructors with initializer lists are user friendly and simple to use. It is a good choice for tool problems with small matrices.
+2. **STL map** can be used for creation of sparse matrices only. The user first insert all matrix elements together with their coordinates into `std::map` based on which the sparse matrix is created in the next step. It is simple and user friendly approach suitable for creation of large matrices. An advantage is that we do not need to know the distribution of the matrix elements in matrix rows in advance like we do in other ways of matrix construction. This makes the use of STL map suitable for combining of sparse matrices in TNL with other numerical packages. However, the sparse matrix is constructed on the host and then copied on GPU if necessary. Therefor, this approach is not a good choice if fast and efficient matrix construction is required.
+3. **Methods `setElement` and `addElement` called from the host** allows to change particular matrix elements. The methods can be called from host even for matrices allocated on GPU. In this case, however, the matrix elements are transferred on GPU one by one which is very inefficient. If the matrix is allocated on the host system (CPU), the efficiency is good. In case of sparse matrices, one must set row capacities (i.e. maximal number of nonzero elements in each row) before using these methods. If the row capacity is exceeded, the matrix has to be reallocated and all matrix elements are lost.
+4. **Methods `setElement` and `addElement` called from native device** allows to do efficient matrix elements setup even on devices (GPUs). In this case, the methods must be called from a GPU kernel or a lambda function combined with parallel for (\ref TNL::Algorithms::ParallelFor). The user get very good performance even when manipulating matrix allocated on GPU. On the other hand, only data structures allocated on GPUs can be used in the kernel or lambda function. The the matrix can be accessed in the GPU kernel or lambda function by means of [matrix view](#matrix_view) or the shared pointer (\ref TNL::Pointers::SharedPointer).
+5. **Method `getRow` combined with `ParallelFor`** is very simillar to the previous one. The difference is that with first fetch helper object called *matrix row* which is linked to particular matrix row. Using methods of this object, one may change the matrix elements in given matrix row. An advantage is that the access to the matrix row is resolved only once for all elements in the row. In some more sophisticated sparse matrix formats, this can be nontrivial operation and this approach may slightly improve the performance. Another advantage for sparse matrices is that we access the matrix elements based on their *local index* in the row which is something like a rank of the nonzero element in the row. This is more efficient than adressing the matrix elements by the column indexes which requires searching in the matrix row. So this may significantly improve the performance of setup of sparse matrices. When it comes to dense matrices, there should not be great difference in performance compared to use of the methods `setElement` and `getElement`. Note that when the method is called from GPU kernel or lambda function , only data structures allocated on GPU can be accessed and the matrix must be made accessible by the means of.
+6. **Method `forRows`** this approach is very similar to the previous one but it avoids using `ParallelFor` and necessity of passing the matrix to GPU kernels by matrix view or shared pointers.
+
+The following table shows pros and cons of particular mathods:
+
+|  Method                                 |   Pros                                                                 | Cons                                                                  |
+|:----------------------------------------|:-----------------------------------------------------------------------|:----------------------------------------------------------------------|
+| **Initializer list**                    | Simple.                                                                | Only for small matrices.                                              |
+| **STL map**                             | Simplest of all methods for sparse matrices.                           | Higher memory requirements, slow transfer on GPU.                     |
+| **[set,add]Element on host**            | Simple.                                                                | Requires setting of row capacities, slow transfer on GPU.             |
+| **[set,add]Element on native device**   | Good efficiency.                                                       | Requires setting of row capacities.                                   |
+|                                         |                                                                        | Requires writting GPU kernel or lambda function.                      |
+|                                         |                                                                        | Allows accessing only data allocated on the same device/memory space. |
+| **getRow and ParallelFor**              | Best efficiency for sparse matrices.                                   | Requires setting of row capacities.                                   |
+|                                         |                                                                        | Requires writting GPU kernel or lambda function.                      |
+|                                         |                                                                        | Allows accessing only data allocated on the same device/memory space. |
+|                                         |                                                                        | Use of matrix local indexes can be less intuitive.                    |
+| **forRows**                             | Best efficiency for sparse matrices.                                   | Requires setting of row capacities.                                   |
+|                                         | Avoid use of matrix view or shared pointer in kernels/lambda function. | Requires writting GPU kernel or lambda function.                      |
+|                                         |                                                                        | Allows accessing only data allocated on the same device/memory space. |
+|                                         |                                                                        | Use of matrix local indexes is less intuitive.                        |
+
+Though it may seem that the later methods come with more cons than pros they offer much higher performance and we believe they even them are still very user friendly. On the other hand, if the matrix setup performance is not a priority the use the simple but slow method can still be a good choice.
+
+
+
 ### Dense matrices <a name="dense_matrices_setup"></a>
 
 Dense matrix is a templated class defined in the namespace \ref TNL::Matrices. It has five template parameters:
@@ -868,7 +901,7 @@ The lambda matrix (\ref TNL::Matrices::LambdaMatrix) is a templated class with t
 The lambda function `MatrixElementsLambda` is supposed to have the following declaration:
 
 ```
-matrixElements( Index rows, 
+matrixElements( Index rows,
                 Index columns,
                 Index row,
                 Index localIdx,
@@ -887,7 +920,7 @@ where the particular parameterts have the following meaning:
 The lambda function `CompressedRowLengthsLambda` is supposed to look like this:
 
 ```
-rowLengths( Index rows, 
+rowLengths( Index rows,
             Index columns,
             Index row ) -> Index
 ```
@@ -918,7 +951,7 @@ Of course, the lambda matrix has the same interface as other matrix types. The f
 
 \includelineno LambdaMatrixExample_forRows.cpp
 
-Here, we treat the lambda matrix as if it was dense matrix. The lambda function `rowLengths` returns the number of the nonzero elements equal to the number of matrix columns (line 13). However, the lambda function `matrixElements` (lines 14-17), sets nozero values only to lower triangular part of the matrix. The elements in the upper part are equal to zero (line 16). Next we create an instance of the lambda matrix with help of the lambda matrix factory (\ref TNL::Matrices::LambdaMatrixFactory) (lines 19-20) and an instance of the dense matrix (\ref TNL::Matrices::DenseMatrix) (lines 22-23). 
+Here, we treat the lambda matrix as if it was dense matrix. The lambda function `rowLengths` returns the number of the nonzero elements equal to the number of matrix columns (line 13). However, the lambda function `matrixElements` (lines 14-17), sets nozero values only to lower triangular part of the matrix. The elements in the upper part are equal to zero (line 16). Next we create an instance of the lambda matrix with help of the lambda matrix factory (\ref TNL::Matrices::LambdaMatrixFactory) (lines 19-20) and an instance of the dense matrix (\ref TNL::Matrices::DenseMatrix) (lines 22-23).
 
 Next we call the lambda function `f` by the method `forRows` (\ref TNL::Matrices::LambdaMatrix::forRows) to set the matrix elements of the dense matrix `denseMatrix` (line 26) via the dense matrix view (`denseView`) (\ref TNL::Matrices::DenseMatrixView). Note, that in the lambda function `f` we get the matrix element value already evaluated in the variable `value` as we are used to from other matrix types. So in fact, the same lambda function `f` woudl do the same job even for sparse matrix or any other. Also note, that in this case we iterate even over all zero matrix elements because the lambda function `rowLengths` (line 13) tells so. The result looks as follows:
 
@@ -1139,7 +1172,7 @@ One of the most important matrix operation is the matrix-vector multiplication.
 
 * `inVector` is the input vector having the same number of elements as the number of matrix columns.
 * `outVector` is the output vector having the same number of elements as the number of matrix rows.
-* `matrixMultiplicator` is a number by which the result of matrix-vector product is multiplied. 
+* `matrixMultiplicator` is a number by which the result of matrix-vector product is multiplied.
 * `outVectorMultiplicator` is a number by which the output vector is multiplied before added to the result of matrix-vector product.
 * `begin` is an index of the first matrix row that is involved in the multiplication. It is zero be default.
 * `end` is an index of the last matrix row that is involved in the multiplication. It is the last matrix row by default.
@@ -1162,7 +1195,7 @@ and it accepts the following parameters:
 
 * `inVector` is the input vector having the same number of elements as the number of matrix columns.
 * `outVector` is the output vector having the same number of elements as the number of matrix rows.
-* `matrixMultiplicator` is a number by which the result of matrix-vector product is multiplied. 
+* `matrixMultiplicator` is a number by which the result of matrix-vector product is multiplied.
 * `outVectorMultiplicator` is a number by which the output vector is multiplied before added to the result of matrix-vector product.
 * `begin` is an index of the first matrix row that is involved in the multiplication. It is zero be default.
 * `end` is an index of the last matrix row that is involved in the multiplication. It is the last matrix row by default.
@@ -1181,12 +1214,12 @@ and it accepts the following parameters:
 
 * `inVector` is the input vector having the same number of elements as the number of matrix columns.
 * `outVector` is the output vector having the same number of elements as the number of matrix rows.
-* `matrixMultiplicator` is a number by which the result of matrix-vector product is multiplied. 
+* `matrixMultiplicator` is a number by which the result of matrix-vector product is multiplied.
 * `outVectorMultiplicator` is a number by which the output vector is multiplied before added to the result of matrix-vector product.
 * `begin` is an index of the first matrix row that is involved in the multiplication. It is zero be default.
 * `end` is an index indicating the last matrix row that is involved in the multiplication which is `end - 1`. It is the number of matrix rows.
 
-Note that the output vector dimension must be the same as the number of matrix rows no 
+Note that the output vector dimension must be the same as the number of matrix rows no
 matter how we set `begin` and `end` parameters. These parameters just say that some matrix rows and the output vector elements are omitted.
 
 ### Multidiagonal matrix
@@ -1202,7 +1235,7 @@ and it accepts the following parameters:
 
 * `inVector` is the input vector having the same number of elements as the number of matrix columns.
 * `outVector` is the output vector having the same number of elements as the number of matrix rows.
-* `matrixMultiplicator` is a number by which the result of matrix-vector product is multiplied. 
+* `matrixMultiplicator` is a number by which the result of matrix-vector product is multiplied.
 * `outVectorMultiplicator` is a number by which the output vector is multiplied before it is added to the result of matrix-vector product.
 * `begin` is an index of the first matrix row that is involved in the multiplication. It is zero be default.
 * `end` is an index indicating the last matrix row that is involved in the multiplication which is `end - 1`. It is the number of matrix rows.
@@ -1221,7 +1254,7 @@ and it accepts the following parameters:
 
 * `inVector` is the input vector having the same number of elements as the number of matrix columns.
 * `outVector` is the output vector having the same number of elements as the number of matrix rows.
-* `matrixMultiplicator` is a number by which the result of matrix-vector product is multiplied. 
+* `matrixMultiplicator` is a number by which the result of matrix-vector product is multiplied.
 * `outVectorMultiplicator` is a number by which the output vector is multiplied before it is added to the result of matrix-vector product.
 * `begin` is an index of the first matrix row that is involved in the multiplication. It is zero be default.
 * `end` is an index indicating the last matrix row that is involved in the multiplication which is `end - 1`. It is the number of matrix rows.
@@ -1249,7 +1282,7 @@ The multidiagonal matrix can be saved to a file using a method `save` (\ref TNL:
 
 The lambda matrix, can be printed by the means of the method `print` (\ref TNL::Matrices::LambdaMatrix::print). The lambda matrix do not offer the methods `save` and `load` since it does not manage any data. Of course, the lambda function evaluating the matrix elements can use any supporting data containers but it is up these containers to manage the IO operations.
 
-## Matrix view 
+## Matrix view
 
 ### Dense matrix view
 
@@ -1310,7 +1343,7 @@ The result looks as follows:
 
 Similar to dense and sparse matrix view, tridiagonal matrix also offers its view for easier use with lambda functions. It is represented by a templated class \ref TNL::Matrices::TridiagonalMatrixView with the following template parameters:
 
-* `Real` is a type of matrix elements. 
+* `Real` is a type of matrix elements.
 * `Device` is a device on which the matrix is allocated. This can be \ref TNL::Devices::Host or \ref TNL::Devices::Cuda.
 * `Index` is a type for indexing the matrix elements and also row and column indexes.
 * `Organization` tells the ordering of matrix elements in memory. It is either RowMajorOrder or ColumnMajorOrder.
@@ -1338,7 +1371,7 @@ As we mentioned already, the tridiagonal matrix view offers almost all methods w
 
 Multidiagonal matrix also offers its view for easier use with lambda functions. It is represented by a templated class \ref TNL::Matrices::MultidiagonalMatrixView with the following template parameters:
 
-* `Real` is a type of matrix elements. 
+* `Real` is a type of matrix elements.
 * `Device` is a device on which the matrix is allocated. This can be \ref TNL::Devices::Host or \ref TNL::Devices::Cuda.
 * `Index` is a type for indexing the matrix elements and also row and column indexes.
 * `Organization` tells the ordering of matrix elements in memory. It is either RowMajorOrder or ColumnMajorOrder.
-- 
GitLab


From f5010d55c38f2fbfa2511e7a1efffa7cb2fd3056 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Mon, 11 Jan 2021 17:39:34 +0100
Subject: [PATCH 27/53] Added matrix setup bechmarks for dense and
 multidiagonal matrix.

---
 .../Tutorials/Matrices/CMakeLists.txt         |  37 ++-
 .../Matrices/DenseMatrixSetup_Benchmark.cpp   | 123 ++++++++++
 .../Matrices/DenseMatrixSetup_Benchmark.cu    |   1 +
 .../Matrices/MatrixSetup_Benchmark.cu         |   1 -
 .../MultidiagonalMatrixSetup_Benchmark.cpp    | 221 ++++++++++++++++++
 .../MultidiagonalMatrixSetup_Benchmark.cu     |   1 +
 ...rk.cpp => SparseMatrixSetup_Benchmark.cpp} |  59 -----
 .../Matrices/SparseMatrixSetup_Benchmark.cu   |   1 +
 8 files changed, 376 insertions(+), 68 deletions(-)
 create mode 100644 Documentation/Tutorials/Matrices/DenseMatrixSetup_Benchmark.cpp
 create mode 120000 Documentation/Tutorials/Matrices/DenseMatrixSetup_Benchmark.cu
 delete mode 120000 Documentation/Tutorials/Matrices/MatrixSetup_Benchmark.cu
 create mode 100644 Documentation/Tutorials/Matrices/MultidiagonalMatrixSetup_Benchmark.cpp
 create mode 120000 Documentation/Tutorials/Matrices/MultidiagonalMatrixSetup_Benchmark.cu
 rename Documentation/Tutorials/Matrices/{MatrixSetup_Benchmark.cpp => SparseMatrixSetup_Benchmark.cpp} (82%)
 create mode 120000 Documentation/Tutorials/Matrices/SparseMatrixSetup_Benchmark.cu

diff --git a/Documentation/Tutorials/Matrices/CMakeLists.txt b/Documentation/Tutorials/Matrices/CMakeLists.txt
index d0a3f210a..d4ef4444d 100644
--- a/Documentation/Tutorials/Matrices/CMakeLists.txt
+++ b/Documentation/Tutorials/Matrices/CMakeLists.txt
@@ -94,15 +94,36 @@ IF( BUILD_CUDA )
                        ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SparseMatrixViewExample_setElement.out
                        OUTPUT SparseMatrixViewExample_setElement.out )
 
-   CUDA_ADD_EXECUTABLE( MatrixSetup_Benchmark_cuda MatrixSetup_Benchmark.cu )
-   ADD_CUSTOM_COMMAND( COMMAND MatrixSetup_Benchmark_cuda >
-                        ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/MatrixSetup_Benchmark.out
-                        OUTPUT MatrixSetup_Benchmark.out )
+   CUDA_ADD_EXECUTABLE( DenseMatrixSetup_Benchmark_cuda DenseMatrixSetup_Benchmark.cu )
+   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixSetup_Benchmark_cuda >
+                        ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixSetup_Benchmark.out
+                        OUTPUT DenseMatrixSetup_Benchmark.out )
+
+   CUDA_ADD_EXECUTABLE( SparseMatrixSetup_Benchmark_cuda SparseMatrixSetup_Benchmark.cu )
+   ADD_CUSTOM_COMMAND( COMMAND SparseMatrixSetup_Benchmark_cuda >
+                        ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SparseMatrixSetup_Benchmark.out
+                        OUTPUT SparseMatrixSetup_Benchmark.out )
+
+   CUDA_ADD_EXECUTABLE( MultidiagonalMatrixSetup_Benchmark_cuda MultidiagonalMatrixSetup_Benchmark.cu )
+   ADD_CUSTOM_COMMAND( COMMAND MultidiagonalMatrixSetup_Benchmark_cuda >
+                        ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/MultidiagonalMatrixSetup_Benchmark.out
+                        OUTPUT MultidiagonalMatrixSetup_Benchmark.out )
+
 ELSE()
-   ADD_EXECUTABLE( MatrixSetup_Benchmark MatrixSetup_Benchmark_cuda.cpp )
-   ADD_CUSTOM_COMMAND( COMMAND MatrixSetup_Benchmark >
-                        ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/MatrixSetup_Benchmark.out
-                        OUTPUT MatrixSetup_Benchmark.out )
+   ADD_EXECUTABLE( DenseMatrixSetup_Benchmark DenseMatrixSetup_Benchmark_cuda.cpp )
+   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixSetup_Benchmark >
+                        ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixSetup_Benchmark.out
+                        OUTPUT DenseMatrixSetup_Benchmark.out )
+
+   ADD_EXECUTABLE( SparseMatrixSetup_Benchmark SparseMatrixSetup_Benchmark_cuda.cpp )
+   ADD_CUSTOM_COMMAND( COMMAND SparseMatrixSetup_Benchmark >
+                        ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SparseMatrixSetup_Benchmark.out
+                        OUTPUT SparseMatrixSetup_Benchmark.out )
+
+   ADD_EXECUTABLE( MultidiagonalMatrixSetup_Benchmark MultidiagonalMatrixSetup_Benchmark_cuda.cpp )
+   ADD_CUSTOM_COMMAND( COMMAND MultidiagonalMatrixSetup_Benchmark >
+                        ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/MultidiagonalMatrixSetup_Benchmark.out
+                        OUTPUT MultidiagonalMatrixSetup_Benchmark.out )
 ENDIF()
 
 IF( BUILD_CUDA )
diff --git a/Documentation/Tutorials/Matrices/DenseMatrixSetup_Benchmark.cpp b/Documentation/Tutorials/Matrices/DenseMatrixSetup_Benchmark.cpp
new file mode 100644
index 000000000..71a6eed2d
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/DenseMatrixSetup_Benchmark.cpp
@@ -0,0 +1,123 @@
+#include <iostream>
+#include <TNL/Algorithms/ParallelFor.h>
+#include <TNL/Matrices/SparseMatrix.h>
+#include <TNL/Devices/Host.h>
+#include <TNL/Devices/Cuda.h>
+#include <TNL/Timer.h>
+
+const int testsCount = 5;
+
+template< typename Matrix >
+void setElement_on_host( const int matrixSize, Matrix& matrix )
+{
+   matrix.setDimensions( matrixSize, matrixSize );
+
+   for( int j = 0; j < matrixSize; j++ )
+      for( int i = 0; i < matrixSize; i++ )
+         matrix.setElement( i, j,  i + j );
+}
+
+template< typename Matrix >
+void setElement_on_device( const int matrixSize, Matrix& matrix )
+{
+   matrix.setDimensions( matrixSize, matrixSize );
+
+   auto matrixView = matrix.getView();
+   auto f = [=] __cuda_callable__ ( int i, int j ) mutable {
+         matrixView.setElement( i, j,  i + j );
+   };
+   TNL::Algorithms::ParallelFor2D< typename Matrix::DeviceType >::exec( 0, 0, matrixSize, matrixSize, f );
+}
+
+template< typename Matrix >
+void getRow( const int matrixSize, Matrix& matrix )
+{
+   matrix.setDimensions( matrixSize, matrixSize );
+
+   auto matrixView = matrix.getView();
+   auto f = [=] __cuda_callable__ ( int rowIdx ) mutable {
+      auto row = matrixView.getRow( rowIdx );
+      for( int i = 0; i < matrixSize; i++ )
+         row.setElement( i, rowIdx + i );
+   };
+   TNL::Algorithms::ParallelFor< typename Matrix::DeviceType >::exec( 0, matrixSize, f );
+}
+
+template< typename Matrix >
+void forRows( const int matrixSize, Matrix& matrix )
+{
+   matrix.setDimensions( matrixSize, matrixSize );
+
+   auto f = [=] __cuda_callable__ ( int rowIdx, int localIdx, int& columnIdx, float& value, bool& compute ) mutable {
+      value = rowIdx + columnIdx;
+   };
+   matrix.forRows( 0, matrixSize, f );
+}
+
+template< typename Device >
+void setupDenseMatrix()
+{
+   std::cout << " Dense matrix test:" << std::endl;
+   for( int matrixSize = 16; matrixSize <= 8192; matrixSize *= 2 )
+   {
+      std::cout << "  Matrix size = " << matrixSize << std::endl;
+      TNL::Timer timer;
+
+      std::cout << "   setElement on host: ";
+      timer.reset();
+      timer.start();
+      for( int i = 0; i < testsCount; i++ )
+      {
+         TNL::Matrices::DenseMatrix< float, Device, int > matrix;
+         setElement_on_host( matrixSize, matrix );
+      }
+      timer.stop();
+      std::cout << timer.getRealTime() / ( double ) testsCount << " sec." << std::endl;
+
+      std::cout << "   setElement on device: ";
+      timer.reset();
+      timer.start();
+      for( int i = 0; i < testsCount; i++ )
+      {
+         TNL::Matrices::DenseMatrix< float, Device, int > matrix;
+         setElement_on_device( matrixSize, matrix );
+      }
+      timer.stop();
+      std::cout << timer.getRealTime() / ( double ) testsCount << " sec." << std::endl;
+
+      std::cout << "   getRow: ";
+      timer.reset();
+      timer.start();
+      for( int i = 0; i < testsCount; i++ )
+      {
+         TNL::Matrices::DenseMatrix< float, Device, int > matrix;
+         getRow( matrixSize, matrix );
+      }
+      timer.stop();
+      std::cout << timer.getRealTime() / ( double ) testsCount << " sec." << std::endl;
+
+      std::cout << "   forRows: ";
+      timer.reset();
+      timer.start();
+      for( int i = 0; i < testsCount; i++ )
+      {
+         TNL::Matrices::DenseMatrix< float, Device, int > matrix;
+         forRows( matrixSize, matrix );
+      }
+      timer.stop();
+      std::cout << timer.getRealTime() / ( double ) testsCount << " sec." << std::endl;
+   }
+}
+
+
+int main( int argc, char* argv[] )
+{
+   std::cout << "Creating dense matrix on CPU ... " << std::endl;
+   setupDenseMatrix< TNL::Devices::Host >();
+
+
+#ifdef HAVE_CUDA
+   std::cout << "Creating dense matrix on CUDA GPU ... " << std::endl;
+   setupDenseMatrix< TNL::Devices::Cuda >();
+#endif
+}
diff --git a/Documentation/Tutorials/Matrices/DenseMatrixSetup_Benchmark.cu b/Documentation/Tutorials/Matrices/DenseMatrixSetup_Benchmark.cu
new file mode 120000
index 000000000..d9b61a3cf
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/DenseMatrixSetup_Benchmark.cu
@@ -0,0 +1 @@
+DenseMatrixSetup_Benchmark.cpp
\ No newline at end of file
diff --git a/Documentation/Tutorials/Matrices/MatrixSetup_Benchmark.cu b/Documentation/Tutorials/Matrices/MatrixSetup_Benchmark.cu
deleted file mode 120000
index 9b65ac7a2..000000000
--- a/Documentation/Tutorials/Matrices/MatrixSetup_Benchmark.cu
+++ /dev/null
@@ -1 +0,0 @@
-MatrixSetup_Benchmark.cpp
\ No newline at end of file
diff --git a/Documentation/Tutorials/Matrices/MultidiagonalMatrixSetup_Benchmark.cpp b/Documentation/Tutorials/Matrices/MultidiagonalMatrixSetup_Benchmark.cpp
new file mode 100644
index 000000000..0ee70e79b
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/MultidiagonalMatrixSetup_Benchmark.cpp
@@ -0,0 +1,221 @@
+#include <iostream>
+#include <TNL/Algorithms/ParallelFor.h>
+#include <TNL/Matrices/MultidiagonalMatrix.h>
+#include <TNL/Devices/Host.h>
+#include <TNL/Devices/Cuda.h>
+#include <TNL/Timer.h>
+
+const int testsCount = 5;
+
+template< typename Device >
+TNL::Containers::Vector< int, Device > getOffsets( const int gridSize )
+{
+   TNL::Containers::Vector< int, Device > offsets( 5 );
+   offsets.setElement( 0, -gridSize );
+   offsets.setElement( 1, -1 );
+   offsets.setElement( 2, 0 );
+   offsets.setElement( 3, 1 );
+   offsets.setElement( 4, gridSize );
+   return offsets;
+}
+
+template< typename Matrix >
+void setElement_on_host( const int gridSize, Matrix& matrix )
+{
+   /***
+    * Set  matrix representing approximation of the Laplace operator on regular
+    * grid using the finite difference method by means setElement method called
+    * from the host system.
+    */
+   const int matrixSize = gridSize * gridSize;
+   matrix.setDimensions( matrixSize, matrixSize, getOffsets< typename Matrix::DeviceType >( gridSize ) );
+
+   for( int j = 0; j < gridSize; j++ )
+      for( int i = 0; i < gridSize; i++ )
+      {
+         const int rowIdx = j * gridSize + i;
+         if( i == 0 || j == 0 || i == gridSize - 1 || j == gridSize - 1 )
+            matrix.setElement( rowIdx, rowIdx,  1.0 );
+         else
+         {
+            matrix.setElement( rowIdx, rowIdx - gridSize,  1.0 );
+            matrix.setElement( rowIdx, rowIdx - 1,  1.0 );
+            matrix.setElement( rowIdx, rowIdx,  -4.0 );
+            matrix.setElement( rowIdx, rowIdx + 1,  1.0 );
+            matrix.setElement( rowIdx, rowIdx + gridSize,  1.0 );
+         }
+      }
+}
+
+template< typename Matrix >
+void setElement_on_device( const int gridSize, Matrix& matrix )
+{
+   /***
+    * Set  matrix representing approximation of the Laplace operator on regular
+    * grid using the finite difference method by means of setElement method called
+    * from the native device.
+    */
+   const int matrixSize = gridSize * gridSize;
+   matrix.setDimensions( matrixSize, matrixSize, getOffsets< typename Matrix::DeviceType >( gridSize ) );
+
+   auto matrixView = matrix.getView();
+   auto f = [=] __cuda_callable__ ( int i, int j ) mutable {
+      const int rowIdx = j * gridSize + i;
+      if( i == 0 || j == 0 || i == gridSize - 1 || j == gridSize - 1 )
+         matrixView.setElement( rowIdx, rowIdx,  1.0 );
+      else
+      {
+         matrixView.setElement( rowIdx, rowIdx - gridSize,  1.0 );
+         matrixView.setElement( rowIdx, rowIdx - 1,  1.0 );
+         matrixView.setElement( rowIdx, rowIdx,  -4.0 );
+         matrixView.setElement( rowIdx, rowIdx + 1,  1.0 );
+         matrixView.setElement( rowIdx, rowIdx + gridSize,  1.0 );
+      }
+   };
+   TNL::Algorithms::ParallelFor2D< typename Matrix::DeviceType >::exec( 0, 0, gridSize, gridSize, f );
+}
+
+template< typename Matrix >
+void getRow( const int gridSize, Matrix& matrix )
+{
+   /***
+    * Set  matrix representing approximation of the Laplace operator on regular
+    * grid using the finite difference method by means of getRow method.
+    */
+   const int matrixSize = gridSize * gridSize;
+   matrix.setDimensions( matrixSize, matrixSize, getOffsets< typename Matrix::DeviceType >( gridSize ) );
+
+   auto matrixView = matrix.getView();
+   auto f = [=] __cuda_callable__ ( int rowIdx ) mutable {
+      const int i = rowIdx % gridSize;
+      const int j = rowIdx / gridSize;
+      auto row = matrixView.getRow( rowIdx );
+      if( i == 0 || j == 0 || i == gridSize - 1 || j == gridSize - 1 )
+         row.setElement( 2, 1.0 );
+      else
+      {
+         row.setElement( 0, 1.0 );
+         row.setElement( 1, 1.0 );
+         row.setElement( 2, -4.0 );
+         row.setElement( 3, 1.0 );
+         row.setElement( 4, 1.0 );
+      }
+   };
+   TNL::Algorithms::ParallelFor< typename Matrix::DeviceType >::exec( 0, matrixSize, f );
+}
+
+template< typename Matrix >
+void forRows( const int gridSize, Matrix& matrix )
+{
+   /***
+    * Set  matrix representing approximation of the Laplace operator on regular
+    * grid using the finite difference method by means of forRows method.
+    */
+
+   const int matrixSize = gridSize * gridSize;
+   matrix.setDimensions( matrixSize, matrixSize, getOffsets< typename Matrix::DeviceType >( gridSize ) );
+
+   auto f = [=] __cuda_callable__ ( int rowIdx, int localIdx, int columnIdx, float& value, bool& compute ) mutable {
+      const int i = rowIdx % gridSize;
+      const int j = rowIdx / gridSize;
+      if( i == 0 || j == 0 || i == gridSize - 1 || j == gridSize - 1 && localIdx == 0 )
+      {
+         columnIdx = rowIdx;
+         value = 1.0;
+      }
+      else
+      {
+         switch( localIdx )
+         {
+            case 0:
+               columnIdx = rowIdx - gridSize;
+               value = 1.0;
+               break;
+            case 1:
+               columnIdx = rowIdx - 1;
+               value = 1.0;
+               break;
+            case 2:
+               columnIdx = rowIdx;
+               value = -4.0;
+               break;
+            case 3:
+               columnIdx = rowIdx + 1;
+               value = 1.0;
+               break;
+            case 4:
+               columnIdx = rowIdx + gridSize;
+               value = 1.0;
+               break;
+         }
+      }
+   };
+   matrix.forRows( 0, matrixSize, f );
+}
+
+template< typename Device >
+void laplaceOperatorMultidiagonalMatrix()
+{
+   std::cout << " Sparse matrix test:" << std::endl;
+   for( int gridSize = 16; gridSize <= 8192; gridSize *= 2 )
+   {
+      std::cout << "  Grid size = " << gridSize << std::endl;
+      TNL::Timer timer;
+
+      std::cout << "   setElement on host: ";
+      timer.reset();
+      timer.start();
+      for( int i = 0; i < testsCount; i++ )
+      {
+         TNL::Matrices::MultidiagonalMatrix< float, Device, int > matrix;
+         setElement_on_host( gridSize, matrix );
+      }
+      timer.stop();
+      std::cout << timer.getRealTime() / ( double ) testsCount << " sec." << std::endl;
+
+      std::cout << "   setElement on device: ";
+      timer.reset();
+      timer.start();
+      for( int i = 0; i < testsCount; i++ )
+      {
+         TNL::Matrices::MultidiagonalMatrix< float, Device, int > matrix;
+         setElement_on_device( gridSize, matrix );
+      }
+      timer.stop();
+      std::cout << timer.getRealTime() / ( double ) testsCount << " sec." << std::endl;
+
+      std::cout << "   getRow: ";
+      timer.reset();
+      timer.start();
+      for( int i = 0; i < testsCount; i++ )
+      {
+         TNL::Matrices::MultidiagonalMatrix< float, Device, int > matrix;
+         getRow( gridSize, matrix );
+      }
+      timer.stop();
+      std::cout << timer.getRealTime() / ( double ) testsCount << " sec." << std::endl;
+
+      std::cout << "   forRows: ";
+      timer.reset();
+      timer.start();
+      for( int i = 0; i < testsCount; i++ )
+      {
+         TNL::Matrices::MultidiagonalMatrix< float, Device, int > matrix;
+         forRows( gridSize, matrix );
+      }
+      timer.stop();
+      std::cout << timer.getRealTime() / ( double ) testsCount << " sec." << std::endl;
+
+   }
+}
+
+int main( int argc, char* argv[] )
+{
+   std::cout << "Creating Laplace operator matrix on CPU ... " << std::endl;
+   laplaceOperatorMultidiagonalMatrix< TNL::Devices::Host >();
+
+#ifdef HAVE_CUDA
+   std::cout << "Creating Laplace operator matrix on CUDA GPU ... " << std::endl;
+   laplaceOperatorMultidiagonalMatrix< TNL::Devices::Cuda >();
+#endif
+}
diff --git a/Documentation/Tutorials/Matrices/MultidiagonalMatrixSetup_Benchmark.cu b/Documentation/Tutorials/Matrices/MultidiagonalMatrixSetup_Benchmark.cu
new file mode 120000
index 000000000..ec14fc9ea
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/MultidiagonalMatrixSetup_Benchmark.cu
@@ -0,0 +1 @@
+MultidiagonalMatrixSetup_Benchmark.cpp
\ No newline at end of file
diff --git a/Documentation/Tutorials/Matrices/MatrixSetup_Benchmark.cpp b/Documentation/Tutorials/Matrices/SparseMatrixSetup_Benchmark.cpp
similarity index 82%
rename from Documentation/Tutorials/Matrices/MatrixSetup_Benchmark.cpp
rename to Documentation/Tutorials/Matrices/SparseMatrixSetup_Benchmark.cpp
index fd1841e4d..58ab83a9a 100644
--- a/Documentation/Tutorials/Matrices/MatrixSetup_Benchmark.cpp
+++ b/Documentation/Tutorials/Matrices/SparseMatrixSetup_Benchmark.cpp
@@ -138,12 +138,10 @@ void forRows( const int gridSize, Matrix& matrix )
    TNL::Containers::Vector< int, typename Matrix::DeviceType, int > rowCapacities( matrixSize, 5 );
    matrix.setDimensions( matrixSize, matrixSize );
    matrix.setRowCapacities( rowCapacities );
-   auto matrixView = matrix.getView();
 
    auto f = [=] __cuda_callable__ ( int rowIdx, int localIdx, int& columnIdx, float& value, bool& compute ) mutable {
       const int i = rowIdx % gridSize;
       const int j = rowIdx / gridSize;
-      auto row = matrixView.getRow( rowIdx );
       if( i == 0 || j == 0 || i == gridSize - 1 || j == gridSize - 1 && localIdx == 0 )
       {
          columnIdx = rowIdx;
@@ -179,61 +177,6 @@ void forRows( const int gridSize, Matrix& matrix )
    matrix.forRows( 0, matrixSize, f );
 }
 
-template< typename Device >
-void laplaceOperatorDenseMatrix()
-{
-   std::cout << " Dense matrix test:" << std::endl;
-   for( int gridSize = 16; gridSize <= 8192; gridSize *= 2 )
-   {
-      std::cout << "  Grid size = " << gridSize << std::endl;
-      TNL::Timer timer;
-
-      std::cout << "   setElement on host: ";
-      timer.reset();
-      timer.start();
-      for( int i = 0; i < testsCount; i++ )
-      {
-         TNL::Matrices::DenseMatrix< float, Device, int > matrix;
-         setElement_on_host( gridSize, matrix );
-      }
-      timer.stop();
-      std::cout << timer.getRealTime() / ( double ) testsCount << " sec." << std::endl;
-
-      std::cout << "   setElement on device: ";
-      timer.reset();
-      timer.start();
-      for( int i = 0; i < testsCount; i++ )
-      {
-         TNL::Matrices::DenseMatrix< float, Device, int > matrix;
-         setElement_on_device( gridSize, matrix );
-      }
-      timer.stop();
-      std::cout << timer.getRealTime() / ( double ) testsCount << " sec." << std::endl;
-
-      std::cout << "   getRow: ";
-      timer.reset();
-      timer.start();
-      for( int i = 0; i < testsCount; i++ )
-      {
-         TNL::Matrices::DenseMatrix< float, Device, int > matrix;
-         getRow( gridSize, matrix );
-      }
-      timer.stop();
-      std::cout << timer.getRealTime() / ( double ) testsCount << " sec." << std::endl;
-
-      std::cout << "   forRows: ";
-      timer.reset();
-      timer.start();
-      for( int i = 0; i < testsCount; i++ )
-      {
-         TNL::Matrices::DenseMatrix< float, Device, int > matrix;
-         forRows( gridSize, matrix );
-      }
-      timer.stop();
-      std::cout << timer.getRealTime() / ( double ) testsCount << " sec." << std::endl;
-   }
-}
-
 template< typename Device >
 void laplaceOperatorSparseMatrix()
 {
@@ -304,12 +247,10 @@ void laplaceOperatorSparseMatrix()
 int main( int argc, char* argv[] )
 {
    std::cout << "Creating Laplace operator matrix on CPU ... " << std::endl;
-   //laplaceOperatorDenseMatrix< TNL::Devices::Host >();
    laplaceOperatorSparseMatrix< TNL::Devices::Host >();
 
 #ifdef HAVE_CUDA
    std::cout << "Creating Laplace operator matrix on CUDA GPU ... " << std::endl;
-   laplaceOperatorDenseMatrix< TNL::Devices::Cuda >();
    laplaceOperatorSparseMatrix< TNL::Devices::Cuda >();
 #endif
 }
diff --git a/Documentation/Tutorials/Matrices/SparseMatrixSetup_Benchmark.cu b/Documentation/Tutorials/Matrices/SparseMatrixSetup_Benchmark.cu
new file mode 120000
index 000000000..f5a79c132
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/SparseMatrixSetup_Benchmark.cu
@@ -0,0 +1 @@
+SparseMatrixSetup_Benchmark.cpp
\ No newline at end of file
-- 
GitLab


From 83db0caf9d0e023d8b1f69e3a35e50ebfd5ff889 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Mon, 11 Jan 2021 19:47:51 +0100
Subject: [PATCH 28/53] Fix of CMakeLists.txt.

---
 Documentation/Tutorials/Matrices/CMakeLists.txt | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/Documentation/Tutorials/Matrices/CMakeLists.txt b/Documentation/Tutorials/Matrices/CMakeLists.txt
index d4ef4444d..c6dd7cfe4 100644
--- a/Documentation/Tutorials/Matrices/CMakeLists.txt
+++ b/Documentation/Tutorials/Matrices/CMakeLists.txt
@@ -145,11 +145,15 @@ ADD_CUSTOM_TARGET( TutorialsMatricesCuda ALL DEPENDS
    SparseMatrixExample_forRows.out
    SparseMatrixExample_rowsReduction_vectorProduct.out
    SparseMatrixViewExample_setElement.out
-   MatrixSetup_Benchmark.out
+   DenseMatrixSetup_Benchmark.out
+   SparseMatrixSetup_Benchmark.out
+   MultidiagonalMatrixSetup_Benchmark.out
  )
 ELSE()
 ADD_CUSTOM_TARGET( TutorialsMatrices ALL DEPENDS
-   MatrixSetup_Benchmark.out
+   DenseMatrixSetup_Benchmark.out
+   SparseMatrixSetup_Benchmark.out
+   MultidiagonalMatrixSetup_Benchmark.out
 )
 ENDIF()
 #
-- 
GitLab


From 00c7e1cccf5ddc70cd554bc5b5ab0f5458cb2712 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Mon, 11 Jan 2021 21:42:43 +0100
Subject: [PATCH 29/53] Writting documentation on matrices.

---
 .../Tutorials/Matrices/tutorial_Matrices.md   | 166 +++++++++++++++---
 1 file changed, 137 insertions(+), 29 deletions(-)

diff --git a/Documentation/Tutorials/Matrices/tutorial_Matrices.md b/Documentation/Tutorials/Matrices/tutorial_Matrices.md
index b9118218c..c376a51aa 100644
--- a/Documentation/Tutorials/Matrices/tutorial_Matrices.md
+++ b/Documentation/Tutorials/Matrices/tutorial_Matrices.md
@@ -6,13 +6,13 @@ TNL offers several types of matrices like dense (\ref TNL::Matrices::DenseMatrix
 
 ## Table of Contents
 1. [Overview of matrix types](#overview_of_matrix_types)
-2. [Allocation and setup of different matrix types](#allocation_and_setup_of_different_matrix_types)
+2. [Matrix view](#matrix_view)
+3. [Allocation and setup of different matrix types](#allocation_and_setup_of_different_matrix_types)
    1. [Dense matrices](#dense_matrices_setup)
    2. [Sparse matrices](#sparse_matrices_setup)
    3. [Tridiagonal matrices](#tridiagonal_matrices_setup)
    4. [Multidiagonal matrices](#multidiagonal_matrices_setup)
    5. [Lambda matrices](#lambda_matrices_setup)
-3. [Matrix view](#matrix_view)
 4. [Flexible reduction in matrix rows](#flexible_reduction_in_matrix_rows)
 5. [Matrix-vector product](#matrix_vector_product)
 6. [Matrix I/O operations](#matrix_io_operations)
@@ -106,12 +106,16 @@ In this table:
 * **Sparse matrix** is number of bytes needed to store one matrix element in the sparse matrix.
 * **Fill ratio** is maximal percentage of the nonzero matrix elements until which the sparse matrix can perform better.
 
+## Matrix view <a name="matrix_view"></a>
+
+TODO: concept of matrix view. Add reference to general concepts
+
 ## Allocation and setup of different matrix types <a name="allocation_and_setup_of_different_matrix_types"></a>
 
 There are several ways how to create new matrix:
 
-1. **Initializer lists** allow to create matrix from the C++ initializer lists. The matrix elements must be therefore encoded in the source code and so it is useful for rather smaller matrices. Methods and constructors with initializer lists are user friendly and simple to use. It is a good choice for tool problems with small matrices.
-2. **STL map** can be used for creation of sparse matrices only. The user first insert all matrix elements together with their coordinates into `std::map` based on which the sparse matrix is created in the next step. It is simple and user friendly approach suitable for creation of large matrices. An advantage is that we do not need to know the distribution of the matrix elements in matrix rows in advance like we do in other ways of matrix construction. This makes the use of STL map suitable for combining of sparse matrices in TNL with other numerical packages. However, the sparse matrix is constructed on the host and then copied on GPU if necessary. Therefor, this approach is not a good choice if fast and efficient matrix construction is required.
+1. **Initializer lists** allow to create matrix from the [C++ initializer lists](https://en.cppreference.com/w/cpp/utility/initializer_list). The matrix elements must be therefore encoded in the source code and so it is useful for rather smaller matrices. Methods and constructors with initializer lists are user friendly and simple to use. It is a good choice for tool problems with small matrices.
+2. **STL map** can be used for creation of sparse matrices only. The user first insert all matrix elements together with their coordinates into [`std::map`](https://en.cppreference.com/w/cpp/container/map) based on which the sparse matrix is created in the next step. It is simple and user friendly approach suitable for creation of large matrices. An advantage is that we do not need to know the distribution of the matrix elements in matrix rows in advance like we do in other ways of matrix construction. This makes the use of STL map suitable for combining of sparse matrices in TNL with other numerical packages. However, the sparse matrix is constructed on the host and then copied on GPU if necessary. Therefor, this approach is not a good choice if fast and efficient matrix construction is required.
 3. **Methods `setElement` and `addElement` called from the host** allows to change particular matrix elements. The methods can be called from host even for matrices allocated on GPU. In this case, however, the matrix elements are transferred on GPU one by one which is very inefficient. If the matrix is allocated on the host system (CPU), the efficiency is good. In case of sparse matrices, one must set row capacities (i.e. maximal number of nonzero elements in each row) before using these methods. If the row capacity is exceeded, the matrix has to be reallocated and all matrix elements are lost.
 4. **Methods `setElement` and `addElement` called from native device** allows to do efficient matrix elements setup even on devices (GPUs). In this case, the methods must be called from a GPU kernel or a lambda function combined with parallel for (\ref TNL::Algorithms::ParallelFor). The user get very good performance even when manipulating matrix allocated on GPU. On the other hand, only data structures allocated on GPUs can be used in the kernel or lambda function. The the matrix can be accessed in the GPU kernel or lambda function by means of [matrix view](#matrix_view) or the shared pointer (\ref TNL::Pointers::SharedPointer).
 5. **Method `getRow` combined with `ParallelFor`** is very simillar to the previous one. The difference is that with first fetch helper object called *matrix row* which is linked to particular matrix row. Using methods of this object, one may change the matrix elements in given matrix row. An advantage is that the access to the matrix row is resolved only once for all elements in the row. In some more sophisticated sparse matrix formats, this can be nontrivial operation and this approach may slightly improve the performance. Another advantage for sparse matrices is that we access the matrix elements based on their *local index* in the row which is something like a rank of the nonzero element in the row. This is more efficient than adressing the matrix elements by the column indexes which requires searching in the matrix row. So this may significantly improve the performance of setup of sparse matrices. When it comes to dense matrices, there should not be great difference in performance compared to use of the methods `setElement` and `getElement`. Note that when the method is called from GPU kernel or lambda function , only data structures allocated on GPU can be accessed and the matrix must be made accessible by the means of.
@@ -122,8 +126,10 @@ The following table shows pros and cons of particular mathods:
 |  Method                                 |   Pros                                                                 | Cons                                                                  |
 |:----------------------------------------|:-----------------------------------------------------------------------|:----------------------------------------------------------------------|
 | **Initializer list**                    | Simple.                                                                | Only for small matrices.                                              |
-| **STL map**                             | Simplest of all methods for sparse matrices.                           | Higher memory requirements, slow transfer on GPU.                     |
-| **[set,add]Element on host**            | Simple.                                                                | Requires setting of row capacities, slow transfer on GPU.             |
+| **STL map**                             | Simplest of all methods for sparse matrices.                           | Higher memory requirements.                                           |
+|                                         | Does not need setting of matrix rows capacities                        | Slow transfer on GPU.                                                 |
+| **[set,add]Element on host**            | Simple.                                                                | Requires setting of row capacities.                                   |
+|                                         |                                                                        | Extremely slow transfer on GPU.                                       |
 | **[set,add]Element on native device**   | Good efficiency.                                                       | Requires setting of row capacities.                                   |
 |                                         |                                                                        | Requires writting GPU kernel or lambda function.                      |
 |                                         |                                                                        | Allows accessing only data allocated on the same device/memory space. |
@@ -136,21 +142,81 @@ The following table shows pros and cons of particular mathods:
 |                                         |                                                                        | Allows accessing only data allocated on the same device/memory space. |
 |                                         |                                                                        | Use of matrix local indexes is less intuitive.                        |
 
-Though it may seem that the later methods come with more cons than pros they offer much higher performance and we believe they even them are still very user friendly. On the other hand, if the matrix setup performance is not a priority the use the simple but slow method can still be a good choice.
+Though it may seem that the later methods come with more cons than pros they offer much higher performance and we believe they even them are still very user friendly. On the other hand, if the matrix setup performance is not a priority the use the simple but slow method can still be a good choice. The following tables demonstrate the performance of different methods. The tests were performed on CPU Intel Xeon CPU E5-2640 and GPU GeForce RTX 2070 in single precision.
+
+In the test of dense matrices, we set each matrix element to value equal to `rowIdx + columnIdx`. The times in seconds obtained on CPU looks as follows:
+
+| Matrix rows and columns     | `setElement` on host | `setElement` with `ParallelFor` | `getRow`    | `forRows`   |
+|----------------------------:|---------------------:|--------------------------------:|------------:|------------:|
+|                             |                      |                                 |             |             |
+
+And the same on GPU is in the following table:
+
+| Matrix rows and columns     | `setElement` on host | `setElement` with `ParallelFor` | `getRow`    | `forRows`   |
+|----------------------------:|---------------------:|--------------------------------:|------------:|------------:|
+|                             |                      |                                 |             |             |
+
+
+The sparse matrices are tested on computation of matrix approximating the Laplace operator in 2D. This matrix has at most five non-zero elements in each row. The times for sparse matrix (and CSR formart) on CPU in seconds looks as follows:
+
+| Matrix rows and columns     |  STL Map     | `setElement` on host | `getRow`    | `forRows`   |
+|----------------------------:|-------------:|---------------------:|------------:|------------:|
+|                         256 |      0.00045 |              0.00007 |     0.00005 |     0.00007 |
+|                       1,024 |      0.00129 |              0.00015 |     0.00007 |     0.00008 |
+|                       4,096 |      0.00569 |              0.00040 |     0.00007 |     0.00009 |
+|                      16,384 |      0.02024 |              0.00144 |     0.00007 |     0.00014 |
+|                      65,536 |      0.08687 |              0.00373 |     0.00014 |     0.00040 |
+|                     262,144 |      0.42524 |              0.01039 |     0.00039 |     0.00146 |
+|                   1,048,576 |      1.90120 |              0.03860 |     0.00417 |     0.00770 |
+|                   4,194,304 |      9.89239 |              0.15147 |     0.01844 |     0.03164 |
+|                  16,777,216 |     55.81530 |              0.61169 |     0.08441 |     0.13739 |
+|                  67,108,864 |    268.66000 |              2.44765 |     0.33831 |     0.54954 |
+
+We see, that use of STL map makes sence only in situation when it is hard to estimate necessary row capasities. Otherwise very simple with `setElement` method is much faster. If the performance is the highest priority, `getRow` method should be prefered. And the same on GPU is in the following table:
+
+| Matrix rows and columns     |  STL Map     | `setElement` on host | `setElement` on native device | `getRow`    | `forRows`   |
+|----------------------------:|-------------:|---------------------:|------------------------------:|------------:|------------:|
+|                         256 |      0.02423 |           0.0457575  |                       0.00027 |     0.00026 |     0.00027 |
+|                       1,024 |      0.00280 |           0.2043830  |                       0.00028 |     0.00028 |     0.00028 |
+|                       4,096 |      0.00637 |           0.8647010  |                       0.00031 |     0.00030 |     0.00031 |
+|                      16,384 |      0.02349 |           3.5592200  |                       0.00032 |     0.00031 |     0.00032 |
+|                      65,536 |      0.10333 |          14.4267000  |                       0.00072 |     0.00069 |     0.00070 |
+|                     262,144 |      0.52870 |          58.6620000  |                       0.00117 |     0.00115 |     0.00115 |
+|                   1,048,576 |      2.17003 |         235.7660000  |                       0.00335 |     0.00331 |     0.00333 |
+|                   4,194,304 |     11.98680 |         930.6170000  |                       0.00993 |     0.00997 |     0.01003 |
+|                  16,777,216 |     64.24220 |        3737.8400000  |                       0.02759 |     0.02751 |     0.02745 |
+|                  67,108,864 |    284.11700 |       15007.6000000  |                       0.06648 |     0.06802 |     0.06834 |
+
+Here we see, the `setElement` methods performs extremely bad because all matrix elements are transfered to GPU one-by-one. Even STL map is much faster. Note, that the times for STL map are not much higher compared to CPU which indicates that the transfer of the matrix on GPU is not dominant. Another simple method could by to setup the matrix on CPU by the means of `setElement` method and trasnfer it on GPU.
+
+Finaly, the following tables show the times of the same test performed with multidiagonal matrix. Times on CPU looks as follows:
+
+| Matrix rows and columns     |  STL Map     | `setElement` on host | `getRow`    | `forRows`   |
+|----------------------------:|-------------:|---------------------:|------------:|------------:|
+|                             |              |                      |             |             |
+
 
+And on GPU like the fallowing table:
 
+| Matrix rows and columns     |  STL Map     | `setElement` on host | `setElement` on native device | `getRow`    | `forRows`   |
+|----------------------------:|-------------:|---------------------:|------------------------------:|------------:|------------:|
+|                             |              |                      |                               |             |             |
 
 ### Dense matrices <a name="dense_matrices_setup"></a>
 
-Dense matrix is a templated class defined in the namespace \ref TNL::Matrices. It has five template parameters:
+Dense matrix (\ref TNL::Matrices::DenseMatrix) is a templated class defined in the namespace \ref TNL::Matrices. It has five template parameters:
 
 * `Real` is a type of the matrix elements. It is `double` by default.
 * `Device` is a device where the matrix shall be allocated. Currently it can be either \ref TNL::Devices::Host for CPU or \ref TNL::Devices::Cuda for GPU supporting CUDA. It is \ref TNL::Devices::Host by default.
 * `Index` is a type to be used for indexing of the matrix elements. It is `int` by default.
-* `ElementsOrganization` defines the organization of the matrix elements in memory. It can be \ref TNL::Algorithms::Segments::ColumnMajorOrder or \ref TNL::Algorithms::Segments::RowMajorOrder for column-major and row-major organization respectively. Be default it is the row-major order if the matrix is allocated in the host system and column major order if it is allocated on GPU.
+* `ElementsOrganization` defines the organization of the matrix elements in memory. It can be \ref TNL::Algorithms::Segments::ColumnMajorOrder or \ref TNL::Algorithms::Segments::RowMajorOrder for column-major and row-major organization respectively. Be default it is the row-major order if the matrix is allocated on the host system and column major order if it is allocated on GPU.
 * `RealAllocator` is a memory allocator (one from \ref TNL::Allocators) which shall be used for allocation of the matrix elements. By default, it is the default allocator for given `Real` type and `Device` type -- see \ref TNL::Allocators::Default.
 
-The following examples show how to allocate the dense matrix and how to initialize the matrix elements. Small matrices can be created simply by the constructor with an initializer list.
+The following examples show how to allocate the dense matrix and how to initialize the matrix elements.
+
+#### Initializer list
+
+Small matrices can be created simply by the constructor with an [initializer list](https://en.cppreference.com/w/cpp/utility/initializer_list).
 
 \includelineno Matrices/DenseMatrix/DenseMatrixExample_Constructor_init_list.cpp
 
@@ -158,15 +224,17 @@ In fact, the constructor takes a list of initializer lists. Each embedded list d
 
 \include DenseMatrixExample_Constructor_init_list.out
 
-Larger matrices can be set-up with methods `setElement` and `addElement` (\ref TNL::Matrices::DenseMatrix::setElement, \ref TNL::Matrices::DenseMatrix::addElement). The following example shows how to call these methods from the host.
+#### Methods `setElement` and `addElement`
+
+Larger matrices can be setup with methods `setElement` and `addElement` (\ref TNL::Matrices::DenseMatrix::setElement, \ref TNL::Matrices::DenseMatrix::addElement). The following example shows how to call these methods from the host.
 
 \includelineno DenseMatrixExample_addElement.cpp
 
-As we can see, both methods can be called from the host no matter where the matrix is allocated. If it is on GPU, each call of `setElement` or `addElement` (\ref TNL::Matrices::DenseMatrix::setElement, \ref TNL::Matrices::DenseMatrix::addElement) causes slow transfer of tha data between CPU and GPU. Use this approach only if the performance is not a priority for example for matrices which are set only once this way. The result looks as follows:
+As we can see, both methods can be called from the host no matter where the matrix is allocated. If it is on GPU, each call of `setElement` or `addElement` (\ref TNL::Matrices::DenseMatrix::setElement, \ref TNL::Matrices::DenseMatrix::addElement) causes slow transfer of tha data between CPU and GPU. Use this approach only if the performance is not a priority. The result looks as follows:
 
 \include DenseMatrixExample_addElement.out
 
-More efficient way of the matrix initialization on GPU consists in calling the methods `setElement` and `addElement` (\ref TNL::Matrices::DenseMatrix::setElement, \ref TNL::Matrices::DenseMatrix::addElement) directly from GPU. It is demonstrated in the following example (of course it works even for CPU):
+More efficient way of the matrix initialization on GPU consists of calling the methods `setElement` and `addElement` (\ref TNL::Matrices::DenseMatrix::setElement, \ref TNL::Matrices::DenseMatrix::addElement) directly from GPU. It is demonstrated in the following example (of course it works even for CPU):
 
 \includelineno DenseMatrixExample_setElement.cpp
 
@@ -174,6 +242,12 @@ Here we use `SharedPointer` (\ref TNL::Pointers::SharedPointer) to make the matr
 
 \include DenseMatrixExample_setElement.out
 
+#### Method `getRow`
+
+This method is available for the dense matrix (\ref TNL::Matrices::DenseMatrix::getRow) but only for compatibility with the sparse matrices which the method was designed for. Use it only when you need unified code for both dense and sparse matrices.
+
+#### Method `forRows`
+
 If we want to set more matrix elements in each row, we can use inner for-loop in the lambda function `f`. This, however, is limiting the parallelization and it can be inefficient for larger matrices. The next example demonstrates a method `forRows` (\ref TNL::Matrices::DenseMatrix::forRows) which iterates over all matrix elements in parallel and it calls a lambda function defining an operation we want to do on the matrix elements.
 
 \includelineno DenseMatrixExample_forRows.cpp
@@ -211,9 +285,11 @@ Major disadventage of sparse matrices is that there are a lot of different forma
 
 **If `Real` is set to `bool`, we get *a binary matrix* for which the non-zero elements can be equal only to one and so the matrix elements values are not stored explicitly in the memory.**
 
-### Sparse matrix allocation and initiation
+In the following text we will show how to create and setup sparse matrices.
 
-Small matrices can be initialized by a constructor with initializer list. We assume having the following sparse matrix
+#### Initializer list
+
+Small matrices can be initialized by a constructor with an [initializer list](https://en.cppreference.com/w/cpp/utility/initializer_list). We assume having the following sparse matrix
 
 \f[
 \left(
@@ -245,6 +321,20 @@ The result of both examples looks as follows:
 
 \include SparseMatrixExample_Constructor_init_list_2.out
 
+#### STL map
+
+Finaly, there is a constructor which creates the sparse matrix from [`std::map`](https://en.cppreference.com/w/cpp/container/map). It is usefull especially in situation when you cannot compute the matrix elements by rows but rather in random order. You can do it on CPU and store the matrix elements in [`std::map`](https://en.cppreference.com/w/cpp/container/map) data structure in a [COO](https://en.wikipedia.org/wiki/Sparse_matrix#Coordinate_list_(COO)) format manner. It means that each entry of the `map` is the following pair:
+
+```
+std::pair( std::pair( row_index, column_index ), element_value )
+```
+
+which defines one matrix element at given coordinates with given value. Of course, you can insert such entries in any order into the `map`. When it is complete you can pass it the sparse matrix. See the following example:
+
+\includelineno SparseMatrixExample_Constructor_std_map.cpp
+
+#### Setting of row capacities
+
 Larger matrices are created in two steps:
 
 1. We use a method \ref TNL::Matrices::SparseMatrix::setRowCapacities to initialize the underlying matrix format and to allocate memory for the matrix elements. This method only needs to know how many non-zero elements are supposed to be in each row. Once this is set, it cannot be changed only by reseting the whole matrix. In most situations, this is not an issue to compute the number of non-zero elements in each row. Note, however, that we do not tell the positions of the non-zeto elements. If some matrix format needs this information it cannot be used with this implementation of the sparse matrix.
@@ -264,7 +354,6 @@ See the following example which creates lower triangular matrix like this one
 \right).
 \f]
 
-
 \includelineno SparseMatrixExample_setRowCapacities.cpp
 
 The method \ref TNL::Matrices::SparseMatrix::setRowCapacities reads the required capacities of the matrix rows from a vector (or simmilar container - \ref TNL::Containers::Array, \ref TNL::Containers::ArrayView, \ref TNL::Containers::Vector and \ref TNL::Containers::VectorView) which has the same number of elements as the number of matrix rows and each element defines the capacity of the related row. The result looks as follows:
@@ -283,19 +372,10 @@ The result of both examples looks as follows:
 
 \include SparseMatrixExample_Constructor_init_list_1.out
 
-Finaly, there is a constructor which creates the sparse matrix from 'std::map'. It is usefull especially in situation when you cannot compute the matrix elements by rows but rather in random order. You can do it on CPU and store the matrix elements in `std::map` data structure in a [COO](https://en.wikipedia.org/wiki/Sparse_matrix#Coordinate_list_(COO)) format manner. It means that each entry of the `map` is the following pair:
-
-```
-std::pair( std::pair( row_index, column_index ), element_value )
-```
-
-which defines one matrix element at given coordinates with given value. Of course, you can insert such entries in any order into the `map`. When it is complete you can pass it the sparse matrix. See the following example:
-
-\includelineno SparseMatrixExample_Constructor_std_map.cpp
+#### Methods `setElement` and `addElement`
 
 A method `setElements` works the same way for already existing instances of sparse matrix:
 
-
 \includelineno SparseMatrixExample_setElements_map.cpp
 
 The result of both examples looks as folows:
@@ -324,6 +404,24 @@ The result looks as follows:
 
 \include SparseMatrixExample_addElement.out
 
+#### Method `getRow`
+
+More efficient method is to combine `getRow` (\ref TNL::Matrices::SparseMatrix::getRow) method with `ParallelFor` (\ref TNL::Algorithms::ParallelFor) and lambda function as the following example demonstrates:
+
+\includelineno SparseMatrixViewExample_getRow.cpp
+
+On the line 11, we create small matrix having five rows (number of rows is given by the size of the [initializer list](https://en.cppreference.com/w/cpp/utility/initializer_list) ) and columns (number of columns is given by the second parameter) and we set each row capacity to one (particular elements of the initalizer list). On the line 22, we call `ParallelFor` to iterate over all matrix elements. Each row is processed by the lambda function `f` (lines14-17). In the lambda function, we first fetch a sparse matrix row (\ref TNL::Matrices::SparseMatrixRowView) which is a proxy to matrix row. This object has a method `setElement` accepting three parameters:
+
+1. `localIdx` is a rank of the nonzero element in given matrix row.
+2. `columnIdx` is the new column index of the matrix element.
+3. `value` is the new value of the matrix element.
+
+The result looks as follows:
+
+\include SparseMatrixViewExample_getRow.out
+
+#### Method `forRows`
+
 Finaly, for the most efficient way of setting the non-zero matrix elements, is use of a method `forRows`. It requires indexes of the range of rows (`begin` and `end`) to be processed and a lambda function `function` which is called for each non-zero element. The lambda functions provides the following data:
 
 * `rowIdx` is a row index of the matrix element.
@@ -404,7 +502,7 @@ would not make sense. If we pass through this test, the matrix element lies in t
 
 \include SparseMatrixExample_forRows.out
 
-## Tridiagonal matrices <a name="tridiagonal_matrices_setup"></a>
+### Tridiagonal matrices <a name="tridiagonal_matrices_setup"></a>
 
 Tridiagonal matrix format serves for specific matrix pattern when the nonzero matrix elements can be placed only at the diagonal and immediately next to the diagonal. Here is an example:
 
@@ -439,7 +537,9 @@ Tridiagonal matrix is a templated class defined in the namespace \ref TNL::Matri
 * `ElementsOrganization` defines the organization of the matrix elements in memory. It can be \ref TNL::Algorithms::Segments::ColumnMajorOrder or \ref TNL::Algorithms::Segments::RowMajorOrder for column-major and row-major organization respectively. Be default it is the row-major order if the matrix is allocated in the host system and column major order if it is allocated on GPU.
 * `RealAllocator` is a memory allocator (one from \ref TNL::Allocators) which shall be used for allocation of the matrix elements. By default, it is the default allocator for given `Real` type and `Device` type -- see \ref TNL::Allocators::Default.
 
-### Tridiagonal matrix allocation and initiation
+In the following text we shows different methods for setup of tridiagonal matrices.
+
+#### Initializer list
 
 The tridiagonal matrix can be initialized by the means of the constructor with initializer list. The matrix from the begining of this section can be constructed as the following example shows:
 
@@ -526,6 +626,8 @@ The output of the example looks as:
 
 \include TridiagonalMatrixExample_Constructor_init_list_1.out
 
+#### Methods `setElement` and `addElement`
+
 Similar way of the tridiagonal matrix setup is offered by the method `setElements` (\ref TNL::Matrices::TridiagonalMatrix::setElements) as the following example demonstrates:
 
 \includelineno TridiagonalMatrixExample_setElements.cpp
@@ -543,6 +645,8 @@ The result looks as follows:
 
 \include TridiagonalMatrixExample_setElement.out
 
+#### Method `getRow`
+
  A slightly simpler way how to do the same with no need for shared pointer (\ref TNL::Pointers::SharedPointer), could be with the use of tridiagonal matrix view and the method `getRow` (\ref TNL::Matrices::TridiagonalMatrixView::getRow) as the following example demonstrates:
 
 \includelineno TridiagonalMatrixViewExample_getRow.cpp
@@ -553,6 +657,8 @@ The result looks as follows:
 
 \include TridiagonalMatrixViewExample_getRow.out
 
+#### Method `forRows`
+
 Finaly, even a bit more simple and bit less flexible way of matrix elements manipulation with use of the method `forRows` (\ref TNL::Matrices::TridiagonalMatrix::forRows) is demonstrated in the following example:
 
 \includelineno TridiagonalMatrixViewExample_forRows.cpp
@@ -578,7 +684,7 @@ The result looks as follows:
 
 \include TridiagonalMatrixViewExample_forRows.out
 
-## Multidiagonal matrices <a name="multidiagonal_matrices_setup"></a>
+### Multidiagonal matrices <a name="multidiagonal_matrices_setup"></a>
 
 Multidiagonal matrices are generalization of the tridiagonal matrix. It is a special type of sparse matrices with specific pattern of the nonzero matrix elements which are positioned only parallel along diagonal. See the following example:
 
@@ -640,6 +746,8 @@ Multidiagonal matrix is a templated class defined in the namespace \ref TNL::Mat
 * `RealAllocator` is a memory allocator (one from \ref TNL::Allocators) which shall be used for allocation of the matrix elements. By default, it is the default allocator for given `Real` type and `Device` type -- see \ref TNL::Allocators::Default.
 * `IndexAllocator` is a memory allocator (one from \ref TNL::Allocators) which shall be used for allocation of the matrix elements offsets. By default, it is the default allocator for given `Index` type and `Device` type -- see \ref TNL::Allocators::Default.
 
+In the following text we show different methods how to setup multidiagonal matrices.
+
 ### Multidiagonal matrix allocation and initiation
 
 The construction of the multidiagonal matrix differs from the tridiagonal mainly in necessity to define the offsets of "subdiagonals" as we demonstrate on the following example which creates matrix like of the following form:
-- 
GitLab


From 0763edd1d8b6f831f7f617df44b80f052a7b9350 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Mon, 11 Jan 2021 22:41:58 +0100
Subject: [PATCH 30/53] Writting documentation on matrices.

---
 .../Tutorials/Matrices/tutorial_Matrices.md   | 210 +++++++++---------
 1 file changed, 109 insertions(+), 101 deletions(-)

diff --git a/Documentation/Tutorials/Matrices/tutorial_Matrices.md b/Documentation/Tutorials/Matrices/tutorial_Matrices.md
index c376a51aa..1e39fca89 100644
--- a/Documentation/Tutorials/Matrices/tutorial_Matrices.md
+++ b/Documentation/Tutorials/Matrices/tutorial_Matrices.md
@@ -6,16 +6,17 @@ TNL offers several types of matrices like dense (\ref TNL::Matrices::DenseMatrix
 
 ## Table of Contents
 1. [Overview of matrix types](#overview_of_matrix_types)
-2. [Matrix view](#matrix_view)
-3. [Allocation and setup of different matrix types](#allocation_and_setup_of_different_matrix_types)
+2. [Indexing of nonzero matrix elements in sparse matrices](#indexing_of_nonzero_matrix_elements_in_sparse_matrices)
+3. [Matrix view](#matrix_view)
+4. [Allocation and setup of different matrix types](#allocation_and_setup_of_different_matrix_types)
    1. [Dense matrices](#dense_matrices_setup)
    2. [Sparse matrices](#sparse_matrices_setup)
    3. [Tridiagonal matrices](#tridiagonal_matrices_setup)
    4. [Multidiagonal matrices](#multidiagonal_matrices_setup)
    5. [Lambda matrices](#lambda_matrices_setup)
-4. [Flexible reduction in matrix rows](#flexible_reduction_in_matrix_rows)
-5. [Matrix-vector product](#matrix_vector_product)
-6. [Matrix I/O operations](#matrix_io_operations)
+5. [Flexible reduction in matrix rows](#flexible_reduction_in_matrix_rows)
+6. [Matrix-vector product](#matrix_vector_product)
+7. [Matrix I/O operations](#matrix_io_operations)
 
 
 ## Overview of matrix types <a name="overview_of_matrix_types"></a>
@@ -106,6 +107,47 @@ In this table:
 * **Sparse matrix** is number of bytes needed to store one matrix element in the sparse matrix.
 * **Fill ratio** is maximal percentage of the nonzero matrix elements until which the sparse matrix can perform better.
 
+## Indexing of nonzero matrix elements in sparse matrices <a name="indexing_of_nonzero_matrix_elements_in_sparse_matrices"></a>
+
+The sparse matrix formats usualy, in the first step, compress the matrix rows by omitting the zero matrix elements as follows
+
+\f[
+\left(
+\begin{array}{ccccc}
+0 & 1 & 0 & 2 & 0 \\
+0 & 0 & 5 & 0 & 0 \\
+4 & 0 & 0 & 0 & 7 \\
+0 & 3 & 0 & 8 & 5 \\
+0 & 5 & 7 & 0 & 0
+\end{array}
+\right)
+\rightarrow
+\left(
+\begin{array}{ccccc}
+1 & 2 & . & . & . \\
+5 & . & . & . & . \\
+4 & 7 & . & . & . \\
+3 & 8 & 5 & . & . \\
+5 & 7 & . & . & .
+\end{array}
+\right)
+\f]
+
+In this case, it is more efficient to refer the nonzero matrix elements by their rank in the compressed matrix rather than by their column index in the original matrix. In methods for the sparse matrices, this parameter is called `localIdx`. Some sparse matrix formats adds some padding zeros for better alignment of data in memory. But if this is not the case, the variable `localIdx` of particular matrix elements would read as:
+
+\f[
+\left(
+\begin{array}{ccccc}
+0 & 1 & . & . & . \\
+0 & . & . & . & . \\
+0 & 1 & . & . & . \\
+0 & 1 & 2 & . & . \\
+0 & 1 & . & . & .
+\end{array}
+\right)
+\f]
+
+
 ## Matrix view <a name="matrix_view"></a>
 
 TODO: concept of matrix view. Add reference to general concepts
@@ -263,7 +305,7 @@ The result looks as follows:
 
 \include DenseMatrixExample_forRows.out
 
-## Sparse matrices <a name="sparse_matrices_setup"></a>
+### Sparse matrices <a name="sparse_matrices_setup"></a>
 
 [Sparse matrices](https://en.wikipedia.org/wiki/Sparse_matrix) are extremely important in a lot of numerical algorithms. They are used at situations when we need to operate with matrices having majority of the matrix elements equal to zero. In this case, only the non-zero matrix elements are stored with possible some *padding zeros* used for memory alignment. This is necessary mainly on GPUs. Consider just matrix having 50,000 rows and columns whih is 2,500,000,000 matrix elements. If we store each matrix element in double precision (it means eight bytes per element) we need 20,000,000,000 bytes which is nearly 20 GB of memory. If there are only five non-zero elements in each row we need only \f$8 \times 5 \times 50,000=2,000,000\f$ bytes and so nearly 200 MB. It is really great difference.
 
@@ -434,45 +476,7 @@ See the following example:
 
 \includelineno SparseMatrixExample_forRows.cpp
 
-On the line 9, we allocate a lower triangular matrix (because the row capacities `{1,2,3,4,5}` are equal to row index) using the `SparseMatrix`. On the line 11, we prepare lambda function `f` which we execute on the line 22 just by calling the method `forRows` (\ref TNL::Matrices::SpartseMatrix::forRows). This method takes the range of matrix rows as the first two parameters and the lambda function as the last parameter. The lambda function receives parameters metioned above (see the line 11). We first check if the matrix element coordinates (`rowIdx` and `localIdx`) points to an element lying before the matrix diagonal or on the diagonal. At this moment we should better explain the meaning of the parameter `localIdx`. It says the local index or the range of the non-zero element in the matrix row. The sparse matrix formats usualy in the first step compress the matrix rows by omitting the zero matrix elements as follows
-
-\f[
-\left(
-\begin{array}{ccccc}
-0 & 1 & 0 & 2 & 0 \\
-0 & 0 & 5 & 0 & 0 \\
-4 & 0 & 0 & 0 & 7 \\
-0 & 3 & 0 & 8 & 5 \\
-0 & 5 & 7 & 0 & 0
-\end{array}
-\right)
-\rightarrow
-\left(
-\begin{array}{ccccc}
-1 & 2 & . & . & . \\
-5 & . & . & . & . \\
-4 & 7 & . & . & . \\
-3 & 8 & 5 & . & . \\
-5 & 7 & . & . & .
-\end{array}
-\right)
-\f]
-
-Some sparse matrix formats adds back padding zeros for better alignment of data in memory. But if this is not the case, the local indexes of the matrix elements would read as:
-
-\f[
-\left(
-\begin{array}{ccccc}
-0 & 1 & . & . & . \\
-0 & . & . & . & . \\
-0 & 1 & . & . & . \\
-0 & 1 & 2 & . & . \\
-0 & 1 & . & . & .
-\end{array}
-\right)
-\f]
-
-In case of the lower triangular matrix in our example, the local index is in fact the same as the column index
+On the line 9, we allocate a lower triangular matrix (because the row capacities `{1,2,3,4,5}` are equal to row index) using the `SparseMatrix`. On the line 11, we prepare lambda function `f` which we execute on the line 22 just by calling the method `forRows` (\ref TNL::Matrices::SpartseMatrix::forRows). This method takes the range of matrix rows as the first two parameters and the lambda function as the last parameter. The lambda function receives parameters metioned above (see the line 11). We first check if the matrix element coordinates (`rowIdx` and `localIdx`) points to an element lying before the matrix diagonal or on the diagonal. In case of the lower triangular matrix in our example, the local index is in fact the same as the column index
 
 \f[
 \left(
@@ -541,7 +545,7 @@ In the following text we shows different methods for setup of tridiagonal matric
 
 #### Initializer list
 
-The tridiagonal matrix can be initialized by the means of the constructor with initializer list. The matrix from the begining of this section can be constructed as the following example shows:
+The tridiagonal matrix can be initialized by the means of the constructor with [initializer list](https://en.cppreference.com/w/cpp/utility/initializer_list). The matrix from the begining of this section can be constructed as the following example shows:
 
 \includelineno TridiagonalMatrixExample_Constructor_init_list_1.cpp
 
@@ -632,7 +636,6 @@ Similar way of the tridiagonal matrix setup is offered by the method `setElement
 
 \includelineno TridiagonalMatrixExample_setElements.cpp
 
-
 Here we create the matrix in two steps. Firstly, we setup the matrix dimensions by the appropriate constructor (line 24) and after that we setup the matrix elements (line 25-45). The result looks the same as in the previous example:
 
 \include TridiagonalMatrixExample_setElements.out
@@ -748,9 +751,62 @@ Multidiagonal matrix is a templated class defined in the namespace \ref TNL::Mat
 
 In the following text we show different methods how to setup multidiagonal matrices.
 
-### Multidiagonal matrix allocation and initiation
+#### Initializer list
+
+Smaller multidiagonal matrices can be constructed using the constructor of multidiagonal matrix taking the subdiagonals offsets as an initializer list:
+
+\includelineno MultidiagonalMatrixExample_Constructor_init_list_1.cpp
+
+The only change is on the line 17 which reads as
 
-The construction of the multidiagonal matrix differs from the tridiagonal mainly in necessity to define the offsets of "subdiagonals" as we demonstrate on the following example which creates matrix like of the following form:
+```
+TNL::Matrices::MultidiagonalMatrix< double, Device > matrix( matrixSize, matrixSize, { - gridSize, -1, 0, 1, gridSize } );
+```
+
+Here we call the mentioned constructor, which accepts the matrix dimensions (number of rows and columns) as first two parameters and the initializer list with the subdiagonal offsets as the last one. The result looks the same as in the previous example.
+
+There is also a constructor with initializer list for matrix elements values as demonstrated by the following example:
+
+\includelineno MultidiagonalMatrixExample_Constructor_init_list_2.cpp
+
+Here, we create a matrix which looks as
+
+\f[
+\left(
+\begin{array}{cccccc}
+4  & -1 &    & -1 &    &    \\
+-1 &  4 & -1 &    & -1 &    \\
+   & -1 & 4  & -1 &    & -1 \\
+-1 &    & -1 &  4 & -1 &    \\
+   & -1 &    & -1 & 4  & -1 \\
+   &    & -1 &    & -1 &  4 \\
+\end{array}
+\right).
+\f]
+
+On the lines 25-46, we call the constructor which, in addition to matrix dimensions and subdiagonals offsets, accepts also initializer list of initializer lists with matrix elements values. Each embedded list corresponds to one matrix row and it contains values of matrix elements on particular subdiagonals including those which lies out of the matrix. The result looks as follows:
+
+\include MultidiagonalMatrixExample_Constructor_init_list_2.out
+
+#### Methods `setElement` and `addElement`
+
+The matrix elements values can be changed using the method method `setElements` (\ref TNL::Matrices::MutlidiagonalMatrix::setElements) which accepts the elements values in the same form of embedded initializer list. It just does not allow changing the subdiagonals offsets. For this purpose method `setDiagonalsOffsets` (\ref TNL::Matrices::MultidiagonalMatrix::setDiagonalsOffsets) can be used. Note, however, that this method deletes all current matrix elements.
+
+Another way of setting the matrix elements is by means of the method `setElement` (\ref TNL::Matrices::MutlidiagonalMatrix::setElement). It works the same way as with other matrix types as we can see in the follofwing example:
+
+\includelineno MultidiagonalMatrixExample_setElement.cpp
+
+This examples shows that the method `setElement` can be used both on the host (CPU) (line 17) as well as in the GPU kernels (lines 23-27). Here we use shared pointer (\ref TNL::Pointers::SharedPointer) (line 15) to pass the multidiagonal matrix to lambda function `f` (lines 22-28) which may run on GPU. In this case we have to synchronize to share pointer explicitly by calling the function \ref TNL::Pointers::synchronizeSmartPointersOnDevice. To avoid this inconvenience the same can be achieved with the multidiagonal matrix view:
+
+\includelineno MultidiagonalMatrixViewExample_setElement.cpp
+
+In this example, we fetch the matrix view (line 16) immediately after creating the matrix itself (line 15). Note that the matrix view can be obtained from the matrix at any time while the shared pointer only at the time of the matrix creation. On the other hand, if the original matrix is changed, all matrix views become invalid which is not true for the shared pointers. So it is better to fetch the matrix view immediately before we use it to avoid the sitaution that you would use invalid matrix view. The method `setElement` (\ref TNL::Matrices::MutlidiagonalMatrixView::setElement) can be used on both host (CPU) (line 19) and the device (lines 25-29) if the lambda function `f` (lines 24-30) runs in GPU kernel. The result of both examles looks the same:
+
+\include MultidiagonalMatrixViewExample_setElement.out
+
+#### Method `getRow`
+
+In this example we will create matrix of the following form:
 
 \f[
 \left(
@@ -775,7 +831,7 @@ The construction of the multidiagonal matrix differs from the tridiagonal mainly
 \right)
 \f]
 
-The code reads as:
+The code based on use of method 'getRow' reads as:
 
 \includelineno MultidiagonalMatrixExample_Constructor.cpp
 
@@ -891,61 +947,11 @@ We use `ParallelFor2D` (\ref TNL::Algorithms::ParallelFor2D) to iterate over all
 \end{array}
 \f]
 
-
 The result looks as follows:
 
 \include MultidiagonalMatrixExample_Constructor.out
 
-Slightly simpler way of doing the same is by using the constructor of multidiagonal matrix taking the subdiagonals offsets as an STL initializer list:
-
-\includelineno MultidiagonalMatrixExample_Constructor_init_list_1.cpp
-
-The only change is on the line 17 which reads as
-
-```
-TNL::Matrices::MultidiagonalMatrix< double, Device > matrix( matrixSize, matrixSize, { - gridSize, -1, 0, 1, gridSize } );
-```
-
-Here we call the mentioned constructor, which accepts the matrix dimensions (number of rows and columns) as first two parameters and the initializer list with the subdiagonal offsets as the last one. The result looks the same as in the previous example.
-
-There is also a constructor with initializer list for matrix elements values as demonstrated by the following example:
-
-\includelineno MultidiagonalMatrixExample_Constructor_init_list_2.cpp
-
-Here, we create a matrix which looks as
-
-\f[
-\left(
-\begin{array}{cccccc}
-4  & -1 &    & -1 &    &    \\
--1 &  4 & -1 &    & -1 &    \\
-   & -1 & 4  & -1 &    & -1 \\
--1 &    & -1 &  4 & -1 &    \\
-   & -1 &    & -1 & 4  & -1 \\
-   &    & -1 &    & -1 &  4 \\
-\end{array}
-\right).
-\f]
-
-On the lines 25-46, we call the constructor which, in addition to matrix dimensions and subdiagonals offsets, accepts also initializer list of initializer lists with matrix elements values. Each embedded list corresponds to one matrix row and it contains values of matrix elements on particular subdiagonals including those which lies out of the matrix. The result looks as follows:
-
-\include MultidiagonalMatrixExample_Constructor_init_list_2.out
-
-The matrix elements values can be changed the same way using the method method `setElements` (\ref TNL::Matrices::MutlidiagonalMatrix::setElements) which accepts the elements values in the same form of embedded initializer list. It just does not allow changing the subdiagonals offsets. For this purpose method `setDiagonalsOffsets` (\ref TNL::Matrices::MultidiagonalMatrix::setDiagonalsOffsets) can be used. Note, however, that this method deletes all current matrix elements.
-
-Another way of setting the matrix elements is by means of the method `setElement` (\ref TNL::Matrices::MutlidiagonalMatrix::setElement). It works the same way as with other matrix types as we can see in the follofwing example:
-
-\includelineno MultidiagonalMatrixExample_setElement.cpp
-
-This examples shows that the method `setElement` can be used both on the host (CPU) (line 17) as well as in the GPU kernels (lines 23-27). Here we use shared pointer (\ref TNL::Pointers::SharedPointer) (line 15) to pass the multidiagonal matrix to lambda function `f` (lines 22-28) which may run on GPU. In this case we have to synchronize to share pointer explicitly by calling the function \ref TNL::Pointers::synchronizeSmartPointersOnDevice. To avoid this inconvenience the same can be achieved with the multidiagonal matrix view:
-
-\includelineno MultidiagonalMatrixViewExample_setElement.cpp
-
-In this example, we fetch the matrix view (line 16) immediately after creating the matrix itself (line 15). Note that the matrix view can be obtained from the matrix at any time while the shared pointer only at the time of the matrix creation. On the other hand, if the original matrix is changed, all matrix views become invalid which is not true for the shared pointers. So it is better to fetch the matrix view immediately before we use it to avoid the sitaution that you would use invalid matrix view. The method `setElement` (\ref TNL::Matrices::MutlidiagonalMatrixView::setElement) can be used on both host (CPU) (line 19) and the device (lines 25-29) if the lambda function `f` (lines 24-30) runs in GPU kernel. The result of both examles looks the same:
-
-\include MultidiagonalMatrixViewExample_setElement.out
-
-Another way for setting the matrix elements is by means of the multidiagonal matrix row:
+Another way for setting the matrix elements is by means of the multidiagonal matrix row is as follows:
 
 \includelineno MultidiagonalMatrixViewExample_getRow.cpp
 
@@ -978,6 +984,8 @@ The second parameter of the method `setElement` is the new matrix elements value
 
 \include MultidiagonalMatrixViewExample_getRow.out
 
+#### Method `forRows`
+
 Similar and even a bit simpler way of setting the matrix elements is offered by the method `forRows` (\ref TNL::Matrices::MultidiagonalMatrix::forRows, \ref TNL::Matrices::MultidiagonalMatrixView::forRows) as demonstrated in the following example:
 
 \includelineno MultidiagonalMatrixViewExample_forRows.cpp
-- 
GitLab


From 6149889f7721294f48a6a9a794fcd80c5a0eb360 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Tue, 12 Jan 2021 07:21:16 +0100
Subject: [PATCH 31/53] Deleting long time runnig matrix benchmarks from
 automatic run.

---
 .../Tutorials/Matrices/CMakeLists.txt         | 33 +++----------------
 1 file changed, 5 insertions(+), 28 deletions(-)

diff --git a/Documentation/Tutorials/Matrices/CMakeLists.txt b/Documentation/Tutorials/Matrices/CMakeLists.txt
index c6dd7cfe4..0d672aa0b 100644
--- a/Documentation/Tutorials/Matrices/CMakeLists.txt
+++ b/Documentation/Tutorials/Matrices/CMakeLists.txt
@@ -94,36 +94,19 @@ IF( BUILD_CUDA )
                        ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SparseMatrixViewExample_setElement.out
                        OUTPUT SparseMatrixViewExample_setElement.out )
 
+   ####
+   # THe following examples/benchmarks run for very long time
    CUDA_ADD_EXECUTABLE( DenseMatrixSetup_Benchmark_cuda DenseMatrixSetup_Benchmark.cu )
-   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixSetup_Benchmark_cuda >
-                        ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixSetup_Benchmark.out
-                        OUTPUT DenseMatrixSetup_Benchmark.out )
-
    CUDA_ADD_EXECUTABLE( SparseMatrixSetup_Benchmark_cuda SparseMatrixSetup_Benchmark.cu )
-   ADD_CUSTOM_COMMAND( COMMAND SparseMatrixSetup_Benchmark_cuda >
-                        ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SparseMatrixSetup_Benchmark.out
-                        OUTPUT SparseMatrixSetup_Benchmark.out )
-
    CUDA_ADD_EXECUTABLE( MultidiagonalMatrixSetup_Benchmark_cuda MultidiagonalMatrixSetup_Benchmark.cu )
-   ADD_CUSTOM_COMMAND( COMMAND MultidiagonalMatrixSetup_Benchmark_cuda >
-                        ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/MultidiagonalMatrixSetup_Benchmark.out
-                        OUTPUT MultidiagonalMatrixSetup_Benchmark.out )
 
 ELSE()
-   ADD_EXECUTABLE( DenseMatrixSetup_Benchmark DenseMatrixSetup_Benchmark_cuda.cpp )
-   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixSetup_Benchmark >
-                        ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixSetup_Benchmark.out
-                        OUTPUT DenseMatrixSetup_Benchmark.out )
 
+   ####
+   # THe following examples/benchmarks run for very long time
+   ADD_EXECUTABLE( DenseMatrixSetup_Benchmark DenseMatrixSetup_Benchmark_cuda.cpp )
    ADD_EXECUTABLE( SparseMatrixSetup_Benchmark SparseMatrixSetup_Benchmark_cuda.cpp )
-   ADD_CUSTOM_COMMAND( COMMAND SparseMatrixSetup_Benchmark >
-                        ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SparseMatrixSetup_Benchmark.out
-                        OUTPUT SparseMatrixSetup_Benchmark.out )
-
    ADD_EXECUTABLE( MultidiagonalMatrixSetup_Benchmark MultidiagonalMatrixSetup_Benchmark_cuda.cpp )
-   ADD_CUSTOM_COMMAND( COMMAND MultidiagonalMatrixSetup_Benchmark >
-                        ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/MultidiagonalMatrixSetup_Benchmark.out
-                        OUTPUT MultidiagonalMatrixSetup_Benchmark.out )
 ENDIF()
 
 IF( BUILD_CUDA )
@@ -145,15 +128,9 @@ ADD_CUSTOM_TARGET( TutorialsMatricesCuda ALL DEPENDS
    SparseMatrixExample_forRows.out
    SparseMatrixExample_rowsReduction_vectorProduct.out
    SparseMatrixViewExample_setElement.out
-   DenseMatrixSetup_Benchmark.out
-   SparseMatrixSetup_Benchmark.out
-   MultidiagonalMatrixSetup_Benchmark.out
  )
 ELSE()
 ADD_CUSTOM_TARGET( TutorialsMatrices ALL DEPENDS
-   DenseMatrixSetup_Benchmark.out
-   SparseMatrixSetup_Benchmark.out
-   MultidiagonalMatrixSetup_Benchmark.out
 )
 ENDIF()
 #
-- 
GitLab


From 333d604f331464b3e89014dbd2ff3dcd710154fa Mon Sep 17 00:00:00 2001
From: Tomas Oberhuber <tomas.oberhuber@fjfi.cvut.cz>
Date: Tue, 12 Jan 2021 12:51:20 +0100
Subject: [PATCH 32/53] Added tutorial on core concepts.

---
 .../CoreConcepts/tutorial_CoreConcepts.md         | 15 +++++++++++++++
 1 file changed, 15 insertions(+)
 create mode 100644 Documentation/Tutorials/CoreConcepts/tutorial_CoreConcepts.md

diff --git a/Documentation/Tutorials/CoreConcepts/tutorial_CoreConcepts.md b/Documentation/Tutorials/CoreConcepts/tutorial_CoreConcepts.md
new file mode 100644
index 000000000..61082057e
--- /dev/null
+++ b/Documentation/Tutorials/CoreConcepts/tutorial_CoreConcepts.md
@@ -0,0 +1,15 @@
+\page tutorial_CoreConcepts Core concepts
+
+## Introduction
+
+
+
+
+## Table of Contents
+1. [Devices and allocators](#devices_and_allocators)
+2. [Algorithms and lambda functions](#algorithms_and_lambda_functions)
+3. [Smart pointers and views](#static_for)
+
+
+## Devices and allocators<a name="devices_and_allocators"></a>
+
-- 
GitLab


From 75a8f3e34f83d23a61d402a42a188fb6a77d6c72 Mon Sep 17 00:00:00 2001
From: Tomas Oberhuber <tomas.oberhuber@fjfi.cvut.cz>
Date: Tue, 12 Jan 2021 12:51:48 +0100
Subject: [PATCH 33/53] Added tutorial on core concepts.

---
 Documentation/Tutorials/index.md | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/Documentation/Tutorials/index.md b/Documentation/Tutorials/index.md
index d517faa3b..2addba243 100644
--- a/Documentation/Tutorials/index.md
+++ b/Documentation/Tutorials/index.md
@@ -3,9 +3,10 @@
 ## Tutorials
 
 1. [Building applications with TNL](tutorial_building_applications_with_tnl.html)
-2. [Arrays](tutorial_Arrays.html)
-3. [Vectors](tutorial_Vectors.html)
-4. [Flexible parallel reduction and scan](tutorial_ReductionAndScan.html)
-5. [For loops](tutorial_ForLoops.html)
-6. [Cross-device pointers](tutorial_Pointers.html)
-7. [Matrices](tutorial_Matrices.html)
+2. [Core concepts](tutorial_CoreConcepts.html)
+3. [Arrays](tutorial_Arrays.html)
+4. [Vectors](tutorial_Vectors.html)
+5. [Flexible parallel reduction and scan](tutorial_ReductionAndScan.html)
+6. [For loops](tutorial_ForLoops.html)
+7. [Cross-device pointers](tutorial_Pointers.html)
+8. [Matrices](tutorial_Matrices.html)
-- 
GitLab


From fe1823331eb47dfa2ecabd859cd1bd727c64912f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Tue, 12 Jan 2021 22:35:40 +0100
Subject: [PATCH 34/53] Writting documentation - general concepts.

---
 Documentation/Tutorials/CMakeLists.txt        |   1 +
 .../CoreConcepts/tutorial_CoreConcepts.md     |  15 ---
 ...thms_and_lambda_functions_parallel_for.cpp |   8 ++
 ...orithms_and_lambda_functions_reduction.cpp |  10 ++
 ...ithms_and_lambda_functions_reduction_2.cpp |  12 ++
 ..._and_lambda_functions_reduction_cublas.cpp |   9 ++
 ...d_allocators_arrays_assignment_example.cpp |   1 +
 ...ces_and_allocators_arrays_device_check.cpp |   9 ++
 ...and_allocators_arrays_device_deduction.cpp |   6 +
 ..._devices_and_allocators_arrays_example.cpp |   2 +
 ..._and_allocators_arrays_setsize_example.cpp |   2 +
 ...d_pointers_and_views_capture_reference.cpp |  11 ++
 ...hared_pointers_and_views_capture_value.cpp |  11 ++
 ...shared_pointers_and_views_capture_view.cpp |  10 ++
 ...pointers_and_views_capture_view_change.cpp |  11 ++
 ...inters_and_views_capture_view_change_2.cpp |  11 ++
 .../tutorial_GeneralConcepts.md               | 115 ++++++++++++++++++
 Documentation/Tutorials/index.md              |   2 +-
 18 files changed, 230 insertions(+), 16 deletions(-)
 delete mode 100644 Documentation/Tutorials/CoreConcepts/tutorial_CoreConcepts.md
 create mode 100644 Documentation/Tutorials/GeneralConcepts/snippet_algorithms_and_lambda_functions_parallel_for.cpp
 create mode 100644 Documentation/Tutorials/GeneralConcepts/snippet_algorithms_and_lambda_functions_reduction.cpp
 create mode 100644 Documentation/Tutorials/GeneralConcepts/snippet_algorithms_and_lambda_functions_reduction_2.cpp
 create mode 100644 Documentation/Tutorials/GeneralConcepts/snippet_algorithms_and_lambda_functions_reduction_cublas.cpp
 create mode 100644 Documentation/Tutorials/GeneralConcepts/snippet_devices_and_allocators_arrays_assignment_example.cpp
 create mode 100644 Documentation/Tutorials/GeneralConcepts/snippet_devices_and_allocators_arrays_device_check.cpp
 create mode 100644 Documentation/Tutorials/GeneralConcepts/snippet_devices_and_allocators_arrays_device_deduction.cpp
 create mode 100644 Documentation/Tutorials/GeneralConcepts/snippet_devices_and_allocators_arrays_example.cpp
 create mode 100644 Documentation/Tutorials/GeneralConcepts/snippet_devices_and_allocators_arrays_setsize_example.cpp
 create mode 100644 Documentation/Tutorials/GeneralConcepts/snippet_shared_pointers_and_views_capture_reference.cpp
 create mode 100644 Documentation/Tutorials/GeneralConcepts/snippet_shared_pointers_and_views_capture_value.cpp
 create mode 100644 Documentation/Tutorials/GeneralConcepts/snippet_shared_pointers_and_views_capture_view.cpp
 create mode 100644 Documentation/Tutorials/GeneralConcepts/snippet_shared_pointers_and_views_capture_view_change.cpp
 create mode 100644 Documentation/Tutorials/GeneralConcepts/snippet_shared_pointers_and_views_capture_view_change_2.cpp
 create mode 100644 Documentation/Tutorials/GeneralConcepts/tutorial_GeneralConcepts.md

diff --git a/Documentation/Tutorials/CMakeLists.txt b/Documentation/Tutorials/CMakeLists.txt
index 98734f50c..53ce4df62 100644
--- a/Documentation/Tutorials/CMakeLists.txt
+++ b/Documentation/Tutorials/CMakeLists.txt
@@ -1,3 +1,4 @@
+add_subdirectory( GeneralConcepts )
 add_subdirectory( Arrays )
 add_subdirectory( Vectors )
 add_subdirectory( ReductionAndScan )
diff --git a/Documentation/Tutorials/CoreConcepts/tutorial_CoreConcepts.md b/Documentation/Tutorials/CoreConcepts/tutorial_CoreConcepts.md
deleted file mode 100644
index 61082057e..000000000
--- a/Documentation/Tutorials/CoreConcepts/tutorial_CoreConcepts.md
+++ /dev/null
@@ -1,15 +0,0 @@
-\page tutorial_CoreConcepts Core concepts
-
-## Introduction
-
-
-
-
-## Table of Contents
-1. [Devices and allocators](#devices_and_allocators)
-2. [Algorithms and lambda functions](#algorithms_and_lambda_functions)
-3. [Smart pointers and views](#static_for)
-
-
-## Devices and allocators<a name="devices_and_allocators"></a>
-
diff --git a/Documentation/Tutorials/GeneralConcepts/snippet_algorithms_and_lambda_functions_parallel_for.cpp b/Documentation/Tutorials/GeneralConcepts/snippet_algorithms_and_lambda_functions_parallel_for.cpp
new file mode 100644
index 000000000..5477541cb
--- /dev/null
+++ b/Documentation/Tutorials/GeneralConcepts/snippet_algorithms_and_lambda_functions_parallel_for.cpp
@@ -0,0 +1,8 @@
+template< typename Device >
+void vectorAddition( double* v1, double* v2, double* sum, const int size )
+{
+    auto sum_lambda = [=] __cuda_callable__ ( int i ) mutable {
+        sum[ i ] = v1[ i ] + v2[ i ];
+    }
+    TNL::Algorithms::ParalellFor< Device >::exec( 0, size, sum_lambda );
+}
\ No newline at end of file
diff --git a/Documentation/Tutorials/GeneralConcepts/snippet_algorithms_and_lambda_functions_reduction.cpp b/Documentation/Tutorials/GeneralConcepts/snippet_algorithms_and_lambda_functions_reduction.cpp
new file mode 100644
index 000000000..85ba93408
--- /dev/null
+++ b/Documentation/Tutorials/GeneralConcepts/snippet_algorithms_and_lambda_functions_reduction.cpp
@@ -0,0 +1,10 @@
+template< typename Device >
+void scalarProduct( double* v1, double* v2, double* product, const int size )
+{
+    auto fetch = [=] __cuda_callable__ ( int i ) -> double {
+        return = v1[ i ] * v2[ i ];
+    }
+    auto reduce = [] __cuda_callable__ ( const double& a, const double& b ) {
+        return a + b; };
+    TNL::Algorithms::Reduction< Device >::reduce( 0, size, reduce, fetch, 0.0 );
+}
\ No newline at end of file
diff --git a/Documentation/Tutorials/GeneralConcepts/snippet_algorithms_and_lambda_functions_reduction_2.cpp b/Documentation/Tutorials/GeneralConcepts/snippet_algorithms_and_lambda_functions_reduction_2.cpp
new file mode 100644
index 000000000..deeb49dd5
--- /dev/null
+++ b/Documentation/Tutorials/GeneralConcepts/snippet_algorithms_and_lambda_functions_reduction_2.cpp
@@ -0,0 +1,12 @@
+template< typename Device >
+void scalarProduct( double* u1, double* u2,
+                    double* v1, double* v2,
+                    double* product, const int size )
+{
+    auto fetch = [=] __cuda_callable__ ( int i ) -> double {
+        return = ( u1[ i ] + u2[ i ] ) * ( v1[ i ] + v2[ i ] );
+    }
+    auto reduce = [] __cuda_callable__ ( const double& a, const double& b ) {
+        return a + b; };
+    TNL::Algorithms::Reduction< Device >::reduce( 0, size, reduce, fetch, 0.0 );
+}
\ No newline at end of file
diff --git a/Documentation/Tutorials/GeneralConcepts/snippet_algorithms_and_lambda_functions_reduction_cublas.cpp b/Documentation/Tutorials/GeneralConcepts/snippet_algorithms_and_lambda_functions_reduction_cublas.cpp
new file mode 100644
index 000000000..ccb0329b9
--- /dev/null
+++ b/Documentation/Tutorials/GeneralConcepts/snippet_algorithms_and_lambda_functions_reduction_cublas.cpp
@@ -0,0 +1,9 @@
+void scalarProduct( double* u1, double* u2,
+                    double* v1, double* v2,
+                    double* product, const int size )
+{
+    cublasHandle_t handle;
+    cublasSaxpy( handle, size, 1.0, u1, 1, u2, 1 );
+    cublasSaxpy( handle, size, 1.0, v1, 1, v2, 1 );
+    cublasSdot ( handle, size, 1.0, u1, 1, v1, 1, &product );
+}
\ No newline at end of file
diff --git a/Documentation/Tutorials/GeneralConcepts/snippet_devices_and_allocators_arrays_assignment_example.cpp b/Documentation/Tutorials/GeneralConcepts/snippet_devices_and_allocators_arrays_assignment_example.cpp
new file mode 100644
index 000000000..3ee8150d4
--- /dev/null
+++ b/Documentation/Tutorials/GeneralConcepts/snippet_devices_and_allocators_arrays_assignment_example.cpp
@@ -0,0 +1 @@
+cuda_array = host_aray;
\ No newline at end of file
diff --git a/Documentation/Tutorials/GeneralConcepts/snippet_devices_and_allocators_arrays_device_check.cpp b/Documentation/Tutorials/GeneralConcepts/snippet_devices_and_allocators_arrays_device_check.cpp
new file mode 100644
index 000000000..b927dc0dd
--- /dev/null
+++ b/Documentation/Tutorials/GeneralConcepts/snippet_devices_and_allocators_arrays_device_check.cpp
@@ -0,0 +1,9 @@
+template< typename Array >
+void checkDevice
+{
+    using Device = typename Array::DeviceType;
+    if( std::is_same< Device, TNL::Device::Host >::value )
+        std::cout << "Device is host CPU." << std::endl;
+    if( std::is_same< Device, TNL::Device::Cuda >::value )
+        std::cout << "Device is CUDA GPU." << std::endl;
+}
\ No newline at end of file
diff --git a/Documentation/Tutorials/GeneralConcepts/snippet_devices_and_allocators_arrays_device_deduction.cpp b/Documentation/Tutorials/GeneralConcepts/snippet_devices_and_allocators_arrays_device_deduction.cpp
new file mode 100644
index 000000000..f3cad42fe
--- /dev/null
+++ b/Documentation/Tutorials/GeneralConcepts/snippet_devices_and_allocators_arrays_device_deduction.cpp
@@ -0,0 +1,6 @@
+template< typename Array >
+void deduceDevice
+{
+    using Device = typename Array::DeviceType;
+    TNL::Container::Array< int, Device > array;
+}
\ No newline at end of file
diff --git a/Documentation/Tutorials/GeneralConcepts/snippet_devices_and_allocators_arrays_example.cpp b/Documentation/Tutorials/GeneralConcepts/snippet_devices_and_allocators_arrays_example.cpp
new file mode 100644
index 000000000..2fcb006f5
--- /dev/null
+++ b/Documentation/Tutorials/GeneralConcepts/snippet_devices_and_allocators_arrays_example.cpp
@@ -0,0 +1,2 @@
+TNL::Containers::Array< int, TNL::Devices::Host > host_array;
+TNL::Containers::Array< int, TNL::Devices::Cuda > cuda_array;
\ No newline at end of file
diff --git a/Documentation/Tutorials/GeneralConcepts/snippet_devices_and_allocators_arrays_setsize_example.cpp b/Documentation/Tutorials/GeneralConcepts/snippet_devices_and_allocators_arrays_setsize_example.cpp
new file mode 100644
index 000000000..8369de9f3
--- /dev/null
+++ b/Documentation/Tutorials/GeneralConcepts/snippet_devices_and_allocators_arrays_setsize_example.cpp
@@ -0,0 +1,2 @@
+host_array.setSize( 10 );
+cuda_array.setSize( 10 );
\ No newline at end of file
diff --git a/Documentation/Tutorials/GeneralConcepts/snippet_shared_pointers_and_views_capture_reference.cpp b/Documentation/Tutorials/GeneralConcepts/snippet_shared_pointers_and_views_capture_reference.cpp
new file mode 100644
index 000000000..8eeb657a4
--- /dev/null
+++ b/Documentation/Tutorials/GeneralConcepts/snippet_shared_pointers_and_views_capture_reference.cpp
@@ -0,0 +1,11 @@
+template< typename Device >
+void lambda_capture_by_value( int size )
+{
+    TNL::Containers::Array< int, Device > a( size );
+    auto f = [&a] __cuda_callable__ ( int i ) mutable {
+        a[ i ] = 1;
+    };
+    TNL::Algorithms::ParallelFor< Device >::exec( 0, size, f );
+}
+
+
diff --git a/Documentation/Tutorials/GeneralConcepts/snippet_shared_pointers_and_views_capture_value.cpp b/Documentation/Tutorials/GeneralConcepts/snippet_shared_pointers_and_views_capture_value.cpp
new file mode 100644
index 000000000..77bfccca2
--- /dev/null
+++ b/Documentation/Tutorials/GeneralConcepts/snippet_shared_pointers_and_views_capture_value.cpp
@@ -0,0 +1,11 @@
+template< typename Device >
+void lambda_capture_by_value( int size )
+{
+    TNL::Containers::Array< int, Device > a( size );
+    auto f = [=] __cuda_callable__ ( int i ) mutable {
+        a[ i ] = 1;
+    };
+    TNL::Algorithms::ParallelFor< Device >::exec( 0, size, f );
+}
+
+
diff --git a/Documentation/Tutorials/GeneralConcepts/snippet_shared_pointers_and_views_capture_view.cpp b/Documentation/Tutorials/GeneralConcepts/snippet_shared_pointers_and_views_capture_view.cpp
new file mode 100644
index 000000000..96d82fe0b
--- /dev/null
+++ b/Documentation/Tutorials/GeneralConcepts/snippet_shared_pointers_and_views_capture_view.cpp
@@ -0,0 +1,10 @@
+template< typename Device >
+void lambda_capture_by_value( int size )
+{
+    TNL::Containers::Array< int, Device > a( size );
+    auto view = a.getView();
+    auto f = [=] __cuda_callable__ ( int i ) mutable {
+        view[ i ] = 1;
+    };
+    TNL::Algorithms::ParallelFor< Device >::exec( 0, size, f );
+}
\ No newline at end of file
diff --git a/Documentation/Tutorials/GeneralConcepts/snippet_shared_pointers_and_views_capture_view_change.cpp b/Documentation/Tutorials/GeneralConcepts/snippet_shared_pointers_and_views_capture_view_change.cpp
new file mode 100644
index 000000000..4d994df5a
--- /dev/null
+++ b/Documentation/Tutorials/GeneralConcepts/snippet_shared_pointers_and_views_capture_view_change.cpp
@@ -0,0 +1,11 @@
+template< typename Device >
+void lambda_capture_by_value( int size )
+{
+    TNL::Containers::Array< int, Device > a( size );
+    auto view = a.getView();
+    a.setSize( 2 * size );
+    auto f = [=] __cuda_callable__ ( int i ) mutable {
+        view[ i ] = 1;
+    };
+    TNL::Algorithms::ParallelFor< Device >::exec( 0, size, f );
+}
\ No newline at end of file
diff --git a/Documentation/Tutorials/GeneralConcepts/snippet_shared_pointers_and_views_capture_view_change_2.cpp b/Documentation/Tutorials/GeneralConcepts/snippet_shared_pointers_and_views_capture_view_change_2.cpp
new file mode 100644
index 000000000..e6699e9f9
--- /dev/null
+++ b/Documentation/Tutorials/GeneralConcepts/snippet_shared_pointers_and_views_capture_view_change_2.cpp
@@ -0,0 +1,11 @@
+template< typename Device >
+void lambda_capture_by_value( int size )
+{
+    TNL::Containers::Array< int, Device > a( size );
+    auto view = a.getView();
+    a.setElement( 0, 1 );
+    auto f = [=] __cuda_callable__ ( int i ) mutable {
+        view[ i ] = 1;
+    };
+    TNL::Algorithms::ParallelFor< Device >::exec( 0, size, f );
+}
\ No newline at end of file
diff --git a/Documentation/Tutorials/GeneralConcepts/tutorial_GeneralConcepts.md b/Documentation/Tutorials/GeneralConcepts/tutorial_GeneralConcepts.md
new file mode 100644
index 000000000..8cf709fe1
--- /dev/null
+++ b/Documentation/Tutorials/GeneralConcepts/tutorial_GeneralConcepts.md
@@ -0,0 +1,115 @@
+\page tutorial_GeneralConcepts General concepts
+
+## Table of Contents
+- [Table of Contents](#table-of-contents)
+- [Introduction](#introduction)
+- [Devices and allocators<a name="devices-and-allocators"></a>](#devices-and-allocators)
+- [Algorithms and lambda functions<a name="algorithms-and-lambda-functions"></a>](#algorithms-and-lambda-functions)
+- [Shared pointers and views<a name="shared-pointers-and-views"></a>](#shared-pointers-and-views)
+  - [Data structures views](#data-structures-views)
+  - [Shared pointers](#shared-pointers)
+
+## Introduction
+
+In this part we desribe some general and core concepts of programming with TNL. Understaniding these ideas may significantly help to understand the design of TNL algortihms and data structure and it also helps ti use TNL more efficiently. The main goal of TNL is to allow developing high performance algorithms that could run on multicore CPUs and GPUs. TNL offers unified interface and so the developer writes one code for both architecures.
+
+## Devices and allocators<a name="devices-and-allocators"></a>
+
+TNL offers unified interface for both CPUs (also referred as a host system) and GPUs (also refered as device). Connection between CPU and GPU is usualy represented by [PCI-Express bus](https://en.wikipedia.org/wiki/PCI_Express) which is orders of magnitude slower compared to speed of the global memory of GPU. Therefore, the communication between CPU and GPU must be reduced as much as possible. As a result, the programmer operates with two different adress spaces, one for CPU and one for GPU. To distenguish between the adress spaces, each data structure requiring dynamic allocation of memory needs to now on what device it resides. This is done by a template parameter `Device`. For example the following code creates two arrays, one on CPU and the other on GPU
+
+\includelineno snippet_devices_and_allocators_arrays_example.cpp
+
+Since now, [C++ template sepcialization](https://en.wikipedia.org/wiki/Partial_template_specialization) takes care of using the right methods for given device (note, that in this meaning device can be even CPU). For examaple, calling a method `setSize`
+
+\includelineno snippet_devices_and_allocators_arrays_setsize_example.cpp
+
+results in different memory allocation on CPU (for `host_array`) and on GPU (for `cuda_array`). The same holds for assignment
+
+\includelineno snippet_devices_and_allocators_arrays_assignment_example.cpp
+
+in which case apropriate data transfer from CPU to GPU is performed. Each such data structure contains inner type named `DeviceType` which tells where it resides as we can see here:
+
+\includelineno snippet_devices_and_allocators_arrays_device_deduction.cpp
+
+If we need to specialize some parts of algorithm with respect to its device we can do it by means of  \ref std::is_same :
+
+\includelineno snippet_devices_and_allocators_arrays_device_check.cpp
+
+TODO: Allocators
+
+## Algorithms and lambda functions<a name="algorithms-and-lambda-functions"></a>
+
+Developing a code for GPUs (in [CUDA](https://developer.nvidia.com/CUDA-zone) for example) consists mainly of writting [kernels](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#kernels) which are special functioins running on GPU in parallel. This can be very hard and tedious work especially when it comes to debugging. [Parallel reduction](https://developer.download.nvidia.com/assets/cuda/files/reduction.pdf) is a perfect example of an algorithm which relatively hard to understand and implement on one hand but it necessary frequently. Writting tens of lines of code everytime we need to sum up some data is exactly what we mean by tedious programming. TNL offers skeletons of such algorithms and combines them with user defined [lambda functions](https://en.cppreference.com/w/cpp/language/lambda). This approach is not absolutely general which means that you can use it only in situation when there is a "skeleton" (see \ref TNL::Algorithms) suitable for your problem. But when there is, it offers several adventages:
+
+1. Implementing lambda functions is much easier compared to implementing GPU kernels.
+2. All such algorithms works even on CPU, so the developer write only one code for both hadrware architectures.
+3. The developer may debug the code on CPU first and then just run it on GPU. Quite likely it will work with only a little or no changes.
+
+The follwing code snippet demonstrates it on use of \ref TNL::Algorithms::ParallelFor:
+
+\includelineno snippet_algorithms_and_lambda_functions_parallel_for.cpp
+
+In this example, we assume that all arrays `v1`, `v2` and `sum` were properly allocated on given `Device`. If `Device` equals \ref TNL::Devices::Host, the lambda function is processed sequentialy or in parallel by several OpenMP threads on CPU. If `Device` equals \ref TNL::Devices::Cuda, the lambda function is called from CUDA kernel (this is why it is defined as `__cuda_callable__` which is just a substitute for `__host__ __device__` ) by apropriate number of CUDA threads. One more example demonstrates use of \ref TNL::Algorithms::Reduction .
+
+\includelineno snippet_algorithms_and_lambda_functions_reduction.cpp
+
+We will not explain the parallel reduction in TNL at this moment (see the section about [flexible parallel reduction](tutorial_ReductionAndScan.html#flexible_parallel_reduction) ), we hope that the idea is more or less clear from the code snippet. If `Device` equals to \ref TNL::Device::Host, the scalar product is evaluated sequentialy or in parallel by several OpenMP threads on CPU, if `Device` equals \ref TNL::Algorithms::Cuda, the [parallel reduction](https://developer.download.nvidia.com/assets/cuda/files/reduction.pdf) is finetuned with the lambda functions is performed. Fortunately, there is no performance drop, on the contrary, since it is easy to generate CUDA kernels for particular situations, we may get more efficient code. Consider computing scalar product of sum of vectors like this
+
+\f[
+s = (u_1 + u_2, v_1 + v_2).
+\f]
+
+This can be solved by the following code
+
+\includelineno snippet_algorithms_and_lambda_functions_reduction_2.cpp
+
+We have changed only the `fetch` lambda function to perform the sums of `u1[ i ] + u2[ i ]` and `v1[ i ] + v2[ i ]` (line 7). Now we get completely new CUDA kernel tailored exactly for our problem. Doing the same with [Cublas](https://developer.nvidia.com/cublas), for example, we would have to split into three separate kernels:
+
+1. Kernel to compute \f$u_1 = u_1 + u_2\f$.
+2. Kernel to compute \f$v_1 = v_1 + v_2\f$.
+3. Kernel to compute \f$product = ( u_1, v_1 )\f$.
+
+This could be achived with the following code:
+
+\includelineno snippet_algorithms_and_lambda_functions_reduction_cublas.cpp
+
+We believe that C++ lamnda functions with properly designed patterns of parallel algorithms could make programming of GPUs significantly easier. We see a parallel with [MPI standard](https://en.wikipedia.org/wiki/Message_Passing_Interface) which in nineties defined frequent communication operations in distributed parallel computing. It made programming of distributed systems much easier and at the same time MPI helps to write efficient programs. We aim to add additional skeletons or patterns to \ref TNL::Algorithms.
+
+## Shared pointers and views<a name="shared-pointers-and-views"></a>
+
+You might notice that in the previous section we used only C style arrays represented by pointers in the lambda functions. There is a difficulty when we want to access TNL arrays or other data structures inside the lambda functions. We may capture the outside varibles either by value or reference. The first case would as follows:
+
+\includelineno snippet_shared_pointers_and_views_capture_value.cpp
+
+In this case a deep copy of array `a` will be made and so there will be no effect of what we do with the lambda function. Capturing by a reference may look as follows:
+
+\includelineno snippet_shared_pointers_and_views_capture_reference.cpp
+
+This would be correct on CPU (i.e. when `Device` is \ref TNL::Devices::Host). However, we are not allowed to pass references to CUDA kernels and so this source code would not even compile with CUDA compiler. To overcome this issue, TNL offers two solutions:
+
+1. Data structures views
+2. Shared pointers
+
+### Data structures views
+
+View is a kind of lightweight reference object which makes only a shallow copy of itself in copy constructor. Therefore view can by captured by value but beacause it is a reference to another object, everything we do with the view will affect the original object. The example with the array would look as follows:
+
+\includelineno snippet_shared_pointers_and_views_capture_view.cpp
+
+The differences are on the line 5 where we fetch the view by means of method `getView` and on the line 7 where we work with the `view` and not with the array `a`. The view has very simmilar interface (see \ref TNL::Containers::ArrayView) as the array (\ref TNL::Containers::Array) and so mostly there is no differnce in using array and its view for the programmer. In TNL, each data structure which can be accesed from GPU kernels (it means that it has methods defined as `__cuda_callable__`) provides also a method `getView` for getting appropriate view of the object.
+
+Views are simple objects because they must be transferred to GPU in each kernel call. So there are no smart links between a view and an original object. Therefore if the original object get changed, all views obtained from the object before may become invalid. See the following example:
+
+\includelineno snippet_shared_pointers_and_views_capture_view_change.cpp
+
+Such code would not work because after obtaining the view on the line 5, we change the size ot he array `a` which will cause data reallocation. In fact, the array view contains just a pointer the the data managed by the array and the size of the array. There is no pointer from the view to the array and so the view has no chance to check if it is still synchronized with the original object. However, if you fetch all necessary views immediately before capturing by a lambda fuction, this is not an issue. And this is why **the views are recommended for accesing TNL data structures in lamda functios or GPU kernels**.
+
+Note, that changing the data managed by the array after fetching the view is not an issue. See the following example:
+
+\includelineno snippet_shared_pointers_and_views_capture_view_change_2.cpp
+
+On the line 6, we change value of the first element. This causes no data reallocation or change of size and so the view fetched on the line 5 is still valid.
+
+### Shared pointers
+
+TNL offers smart pointers working across different devices (meaning CPU or GPU). 
\ No newline at end of file
diff --git a/Documentation/Tutorials/index.md b/Documentation/Tutorials/index.md
index 2addba243..56a51cc22 100644
--- a/Documentation/Tutorials/index.md
+++ b/Documentation/Tutorials/index.md
@@ -3,7 +3,7 @@
 ## Tutorials
 
 1. [Building applications with TNL](tutorial_building_applications_with_tnl.html)
-2. [Core concepts](tutorial_CoreConcepts.html)
+2. [General concepts](tutorial_GeneralConcepts.html)
 3. [Arrays](tutorial_Arrays.html)
 4. [Vectors](tutorial_Vectors.html)
 5. [Flexible parallel reduction and scan](tutorial_ReductionAndScan.html)
-- 
GitLab


From dffa1fcc8b1c95e5930772ad4ed1466bbb7f0938 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Wed, 13 Jan 2021 13:02:28 +0100
Subject: [PATCH 35/53] Correcrtions of the general concepts documentation.

---
 ...ces_and_allocators_arrays_device_test.cpp} |  2 +-
 .../tutorial_GeneralConcepts.md               | 42 +++++++++----------
 2 files changed, 22 insertions(+), 22 deletions(-)
 rename Documentation/Tutorials/GeneralConcepts/{snippet_devices_and_allocators_arrays_device_check.cpp => snippet_devices_and_allocators_arrays_device_test.cpp} (94%)

diff --git a/Documentation/Tutorials/GeneralConcepts/snippet_devices_and_allocators_arrays_device_check.cpp b/Documentation/Tutorials/GeneralConcepts/snippet_devices_and_allocators_arrays_device_test.cpp
similarity index 94%
rename from Documentation/Tutorials/GeneralConcepts/snippet_devices_and_allocators_arrays_device_check.cpp
rename to Documentation/Tutorials/GeneralConcepts/snippet_devices_and_allocators_arrays_device_test.cpp
index b927dc0dd..3f6d21721 100644
--- a/Documentation/Tutorials/GeneralConcepts/snippet_devices_and_allocators_arrays_device_check.cpp
+++ b/Documentation/Tutorials/GeneralConcepts/snippet_devices_and_allocators_arrays_device_test.cpp
@@ -1,5 +1,5 @@
 template< typename Array >
-void checkDevice
+void testDevice
 {
     using Device = typename Array::DeviceType;
     if( std::is_same< Device, TNL::Device::Host >::value )
diff --git a/Documentation/Tutorials/GeneralConcepts/tutorial_GeneralConcepts.md b/Documentation/Tutorials/GeneralConcepts/tutorial_GeneralConcepts.md
index 8cf709fe1..8c80c0df3 100644
--- a/Documentation/Tutorials/GeneralConcepts/tutorial_GeneralConcepts.md
+++ b/Documentation/Tutorials/GeneralConcepts/tutorial_GeneralConcepts.md
@@ -11,15 +11,15 @@
 
 ## Introduction
 
-In this part we desribe some general and core concepts of programming with TNL. Understaniding these ideas may significantly help to understand the design of TNL algortihms and data structure and it also helps ti use TNL more efficiently. The main goal of TNL is to allow developing high performance algorithms that could run on multicore CPUs and GPUs. TNL offers unified interface and so the developer writes one code for both architecures.
+In this part we describe some general and core concepts of programming with TNL. Understanding these ideas may significantly help to understand the design of TNL algorithms and data structure and it also helps to use TNL more efficiently. The main goal of TNL is to allow developing high performance algorithms that could run on multicore CPUs and GPUs. TNL offers unified interface and so the developer writes one code for both architectures.
 
 ## Devices and allocators<a name="devices-and-allocators"></a>
 
-TNL offers unified interface for both CPUs (also referred as a host system) and GPUs (also refered as device). Connection between CPU and GPU is usualy represented by [PCI-Express bus](https://en.wikipedia.org/wiki/PCI_Express) which is orders of magnitude slower compared to speed of the global memory of GPU. Therefore, the communication between CPU and GPU must be reduced as much as possible. As a result, the programmer operates with two different adress spaces, one for CPU and one for GPU. To distenguish between the adress spaces, each data structure requiring dynamic allocation of memory needs to now on what device it resides. This is done by a template parameter `Device`. For example the following code creates two arrays, one on CPU and the other on GPU
+TNL offers unified interface for both CPUs (also referred as a host system) and GPUs (referred as device). Connection between CPU and GPU is usually represented by [PCI-Express bus](https://en.wikipedia.org/wiki/PCI_Express) which is orders of magnitude slower compared to speed of the global memory of GPU. Therefore, the communication between CPU and GPU must be reduced as much as possible. As a result, the programmer operates with two different address spaces, one for CPU and one for GPU. To distinguish between the address spaces, each data structure requiring dynamic allocation of memory needs to now on what device it resides. This is done by a template parameter `Device`. For example the following code creates two arrays, one on CPU and the other on GPU
 
 \includelineno snippet_devices_and_allocators_arrays_example.cpp
 
-Since now, [C++ template sepcialization](https://en.wikipedia.org/wiki/Partial_template_specialization) takes care of using the right methods for given device (note, that in this meaning device can be even CPU). For examaple, calling a method `setSize`
+Since now, [C++ template sepcialization](https://en.wikipedia.org/wiki/Partial_template_specialization) takes care of using the right methods for given device (in meaning hardware architecture and so the  device can be even CPU). For example, calling a method `setSize`
 
 \includelineno snippet_devices_and_allocators_arrays_setsize_example.cpp
 
@@ -27,7 +27,7 @@ results in different memory allocation on CPU (for `host_array`) and on GPU (for
 
 \includelineno snippet_devices_and_allocators_arrays_assignment_example.cpp
 
-in which case apropriate data transfer from CPU to GPU is performed. Each such data structure contains inner type named `DeviceType` which tells where it resides as we can see here:
+in which case appropriate data transfer from CPU to GPU is performed. Each such data structure contains inner type named `DeviceType` which tells where it resides as we can see here:
 
 \includelineno snippet_devices_and_allocators_arrays_device_deduction.cpp
 
@@ -39,21 +39,21 @@ TODO: Allocators
 
 ## Algorithms and lambda functions<a name="algorithms-and-lambda-functions"></a>
 
-Developing a code for GPUs (in [CUDA](https://developer.nvidia.com/CUDA-zone) for example) consists mainly of writting [kernels](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#kernels) which are special functioins running on GPU in parallel. This can be very hard and tedious work especially when it comes to debugging. [Parallel reduction](https://developer.download.nvidia.com/assets/cuda/files/reduction.pdf) is a perfect example of an algorithm which relatively hard to understand and implement on one hand but it necessary frequently. Writting tens of lines of code everytime we need to sum up some data is exactly what we mean by tedious programming. TNL offers skeletons of such algorithms and combines them with user defined [lambda functions](https://en.cppreference.com/w/cpp/language/lambda). This approach is not absolutely general which means that you can use it only in situation when there is a "skeleton" (see \ref TNL::Algorithms) suitable for your problem. But when there is, it offers several adventages:
+Developing a code for GPUs (in [CUDA](https://developer.nvidia.com/CUDA-zone) for example) consists mainly of writing [kernels](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#kernels) which are special functions running on GPU in parallel. This can be very hard and tedious work especially when it comes to debugging. [Parallel reduction](https://developer.download.nvidia.com/assets/cuda/files/reduction.pdf) is a perfect example of an algorithm which is relatively hard to understand and implement on one hand but it is necessary to use frequently. Writing tens of lines of code every time we need to sum up some data is exactly what we mean by tedious programming. TNL offers skeletons or patterns of such algorithms and combines them with user defined [lambda functions](https://en.cppreference.com/w/cpp/language/lambda). This approach is not absolutely general, which means that you can use it only in situation when there is a skeleton/pattern (see \ref TNL::Algorithms) suitable for your problem. But when there is, it offers several advantages:
 
 1. Implementing lambda functions is much easier compared to implementing GPU kernels.
-2. All such algorithms works even on CPU, so the developer write only one code for both hadrware architectures.
+2. Code implemented this way works even on CPU, so the developer writes only one code for both hardware architectures.
 3. The developer may debug the code on CPU first and then just run it on GPU. Quite likely it will work with only a little or no changes.
 
-The follwing code snippet demonstrates it on use of \ref TNL::Algorithms::ParallelFor:
+The following code snippet demonstrates it on use of \ref TNL::Algorithms::ParallelFor:
 
 \includelineno snippet_algorithms_and_lambda_functions_parallel_for.cpp
 
-In this example, we assume that all arrays `v1`, `v2` and `sum` were properly allocated on given `Device`. If `Device` equals \ref TNL::Devices::Host, the lambda function is processed sequentialy or in parallel by several OpenMP threads on CPU. If `Device` equals \ref TNL::Devices::Cuda, the lambda function is called from CUDA kernel (this is why it is defined as `__cuda_callable__` which is just a substitute for `__host__ __device__` ) by apropriate number of CUDA threads. One more example demonstrates use of \ref TNL::Algorithms::Reduction .
+In this example, we assume that all arrays `v1`, `v2` and `sum` were properly allocated on given `Device`. If `Device` equals \ref TNL::Devices::Host , the lambda function is processed sequentially or in parallel by several OpenMP threads on CPU. If `Device` equals \ref TNL::Devices::Cuda , the lambda function is called from CUDA kernel (this is why it is defined as `__cuda_callable__` which is just a substitute for `__host__ __device__` ) by apropriate number of CUDA threads. One more example demonstrates use of \ref TNL::Algorithms::Reduction .
 
 \includelineno snippet_algorithms_and_lambda_functions_reduction.cpp
 
-We will not explain the parallel reduction in TNL at this moment (see the section about [flexible parallel reduction](tutorial_ReductionAndScan.html#flexible_parallel_reduction) ), we hope that the idea is more or less clear from the code snippet. If `Device` equals to \ref TNL::Device::Host, the scalar product is evaluated sequentialy or in parallel by several OpenMP threads on CPU, if `Device` equals \ref TNL::Algorithms::Cuda, the [parallel reduction](https://developer.download.nvidia.com/assets/cuda/files/reduction.pdf) is finetuned with the lambda functions is performed. Fortunately, there is no performance drop, on the contrary, since it is easy to generate CUDA kernels for particular situations, we may get more efficient code. Consider computing scalar product of sum of vectors like this
+We will not explain the parallel reduction in TNL at this moment (see the section about [flexible parallel reduction](tutorial_ReductionAndScan.html#flexible_parallel_reduction) ), we hope that the idea is more or less clear from the code snippet. If `Device` equals to \ref TNL::Device::Host , the scalar product is evaluated sequentially or in parallel by several OpenMP threads on CPU, if `Device` equals \ref TNL::Algorithms::Cuda, the [parallel reduction](https://developer.download.nvidia.com/assets/cuda/files/reduction.pdf) fine tuned with the lambda functions is performed. Fortunately, there is no performance drop. On the contrary, since it is easy to generate CUDA kernels for particular situations, we may get more efficient code. Consider computing a scalar product of sum of vectors like this
 
 \f[
 s = (u_1 + u_2, v_1 + v_2).
@@ -63,53 +63,53 @@ This can be solved by the following code
 
 \includelineno snippet_algorithms_and_lambda_functions_reduction_2.cpp
 
-We have changed only the `fetch` lambda function to perform the sums of `u1[ i ] + u2[ i ]` and `v1[ i ] + v2[ i ]` (line 7). Now we get completely new CUDA kernel tailored exactly for our problem. Doing the same with [Cublas](https://developer.nvidia.com/cublas), for example, we would have to split into three separate kernels:
+We have changed only the `fetch` lambda function to perform the sums of `u1[ i ] + u2[ i ]` and `v1[ i ] + v2[ i ]` (line 7). Now we get completely new CUDA kernel tailored exactly for our problem. Doing the same with [Cublas](https://developer.nvidia.com/cublas), for example, would require splitting into three separate kernels:
 
 1. Kernel to compute \f$u_1 = u_1 + u_2\f$.
 2. Kernel to compute \f$v_1 = v_1 + v_2\f$.
 3. Kernel to compute \f$product = ( u_1, v_1 )\f$.
 
-This could be achived with the following code:
+This could be achieved with the following code:
 
 \includelineno snippet_algorithms_and_lambda_functions_reduction_cublas.cpp
 
-We believe that C++ lamnda functions with properly designed patterns of parallel algorithms could make programming of GPUs significantly easier. We see a parallel with [MPI standard](https://en.wikipedia.org/wiki/Message_Passing_Interface) which in nineties defined frequent communication operations in distributed parallel computing. It made programming of distributed systems much easier and at the same time MPI helps to write efficient programs. We aim to add additional skeletons or patterns to \ref TNL::Algorithms.
+We believe that C++ lambda functions with properly designed patterns of parallel algorithms could make programming of GPUs significantly easier. We see a parallel with [MPI standard](https://en.wikipedia.org/wiki/Message_Passing_Interface) which in nineties defined frequent communication operations in distributed parallel computing. It made programming of distributed systems much easier and at the same time MPI helps to write efficient programs. We aim to add additional skeletons or patterns to \ref TNL::Algorithms.
 
 ## Shared pointers and views<a name="shared-pointers-and-views"></a>
 
-You might notice that in the previous section we used only C style arrays represented by pointers in the lambda functions. There is a difficulty when we want to access TNL arrays or other data structures inside the lambda functions. We may capture the outside varibles either by value or reference. The first case would as follows:
+You might notice that in the previous section we used only C style arrays represented by pointers in the lambda functions. There is a difficulty when we want to access TNL arrays or other data structures inside the lambda functions. We may capture the outside variables either by a value or a reference. The first case would be as follows:
 
 \includelineno snippet_shared_pointers_and_views_capture_value.cpp
 
-In this case a deep copy of array `a` will be made and so there will be no effect of what we do with the lambda function. Capturing by a reference may look as follows:
+In this case a deep copy of array `a` will be made and so there will be no effect of what we do with the array `a` in the lambda function. Capturing by a reference may look as follows:
 
 \includelineno snippet_shared_pointers_and_views_capture_reference.cpp
 
-This would be correct on CPU (i.e. when `Device` is \ref TNL::Devices::Host). However, we are not allowed to pass references to CUDA kernels and so this source code would not even compile with CUDA compiler. To overcome this issue, TNL offers two solutions:
+This would be correct on CPU (i.e. when `Device` is \ref TNL::Devices::Host ). However, we are not allowed to pass references to CUDA kernels and so this source code would not even compile with CUDA compiler. To overcome this issue, TNL offers two solutions:
 
 1. Data structures views
 2. Shared pointers
 
 ### Data structures views
 
-View is a kind of lightweight reference object which makes only a shallow copy of itself in copy constructor. Therefore view can by captured by value but beacause it is a reference to another object, everything we do with the view will affect the original object. The example with the array would look as follows:
+View is a kind of lightweight reference object which makes only a shallow copy of itself in copy constructor. Therefore view can by captured by value, but because it is, in fact, a reference to another object, everything we do with the view will affect the original object. The example with the array would look as follows:
 
 \includelineno snippet_shared_pointers_and_views_capture_view.cpp
 
-The differences are on the line 5 where we fetch the view by means of method `getView` and on the line 7 where we work with the `view` and not with the array `a`. The view has very simmilar interface (see \ref TNL::Containers::ArrayView) as the array (\ref TNL::Containers::Array) and so mostly there is no differnce in using array and its view for the programmer. In TNL, each data structure which can be accesed from GPU kernels (it means that it has methods defined as `__cuda_callable__`) provides also a method `getView` for getting appropriate view of the object.
+The differences are on the line 5 where we fetch the view by means of method `getView` and on the line 7 where we work with the `view` and not with the array `a`. The view has very similar interface (see \ref TNL::Containers::ArrayView) as the array (\ref TNL::Containers::Array) and so mostly there is no difference in using array and its view for the programmer. In TNL, each data structure which can be accessed from GPU kernels (it means that it has methods defined as `__cuda_callable__`) provides also a method `getView` for getting appropriate view of the object.
 
-Views are simple objects because they must be transferred to GPU in each kernel call. So there are no smart links between a view and an original object. Therefore if the original object get changed, all views obtained from the object before may become invalid. See the following example:
+Views are simple objects because they must be transferred to GPU in each kernel call. So there are no smart links between a view and the original object. In fact, the array view contains just a pointer the the data managed by the array and the size of the array. Therefore if the original object get changed, all views obtained from the object before may become invalid. See the following example:
 
 \includelineno snippet_shared_pointers_and_views_capture_view_change.cpp
 
-Such code would not work because after obtaining the view on the line 5, we change the size ot he array `a` which will cause data reallocation. In fact, the array view contains just a pointer the the data managed by the array and the size of the array. There is no pointer from the view to the array and so the view has no chance to check if it is still synchronized with the original object. However, if you fetch all necessary views immediately before capturing by a lambda fuction, this is not an issue. And this is why **the views are recommended for accesing TNL data structures in lamda functios or GPU kernels**.
+Such code would not work because after obtaining the view on the line 5, we change the size of the array `a` which will cause data reallocation. As we mentioned, there is no pointer from the view to the array and so the view has no chance to check if it is still up-to-date with the original object. However, if you fetch all necessary views immediately before capturing by a lambda function, there will be no problem. And this is why **the views are recommended for accessing TNL data structures in lambda functions or GPU kernels**.
 
 Note, that changing the data managed by the array after fetching the view is not an issue. See the following example:
 
 \includelineno snippet_shared_pointers_and_views_capture_view_change_2.cpp
 
-On the line 6, we change value of the first element. This causes no data reallocation or change of size and so the view fetched on the line 5 is still valid.
+On the line 6, we change value of the first element. This causes no data reallocation or change of size and so the view fetched on the line 5 is still valid and up-to-date.
 
 ### Shared pointers
 
-TNL offers smart pointers working across different devices (meaning CPU or GPU). 
\ No newline at end of file
+TNL offers smart pointers working across different devices (meaning CPU or GPU).
\ No newline at end of file
-- 
GitLab


From 03a5e6a2cbcc588f8ec2ec3c6de63d0d28a7da69 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Wed, 13 Jan 2021 13:18:45 +0100
Subject: [PATCH 36/53] Correction of table of contents in general concepts.

---
 .../Tutorials/GeneralConcepts/tutorial_GeneralConcepts.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/Documentation/Tutorials/GeneralConcepts/tutorial_GeneralConcepts.md b/Documentation/Tutorials/GeneralConcepts/tutorial_GeneralConcepts.md
index 8c80c0df3..edab5bc8e 100644
--- a/Documentation/Tutorials/GeneralConcepts/tutorial_GeneralConcepts.md
+++ b/Documentation/Tutorials/GeneralConcepts/tutorial_GeneralConcepts.md
@@ -6,8 +6,8 @@
 - [Devices and allocators<a name="devices-and-allocators"></a>](#devices-and-allocators)
 - [Algorithms and lambda functions<a name="algorithms-and-lambda-functions"></a>](#algorithms-and-lambda-functions)
 - [Shared pointers and views<a name="shared-pointers-and-views"></a>](#shared-pointers-and-views)
-  - [Data structures views](#data-structures-views)
-  - [Shared pointers](#shared-pointers)
+  - [Data structures views<a name="data-structures-views"></a>](#data-structures-views)
+  - [Shared pointers<a name="shared-pointers"></a>](#shared-pointers)
 
 ## Introduction
 
@@ -90,7 +90,7 @@ This would be correct on CPU (i.e. when `Device` is \ref TNL::Devices::Host ). H
 1. Data structures views
 2. Shared pointers
 
-### Data structures views
+### Data structures views<a name="data-structures-views"></a>
 
 View is a kind of lightweight reference object which makes only a shallow copy of itself in copy constructor. Therefore view can by captured by value, but because it is, in fact, a reference to another object, everything we do with the view will affect the original object. The example with the array would look as follows:
 
@@ -110,6 +110,6 @@ Note, that changing the data managed by the array after fetching the view is not
 
 On the line 6, we change value of the first element. This causes no data reallocation or change of size and so the view fetched on the line 5 is still valid and up-to-date.
 
-### Shared pointers
+### Shared pointers<a name="shared-pointers"></a>
 
 TNL offers smart pointers working across different devices (meaning CPU or GPU).
\ No newline at end of file
-- 
GitLab


From 7a579801accfbc73bdc386304c4912f7fb4b9e69 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Wed, 13 Jan 2021 13:19:23 +0100
Subject: [PATCH 37/53] Correcting table of contents in arrays tutorial.

---
 .../Tutorials/Arrays/tutorial_Arrays.md       | 46 ++++++++++---------
 1 file changed, 24 insertions(+), 22 deletions(-)

diff --git a/Documentation/Tutorials/Arrays/tutorial_Arrays.md b/Documentation/Tutorials/Arrays/tutorial_Arrays.md
index 80e7c3816..b82caf0c3 100644
--- a/Documentation/Tutorials/Arrays/tutorial_Arrays.md
+++ b/Documentation/Tutorials/Arrays/tutorial_Arrays.md
@@ -1,22 +1,24 @@
 \page tutorial_Arrays  Arrays tutorial
 
+## Table of Contents
+- [Table of Contents](#table-of-contents)
+- [Introduction](#introduction)
+- [Arrays<a name="arrays"></a>](#arrays)
+  - [Array views<a name="array-views"></a>](#array-views)
+  - [Accessing the array elements<a name="accessing-the-array-elements"></a>](#accessing-the-array-elements)
+    - [Accessing the array elements with `operator[]`<a name="accessing-the-array-elements-with-operator"></a>](#accessing-the-array-elements-with-operator)
+    - [Accessing the array elements with `setElement` and `getElement`<a name="accessing-the-array-elements-with-setelement-and-getelement"></a>](#accessing-the-array-elements-with-setelement-and-getelement)
+  - [Arrays initiation with lambdas<a name="arrays-initiation-with-lambdas"></a>](#arrays-initiation-with-lambdas)
+  - [Checking the array contents<a name="checking-the-array-contents"></a>](#checking-the-array-contents)
+  - [IO operations with arrays<a name="io-operations-with-arrays"></a>](#io-operations-with-arrays)
+- [Static arrays<a name="static-arrays"></a>](#static-arrays)
+- [Distributed arrays<a name="distributed-arrays"></a>](#distributed-arrays)
+
 ## Introduction
 
 This tutorial introduces arrays in TNL. There are three types - common arrays with dynamic allocation, static arrays allocated on stack and distributed arrays with dynamic allocation. Arrays are one of the most important structures for memory management. Methods implemented in arrays are particularly useful for GPU programming. From this point of view, the reader will learn how to easily allocate memory on GPU, transfer data between GPU and CPU but also, how to initialize data allocated on GPU. In addition, the resulting code is hardware platform independent, so it can be ran on CPU nad GPU without any changes.
 
-## Table of Contents
-1. [Arrays](#arrays)
-   1. [Array views](#array_views)
-   2. [Accessing the array elements](#accessing_the_array_elements)
-      1. [Accessing the array elements with `operator[]`](#accessing_the_array_elements_with_operator)
-      2. [Accessing the array elements with `setElement` and `getElement`](#accessing_the_array_elements_with_set_get_element)
-   3. [Arrays initiation with lambdas](#arrays_initiation_with_lambdas)
-   4. [Checking the array contents](#checking_the_array_contents)
-   5. [IO operations with arrays](#io_operations_with-arrays)
-2. [Static arrays](#static_arrays)
-3. [Distributed arrays](#distributed_arrays)
-
-## Arrays <a name="arrays"></a>
+## Arrays<a name="arrays"></a>
 
 Array is templated class defined in namespace `TNL::Containers` having three template parameters:
 
@@ -33,7 +35,7 @@ The result looks as follows:
 \include ArrayAllocation.out
 
 
-### Array views <a name="array_views"></a>
+### Array views<a name="array-views"></a>
 
 Arrays cannot share data with each other or data allocated elsewhere. This can be achieved with the `ArrayView` structure which has similar semantics to `Array`, but it does not handle allocation and deallocation of the data. Hence, array view cannot be resized, but it can be used to wrap data allocated elsewhere (e.g. using an `Array` or an operator `new`) and to partition large arrays into subarrays. The process of wrapping external data with a view is called _binding_.
 
@@ -55,11 +57,11 @@ Output:
 
 Since array views do not allocate or deallocate memory, they can be created even in CUDA kernels, which is not possible with `Array`. `ArrayView` can also be passed-by-value into CUDA kernels or captured-by-value by device lambda functions, because the `ArrayView`'s copy-constructor makes only a shallow copy (i.e., it copies only the data pointer and size).
 
-### Accessing the array elements <a name="accessing_the_array_elements"></a>
+### Accessing the array elements<a name="accessing-the-array-elements"></a>
 
 There are two ways how to work with the array (or array view) elements - using the indexing operator (`operator[]`) which is more efficient or using methods `setElement` and `getElement` which is more flexible.
 
-#### Accessing the array elements with `operator[]` <a name="accessing_the_array_elements_with_operator"></a>
+#### Accessing the array elements with `operator[]`<a name="accessing-the-array-elements-with-operator"></a>
 
 Indexing operator `operator[]` is implemented in both `Array` and `ArrayView` and it is defined as `__cuda_callable__`. It means that it can be called even in CUDA kernels if the data is allocated on GPU, i.e. the `Device` parameter is `Devices::Cuda`. This operator returns a reference to given array element and so it is very efficient. However, calling this operator from host for data allocated on device (or vice versa) leads to segmentation fault (on the host system) or broken state of the device. It means:
 
@@ -76,7 +78,7 @@ Output:
 
 In general in TNL, each method defined as `__cuda_callable__` can be called from the CUDA kernels. The method `ArrayView::getSize` is another example. We also would like to point the reader to better ways of arrays initiation for example with method `ArrayView::evaluate` or with `ParallelFor`.
 
-#### Accessing the array element with `setElement` and `getElement` <a name="accessing_the_array_elements_with_set_get_element"></a>
+#### Accessing the array elements with `setElement` and `getElement`<a name="accessing-the-array-elements-with-setelement-and-getelement"></a>
 
 On the other hand, the methods `setElement` and `getElement` can be called from the host **no matter where the array is allocated**. In addition they can be called from kernels on device where the array is allocated. `getElement` returns copy of an element rather than a reference. Therefore it is slightly slower. If the array is on GPU and the methods are called from the host, the array element is copied from the device on the host (or vice versa) which is significantly slower. In the parts of code where the performance matters, these methods shall not be called from the host when the array is allocated on the device. In this way, their use is, however, easier compared to `operator[]` and they allow to write one simple code for both CPU and GPU. Both methods are good candidates for:
 
@@ -92,7 +94,7 @@ Output:
 
 \include ElementsAccessing-2.out
 
-### Arrays initiation with lambdas <a name="arrays_inititation_with_lambdas"></a>
+### Arrays initiation with lambdas<a name="arrays-initiation-with-lambdas"></a>
 
 More efficient and still quite simple method for the arrays initiation is with the use of C++ lambda functions and method `evaluate`. This method is implemented in `ArrayView` only. As an argument a lambda function is passed which is then evaluated for all elements. Optionally one may define only subinterval of element indexes where the lambda shall be evaluated. If the underlying array is allocated on GPU, the lambda function is called from CUDA kernel. This is why it is more efficient than use of `setElement`. On the other hand, one must be careful to use only `__cuda_callable__` methods inside the lambda. The use of the method `evaluate` demonstrates the following example.
 
@@ -102,7 +104,7 @@ Output:
 
 \include ArrayViewEvaluate.out
 
-### Checking the array contents <a name="arrays"></a>
+### Checking the array contents<a name="checking-the-array-contents"></a>
 
 Methods `containsValue` and `containsOnlyValue` serve for testing the contents of the arrays. `containsValue` returns `true` of there is at least one element in the array with given value. `containsOnlyValue` returns `true` only if all elements of the array equal given value. The test can be restricted to subinterval of array elements. Both methods are implemented in `Array` as well as in `ArrayView`. See the following code snippet for example of use.
 
@@ -112,7 +114,7 @@ Output:
 
 \include ContainsValue.out
 
-### IO operations with arrays <a name="arrays"></a>
+### IO operations with arrays<a name="io-operations-with-arrays"></a>
 
 Methods `save` and `load` serve for storing/restoring the array to/from a file in a binary form. In case of `Array`, loading of an array from a file causes data reallocation. `ArrayView` cannot do reallocation, therefore the data loaded from a file is copied to the memory managed by the `ArrayView`. The number of elements managed by the array view and those loaded from the file must be equal. See the following example.
 
@@ -122,7 +124,7 @@ Output:
 
 \include ArrayIO.out
 
-## Static arrays <a name="static_arrays"></a>
+## Static arrays<a name="static-arrays"></a>
 
 Static arrays are allocated on stack and thus they can be created even in CUDA kernels. Their size is fixed and it is given by a template parameter. Static array is a templated class defined in namespace `TNL::Containers` having two template parameters:
 
@@ -137,4 +139,4 @@ The output looks as:
 
 \include StaticArrayExample.out
 
-## Distributed arrays <a name="distributed_arrays"></a>
+## Distributed arrays<a name="distributed-arrays"></a>
-- 
GitLab


From 8c77683ebb8a70efd5a13b2c34f9a44b822b91a7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Wed, 13 Jan 2021 16:43:16 +0100
Subject: [PATCH 38/53] Added benchmark results to tutorial on matrices.

---
 .../Tutorials/Matrices/tutorial_Matrices.md   | 133 ++++++++++++------
 1 file changed, 92 insertions(+), 41 deletions(-)

diff --git a/Documentation/Tutorials/Matrices/tutorial_Matrices.md b/Documentation/Tutorials/Matrices/tutorial_Matrices.md
index 1e39fca89..81a2702cd 100644
--- a/Documentation/Tutorials/Matrices/tutorial_Matrices.md
+++ b/Documentation/Tutorials/Matrices/tutorial_Matrices.md
@@ -184,65 +184,116 @@ The following table shows pros and cons of particular mathods:
 |                                         |                                                                        | Allows accessing only data allocated on the same device/memory space. |
 |                                         |                                                                        | Use of matrix local indexes is less intuitive.                        |
 
-Though it may seem that the later methods come with more cons than pros they offer much higher performance and we believe they even them are still very user friendly. On the other hand, if the matrix setup performance is not a priority the use the simple but slow method can still be a good choice. The following tables demonstrate the performance of different methods. The tests were performed on CPU Intel Xeon CPU E5-2640 and GPU GeForce RTX 2070 in single precision.
+Though it may seem that the later methods come with more cons than pros they offer much higher performance and we believe they even them are still very user friendly. On the other hand, if the matrix setup performance is not a priority the use the simple but slow method can still be a good choice. The following tables demonstrate the performance of different methods. The tests were performed with the following setup:
+
+|              |                                                   |
+|--------------|---------------------------------------------------|
+| CPU          | Intel i9-9900KF, 3.60GHz, 8 cores, 16384 KB cache |
+| GPU          | GeForce RTX 2070                                  |
+| g++ version  | 10.2.0                                            |
+| nvcc version | 11.2.67                                           |
+| Precision    | single precision                                  |
+
+### Dense matrix
 
 In the test of dense matrices, we set each matrix element to value equal to `rowIdx + columnIdx`. The times in seconds obtained on CPU looks as follows:
 
-| Matrix rows and columns     | `setElement` on host | `setElement` with `ParallelFor` | `getRow`    | `forRows`   |
-|----------------------------:|---------------------:|--------------------------------:|------------:|------------:|
-|                             |                      |                                 |             |             |
+| Matrix rows and columns     | `setElement` on host | `setElement` with `ParallelFor` |  `getRow`    | `forRows`   |
+|----------------------------:|---------------------:|--------------------------------:|-------------:|------------:|
+|                          16 |           0.00000086 |                       0.0000053 |   0.00000035 |   0.0000023 |
+|                          32 |           0.00000278 |                       0.0000050 |   0.00000201 |   0.0000074 |
+|                          64 |           0.00000703 |                       0.0000103 |   0.00000354 |   0.0000203 |
+|                         128 |           0.00002885 |                       0.0000312 |   0.00000867 |   0.0000709 |
+|                         256 |           0.00017543 |                       0.0000439 |   0.00002490 |   0.0001054 |
+|                         512 |           0.00078153 |                       0.0001683 |   0.00005999 |   0.0002713 |
+|                        1024 |           0.00271989 |                       0.0006691 |   0.00003808 |   0.0003942 |
+|                        2048 |           0.01273520 |                       0.0038295 |   0.00039116 |   0.0017083 |
+|                        4096 |           0.08381450 |                       0.0716542 |   0.00937997 |   0.0116771 |
+|                        8192 |           0.51596800 |                       0.3535530 |   0.03971900 |   0.0467374 |
+
+The results on GPU looks as follows:
 
 And the same on GPU is in the following table:
 
-| Matrix rows and columns     | `setElement` on host | `setElement` with `ParallelFor` | `getRow`    | `forRows`   |
-|----------------------------:|---------------------:|--------------------------------:|------------:|------------:|
-|                             |                      |                                 |             |             |
+| Matrix rows and columns     | `setElement` on host | `setElement` with `ParallelFor` | `getRow`     | `forRows`   |
+|----------------------------:|---------------------:|--------------------------------:|-------------:|------------:|
+|                          16 |           0.027835   |                     0.000101198 | 0.00009903   | 0.000101214 |
+|                          32 |           0.002776   |                     0.000099197 | 0.00009901   | 0.000100481 |
+|                          64 |           0.010791   |                     0.000094446 | 0.00009493   | 0.000101796 |
+|                         128 |           0.043014   |                     0.000099397 | 0.00010024   | 0.000102729 |
+|                         256 |           0.171029   |                     0.000100469 | 0.00010448   | 0.000105893 |
+|                         512 |           0.683627   |                     0.000103346 | 0.00011034   | 0.000112752 |
+|                        1024 |           2.736680   |                     0.000158805 | 0.00016932   | 0.000170302 |
+|                        2048 |          10.930300   |                     0.000509000 | 0.00050917   | 0.000511183 |
+|                        4096 |          43.728700   |                     0.001557030 | 0.00156117   | 0.001557930 |
+|                        8192 |         174.923000   |                     0.005312470 | 0.00526658   | 0.005263870 |
 
 
-The sparse matrices are tested on computation of matrix approximating the Laplace operator in 2D. This matrix has at most five non-zero elements in each row. The times for sparse matrix (and CSR formart) on CPU in seconds looks as follows:
+### Sparse matrix
+
+The sparse matrices are tested on computation of matrix approximating the Laplace operator in 2D. This matrix has at most five non-zero elements in each row. The times for sparse matrix (and CSR format) on CPU in seconds looks as follows:
 
-| Matrix rows and columns     |  STL Map     | `setElement` on host | `getRow`    | `forRows`   |
-|----------------------------:|-------------:|---------------------:|------------:|------------:|
-|                         256 |      0.00045 |              0.00007 |     0.00005 |     0.00007 |
-|                       1,024 |      0.00129 |              0.00015 |     0.00007 |     0.00008 |
-|                       4,096 |      0.00569 |              0.00040 |     0.00007 |     0.00009 |
-|                      16,384 |      0.02024 |              0.00144 |     0.00007 |     0.00014 |
-|                      65,536 |      0.08687 |              0.00373 |     0.00014 |     0.00040 |
-|                     262,144 |      0.42524 |              0.01039 |     0.00039 |     0.00146 |
-|                   1,048,576 |      1.90120 |              0.03860 |     0.00417 |     0.00770 |
-|                   4,194,304 |      9.89239 |              0.15147 |     0.01844 |     0.03164 |
-|                  16,777,216 |     55.81530 |              0.61169 |     0.08441 |     0.13739 |
-|                  67,108,864 |    268.66000 |              2.44765 |     0.33831 |     0.54954 |
+| Matrix rows and columns     |  STL Map     | `setElement` on host | `setElement` with `ParallelFor` | `getRow`    | `forRows`    |
+|----------------------------:|-------------:|---------------------:|--------------------------------:|------------:|-------------:|
+|                         256 |      0.00016 |             0.000017 |                        0.000014 |    0.000013 |     0.000020 |
+|                       1,024 |      0.00059 |             0.000044 |                        0.000021 |    0.000019 |     0.000022 |
+|                       4,096 |      0.00291 |             0.000130 |                        0.000031 |    0.000022 |     0.000031 |
+|                      16,384 |      0.01414 |             0.000471 |                        0.000067 |    0.000031 |     0.000065 |
+|                      65,536 |      0.06705 |             0.001869 |                        0.000218 |    0.000074 |     0.000209 |
+|                     262,144 |      0.31728 |             0.007436 |                        0.000856 |    0.000274 |     0.000799 |
+|                   1,048,576 |      1.46388 |             0.027087 |                        0.006162 |    0.005653 |     0.005904 |
+|                   4,194,304 |      7.46147 |             0.102808 |                        0.028385 |    0.027925 |     0.027937 |
+|                  16,777,216 |     38.95900 |             0.413823 |                        0.125870 |    0.124588 |     0.123858 |
+|                  67,108,864 |    185.75700 |             1.652580 |                        0.505232 |    0.501003 |     0.500927 |
 
-We see, that use of STL map makes sence only in situation when it is hard to estimate necessary row capasities. Otherwise very simple with `setElement` method is much faster. If the performance is the highest priority, `getRow` method should be prefered. And the same on GPU is in the following table:
+We see, that use of STL map makes sense only in situation when it is hard to estimate necessary row capacities. Otherwise very simple with `setElement` method is much faster. If the performance is the highest priority, `getRow` method should be preferred. And the same on GPU is in the following table:
 
 | Matrix rows and columns     |  STL Map     | `setElement` on host | `setElement` on native device | `getRow`    | `forRows`   |
 |----------------------------:|-------------:|---------------------:|------------------------------:|------------:|------------:|
-|                         256 |      0.02423 |           0.0457575  |                       0.00027 |     0.00026 |     0.00027 |
-|                       1,024 |      0.00280 |           0.2043830  |                       0.00028 |     0.00028 |     0.00028 |
-|                       4,096 |      0.00637 |           0.8647010  |                       0.00031 |     0.00030 |     0.00031 |
-|                      16,384 |      0.02349 |           3.5592200  |                       0.00032 |     0.00031 |     0.00032 |
-|                      65,536 |      0.10333 |          14.4267000  |                       0.00072 |     0.00069 |     0.00070 |
-|                     262,144 |      0.52870 |          58.6620000  |                       0.00117 |     0.00115 |     0.00115 |
-|                   1,048,576 |      2.17003 |         235.7660000  |                       0.00335 |     0.00331 |     0.00333 |
-|                   4,194,304 |     11.98680 |         930.6170000  |                       0.00993 |     0.00997 |     0.01003 |
-|                  16,777,216 |     64.24220 |        3737.8400000  |                       0.02759 |     0.02751 |     0.02745 |
-|                  67,108,864 |    284.11700 |       15007.6000000  |                       0.06648 |     0.06802 |     0.06834 |
+|                         256 |       0.002  |                0.036 |                       0.00017 |     0.00017 |     0.00017 |
+|                       1,024 |       0.001  |                0.161 |                       0.00017 |     0.00017 |     0.00017 |
+|                       4,096 |       0.003  |                0.680 |                       0.00020 |     0.00020 |     0.00020 |
+|                      16,384 |       0.015  |                2.800 |                       0.00021 |     0.00020 |     0.00021 |
+|                      65,536 |       0.074  |               11.356 |                       0.00048 |     0.00047 |     0.00048 |
+|                     262,144 |       0.350  |               45.745 |                       0.00088 |     0.00087 |     0.00088 |
+|                   1,048,576 |       1.630  |              183.632 |                       0.00247 |     0.00244 |     0.00245 |
+|                   4,194,304 |       8.036  |              735.848 |                       0.00794 |     0.00783 |     0.00788 |
+|                  16,777,216 |      41.057  |             2946.610 |                       0.02481 |     0.02429 |     0.02211 |
+|                  67,108,864 |     187.581  |            11791.601 |                       0.07196 |     0.06329 |     0.06308 |
+
+Here we see, the `setElement` methods performs extremely bad because all matrix elements are transferred to GPU one-by-one. Even STL map is much faster. Note, that the times for STL map are not much higher compared to CPU which indicates that the transfer of the matrix on GPU is not dominant. Another simple method could by to setup the matrix on CPU by the means of `setElement` method and transfer it on GPU.
 
-Here we see, the `setElement` methods performs extremely bad because all matrix elements are transfered to GPU one-by-one. Even STL map is much faster. Note, that the times for STL map are not much higher compared to CPU which indicates that the transfer of the matrix on GPU is not dominant. Another simple method could by to setup the matrix on CPU by the means of `setElement` method and trasnfer it on GPU.
-
-Finaly, the following tables show the times of the same test performed with multidiagonal matrix. Times on CPU looks as follows:
-
-| Matrix rows and columns     |  STL Map     | `setElement` on host | `getRow`    | `forRows`   |
-|----------------------------:|-------------:|---------------------:|------------:|------------:|
-|                             |              |                      |             |             |
+### Multidiagonal matrix
 
+Finally, the following tables show the times of the same test performed with multidiagonal matrix. Times on CPU looks as follows:
+
+| Matrix rows and columns     |  `setElement` on host     | `setElement` with `ParallelFor` | `getRow`    | `forRows`   |
+|----------------------------:|--------------------------:|--------------------------------:|------------:|------------:|
+|                         256 |                  0.000055 |                       0.0000038 |    0.000004 |    0.000009 |
+|                       1,024 |                  0.000002 |                       0.0000056 |    0.000003 |    0.000006 |
+|                       4,096 |                  0.000087 |                       0.0000130 |    0.000005 |    0.000014 |
+|                      16,384 |                  0.000347 |                       0.0000419 |    0.000010 |    0.000046 |
+|                      65,536 |                  0.001378 |                       0.0001528 |    0.000032 |    0.000177 |
+|                     262,144 |                  0.005504 |                       0.0006025 |    0.000131 |    0.000711 |
+|                   1,048,576 |                  0.019392 |                       0.0028773 |    0.001005 |    0.003265 |
+|                   4,194,304 |                  0.072078 |                       0.0162378 |    0.011915 |    0.018065 |
+|                  16,777,216 |                  0.280085 |                       0.0642682 |    0.048876 |    0.072084 |
+|                  67,108,864 |                  1.105120 |                       0.2427610 |    0.181974 |    0.272579 |
 
 And on GPU like the fallowing table:
 
-| Matrix rows and columns     |  STL Map     | `setElement` on host | `setElement` on native device | `getRow`    | `forRows`   |
-|----------------------------:|-------------:|---------------------:|------------------------------:|------------:|------------:|
-|                             |              |                      |                               |             |             |
+| Matrix rows and columns     | `setElement` on host | `setElement` on native device | `getRow`    | `forRows`   |
+|----------------------------:|---------------------:|------------------------------:|------------:|------------:|
+|                         256 |                0.035 |                      0.000048 |    0.000045 |   0.000047  |
+|                       1,024 |                0.059 |                      0.000047 |    0.000045 |   0.000047  |
+|                       4,096 |                0.251 |                      0.000048 |    0.000045 |   0.000047  |
+|                      16,384 |                1.030 |                      0.000049 |    0.000046 |   0.000048  |
+|                      65,536 |                4.169 |                      0.000053 |    0.000048 |   0.000052  |
+|                     262,144 |               16.807 |                      0.000216 |    0.000214 |   0.000217  |
+|                   1,048,576 |               67.385 |                      0.000630 |    0.000629 |   0.000634  |
+|                   4,194,304 |              270.025 |                      0.001939 |    0.001941 |   0.001942  |
+|                  16,777,216 |             1080.741 |                      0.003212 |    0.004185 |   0.004207  |
+|                  67,108,864 |             4326.120 |                      0.013672 |    0.022494 |   0.030369  |
 
 ### Dense matrices <a name="dense_matrices_setup"></a>
 
-- 
GitLab


From cefa0dadacb132c704353deeaf3e80f25bb2c5a7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Wed, 13 Jan 2021 19:58:16 +0100
Subject: [PATCH 39/53] Fixed CMakeLists in tutorials.

---
 Documentation/Tutorials/GeneralConcepts/CMakeLists.txt | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 Documentation/Tutorials/GeneralConcepts/CMakeLists.txt

diff --git a/Documentation/Tutorials/GeneralConcepts/CMakeLists.txt b/Documentation/Tutorials/GeneralConcepts/CMakeLists.txt
new file mode 100644
index 000000000..e69de29bb
-- 
GitLab


From c8bb8d1eaee54d3d162da623ead07d8cfd00ae8e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Wed, 13 Jan 2021 19:58:43 +0100
Subject: [PATCH 40/53] Added another benchmark of matrix setup.

---
 .../Matrices/DenseMatrixSetup_Benchmark.cpp   | 28 ++++++++++++
 .../MultidiagonalMatrixSetup_Benchmark.cpp    | 42 ++++++++++++++++++
 .../Matrices/SparseMatrixSetup_Benchmark.cpp  | 43 +++++++++++++++++++
 3 files changed, 113 insertions(+)

diff --git a/Documentation/Tutorials/Matrices/DenseMatrixSetup_Benchmark.cpp b/Documentation/Tutorials/Matrices/DenseMatrixSetup_Benchmark.cpp
index 71a6eed2d..7545376c9 100644
--- a/Documentation/Tutorials/Matrices/DenseMatrixSetup_Benchmark.cpp
+++ b/Documentation/Tutorials/Matrices/DenseMatrixSetup_Benchmark.cpp
@@ -17,6 +17,20 @@ void setElement_on_host( const int matrixSize, Matrix& matrix )
          matrix.setElement( i, j,  i + j );
 }
 
+template< typename Matrix >
+void setElement_on_host_and_transfer( const int matrixSize, Matrix& matrix )
+{
+   using RealType = typename Matrix::RealType;
+   using IndexType = typename Matrix::IndexType;
+   using HostMatrix = TNL::Matrices::DenseMatrix< RealType, TNL::Devices::Host, IndexType >;
+   HostMatrix hostMatrix( matrixSize, matrixSize );
+
+   for( int j = 0; j < matrixSize; j++ )
+      for( int i = 0; i < matrixSize; i++ )
+         hostMatrix.setElement( i, j,  i + j );
+   matrix = hostMatrix;
+}
+
 template< typename Matrix >
 void setElement_on_device( const int matrixSize, Matrix& matrix )
 {
@@ -85,6 +99,20 @@ void setupDenseMatrix()
       timer.stop();
       std::cout << timer.getRealTime() / ( double ) testsCount << " sec." << std::endl;
 
+      if( std::is_same< Device, TNL::Devices::Cuda >::value )
+      {
+         std::cout << "   setElement on host and transfer on GPU: ";
+         timer.reset();
+         timer.start();
+         for( int i = 0; i < testsCount; i++ )
+         {
+            TNL::Matrices::DenseMatrix< float, Device, int > matrix;
+            setElement_on_host_and_transfer( matrixSize, matrix );
+         }
+         timer.stop();
+         std::cout << timer.getRealTime() / ( double ) testsCount << " sec." << std::endl;
+      }
+
       std::cout << "   getRow: ";
       timer.reset();
       timer.start();
diff --git a/Documentation/Tutorials/Matrices/MultidiagonalMatrixSetup_Benchmark.cpp b/Documentation/Tutorials/Matrices/MultidiagonalMatrixSetup_Benchmark.cpp
index 0ee70e79b..713394ced 100644
--- a/Documentation/Tutorials/Matrices/MultidiagonalMatrixSetup_Benchmark.cpp
+++ b/Documentation/Tutorials/Matrices/MultidiagonalMatrixSetup_Benchmark.cpp
@@ -47,6 +47,34 @@ void setElement_on_host( const int gridSize, Matrix& matrix )
       }
 }
 
+template< typename Matrix >
+void setElement_on_host_and_transfer( const int gridSize, Matrix& matrix )
+{
+   using RealType = typename Matrix::RealType;
+   using IndexType = typename Matrix::IndexType;
+   using HostMatrix = TNL::Matrices::MultidiagonalMatrix< RealType, TNL::Devices::Host, IndexType >;
+   const int matrixSize = gridSize * gridSize;
+   HostMatrix hostMatrix( matrixSize, matrixSize, getOffsets< typename Matrix::DeviceType >( gridSize ) );
+
+   for( int j = 0; j < gridSize; j++ )
+      for( int i = 0; i < gridSize; i++ )
+      {
+         const int rowIdx = j * gridSize + i;
+         if( i == 0 || j == 0 || i == gridSize - 1 || j == gridSize - 1 )
+            hostMatrix.setElement( rowIdx, rowIdx,  1.0 );
+         else
+         {
+            hostMatrix.setElement( rowIdx, rowIdx - gridSize,  1.0 );
+            hostMatrix.setElement( rowIdx, rowIdx - 1,  1.0 );
+            hostMatrix.setElement( rowIdx, rowIdx,  -4.0 );
+            hostMatrix.setElement( rowIdx, rowIdx + 1,  1.0 );
+            hostMatrix.setElement( rowIdx, rowIdx + gridSize,  1.0 );
+         }
+      }
+   matrix = hostMatrix;
+}
+
+
 template< typename Matrix >
 void setElement_on_device( const int gridSize, Matrix& matrix )
 {
@@ -173,6 +201,20 @@ void laplaceOperatorMultidiagonalMatrix()
       timer.stop();
       std::cout << timer.getRealTime() / ( double ) testsCount << " sec." << std::endl;
 
+      if( std::is_same< Device, TNL::Devices::Cuda >::value )
+      {
+         std::cout << "   setElement on host and transfer on GPU: ";
+         timer.reset();
+         timer.start();
+         for( int i = 0; i < testsCount; i++ )
+         {
+            TNL::Matrices::MultidiagonalMatrix< float, Device, int > matrix;
+            setElement_on_host_and_transfer( gridSize, matrix );
+         }
+         timer.stop();
+         std::cout << timer.getRealTime() / ( double ) testsCount << " sec." << std::endl;
+      }
+
       std::cout << "   setElement on device: ";
       timer.reset();
       timer.start();
diff --git a/Documentation/Tutorials/Matrices/SparseMatrixSetup_Benchmark.cpp b/Documentation/Tutorials/Matrices/SparseMatrixSetup_Benchmark.cpp
index 58ab83a9a..c53a8f5b4 100644
--- a/Documentation/Tutorials/Matrices/SparseMatrixSetup_Benchmark.cpp
+++ b/Documentation/Tutorials/Matrices/SparseMatrixSetup_Benchmark.cpp
@@ -65,6 +65,35 @@ void setElement_on_host( const int gridSize, Matrix& matrix )
       }
 }
 
+template< typename Matrix >
+void setElement_on_host_and_transfer( const int gridSize, Matrix& matrix )
+{
+   using RealType = typename Matrix::RealType;
+   using HostMatrix = typename Matrix::Self< RealType, TNL::Devices::Host >;
+
+   const int matrixSize = gridSize * gridSize;
+   TNL::Containers::Vector< int, typename HostMatrix::DeviceType, int > rowCapacities( matrixSize, 5 );
+   HostMatrix hostMatrix( matrixSize, matrixSize );
+   hostMatrix.setRowCapacities( rowCapacities );
+
+   for( int j = 0; j < gridSize; j++ )
+      for( int i = 0; i < gridSize; i++ )
+      {
+         const int rowIdx = j * gridSize + i;
+         if( i == 0 || j == 0 || i == gridSize - 1 || j == gridSize - 1 )
+            hostMatrix.setElement( rowIdx, rowIdx,  1.0 );
+         else
+         {
+            hostMatrix.setElement( rowIdx, rowIdx - gridSize,  1.0 );
+            hostMatrix.setElement( rowIdx, rowIdx - 1,  1.0 );
+            hostMatrix.setElement( rowIdx, rowIdx,  -4.0 );
+            hostMatrix.setElement( rowIdx, rowIdx + 1,  1.0 );
+            hostMatrix.setElement( rowIdx, rowIdx + gridSize,  1.0 );
+         }
+      }
+   matrix = hostMatrix;
+}
+
 template< typename Matrix >
 void setElement_on_device( const int gridSize, Matrix& matrix )
 {
@@ -208,6 +237,20 @@ void laplaceOperatorSparseMatrix()
       timer.stop();
       std::cout << timer.getRealTime() / ( double ) testsCount << " sec." << std::endl;
 
+      if( std::is_same< Device, TNL::Devices::Cuda >::value )
+      {
+         std::cout << "   setElement on host and transfer on GPU: ";
+         timer.reset();
+         timer.start();
+         for( int i = 0; i < testsCount; i++ )
+         {
+            TNL::Matrices::SparseMatrix< float, Device, int > matrix;
+            setElement_on_host_and_transfer( gridSize, matrix );
+         }
+         timer.stop();
+         std::cout << timer.getRealTime() / ( double ) testsCount << " sec." << std::endl;
+      }
+
       std::cout << "   setElement on device: ";
       timer.reset();
       timer.start();
-- 
GitLab


From a7202f62438d4f60e24c0ad5ba930c430e21f946 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Wed, 13 Jan 2021 20:39:30 +0100
Subject: [PATCH 41/53] Added results of new benchmark to matrices tutorial.

---
 .../Tutorials/Matrices/tutorial_Matrices.md   | 82 +++++++++----------
 1 file changed, 41 insertions(+), 41 deletions(-)

diff --git a/Documentation/Tutorials/Matrices/tutorial_Matrices.md b/Documentation/Tutorials/Matrices/tutorial_Matrices.md
index 81a2702cd..ed2fb8d69 100644
--- a/Documentation/Tutorials/Matrices/tutorial_Matrices.md
+++ b/Documentation/Tutorials/Matrices/tutorial_Matrices.md
@@ -159,11 +159,12 @@ There are several ways how to create new matrix:
 1. **Initializer lists** allow to create matrix from the [C++ initializer lists](https://en.cppreference.com/w/cpp/utility/initializer_list). The matrix elements must be therefore encoded in the source code and so it is useful for rather smaller matrices. Methods and constructors with initializer lists are user friendly and simple to use. It is a good choice for tool problems with small matrices.
 2. **STL map** can be used for creation of sparse matrices only. The user first insert all matrix elements together with their coordinates into [`std::map`](https://en.cppreference.com/w/cpp/container/map) based on which the sparse matrix is created in the next step. It is simple and user friendly approach suitable for creation of large matrices. An advantage is that we do not need to know the distribution of the matrix elements in matrix rows in advance like we do in other ways of matrix construction. This makes the use of STL map suitable for combining of sparse matrices in TNL with other numerical packages. However, the sparse matrix is constructed on the host and then copied on GPU if necessary. Therefor, this approach is not a good choice if fast and efficient matrix construction is required.
 3. **Methods `setElement` and `addElement` called from the host** allows to change particular matrix elements. The methods can be called from host even for matrices allocated on GPU. In this case, however, the matrix elements are transferred on GPU one by one which is very inefficient. If the matrix is allocated on the host system (CPU), the efficiency is good. In case of sparse matrices, one must set row capacities (i.e. maximal number of nonzero elements in each row) before using these methods. If the row capacity is exceeded, the matrix has to be reallocated and all matrix elements are lost.
-4. **Methods `setElement` and `addElement` called from native device** allows to do efficient matrix elements setup even on devices (GPUs). In this case, the methods must be called from a GPU kernel or a lambda function combined with parallel for (\ref TNL::Algorithms::ParallelFor). The user get very good performance even when manipulating matrix allocated on GPU. On the other hand, only data structures allocated on GPUs can be used in the kernel or lambda function. The the matrix can be accessed in the GPU kernel or lambda function by means of [matrix view](#matrix_view) or the shared pointer (\ref TNL::Pointers::SharedPointer).
-5. **Method `getRow` combined with `ParallelFor`** is very simillar to the previous one. The difference is that with first fetch helper object called *matrix row* which is linked to particular matrix row. Using methods of this object, one may change the matrix elements in given matrix row. An advantage is that the access to the matrix row is resolved only once for all elements in the row. In some more sophisticated sparse matrix formats, this can be nontrivial operation and this approach may slightly improve the performance. Another advantage for sparse matrices is that we access the matrix elements based on their *local index* in the row which is something like a rank of the nonzero element in the row. This is more efficient than adressing the matrix elements by the column indexes which requires searching in the matrix row. So this may significantly improve the performance of setup of sparse matrices. When it comes to dense matrices, there should not be great difference in performance compared to use of the methods `setElement` and `getElement`. Note that when the method is called from GPU kernel or lambda function , only data structures allocated on GPU can be accessed and the matrix must be made accessible by the means of.
-6. **Method `forRows`** this approach is very similar to the previous one but it avoids using `ParallelFor` and necessity of passing the matrix to GPU kernels by matrix view or shared pointers.
+4. **Methods `setElement` and `addElement` called from the host and copy matrix on GPU** setting particular matrix elements by the methods `setElement` and `addElement` when the matrix is allocated on GPU can be time consuming for large matrices. Setting up the matrix on CPU using the same methods and copying it on GPU at once when the setup is finished can be significantly more efficient. A drawback is that we need allocate temporarily whole matrix on CPU.
+5. **Methods `setElement` and `addElement` called from native device** allows to do efficient matrix elements setup even on devices (GPUs). In this case, the methods must be called from a GPU kernel or a lambda function combined with parallel for (\ref TNL::Algorithms::ParallelFor). The user get very good performance even when manipulating matrix allocated on GPU. On the other hand, only data structures allocated on GPUs can be used in the kernel or lambda function. The the matrix can be accessed in the GPU kernel or lambda function by means of [matrix view](#matrix_view) or the shared pointer (\ref TNL::Pointers::SharedPointer).
+6. **Method `getRow` combined with `ParallelFor`** is very similar to the previous one. The difference is that with first fetch helper object called *matrix row* which is linked to particular matrix row. Using methods of this object, one may change the matrix elements in given matrix row. An advantage is that the access to the matrix row is resolved only once for all elements in the row. In some more sophisticated sparse matrix formats, this can be nontrivial operation and this approach may slightly improve the performance. Another advantage for sparse matrices is that we access the matrix elements based on their *local index* in the row which is something like a rank of the nonzero element in the row. This is more efficient than addressing the matrix elements by the column indexes which requires searching in the matrix row. So this may significantly improve the performance of setup of sparse matrices. When it comes to dense matrices, there should not be great difference in performance compared to use of the methods `setElement` and `getElement`. Note that when the method is called from GPU kernel or lambda function , only data structures allocated on GPU can be accessed and the matrix must be made accessible by the means of.
+7. **Method `forRows`** this approach is very similar to the previous one but it avoids using `ParallelFor` and necessity of passing the matrix to GPU kernels by matrix view or shared pointers.
 
-The following table shows pros and cons of particular mathods:
+The following table shows pros and cons of particular methods:
 
 |  Method                                 |   Pros                                                                 | Cons                                                                  |
 |:----------------------------------------|:-----------------------------------------------------------------------|:----------------------------------------------------------------------|
@@ -215,19 +216,18 @@ The results on GPU looks as follows:
 
 And the same on GPU is in the following table:
 
-| Matrix rows and columns     | `setElement` on host | `setElement` with `ParallelFor` | `getRow`     | `forRows`   |
-|----------------------------:|---------------------:|--------------------------------:|-------------:|------------:|
-|                          16 |           0.027835   |                     0.000101198 | 0.00009903   | 0.000101214 |
-|                          32 |           0.002776   |                     0.000099197 | 0.00009901   | 0.000100481 |
-|                          64 |           0.010791   |                     0.000094446 | 0.00009493   | 0.000101796 |
-|                         128 |           0.043014   |                     0.000099397 | 0.00010024   | 0.000102729 |
-|                         256 |           0.171029   |                     0.000100469 | 0.00010448   | 0.000105893 |
-|                         512 |           0.683627   |                     0.000103346 | 0.00011034   | 0.000112752 |
-|                        1024 |           2.736680   |                     0.000158805 | 0.00016932   | 0.000170302 |
-|                        2048 |          10.930300   |                     0.000509000 | 0.00050917   | 0.000511183 |
-|                        4096 |          43.728700   |                     0.001557030 | 0.00156117   | 0.001557930 |
-|                        8192 |         174.923000   |                     0.005312470 | 0.00526658   | 0.005263870 |
-
+| Matrix rows and columns     | `setElement` on host | `setElement` on host and copy | `setElement` with `ParallelFor` | `getRow`     | `forRows`   |
+|----------------------------:|---------------------:|------------------------------:|--------------------------------:|-------------:|------------:|
+|                          16 |           0.027835   |                       0.02675 |                     0.000101198 | 0.00009903   | 0.000101214 |
+|                          32 |           0.002776   |                       0.00018 |                     0.000099197 | 0.00009901   | 0.000100481 |
+|                          64 |           0.010791   |                       0.00015 |                     0.000094446 | 0.00009493   | 0.000101796 |
+|                         128 |           0.043014   |                       0.00021 |                     0.000099397 | 0.00010024   | 0.000102729 |
+|                         256 |           0.171029   |                       0.00056 |                     0.000100469 | 0.00010448   | 0.000105893 |
+|                         512 |           0.683627   |                       0.00192 |                     0.000103346 | 0.00011034   | 0.000112752 |
+|                        1024 |           2.736680   |                       0.00687 |                     0.000158805 | 0.00016932   | 0.000170302 |
+|                        2048 |          10.930300   |                       0.02474 |                     0.000509000 | 0.00050917   | 0.000511183 |
+|                        4096 |          43.728700   |                       0.13174 |                     0.001557030 | 0.00156117   | 0.001557930 |
+|                        8192 |         174.923000   |                       0.70602 |                     0.005312470 | 0.00526658   | 0.005263870 |
 
 ### Sparse matrix
 
@@ -248,18 +248,18 @@ The sparse matrices are tested on computation of matrix approximating the Laplac
 
 We see, that use of STL map makes sense only in situation when it is hard to estimate necessary row capacities. Otherwise very simple with `setElement` method is much faster. If the performance is the highest priority, `getRow` method should be preferred. And the same on GPU is in the following table:
 
-| Matrix rows and columns     |  STL Map     | `setElement` on host | `setElement` on native device | `getRow`    | `forRows`   |
-|----------------------------:|-------------:|---------------------:|------------------------------:|------------:|------------:|
-|                         256 |       0.002  |                0.036 |                       0.00017 |     0.00017 |     0.00017 |
-|                       1,024 |       0.001  |                0.161 |                       0.00017 |     0.00017 |     0.00017 |
-|                       4,096 |       0.003  |                0.680 |                       0.00020 |     0.00020 |     0.00020 |
-|                      16,384 |       0.015  |                2.800 |                       0.00021 |     0.00020 |     0.00021 |
-|                      65,536 |       0.074  |               11.356 |                       0.00048 |     0.00047 |     0.00048 |
-|                     262,144 |       0.350  |               45.745 |                       0.00088 |     0.00087 |     0.00088 |
-|                   1,048,576 |       1.630  |              183.632 |                       0.00247 |     0.00244 |     0.00245 |
-|                   4,194,304 |       8.036  |              735.848 |                       0.00794 |     0.00783 |     0.00788 |
-|                  16,777,216 |      41.057  |             2946.610 |                       0.02481 |     0.02429 |     0.02211 |
-|                  67,108,864 |     187.581  |            11791.601 |                       0.07196 |     0.06329 |     0.06308 |
+| Matrix rows and columns     |  STL Map     | `setElement` on host | `setElement` on host and copy |`setElement` on native device | `getRow`    | `forRows`   |
+|----------------------------:|-------------:|---------------------:|------------------------------:|-----------------------------:|------------:|------------:|
+|                         256 |       0.002  |                0.036 |                        0.0280 |                      0.00017 |     0.00017 |     0.00017 |
+|                       1,024 |       0.001  |                0.161 |                        0.0006 |                      0.00017 |     0.00017 |     0.00017 |
+|                       4,096 |       0.003  |                0.680 |                        0.0010 |                      0.00020 |     0.00020 |     0.00020 |
+|                      16,384 |       0.015  |                2.800 |                        0.0034 |                      0.00021 |     0.00020 |     0.00021 |
+|                      65,536 |       0.074  |               11.356 |                        0.0130 |                      0.00048 |     0.00047 |     0.00048 |
+|                     262,144 |       0.350  |               45.745 |                        0.0518 |                      0.00088 |     0.00087 |     0.00088 |
+|                   1,048,576 |       1.630  |              183.632 |                        0.2057 |                      0.00247 |     0.00244 |     0.00245 |
+|                   4,194,304 |       8.036  |              735.848 |                        0.8119 |                      0.00794 |     0.00783 |     0.00788 |
+|                  16,777,216 |      41.057  |             2946.610 |                        3.2198 |                      0.02481 |     0.02429 |     0.02211 |
+|                  67,108,864 |     187.581  |            11791.601 |                       12.7775 |                      0.07196 |     0.06329 |     0.06308 |
 
 Here we see, the `setElement` methods performs extremely bad because all matrix elements are transferred to GPU one-by-one. Even STL map is much faster. Note, that the times for STL map are not much higher compared to CPU which indicates that the transfer of the matrix on GPU is not dominant. Another simple method could by to setup the matrix on CPU by the means of `setElement` method and transfer it on GPU.
 
@@ -282,18 +282,18 @@ Finally, the following tables show the times of the same test performed with mul
 
 And on GPU like the fallowing table:
 
-| Matrix rows and columns     | `setElement` on host | `setElement` on native device | `getRow`    | `forRows`   |
-|----------------------------:|---------------------:|------------------------------:|------------:|------------:|
-|                         256 |                0.035 |                      0.000048 |    0.000045 |   0.000047  |
-|                       1,024 |                0.059 |                      0.000047 |    0.000045 |   0.000047  |
-|                       4,096 |                0.251 |                      0.000048 |    0.000045 |   0.000047  |
-|                      16,384 |                1.030 |                      0.000049 |    0.000046 |   0.000048  |
-|                      65,536 |                4.169 |                      0.000053 |    0.000048 |   0.000052  |
-|                     262,144 |               16.807 |                      0.000216 |    0.000214 |   0.000217  |
-|                   1,048,576 |               67.385 |                      0.000630 |    0.000629 |   0.000634  |
-|                   4,194,304 |              270.025 |                      0.001939 |    0.001941 |   0.001942  |
-|                  16,777,216 |             1080.741 |                      0.003212 |    0.004185 |   0.004207  |
-|                  67,108,864 |             4326.120 |                      0.013672 |    0.022494 |   0.030369  |
+| Matrix rows and columns     | `setElement` on host | `setElement` on host and copy | `setElement` on native device | `getRow`    | `forRows`   |
+|----------------------------:|---------------------:|------------------------------:|------------------------------:|------------:|------------:|
+|                         256 |                0.035 |                       0.02468 |                      0.000048 |    0.000045 |   0.000047  |
+|                       1,024 |                0.059 |                       0.00015 |                      0.000047 |    0.000045 |   0.000047  |
+|                       4,096 |                0.251 |                       0.00044 |                      0.000048 |    0.000045 |   0.000047  |
+|                      16,384 |                1.030 |                       0.00158 |                      0.000049 |    0.000046 |   0.000048  |
+|                      65,536 |                4.169 |                       0.00619 |                      0.000053 |    0.000048 |   0.000052  |
+|                     262,144 |               16.807 |                       0.02187 |                      0.000216 |    0.000214 |   0.000217  |
+|                   1,048,576 |               67.385 |                       0.08043 |                      0.000630 |    0.000629 |   0.000634  |
+|                   4,194,304 |              270.025 |                       0.31272 |                      0.001939 |    0.001941 |   0.001942  |
+|                  16,777,216 |             1080.741 |                       1.18849 |                      0.003212 |    0.004185 |   0.004207  |
+|                  67,108,864 |             4326.120 |                       4.74481 |                      0.013672 |    0.022494 |   0.030369  |
 
 ### Dense matrices <a name="dense_matrices_setup"></a>
 
-- 
GitLab


From 395ad5643161c97aabfa4e79a1d58091d2718e85 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Fri, 15 Jan 2021 21:39:25 +0100
Subject: [PATCH 42/53] Working on the documentation of dense matrices.

---
 .../DenseMatrixExample_setElement.cpp         |   6 +-
 .../DenseMatrixViewExample_setElement.cpp     |   6 +-
 .../Tutorials/Matrices/tutorial_Matrices.md   | 274 +++++++++++-------
 3 files changed, 183 insertions(+), 103 deletions(-)

diff --git a/Documentation/Examples/Matrices/DenseMatrix/DenseMatrixExample_setElement.cpp b/Documentation/Examples/Matrices/DenseMatrix/DenseMatrixExample_setElement.cpp
index 9441cc60d..221a23c0c 100644
--- a/Documentation/Examples/Matrices/DenseMatrix/DenseMatrixExample_setElement.cpp
+++ b/Documentation/Examples/Matrices/DenseMatrix/DenseMatrixExample_setElement.cpp
@@ -16,8 +16,8 @@ void setElements()
    std::cout << "Matrix set from the host:" << std::endl;
    std::cout << *matrix << std::endl;
 
-   auto f = [=] __cuda_callable__ ( int i ) mutable {
-      matrix->setElement( i, i, -i );
+   auto f = [=] __cuda_callable__ ( int i, int j ) mutable {
+      matrix->addElement( i, j, 5.0 );
    };
 
    /***
@@ -26,7 +26,7 @@ void setElements()
     * DenseMatrixView::getRow example for details.
     */
    TNL::Pointers::synchronizeSmartPointersOnDevice< Device >();
-   TNL::Algorithms::ParallelFor< Device >::exec( 0, 5, f );
+   TNL::Algorithms::ParallelFor2D< Device >::exec( 0, 0, 5, 5, f );
 
    std::cout << "Matrix set from its native device:" << std::endl;
    std::cout << *matrix << std::endl;
diff --git a/Documentation/Examples/Matrices/DenseMatrix/DenseMatrixViewExample_setElement.cpp b/Documentation/Examples/Matrices/DenseMatrix/DenseMatrixViewExample_setElement.cpp
index 92985bc5a..b3760d976 100644
--- a/Documentation/Examples/Matrices/DenseMatrix/DenseMatrixViewExample_setElement.cpp
+++ b/Documentation/Examples/Matrices/DenseMatrix/DenseMatrixViewExample_setElement.cpp
@@ -14,10 +14,10 @@ void setElements()
    std::cout << "Matrix set from the host:" << std::endl;
    std::cout << matrix << std::endl;
 
-   auto f = [=] __cuda_callable__ ( int i ) mutable {
-      matrixView.setElement( i, i, -i );
+   auto f = [=] __cuda_callable__ ( int i, int j ) mutable {
+      matrixView.addElement( i, j, 5.0 );
    };
-   TNL::Algorithms::ParallelFor< Device >::exec( 0, 5, f );
+   TNL::Algorithms::ParallelFor2D< Device >::exec( 0, 0, 5, 5, f );
 
    std::cout << "Matrix set from its native device:" << std::endl;
    std::cout << matrix << std::endl;
diff --git a/Documentation/Tutorials/Matrices/tutorial_Matrices.md b/Documentation/Tutorials/Matrices/tutorial_Matrices.md
index ed2fb8d69..0259039ff 100644
--- a/Documentation/Tutorials/Matrices/tutorial_Matrices.md
+++ b/Documentation/Tutorials/Matrices/tutorial_Matrices.md
@@ -1,9 +1,5 @@
 \page tutorial_Matrices  Matrices tutorial
 
-## Introduction
-
-TNL offers several types of matrices like dense (\ref TNL::Matrices::DenseMatrix), sparse (\ref TNL::Matrices::SparseMatrix), tridiagonal (\ref TNL::Matrices::TridiagonalMatrix), multidiagonal (\ref TNL::Matrices::MultidiagonalMatrix) and lambda matrices (\ref TNL::Matrices::LambdaMatrix). The sparse matrices can be marked as symmetric to lower the memory requirements. The interfaces of given matrix types are designed to be as unified as possible to ensure that the user can easily switch between different matrix types while making no or only a little changes in the source code. All matrix types allows traversing all matrix elements and manipulate them using a lambda functions as well as performing flexible reduction in matrix rows. The following text describes particular matrix types and their unified interface in details.
-
 ## Table of Contents
 1. [Overview of matrix types](#overview_of_matrix_types)
 2. [Indexing of nonzero matrix elements in sparse matrices](#indexing_of_nonzero_matrix_elements_in_sparse_matrices)
@@ -17,11 +13,16 @@ TNL offers several types of matrices like dense (\ref TNL::Matrices::DenseMatrix
 5. [Flexible reduction in matrix rows](#flexible_reduction_in_matrix_rows)
 6. [Matrix-vector product](#matrix_vector_product)
 7. [Matrix I/O operations](#matrix_io_operations)
+8. [Appendix](#appendix)
+
+## Introduction
+
+TNL offers several types of matrices like dense (\ref TNL::Matrices::DenseMatrix), sparse (\ref TNL::Matrices::SparseMatrix), tridiagonal (\ref TNL::Matrices::TridiagonalMatrix), multidiagonal (\ref TNL::Matrices::MultidiagonalMatrix) and lambda matrices (\ref TNL::Matrices::LambdaMatrix). The sparse matrices can be symmetric to lower the memory requirements. The interfaces of given matrix types are designed to be as unified as possible to ensure that the user can easily switch between different matrix types while making no or only a little changes in the source code. All matrix types allows traversing all matrix elements and manipulate them using lambda functions as well as performing flexible reduction in matrix rows. The following text describes particular matrix types and their unified interface in details.
 
 
 ## Overview of matrix types <a name="overview_of_matrix_types"></a>
 
-In majority of numerical algorithms either dense or sparse matrices are used. The dense matrix (\ref TNL::Matrices::DenseMatrix) is such that all or at least most of its matrix elements are nonzero. On the other hand [sparse matrix](https://en.wikipedia.org/wiki/Sparse_matrix) (\ref TNL::Matrices::SparseMatrix) is a matrix which has most of the matrix elements equal to zero. From the implementation point of view, the data structures for the dense matrices allocates all matrix elements while formats for the sparse matrices aims to store explicitly only the nonzero matrix elements. The most popular format for storing the sparse matrices in [CSR format](https://en.wikipedia.org/wiki/Sparse_matrix#Compressed_sparse_row_(CSR,_CRS_or_Yale_format)). However, especially for better data alignment in memory of GPUs, many other formats were designed. In TNL, the user may choose between several different sparse matrix formats. There are also sparse matrices with specific pattern of the nonzero elements like [tridiagonal matrices](https://en.wikipedia.org/wiki/Tridiagonal_matrix) (\ref TNL::Matrices::TridiagonalMatrix) which "has nonzero elements on the main diagonal, the first diagonal below this, and the first diagonal above the main diagonal only". An example of such matrix may look as follows:
+In a lot of numerical algorithms either dense or sparse matrices are used. The dense matrix (\ref TNL::Matrices::DenseMatrix) is such that all or at least most of its matrix elements are nonzero. On the other hand [sparse matrix](https://en.wikipedia.org/wiki/Sparse_matrix) (\ref TNL::Matrices::SparseMatrix) is a matrix which has most of the matrix elements equal to zero. From the implementation point of view, the data structures for the dense matrices allocates all matrix elements while formats for the sparse matrices aim to store explicitly only the nonzero matrix elements. The most popular format for storing the sparse matrices is [CSR format](https://en.wikipedia.org/wiki/Sparse_matrix#Compressed_sparse_row_(CSR,_CRS_or_Yale_format)). However, especially for better data alignment in memory of GPUs, many other formats were designed. In TNL, the user may choose between several different sparse matrix formats. There are also sparse matrices with specific pattern of the nonzero elements like [tridiagonal matrices](https://en.wikipedia.org/wiki/Tridiagonal_matrix) (\ref TNL::Matrices::TridiagonalMatrix) which "has nonzero elements on the main diagonal, the first diagonal below this, and the first diagonal above the main diagonal only". An example of such matrix may look as follows:
 
 \f[
 \left(
@@ -36,7 +37,7 @@ In majority of numerical algorithms either dense or sparse matrices are used. Th
  \right)
 \f]
 
-Similar but more general type of matrices are multidiagonal matrices (\ref TNL::Matrices::MultidigonalMatrix) which has the nonzero elements positioned only on lines parallel to the diagonal like the following matrix:
+Similar but more general type of matrices are multidiagonal matrices (\ref TNL::Matrices::MultidiagonalMatrix) which have the nonzero matrix elements positioned only on lines parallel to the main diagonal like the following matrix:
 
 \f[
   \left(
@@ -51,9 +52,9 @@ Similar but more general type of matrices are multidiagonal matrices (\ref TNL::
   \right)
  \f]
 
-Finaly, TNL offers so called *lambda matrices* (\ref TNL::Matrices::LambdaMatrix) which are kind of "matrix-free matrices". They do not store the matrix elements explicitly in the memory, but rather evaluates them on-the-fly based on user defined lambda functions.
+Finally, TNL offers so called *lambda matrices* (\ref TNL::Matrices::LambdaMatrix) which are kind of "matrix-free matrices". They do not store the matrix elements explicitly in the memory, but rather evaluates them on-the-fly based on user defined lambda functions.
 
-In the following table we show comparison of extreme example when we would express a tridiagonal matrix by means of different matrix types.
+In the following table we show comparison of expressing a tridiagonal matrix by means of different matrix types.
 
 | Matrix dimensions | Dense elems.   | Dense mem. | Sparse elems. | Sparse mem.  | Tridiag. elems. | Tridiag. mem. | Multidiag. elems. | Mutlidiag. mem. |
 |------------------:|---------------:|-----------:|--------------:|-------------:|----------------:|--------------:|------------------:|----------------:|
@@ -61,23 +62,23 @@ In the following table we show comparison of extreme example when we would expre
 |           100x100 |         10,000 |      80 kB |          >298 |     >3,576 B |             300 |       2,400 B |               300 |         2,412 B |
 |       1,000x1,000 |      1,000,000 |       8 MB |        >2,998 |    >35,976 B |           3,000 |      24,000 B |             3,000 |        24,012 B |
 |     10,000x10,000 |    100,000,000 |     800 MB |       >29,998 |   >359,976 B |          30,000 |     240,000 B |            30,000 |       240,012 B |
-|   100,000x100,000 | 10,000,000,000 |      80 GB |     > 299,998 | >3,599,876 B |         300,000 |   2,400,000 B |           300,000 |     2,400,012 B |
+|   100,000x100,000 | 10,000,000,000 |      80 GB |      >299,998 | >3,599,876 B |         300,000 |   2,400,000 B |           300,000 |     2,400,012 B |
 
 In the table:
 
 * **Matrix dimensions** is the number of matrix rows and columns
 * **Dense elems.** is the number of allocated matrix elements in the dense matrix.
 * **Dense mem.** is the allocated memory for the matrix elements in the dense matrix if the elements are stored in the double precision.
-* **Sparse elems.** is the number of allocated matrix elements in the sparse matrix. Some formats may allocate padding zeros for better data alignment in the memory.
+* **Sparse elems.** is the number of allocated matrix elements in the sparse matrix. Some formats may allocate padding zeros for better data alignment in the memory and so the number of allocated matrix elements may increase.
 * **Sparse mem.** is the allocated memory for the matrix elements in the sparse matrix if the elements are stored in the double precision and column indexes in 32-bit integer.
 * **Tridiag. elems** is the number of allocated matrix elements in the tridiagonal matrix.
 * **Tridiag mem.** is the allocated memory for the matrix elements in the tridiagonal matrix if the elements are stored in the double precision.
 * **Multidiag. elems** is the number of allocated matrix elements in the multidiagonal matrix.
 * **Multidiag mem.** is the allocated memory for the matrix elements in the multidiagonal matrix if the elements are stored in the double precision.
 
-Choosing the best matrix type can have tremendous impact on the performance but also memory requirements. If we would treat each matrix as a dense one we would not be able to to work with matrices larger than 50,000x50,000 on common personal computers because we would need tens of gibabytes of memory. At the same time we see that the other matrix types can do the same job with only few megabytes. In addition, other matrix types work with much less matrix types and so operations like matrix-vector multiplication can be done with much less operations which means much faster. Since in modern hardware architectures, the computing units are limited mainly by the performance of the memory chips, transferring less data from the memory increases the performance even more.
+Choosing the best matrix type can have tremendous impact on the performance but also on memory requirements. If we would treat each matrix as dense one we would not be able to to work with matrices larger than 50,000x50,000 on common personal computers, because we would need tens of gibabytes of memory. At the same time, we see that the other matrix types can do the same job with only few megabytes. In addition, other matrix types work with much less matrix elements and so operations like matrix-vector multiplication can be done with significantly less operations which means much faster. Since in the modern hardware architectures, the computing units are limited mainly by the performance of the memory chips (so [called memory wall](https://en.wikipedia.org/wiki/Random-access_memory#Memory_wall)), transferring less data from the memory increases the performance even more.
 
-The following table shows the same but when storing a matrix which has only five nonzero elements in each row. Such matrices arises often from the finite difference method for solution of the partial differential equations:
+The following table shows the same as the one above but when storing a matrix which has only five nonzero elements in each row. Such matrices arise often from the finite difference method for solution of the partial differential equations:
 
 | Matrix dimensions | Dense elems.   | Dense mem. | Sparse elems. | Sparse mem.  | Multidiag. elems. | Mutlidiag. mem. |
 |------------------:|---------------:|-----------:|--------------:|-------------:|------------------:|----------------:|
@@ -87,9 +88,7 @@ The following table shows the same but when storing a matrix which has only five
 |     10,000x10,000 |    100,000,000 |     800 MB |       >50,000 |   >600,000 B |            50,000 |       400,020 B |
 |   100,000x100,000 | 10,000,000,000 |      80 GB |      >500,000 | >6,000,000 B |           500,000 |     4,000,020 B |
 
-There is no change in the dense matrix part of the table. The numbers grow proportionaly in case of sparse and mutlidiagonal matrix. We see that the multidiagonal matrix type is suitable for the finite difference method or similar numerical methods for solution of the partial differential equations.
-
-General sparse matrix formats needs to store columns indexes for each matrix element which is not true for the multidiagonal matrix. The following table shows how many bytes we need for storing of one matrix element with different matrix types depending on the type of the matrix elements (`Real`) and column indexes (`Index`):
+There is no change in the dense matrix part of the table. The numbers grow proportionally in case of sparse and mutlidiagonal matrix. General sparse matrix formats need to store column indexes for each matrix element which is not true for the multidiagonal matrix. The following table shows how many bytes we need for storing of one matrix element with different matrix types depending on the type of the matrix elements (`Real`) and column indexes (`Index`):
 
 | Real   | Index  | Dense matrix | Multidiagonal matrix |  Sparse matrix | Fill ratio |
 |:------:|:------:|:------------:|:--------------------:|:--------------:|:----------:|
@@ -107,9 +106,11 @@ In this table:
 * **Sparse matrix** is number of bytes needed to store one matrix element in the sparse matrix.
 * **Fill ratio** is maximal percentage of the nonzero matrix elements until which the sparse matrix can perform better.
 
-## Indexing of nonzero matrix elements in sparse matrices <a name="indexing_of_nonzero_matrix_elements_in_sparse_matrices"></a>
+The multidiagonal matrix type is especially suitable for the finite difference method or similar numerical methods for solution of the partial differential equations.
 
-The sparse matrix formats usualy, in the first step, compress the matrix rows by omitting the zero matrix elements as follows
+## Indexing of nonzero matrix elements in sparse matrices<a name="indexing_of_nonzero_matrix_elements_in_sparse_matrices"></a>
+
+The sparse matrix formats usually, in the first step, compress the matrix rows by omitting the zero matrix elements as follows
 
 \f[
 \left(
@@ -133,7 +134,7 @@ The sparse matrix formats usualy, in the first step, compress the matrix rows by
 \right)
 \f]
 
-In this case, it is more efficient to refer the nonzero matrix elements by their rank in the compressed matrix rather than by their column index in the original matrix. In methods for the sparse matrices, this parameter is called `localIdx`. Some sparse matrix formats adds some padding zeros for better alignment of data in memory. But if this is not the case, the variable `localIdx` of particular matrix elements would read as:
+In such a form, it is more efficient to refer the nonzero matrix elements in given row by their rank in the compressed matrix row rather than by their column index in the original matrix. In methods for the sparse matrices, this parameter is called `localIdx`. Some sparse matrix formats add padding zeros for better alignment of data in memory. But if this is not the case, the variable `localIdx` of particular matrix elements would read as:
 
 \f[
 \left(
@@ -147,45 +148,52 @@ In this case, it is more efficient to refer the nonzero matrix elements by their
 \right)
 \f]
 
-
 ## Matrix view <a name="matrix_view"></a>
 
-TODO: concept of matrix view. Add reference to general concepts
+Matrix views are small reference objects which help accessing the matrix in GPU kernels or lambda functions being executed on GPUs. We describe this in details in section about [Shared pointers and views](tutorial_GeneralConcepts.html#shared-pointers-and-views). The problem lies in fact that we cannot pass references to GPU kernels and we do not want to pass there deep copies of matrices. Matrix view is some kind of reference to a matrix. A copy of matrix view is always shallow and so it behaves like a reference.  The following example shows how to obtain the matrix view by means of method `getView` and pass it to a lambda function:
+
+\includelineno SparseMatrixViewExample_getRow.cpp
+
+Here we create sparse matrix `matrix` on the line 11, and use the method `getView` to get the matrix view on the line 12. The view is then used in the lambda function on the line 15 where it is captured by value (see `[=]` in the definition of the lambda function `f` on the line 14).
+
 
 ## Allocation and setup of different matrix types <a name="allocation_and_setup_of_different_matrix_types"></a>
 
-There are several ways how to create new matrix:
+There are several ways how to create a new matrix:
 
 1. **Initializer lists** allow to create matrix from the [C++ initializer lists](https://en.cppreference.com/w/cpp/utility/initializer_list). The matrix elements must be therefore encoded in the source code and so it is useful for rather smaller matrices. Methods and constructors with initializer lists are user friendly and simple to use. It is a good choice for tool problems with small matrices.
-2. **STL map** can be used for creation of sparse matrices only. The user first insert all matrix elements together with their coordinates into [`std::map`](https://en.cppreference.com/w/cpp/container/map) based on which the sparse matrix is created in the next step. It is simple and user friendly approach suitable for creation of large matrices. An advantage is that we do not need to know the distribution of the matrix elements in matrix rows in advance like we do in other ways of matrix construction. This makes the use of STL map suitable for combining of sparse matrices in TNL with other numerical packages. However, the sparse matrix is constructed on the host and then copied on GPU if necessary. Therefor, this approach is not a good choice if fast and efficient matrix construction is required.
-3. **Methods `setElement` and `addElement` called from the host** allows to change particular matrix elements. The methods can be called from host even for matrices allocated on GPU. In this case, however, the matrix elements are transferred on GPU one by one which is very inefficient. If the matrix is allocated on the host system (CPU), the efficiency is good. In case of sparse matrices, one must set row capacities (i.e. maximal number of nonzero elements in each row) before using these methods. If the row capacity is exceeded, the matrix has to be reallocated and all matrix elements are lost.
-4. **Methods `setElement` and `addElement` called from the host and copy matrix on GPU** setting particular matrix elements by the methods `setElement` and `addElement` when the matrix is allocated on GPU can be time consuming for large matrices. Setting up the matrix on CPU using the same methods and copying it on GPU at once when the setup is finished can be significantly more efficient. A drawback is that we need allocate temporarily whole matrix on CPU.
-5. **Methods `setElement` and `addElement` called from native device** allows to do efficient matrix elements setup even on devices (GPUs). In this case, the methods must be called from a GPU kernel or a lambda function combined with parallel for (\ref TNL::Algorithms::ParallelFor). The user get very good performance even when manipulating matrix allocated on GPU. On the other hand, only data structures allocated on GPUs can be used in the kernel or lambda function. The the matrix can be accessed in the GPU kernel or lambda function by means of [matrix view](#matrix_view) or the shared pointer (\ref TNL::Pointers::SharedPointer).
-6. **Method `getRow` combined with `ParallelFor`** is very similar to the previous one. The difference is that with first fetch helper object called *matrix row* which is linked to particular matrix row. Using methods of this object, one may change the matrix elements in given matrix row. An advantage is that the access to the matrix row is resolved only once for all elements in the row. In some more sophisticated sparse matrix formats, this can be nontrivial operation and this approach may slightly improve the performance. Another advantage for sparse matrices is that we access the matrix elements based on their *local index* in the row which is something like a rank of the nonzero element in the row. This is more efficient than addressing the matrix elements by the column indexes which requires searching in the matrix row. So this may significantly improve the performance of setup of sparse matrices. When it comes to dense matrices, there should not be great difference in performance compared to use of the methods `setElement` and `getElement`. Note that when the method is called from GPU kernel or lambda function , only data structures allocated on GPU can be accessed and the matrix must be made accessible by the means of.
+2. **STL map** can be used for creation of sparse matrices only. The user first insert all matrix elements together with their coordinates into [`std::map`](https://en.cppreference.com/w/cpp/container/map) based on which the sparse matrix is created in the next step. It is simple and user friendly approach suitable for creation of large matrices. An advantage is that we do not need to know the distribution of the nonzero matrix elements in matrix rows in advance like we do in other ways of construction of sparse matrices. This makes the use of STL map suitable for combining of sparse matrices from TNL with other numerical packages. However, the sparse matrix is constructed on the host and then copied on GPU if necessary. Therefore, this approach is not a good choice if fast and efficient matrix construction is required.
+3. **Methods `setElement` and `addElement` called from the host** allow to change particular matrix elements. The methods can be called from host even for matrices allocated on GPU. In this case, however, the matrix elements are transferred on GPU one by one which is very inefficient. If the matrix is allocated on the host system (CPU), the efficiency is nearly optimal. In case of sparse matrices, one must set row capacities (i.e. maximal number of nonzero elements in each row) before using these methods. If the row capacity is exceeded, the matrix has to be reallocated and all matrix elements are lost.
+4. **Methods `setElement` and `addElement` called on the host and copy matrix on GPU** setting particular matrix elements by the methods `setElement` and `addElement` when the matrix is allocated on GPU can be time consuming for large matrices. Setting up the matrix on CPU using the same methods and copying it on GPU at once when the setup is finished can be significantly more efficient. A drawback is that we need to allocate temporarily whole matrix on CPU.
+5. **Methods `setElement` and `addElement` called from native device** allow to do efficient matrix elements setup even on devices (GPUs). In this case, the methods must be called from a GPU kernel or a lambda function combined with the parallel for (\ref TNL::Algorithms::ParallelFor). The user get very good performance even when manipulating matrix allocated on GPU. On the other hand, only data structures allocated on GPUs can be accessed from the kernel or lambda function. The matrix can be accessed in the GPU kernel or lambda function by means of [matrix view](#matrix_view) or the shared pointer (\ref TNL::Pointers::SharedPointer).
+6. **Method `getRow` combined with `ParallelFor`** is very similar to the previous one. The difference is that we first fetch helper object called *matrix row* which is linked to particular matrix row. Using methods of this object, one may change the matrix elements in given matrix row. An advantage is that the access to the matrix row is resolved only once for all elements in the row. In some more sophisticated sparse matrix formats, this can be nontrivial operation and this approach may slightly improve the performance. Another advantage for sparse matrices is that we access the matrix elements based on their *local index* ('localIdx', see [Indexing of nonzero matrix elements in sparse matrices](indexing_of_nonzero_matrix_elements_in_sparse_matrices)) in the row which is something like a rank of the nonzero element in the row. This is more efficient than addressing the matrix elements by the column indexes which requires searching in the matrix row. So this may significantly improve the performance of setup of sparse matrices. When it comes to dense matrices, there should not be great difference in performance compared to use of the methods `setElement` and `getElement`. Note that when the method is called from a GPU kernel or a lambda function, only data structures allocated on GPU can be accessed and the matrix must be made accessible by the means of matrix view.
 7. **Method `forRows`** this approach is very similar to the previous one but it avoids using `ParallelFor` and necessity of passing the matrix to GPU kernels by matrix view or shared pointers.
 
 The following table shows pros and cons of particular methods:
 
-|  Method                                 |   Pros                                                                 | Cons                                                                  |
-|:----------------------------------------|:-----------------------------------------------------------------------|:----------------------------------------------------------------------|
-| **Initializer list**                    | Simple.                                                                | Only for small matrices.                                              |
-| **STL map**                             | Simplest of all methods for sparse matrices.                           | Higher memory requirements.                                           |
-|                                         | Does not need setting of matrix rows capacities                        | Slow transfer on GPU.                                                 |
-| **[set,add]Element on host**            | Simple.                                                                | Requires setting of row capacities.                                   |
-|                                         |                                                                        | Extremely slow transfer on GPU.                                       |
-| **[set,add]Element on native device**   | Good efficiency.                                                       | Requires setting of row capacities.                                   |
-|                                         |                                                                        | Requires writting GPU kernel or lambda function.                      |
-|                                         |                                                                        | Allows accessing only data allocated on the same device/memory space. |
-| **getRow and ParallelFor**              | Best efficiency for sparse matrices.                                   | Requires setting of row capacities.                                   |
-|                                         |                                                                        | Requires writting GPU kernel or lambda function.                      |
-|                                         |                                                                        | Allows accessing only data allocated on the same device/memory space. |
-|                                         |                                                                        | Use of matrix local indexes can be less intuitive.                    |
-| **forRows**                             | Best efficiency for sparse matrices.                                   | Requires setting of row capacities.                                   |
-|                                         | Avoid use of matrix view or shared pointer in kernels/lambda function. | Requires writting GPU kernel or lambda function.                      |
-|                                         |                                                                        | Allows accessing only data allocated on the same device/memory space. |
-|                                         |                                                                        | Use of matrix local indexes is less intuitive.                        |
-
-Though it may seem that the later methods come with more cons than pros they offer much higher performance and we believe they even them are still very user friendly. On the other hand, if the matrix setup performance is not a priority the use the simple but slow method can still be a good choice. The following tables demonstrate the performance of different methods. The tests were performed with the following setup:
+|  Method                                 | Efficient | Easy to use |  Pros                                                                 | Cons                                                                  |
+|:----------------------------------------|:----------|:------------|:----------------------------------------------------------------------|:----------------------------------------------------------------------|
+| **Initializer list**                    | **        | *****       | Very easy to use.                                                     | Only for small matrices.                                              |
+|                                         |           |             | Does not need setting of matrix rows capacities                       |                                                                       |
+| **STL map**                             | **        | *****       | Very easy to use.                                                     | Higher memory requirements.                                           |
+|                                         |           |             | Does not need setting of matrix rows capacities                       | Slow transfer on GPU.                                                 |
+| **[set,add]Element on host**            | ****/*    | *****       | Very easy to use.                                                     | Requires setting of row capacities.                                   |
+|                                         |           |             |                                                                       | Extremely slow transfer on GPU.                                       |
+| **[set,and]Element on host&copy on GPU**| ***       | ****        | Easy to use.                                                          | Requires setting of row capacities.                                   |
+|                                         |           |             | Reasonable efficiency.                                                | Allocation of auxiliary matrix on CPU.                                |
+| **[set,add]Element on native device**   | ****      |             | Good efficiency.                                                      | Requires setting of row capacities.                                   |
+|                                         |           |             |                                                                       | Requires writing GPU kernel or lambda function.                       |
+|                                         |           |             |                                                                       | Allows accessing only data allocated on the same device/memory space. |
+| **getRow and ParallelFor**              | *****     | **          | Best efficiency for sparse matrices.                                  | Requires setting of row capacities.                                   |
+|                                         |           |             |                                                                       | Requires writing GPU kernel or lambda function.                       |
+|                                         |           |             |                                                                       | Allows accessing only data allocated on the same device/memory space. |
+|                                         |           |             |                                                                       | Use of matrix local indexes can be less intuitive.                    |
+| **forRows**                             | *****     | **          | Best efficiency for sparse matrices.                                  | Requires setting of row capacities.                                   |
+|                                         |           |             | Avoid use of matrix view or shared pointer in kernels/lambda function.| Requires writing GPU kernel or lambda function.                       |
+|                                         |           |             |                                                                       | Allows accessing only data allocated on the same device/memory space. |
+|                                         |           |             |                                                                       | Use of matrix local indexes is less intuitive.                        |
+
+Though it may seem that the later methods come with more cons than pros, they offer much higher performance and we believe that even they are still user friendly. On the other hand, if the matrix setup performance is not a priority, the use easy-to-use but slow method can still be a good choice. The following tables demonstrate the performance of different methods. The tests were performed with the following setup:
 
 |              |                                                   |
 |--------------|---------------------------------------------------|
@@ -212,26 +220,40 @@ In the test of dense matrices, we set each matrix element to value equal to `row
 |                        4096 |           0.08381450 |                       0.0716542 |   0.00937997 |   0.0116771 |
 |                        8192 |           0.51596800 |                       0.3535530 |   0.03971900 |   0.0467374 |
 
-The results on GPU looks as follows:
+Here:
+
+* **setElement on host** tests run in one thread. Therefore they are faster for small matrices compared to "`setElement` with `ParallelFor`" tests.
+* **setElement with ParallelFor** tests run in parallel in several OpenMP threads. This approach is faster for larger matrices.
+* **getRow** tests run in parallel in several OpenMP threads mapping of which is more efficient compared to "`setElement` on host" tests.
 
 And the same on GPU is in the following table:
 
-| Matrix rows and columns     | `setElement` on host | `setElement` on host and copy | `setElement` with `ParallelFor` | `getRow`     | `forRows`   |
-|----------------------------:|---------------------:|------------------------------:|--------------------------------:|-------------:|------------:|
-|                          16 |           0.027835   |                       0.02675 |                     0.000101198 | 0.00009903   | 0.000101214 |
-|                          32 |           0.002776   |                       0.00018 |                     0.000099197 | 0.00009901   | 0.000100481 |
-|                          64 |           0.010791   |                       0.00015 |                     0.000094446 | 0.00009493   | 0.000101796 |
-|                         128 |           0.043014   |                       0.00021 |                     0.000099397 | 0.00010024   | 0.000102729 |
-|                         256 |           0.171029   |                       0.00056 |                     0.000100469 | 0.00010448   | 0.000105893 |
-|                         512 |           0.683627   |                       0.00192 |                     0.000103346 | 0.00011034   | 0.000112752 |
-|                        1024 |           2.736680   |                       0.00687 |                     0.000158805 | 0.00016932   | 0.000170302 |
-|                        2048 |          10.930300   |                       0.02474 |                     0.000509000 | 0.00050917   | 0.000511183 |
-|                        4096 |          43.728700   |                       0.13174 |                     0.001557030 | 0.00156117   | 0.001557930 |
-|                        8192 |         174.923000   |                       0.70602 |                     0.005312470 | 0.00526658   | 0.005263870 |
+| Matrix rows and columns     | `setElement` on host | `setElement` on host and copy | `setElement` on GPU | `getRow`     | `forRows`   |
+|----------------------------:|---------------------:|------------------------------:|--------------------:|-------------:|------------:|
+|                          16 |           0.027835   |                       0.02675 |         0.000101198 | 0.00009903   | 0.000101214 |
+|                          32 |           0.002776   |                       0.00018 |         0.000099197 | 0.00009901   | 0.000100481 |
+|                          64 |           0.010791   |                       0.00015 |         0.000094446 | 0.00009493   | 0.000101796 |
+|                         128 |           0.043014   |                       0.00021 |         0.000099397 | 0.00010024   | 0.000102729 |
+|                         256 |           0.171029   |                       0.00056 |         0.000100469 | 0.00010448   | 0.000105893 |
+|                         512 |           0.683627   |                       0.00192 |         0.000103346 | 0.00011034   | 0.000112752 |
+|                        1024 |           2.736680   |                       0.00687 |         0.000158805 | 0.00016932   | 0.000170302 |
+|                        2048 |          10.930300   |                       0.02474 |         0.000509000 | 0.00050917   | 0.000511183 |
+|                        4096 |          43.728700   |                       0.13174 |         0.001557030 | 0.00156117   | 0.001557930 |
+|                        8192 |         174.923000   |                       0.70602 |         0.005312470 | 0.00526658   | 0.005263870 |
+
+Here:
+
+* **setElement on host** tests are very slow especially for large matrices since each matrix element is copied on GPU separately.
+* **setElement on host and copy** tests are much faster because the matrix is copied from CPU to GPU on the whole which is more efficient.
+* **setElement on GPU** tests are even more faster since there is no transfer of data between CPU and GPU.
+* **getRow** tests have the same performance as "`setElement` on GPU".
+* **forRows** tests have the same performance as both "`setElement` on GPU" and "`getRow`".
+
+You can see the source code of the previous benchmark in [Appendix](#benchmark-of-dense-matrix-setup).
 
 ### Sparse matrix
 
-The sparse matrices are tested on computation of matrix approximating the Laplace operator in 2D. This matrix has at most five non-zero elements in each row. The times for sparse matrix (and CSR format) on CPU in seconds looks as follows:
+The sparse matrices are tested on computation of matrix the [discrete Laplace operator in 2D](https://en.wikipedia.org/wiki/Discrete_Laplace_operator). This matrix has at most five nonzero elements in each row. The times for sparse matrix (with CSR format) on CPU in seconds looks as follows:
 
 | Matrix rows and columns     |  STL Map     | `setElement` on host | `setElement` with `ParallelFor` | `getRow`    | `forRows`    |
 |----------------------------:|-------------:|---------------------:|--------------------------------:|------------:|-------------:|
@@ -246,26 +268,45 @@ The sparse matrices are tested on computation of matrix approximating the Laplac
 |                  16,777,216 |     38.95900 |             0.413823 |                        0.125870 |    0.124588 |     0.123858 |
 |                  67,108,864 |    185.75700 |             1.652580 |                        0.505232 |    0.501003 |     0.500927 |
 
-We see, that use of STL map makes sense only in situation when it is hard to estimate necessary row capacities. Otherwise very simple with `setElement` method is much faster. If the performance is the highest priority, `getRow` method should be preferred. And the same on GPU is in the following table:
+Here:
+
+* **STL Map** tests show that use of STL Map can be very slow on large matrices and, of course, they need to allocate the map containing all the matrix elements. This can be memory consuming. On the other hand, it is the only way which does not require knowing the matrix row capacities in advance.
+* **setElement on host** tests are much faster compared to STL map, it does not need to allocate anything else except the sparse matrix. However, matrix row capacities must be known in advance.
+* **setElement with ParallelFor** tests run in parallel in several OpenMP threads and so this can be faster for larger matrices.
+* **getRow** tests perform the same as "setElement with ParallelFor".
+* **forRows** tests perform the same as both "setElement with ParallelFor" and "forRows".
+
+We see, that the use of STL map makes sense only in situation when it is hard to estimate necessary row capacities. Otherwise very easy setup with `setElement` method is much faster. If the performance is the highest priority, `getRow` method should be preferred. The results for GPU are in the following table:
+
+| Matrix rows and columns     |  STL Map     | `setElement` on host | `setElement` on host and copy |`setElement` on GPU | `getRow`    | `forRows`   |
+|----------------------------:|-------------:|---------------------:|------------------------------:|-------------------:|------------:|------------:|
+|                         256 |       0.002  |                0.036 |                        0.0280 |            0.00017 |     0.00017 |     0.00017 |
+|                       1,024 |       0.001  |                0.161 |                        0.0006 |            0.00017 |     0.00017 |     0.00017 |
+|                       4,096 |       0.003  |                0.680 |                        0.0010 |            0.00020 |     0.00020 |     0.00020 |
+|                      16,384 |       0.015  |                2.800 |                        0.0034 |            0.00021 |     0.00020 |     0.00021 |
+|                      65,536 |       0.074  |               11.356 |                        0.0130 |            0.00048 |     0.00047 |     0.00048 |
+|                     262,144 |       0.350  |               45.745 |                        0.0518 |            0.00088 |     0.00087 |     0.00088 |
+|                   1,048,576 |       1.630  |              183.632 |                        0.2057 |            0.00247 |     0.00244 |     0.00245 |
+|                   4,194,304 |       8.036  |              735.848 |                        0.8119 |            0.00794 |     0.00783 |     0.00788 |
+|                  16,777,216 |      41.057  |             2946.610 |                        3.2198 |            0.02481 |     0.02429 |     0.02211 |
+|                  67,108,864 |     197.581  |            11791.601 |                       12.7775 |            0.07196 |     0.06329 |     0.06308 |
+
+Here:
 
-| Matrix rows and columns     |  STL Map     | `setElement` on host | `setElement` on host and copy |`setElement` on native device | `getRow`    | `forRows`   |
-|----------------------------:|-------------:|---------------------:|------------------------------:|-----------------------------:|------------:|------------:|
-|                         256 |       0.002  |                0.036 |                        0.0280 |                      0.00017 |     0.00017 |     0.00017 |
-|                       1,024 |       0.001  |                0.161 |                        0.0006 |                      0.00017 |     0.00017 |     0.00017 |
-|                       4,096 |       0.003  |                0.680 |                        0.0010 |                      0.00020 |     0.00020 |     0.00020 |
-|                      16,384 |       0.015  |                2.800 |                        0.0034 |                      0.00021 |     0.00020 |     0.00021 |
-|                      65,536 |       0.074  |               11.356 |                        0.0130 |                      0.00048 |     0.00047 |     0.00048 |
-|                     262,144 |       0.350  |               45.745 |                        0.0518 |                      0.00088 |     0.00087 |     0.00088 |
-|                   1,048,576 |       1.630  |              183.632 |                        0.2057 |                      0.00247 |     0.00244 |     0.00245 |
-|                   4,194,304 |       8.036  |              735.848 |                        0.8119 |                      0.00794 |     0.00783 |     0.00788 |
-|                  16,777,216 |      41.057  |             2946.610 |                        3.2198 |                      0.02481 |     0.02429 |     0.02211 |
-|                  67,108,864 |     187.581  |            11791.601 |                       12.7775 |                      0.07196 |     0.06329 |     0.06308 |
+* **STL Map** tests show that the times are comparable to CPU times which means the most of the time is spent by creating the matrix on CPU.
+* **setElement on host**  tests are again extremely slow for large matrices. It is even slower than the use of STL map. So in case of GPU, this is another reason for using the STL map.
+* **setElement on host and copy** tests are, similar to the dense matrix, much faster compared to the previous approaches. So it is the best way when you need to use data structures available only on the host system (CPU).
+* **setElement on GPU** tests exhibit the best performance together with `getRow` and `forRows` methods. Note, however, that this method can be slower that `getRow` and `forRows` if there would be more nonzero matrix elements in a row.
+* **getRow** tests exhibit the best performance together with `setElement` on GPU and `forRows` methods.
+* **forRows** tests exhibit the best performance together with `getRow` and `setElement` on GPU methods.
 
-Here we see, the `setElement` methods performs extremely bad because all matrix elements are transferred to GPU one-by-one. Even STL map is much faster. Note, that the times for STL map are not much higher compared to CPU which indicates that the transfer of the matrix on GPU is not dominant. Another simple method could by to setup the matrix on CPU by the means of `setElement` method and transfer it on GPU.
+Here we see, that the `setElement` methods performs extremely bad because all matrix elements are transferred to GPU one-by-one. Even STL map is much faster. Note, that the times for STL map are not much higher compared to CPU which indicates that the transfer of the matrix on GPU is not dominant. Setup of the matrix on CPU by the means of `setElement` method and transfer on GPU is even faster. However, the best performance can be obtained only we creating the matrix directly on GPU by methods `setElement`, `getRow` and `forRows`. Note, however, that even if all of them perform the same way, for matrices with more nonzero matrix elements in a row, `setElement` could be slower compared to the `getRow` and `forRows`.
+
+You can see the source code of the previous benchmark in [Appendix](#benchmark-of-sparse-matrix-setup).
 
 ### Multidiagonal matrix
 
-Finally, the following tables show the times of the same test performed with multidiagonal matrix. Times on CPU looks as follows:
+Finally, the following tables show the times of the same test performed with multidiagonal matrix. Times on CPU in seconds looks as follows:
 
 | Matrix rows and columns     |  `setElement` on host     | `setElement` with `ParallelFor` | `getRow`    | `forRows`   |
 |----------------------------:|--------------------------:|--------------------------------:|------------:|------------:|
@@ -280,29 +321,48 @@ Finally, the following tables show the times of the same test performed with mul
 |                  16,777,216 |                  0.280085 |                       0.0642682 |    0.048876 |    0.072084 |
 |                  67,108,864 |                  1.105120 |                       0.2427610 |    0.181974 |    0.272579 |
 
-And on GPU like the fallowing table:
-
-| Matrix rows and columns     | `setElement` on host | `setElement` on host and copy | `setElement` on native device | `getRow`    | `forRows`   |
-|----------------------------:|---------------------:|------------------------------:|------------------------------:|------------:|------------:|
-|                         256 |                0.035 |                       0.02468 |                      0.000048 |    0.000045 |   0.000047  |
-|                       1,024 |                0.059 |                       0.00015 |                      0.000047 |    0.000045 |   0.000047  |
-|                       4,096 |                0.251 |                       0.00044 |                      0.000048 |    0.000045 |   0.000047  |
-|                      16,384 |                1.030 |                       0.00158 |                      0.000049 |    0.000046 |   0.000048  |
-|                      65,536 |                4.169 |                       0.00619 |                      0.000053 |    0.000048 |   0.000052  |
-|                     262,144 |               16.807 |                       0.02187 |                      0.000216 |    0.000214 |   0.000217  |
-|                   1,048,576 |               67.385 |                       0.08043 |                      0.000630 |    0.000629 |   0.000634  |
-|                   4,194,304 |              270.025 |                       0.31272 |                      0.001939 |    0.001941 |   0.001942  |
-|                  16,777,216 |             1080.741 |                       1.18849 |                      0.003212 |    0.004185 |   0.004207  |
-|                  67,108,864 |             4326.120 |                       4.74481 |                      0.013672 |    0.022494 |   0.030369  |
+Here:
+
+* **setElement on host** tests show that this method is fairly efficient.
+* **setElement with ParallelFor** tests run in parallel in several OpenMP threads compared to "setElement on host" tests. For larger matrices, this way of matrix setup performs better.
+* **getRow** tests perform more or less the same as "setElement with ParallelFor" and `forRows`.
+* **forRows** tests perform more or less the same as "setElement with ParallelFor" and `getRow`.
+
+Note, that setup of multidiagonal matrix is faster compared to the same matrix stored in general sparse format. Results for GPU are in the following table:
+
+| Matrix rows and columns     | `setElement` on host | `setElement` on host and copy | `setElement` on GPU | `getRow`    | `forRows`   |
+|----------------------------:|---------------------:|------------------------------:|--------------------:|------------:|------------:|
+|                         256 |                0.035 |                       0.02468 |            0.000048 |    0.000045 |   0.000047  |
+|                       1,024 |                0.059 |                       0.00015 |            0.000047 |    0.000045 |   0.000047  |
+|                       4,096 |                0.251 |                       0.00044 |            0.000048 |    0.000045 |   0.000047  |
+|                      16,384 |                1.030 |                       0.00158 |            0.000049 |    0.000046 |   0.000048  |
+|                      65,536 |                4.169 |                       0.00619 |            0.000053 |    0.000048 |   0.000052  |
+|                     262,144 |               16.807 |                       0.02187 |            0.000216 |    0.000214 |   0.000217  |
+|                   1,048,576 |               67.385 |                       0.08043 |            0.000630 |    0.000629 |   0.000634  |
+|                   4,194,304 |              270.025 |                       0.31272 |            0.001939 |    0.001941 |   0.001942  |
+|                  16,777,216 |             1080.741 |                       1.18849 |            0.003212 |    0.004185 |   0.004207  |
+|                  67,108,864 |             4326.120 |                       4.74481 |            0.013672 |    0.022494 |   0.030369  |
+
+* **setElement on host** tests are extremely slow again, especially for large matrices.
+* **setElement on host and copy** tests are much faster compared to the previous.
+* **setElement with ParallelFor** tests offer the best performance. They are even faster then `getRow` and `forRows` method. This, however, does not have be true for matrices having more nonzero elements in a row.
+* **getRow** tests perform more or less the same as `forRows`. For matrices having more nonzero elements in a row this method could be faster than `setElement`.
+* **forRows** tests perform more or less the same as `getRow`.
+
+Note that multidiagonal matrix performs better compared to general sparse matrix. One reason for it is the fact, that the multidiagonal type does not store explicitly column indexes of all matrix elements. Because of this, less data need to be transferred from the memory.
+
+You can see the source code of the previous benchmark in [Appendix](#benchmark-of-multidiagonal-matrix-setup).
+
+In the following parts we will describe hoe to setup particular matrix types by means of the methods mentioned above.
 
 ### Dense matrices <a name="dense_matrices_setup"></a>
 
 Dense matrix (\ref TNL::Matrices::DenseMatrix) is a templated class defined in the namespace \ref TNL::Matrices. It has five template parameters:
 
 * `Real` is a type of the matrix elements. It is `double` by default.
-* `Device` is a device where the matrix shall be allocated. Currently it can be either \ref TNL::Devices::Host for CPU or \ref TNL::Devices::Cuda for GPU supporting CUDA. It is \ref TNL::Devices::Host by default.
+* `Device` is a device where the matrix shall be allocated. Currently it can be either \ref TNL::Devices::Host for CPU or \ref TNL::Devices::Cuda for CUDA supporting GPUs. It is \ref TNL::Devices::Host by default.
 * `Index` is a type to be used for indexing of the matrix elements. It is `int` by default.
-* `ElementsOrganization` defines the organization of the matrix elements in memory. It can be \ref TNL::Algorithms::Segments::ColumnMajorOrder or \ref TNL::Algorithms::Segments::RowMajorOrder for column-major and row-major organization respectively. Be default it is the row-major order if the matrix is allocated on the host system and column major order if it is allocated on GPU.
+* `ElementsOrganization` defines the organization of the matrix elements in memory. It can be \ref TNL::Algorithms::Segments::ColumnMajorOrder or \ref TNL::Algorithms::Segments::RowMajorOrder for column-major and row-major organization respectively. Be default, it is the row-major order if the matrix is allocated on the host system and column major order if it is allocated on GPU.
 * `RealAllocator` is a memory allocator (one from \ref TNL::Allocators) which shall be used for allocation of the matrix elements. By default, it is the default allocator for given `Real` type and `Device` type -- see \ref TNL::Allocators::Default.
 
 The following examples show how to allocate the dense matrix and how to initialize the matrix elements.
@@ -327,25 +387,34 @@ As we can see, both methods can be called from the host no matter where the matr
 
 \include DenseMatrixExample_addElement.out
 
-More efficient way of the matrix initialization on GPU consists of calling the methods `setElement` and `addElement` (\ref TNL::Matrices::DenseMatrix::setElement, \ref TNL::Matrices::DenseMatrix::addElement) directly from GPU. It is demonstrated in the following example (of course it works even for CPU):
+More efficient way of the matrix initialization on GPU consists of calling the methods `setElement` and `addElement` (\ref TNL::Matrices::DenseMatrix::setElement, \ref TNL::Matrices::DenseMatrix::addElement) directly from GPU, for example by means of lambda function and `ParallelFor2D` (\ref TNL::Algorithms::ParallelFor2D). It is demonstrated in the following example (of course it works even on CPU):
 
-\includelineno DenseMatrixExample_setElement.cpp
+\includelineno DenseMatrixViewExample_setElement.cpp
 
-Here we use `SharedPointer` (\ref TNL::Pointers::SharedPointer) to make the matrix accessible in lambda functions even on GPU. We first call the `setElement` method from CPU to set the `i`-th diagonal element to `i`. Next we iterate over the matrix rows with `ParallelFor`and for each row we call a lambda function `f`. This is done on the same device where the matrix is allocated and so it is more efficient for matrices allocated on GPU. In the lambda function we just set the `i`-th diagonal element to `-i`. The result looks as follows:
+Here we get the matrix view (\ref TNL::Matrices::DenseMatrixView) (line 10) to make the matrix accessible in lambda function even on GPU (see [Shared pointers and views](tutorial_GeneralConcepts.html#shared-pointers-and-views) ). We first call the `setElement` method from CPU to set the `i`-th diagonal element to `i` (lines 11-12). Next we iterate over the matrix rows with `ParallelFor2D` (\ref TNL::Algorithms::ParallelFor2D) (line 20) and for each row we call the lambda function `f`. This is done on the same device where the matrix is allocated and so it we get optimal performance even for matrices on GPU. In the lambda function we add one to each matrix element (line 18). The result looks as follows:
 
 \include DenseMatrixExample_setElement.out
 
 #### Method `getRow`
 
-This method is available for the dense matrix (\ref TNL::Matrices::DenseMatrix::getRow) but only for compatibility with the sparse matrices which the method was designed for. Use it only when you need unified code for both dense and sparse matrices.
+This method is available for the dense matrix (\ref TNL::Matrices::DenseMatrix::getRow) mainly for two reasons:
+
+1. The method `getRow` is recommended for sparse matrices. In most cases, it is not optimal for dense matrices. However, if one needs to have one code for both dense and sparse matrices, this method is a good choice.
+2. In general, use of `setElement` (\ref TNL::Matrices::DenseMatrix::setElement) combined with `ParallelFor2D` (\ref TNL::Algorithms::ParallelFor2D) is preferred, for dense matrices, since it offers more parallelism for GPUs. `ParallelFor2D` creates one CUDA thread per each matrix element which is desirable for GPUs. With the use of the method `getRow` we have only one CUDA thread per each matrix row. This makes sense only in situation when we need to setup each matrix row sequentially.
+
+Here we show an example:
+
+\includelineno DenseMatrixViewExample_getRow.cpp
+
+Here we create the matrix on the line 10 and get the matrix view on the line 16. Next we use `ParallelFor` (\ref TNL::Algorithms::ParallelFor) (line 26) to iterate over the matrix rows and the lambda function `f` (lines 18-21) for each of them. In the lambda function, we first fetch the matrix row by means of the merhod `getRow` (\ref TNL::Matrices::DenseMatrixView::getRow) and next we set the matrix elements by using the method `setElement` of the matrix row (\ref TNL::Matrices::DenseMatrixRowView::setElement). For the compatibility with the sparse matrices, use the variant of `setElement` with the parameter `localIdx`. It has no effect here, it is only for compatibility of the interface.
 
 #### Method `forRows`
 
-If we want to set more matrix elements in each row, we can use inner for-loop in the lambda function `f`. This, however, is limiting the parallelization and it can be inefficient for larger matrices. The next example demonstrates a method `forRows` (\ref TNL::Matrices::DenseMatrix::forRows) which iterates over all matrix elements in parallel and it calls a lambda function defining an operation we want to do on the matrix elements.
+ The next example demonstrates the method `forRows` (\ref TNL::Matrices::DenseMatrix::forRows) which works in very similar way as the method `getRow` but it is slightly easier to use. It is also compatible with sparse matrices. See the following example:
 
 \includelineno DenseMatrixExample_forRows.cpp
 
-Firstly note, that this is simpler since we do not need any `SharedPointer`. The lambda function `f` requires the following parameters:
+We do not need any matrix view and instead of calling `ParallelFor` (\ref TNL::Algorithms::ParallelFor) we call just the method `forRows` (line 18). The lambda function `f` (line 11) must accept the following parameters:
 
 * `rowIdx` is the row index of given matrix element.
 * `columnIdx` is the column index of given matrix element.
@@ -1565,8 +1634,19 @@ As we mentioned already, the multidiagonal matrix view offers almost all methods
 TODO: Move to explanation of the matrix view to introduction.
 
 
+## Appendix<a name="appendix"></a>
+
+### Benchmark of dense matrix setup<a name="benchmark-of-dense-matrix-setup"></a>
+
+\includelineno DenseMatrixSetup_Benchmark.cpp
+
+### Benchmark of sparse matrix setup<a name="benchmark-of-sparse-matrix-setup"></a>
+
+\includelineno SparseMatrixSetup_Benchmark.cpp
 
+### Benchmark of multidiagonal matrix setup<a name="benchmark-of-multidiagonal-matrix-setup"></a>
 
+\includelineno MultidiagonalMatrixSetup_Benchmark.cpp
 
 
 
-- 
GitLab


From 769d471e85d0f161473e39e25993c90364e3af76 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Sat, 16 Jan 2021 20:27:21 +0100
Subject: [PATCH 43/53] Writing documentation on matrices.

---
 .../Tutorials/Matrices/tutorial_Matrices.md   | 205 +++++++++---------
 1 file changed, 102 insertions(+), 103 deletions(-)

diff --git a/Documentation/Tutorials/Matrices/tutorial_Matrices.md b/Documentation/Tutorials/Matrices/tutorial_Matrices.md
index 0259039ff..778d12e44 100644
--- a/Documentation/Tutorials/Matrices/tutorial_Matrices.md
+++ b/Documentation/Tutorials/Matrices/tutorial_Matrices.md
@@ -108,7 +108,7 @@ In this table:
 
 The multidiagonal matrix type is especially suitable for the finite difference method or similar numerical methods for solution of the partial differential equations.
 
-## Indexing of nonzero matrix elements in sparse matrices<a name="indexing_of_nonzero_matrix_elements_in_sparse_matrices"></a>
+## Indexing of nonzero matrix elements in sparse matrices<a name="indexing-of-nonzero-matrix-elements-in-sparse-matrices"></a>
 
 The sparse matrix formats usually, in the first step, compress the matrix rows by omitting the zero matrix elements as follows
 
@@ -427,138 +427,135 @@ The result looks as follows:
 
 ### Sparse matrices <a name="sparse_matrices_setup"></a>
 
-[Sparse matrices](https://en.wikipedia.org/wiki/Sparse_matrix) are extremely important in a lot of numerical algorithms. They are used at situations when we need to operate with matrices having majority of the matrix elements equal to zero. In this case, only the non-zero matrix elements are stored with possible some *padding zeros* used for memory alignment. This is necessary mainly on GPUs. Consider just matrix having 50,000 rows and columns whih is 2,500,000,000 matrix elements. If we store each matrix element in double precision (it means eight bytes per element) we need 20,000,000,000 bytes which is nearly 20 GB of memory. If there are only five non-zero elements in each row we need only \f$8 \times 5 \times 50,000=2,000,000\f$ bytes and so nearly 200 MB. It is really great difference.
+[Sparse matrices](https://en.wikipedia.org/wiki/Sparse_matrix) are extremely important in a lot of numerical algorithms. They are used at situations when we need to operate with matrices having majority of the matrix elements equal to zero. In this case, only the non-zero matrix elements are stored with possibly some *padding zeros* used for memory alignment. This is necessary mainly on GPUs. See the [Overview of matrix types](#overview_of_matrix_types) for the differences in memory requirements.
 
-Major disadventage of sparse matrices is that there are a lot of different formats for storing such matrices. Though [CSR - Compressed Sparse Row](https://en.wikipedia.org/wiki/Sparse_matrix#Compressed_sparse_row_(CSR,_CRS_or_Yale_format)) format is the most popular of all, especially for GPUs there are many other formats which perform differently on various matrices. So it is a good idea to test several sparse matrix formats if you want to get the best performance. In TNL, there is one templated class \ref TNL::Matrices::SparseMatrix representing the sparse matrices. The change of underlying matrix format can be done just by changing one template parameter. The list of the template paramaters is as follows:
+Major disadvantage of sparse matrices is that there are a lot of different formats for their storage in memory. Though [CSR (Compressed Sparse Row)](https://en.wikipedia.org/wiki/Sparse_matrix#Compressed_sparse_row_(CSR,_CRS_or_Yale_format)) format is the most popular of all, especially for GPUs, there are many other formats. Often their performance differ significantly for various matrices. So it is a good idea to test several sparse matrix formats if you want to get the best performance. In TNL, there is one templated class \ref TNL::Matrices::SparseMatrix representing general sparse matrices. The change of underlying matrix format can be done just by changing one template parameter. The list of the template paramaters is as follows:
 
 * `Real` is type if the matrix elements. It is `double` by default.
-* `Device` is a device where the matrix is allocated. Currently it can be either \ref TNL::Devices::Host for CPU or \ref TNL::Devices::Cuda for GPU supporting CUDA. It is \ref TNL::Devices::Host by default.
+* `Device` is a device where the matrix is allocated. Currently it can be either \ref TNL::Devices::Host for CPU or \ref TNL::Devices::Cuda for CUDA supporting GPUs. It is \ref TNL::Devices::Host by default.
 * `Index` is a type to be used for indexing of the matrix elements. It is `int` by default.
 * `MatrixType` tells if the matrix is symmetric (\ref TNL::Matrices::SymmetricMatrix) or general (\ref TNL::Matrices::GeneralMatrix). It is a \ref TNL::Matrices::GeneralMatrix by default.
-* `Segments` define the format of the sparse matrix. It can be (by default, it is \ref TNL::Algorithms::Segments::CSR):
+* `Segments` define the format of the sparse matrix. It can be one of the following (by default, it is \ref TNL::Algorithms::Segments::CSR):
    * \ref TNL::Algorithms::Segments::CSR for [CSR format](https://en.wikipedia.org/wiki/Sparse_matrix#Compressed_sparse_row_(CSR,_CRS_or_Yale_format)).
    * \ref TNL::Algorithms::Segments::Ellpack for [Ellpack format](http://mgarland.org/files/papers/nvr-2008-004.pdf).
    * \ref TNL::Algorithms::Segments::SlicedEllpack for [SlicedEllpack format](https://link.springer.com/chapter/10.1007/978-3-642-11515-8_10) which was also presented as [Row-grouped CSR format](https://arxiv.org/abs/1012.2270).
    * \ref TNL::Algorithms::Segments::ChunkedEllpack for [ChunkedEllpack format](http://geraldine.fjfi.cvut.cz/~oberhuber/data/vyzkum/publikace/12-heller-oberhuber-improved-rgcsr-format.pdf) which we reffered as Improved Row-grouped CSR and we renamed it to Ellpack format since it uses padding zeros.
    * \ref TNL::Algorithms::Segments::BiEllpack for [BiEllpack format](https://www.sciencedirect.com/science/article/pii/S0743731514000458?casa_token=2phrEj0Ef1gAAAAA:Lgf6rMBUN6T7TJne6mAgI_CSUJ-jR8jz7Eghdv6L0SJeGm4jfso-x6Wh8zgERk3Si7nFtTAJngg).
 * `ComputeReal` is type which is used for internal computations. By default it is the same as `Real` if `Real` is not `bool`. If `Real` is `bool`, `ComputeReal` is set to `Index` type. This can be changed, of course, by the user.
-* `RealAllocator` is a memory allocator (one from \ref TNL::Allocators) which shall be used for allocation of the matrix elements. By default, it is the default allocator for given `Real` type and `Device` type – see TNL::Allocators::Default.
+* `RealAllocator` is a memory allocator (one from \ref TNL::Allocators) which shall be used for allocation of the matrix elements. By default, it is the default allocator for given `Real` type and `Device` type – see \ref TNL::Allocators::Default.
 * `IndexAllocator` is a memory allocator (one from \ref TNL::Allocators) which shall be used for allocation of the column indexes of the matrix elements. By default, it is the default allocator for given `Index` type and `Device` type – see \ref TNL::Allocators::Default.
 
-**If `Real` is set to `bool`, we get *a binary matrix* for which the non-zero elements can be equal only to one and so the matrix elements values are not stored explicitly in the memory.**
+**If `Real` is set to `bool`, we get *a binary matrix* for which the non-zero elements can be equal only to one and so the matrix elements values are not stored explicitly in the memory.** This can significantly reduce the memory requirements and also increase performance.
 
 In the following text we will show how to create and setup sparse matrices.
 
-#### Initializer list
+#### Setting of row capacities<a name="setting-of-matrix-row-capacities"></a>
 
-Small matrices can be initialized by a constructor with an [initializer list](https://en.cppreference.com/w/cpp/utility/initializer_list). We assume having the following sparse matrix
+Larger sparse matrices are created in two steps:
+
+1. We use a method \ref TNL::Matrices::SparseMatrix::setRowCapacities to initialize the underlying matrix format and to allocate memory for the matrix elements. This method only needs to know how many non-zero elements are supposed to be in each row. Once this is set, it cannot be changed only by resetting the whole matrix. In most situations, it is not an issue to compute the number of nonzero elements in each row. Otherwise, we can currently only recommend the use of matrix setup with [STL map](#sparse-matrix-stl-map), which is, however, quite slow.
+2. Now, the nonzero matrix elements can be set one after another by telling its coordinates and a value. Since majority of sparse matrix formats are designed to allow quick access to particular matrix rows the insertion can be done in parallel by mapping different threads to different matrix rows. This approach is usually optimal or nearly optimal when it comes to efficiency.
+
+See the following example which creates lower triangular matrix like this one
 
 \f[
 \left(
 \begin{array}{ccccc}
  1 &  0 &  0 &  0 &  0 \\
--1 &  2 & -1 &  0 &  0 \\
- 0 & -1 &  2 & -1 &  0 \\
- 0 &  0 & -1 &  2 & -1 \\
- 0 &  0 &  0 & -1 &  0
+ 2 &  1 &  0 &  0 &  0 \\
+ 3 &  2 &  1 &  0 &  0 \\
+ 4 &  3 &  2 &  1 &  0 \\
+ 5 &  4 &  3 &  2 &  1
 \end{array}
 \right).
 \f]
 
-The following example shows how to create it using the initializer list constructor:
-
-\includelineno SparseMatrixExample_Constructor_init_list_2.cpp
-
-The constructor accepts the following parameters:
-
-* `rows` is a number of matrix rows.
-* `columns` is a number of matrix columns.
-* `data` is definition of non-zero matrix elements. It is a initializer list of triples having a form `{ row_index, column_index, value }`. In fact, it is very much like the coordinates format - [COO](https://en.wikipedia.org/wiki/Sparse_matrix#Coordinate_list_(COO)).
-
-The constructor also accepts `Real` and `Index` allocators (\ref TNL::Allocators) but their are not important for this example. A method `setElements` works the same way:
-
-\includelineno SparseMatrixExample_setElements.cpp
+\includelineno SparseMatrixExample_setRowCapacities.cpp
 
-The result of both examples looks as follows:
+The method \ref TNL::Matrices::SparseMatrix::setRowCapacities reads the required capacities of the matrix rows from a vector (or simmilar container - \ref TNL::Containers::Array, \ref TNL::Containers::ArrayView, \ref TNL::Containers::Vector and \ref TNL::Containers::VectorView) which has the same number of elements as the number of matrix rows and each element defines the capacity of the related row. The result looks as follows:
 
-\include SparseMatrixExample_Constructor_init_list_2.out
+\include SparseMatrixExample_setRowCapacities.out
 
-#### STL map
+There are constructors which also set the row capacities. The first one uses a vector:
 
-Finaly, there is a constructor which creates the sparse matrix from [`std::map`](https://en.cppreference.com/w/cpp/container/map). It is usefull especially in situation when you cannot compute the matrix elements by rows but rather in random order. You can do it on CPU and store the matrix elements in [`std::map`](https://en.cppreference.com/w/cpp/container/map) data structure in a [COO](https://en.wikipedia.org/wiki/Sparse_matrix#Coordinate_list_(COO)) format manner. It means that each entry of the `map` is the following pair:
+\includelineno SparseMatrixExample_Constructor_rowCapacities_vector.cpp
 
-```
-std::pair( std::pair( row_index, column_index ), element_value )
-```
+The second one uses an initializer list:
 
-which defines one matrix element at given coordinates with given value. Of course, you can insert such entries in any order into the `map`. When it is complete you can pass it the sparse matrix. See the following example:
+\includelineno SparseMatrixExample_Constructor_init_list_1.cpp
 
-\includelineno SparseMatrixExample_Constructor_std_map.cpp
+The result of both examples looks as follows:
 
-#### Setting of row capacities
+\include SparseMatrixExample_Constructor_init_list_1.out
 
-Larger matrices are created in two steps:
 
-1. We use a method \ref TNL::Matrices::SparseMatrix::setRowCapacities to initialize the underlying matrix format and to allocate memory for the matrix elements. This method only needs to know how many non-zero elements are supposed to be in each row. Once this is set, it cannot be changed only by reseting the whole matrix. In most situations, this is not an issue to compute the number of non-zero elements in each row. Note, however, that we do not tell the positions of the non-zeto elements. If some matrix format needs this information it cannot be used with this implementation of the sparse matrix.
-2. The non-zero matrix elements can be set-up. We insert one non-zero element after another by telling its coordinates and a value. Since probably all sparse matrix formats are designed to allow quick acces to particular matrix rows, this insertion is usualy quite efficient and can by done in parallel by mapping different threads to different matrix rows.
+#### Initializer list
 
-See the following example which creates lower triangular matrix like this one
+Small matrices can be initialized by a constructor with an [initializer list](https://en.cppreference.com/w/cpp/utility/initializer_list). We assume having the following sparse matrix
 
 \f[
 \left(
 \begin{array}{ccccc}
  1 &  0 &  0 &  0 &  0 \\
- 2 &  1 &  0 &  0 &  0 \\
- 3 &  2 &  1 &  0 &  0 \\
- 4 &  3 &  2 &  1 &  0 \\
- 5 &  4 &  3 &  2 &  1
+-1 &  2 & -1 &  0 &  0 \\
+ 0 & -1 &  2 & -1 &  0 \\
+ 0 &  0 & -1 &  2 & -1 \\
+ 0 &  0 &  0 & -1 &  0
 \end{array}
 \right).
 \f]
 
-\includelineno SparseMatrixExample_setRowCapacities.cpp
+It can be created with the initializer list constructor like we shows in the following example:
 
-The method \ref TNL::Matrices::SparseMatrix::setRowCapacities reads the required capacities of the matrix rows from a vector (or simmilar container - \ref TNL::Containers::Array, \ref TNL::Containers::ArrayView, \ref TNL::Containers::Vector and \ref TNL::Containers::VectorView) which has the same number of elements as the number of matrix rows and each element defines the capacity of the related row. The result looks as follows:
+\includelineno SparseMatrixExample_Constructor_init_list_2.cpp
 
-\include SparseMatrixExample_setRowCapacities.out
+The constructor accepts the following parameters (lines 9-17):
 
-There are constructors which also set the row capacities. The first one uses a vector:
+* `rows` is a number of matrix rows.
+* `columns` is a number of matrix columns.
+* `data` is definition of nonzero matrix elements. It is a initializer list of triples having a form `{ row_index, column_index, value }`. In fact, it is very much like the Coordinate format - [COO](https://en.wikipedia.org/wiki/Sparse_matrix#Coordinate_list_(COO)).
 
-\includelineno SparseMatrixExample_Constructor_rowCapacities_vector.cpp
+The constructor also accepts `Real` and `Index` allocators (\ref TNL::Allocators) but the default ones are used in this example. A method `setElements` (\ref TNL::Matrices::SparseMatrix::setElements) works the same way:
 
-The second one uses an initializer list:
+\includelineno SparseMatrixExample_setElements.cpp
 
-\includelineno SparseMatrixExample_Constructor_init_list_1.cpp
+In this example, we create the matrix in two steps. Firstly we use constructor with only matrix dimensions as parameters (line 9) and next we set the matrix elements by `setElements` method (lines 10-15). The result of both examples looks as follows:
 
-The result of both examples looks as follows:
+\include SparseMatrixExample_Constructor_init_list_2.out
 
-\include SparseMatrixExample_Constructor_init_list_1.out
+#### STL map<a name="sparse-matrix-stl-map"></a>
 
-#### Methods `setElement` and `addElement`
+The constructor which creates the sparse matrix from [`std::map`](https://en.cppreference.com/w/cpp/container/map) is useful especially in situations when you cannot estimate the [matrix row capacities](#setting-of-matrix-row-capacities) in advance. You can first store the matrix elements in [`std::map`](https://en.cppreference.com/w/cpp/container/map) data structure in a [COO](https://en.wikipedia.org/wiki/Sparse_matrix#Coordinate_list_(COO)) format manner. It means that each entry of the `map` is the following pair:
 
-A method `setElements` works the same way for already existing instances of sparse matrix:
+```
+std::pair( std::pair( row_index, column_index ), element_value )
+```
+
+which defines one matrix element at given coordinates `(row_index,column_index)` with given value (`element_value`). Of course, you can insert such entries into the `map` in arbitrary order. When it is complete, you pass the map to the sparse matrix. See the following example:
+
+\includelineno SparseMatrixExample_Constructor_std_map.cpp
+
+The method `setElements` (\ref TNL::Matrices::SparseMatrix::setElements) works the same way for already existing instances of sparse matrix:
 
 \includelineno SparseMatrixExample_setElements_map.cpp
 
-The result of both examples looks as folows:
+The result of both examples looks as follows:
 
 \include SparseMatrixExample_setElements_map.out
 
-Another way of setting the sparse matrix is via the methods `setElement` and `addElement` (\ref TNL::Matrices::SparseMatrix::setElement, \ref TNL::Matrices::addElement). The procedure is as follows:
+Note, however, that the map can be constructed only on CPU and not on GPU. It requires allocation of additional memory on the host system (CPU) and if the target sparse matrix resided on GPU, the matrix elements must be copied on GPU. This is the reason, why this way of the sparse matrix setup is inefficient compared to other methods.
 
-1. Setup the matrix dimensions.
-2. Setup the row capacities.
-3. Setup the matrix elements.
+#### Methods `setElement` and `addElement`
 
-The method can be called from both host (CPU) and device (GPU) if the matrix is allocated there. Note, however, that if the matrix is allocated on GPU and the method is called from CPU there will be significant performance drop because the matrix elements will be transfered one after another. However, if the matrix elements setup is not a critical part of your algorithm this can be an easy way how to do it. See the following example:
+Another way of setting the sparse matrix is by means of the methods `setElement` and `addElement` (\ref TNL::Matrices::SparseMatrix::setElement, \ref TNL::Matrices::SparseMatrix::addElement). The method can be called from both host (CPU) and device (GPU) if the matrix is allocated there. Note, however, that if the matrix is allocated on GPU and the methods are called from CPU there will be significant performance drop because the matrix elements will be transferer one-by-one separately. However, if the matrix elements setup is not a critical part of your algorithm this can be an easy way how to do it. See the following example:
 
-\includelineno SparseMatrixExample_setElement.cpp
+\includelineno SparseMatrixViewExample_setElement.cpp
 
-Note that we use `SharedPointer` (\ref TNL::Pointers::SharedPointer) to pass the matrix easily into the lambda function when it runs on GPU. The first for-loop runs on CPU no matter where the matrix is allocated. Next we call the lambda function `f` from `ParallelFor` which is device sensitive and so it runs on CPU or GPU depending where the matrix is allocated. To avoid use of `SharedPointer`, which requires explicit synchronization of smart pointers, you may use `SparseMatrixView' (\ref TNL::Matrices::SparseMatrixView) to achieve the same. The result looks as follows:
+We first allocate matrix with five rows (it is given by the size of the [initializer list](https://en.cppreference.com/w/cpp/utility/initializer_list) and columns and we set capacity each row to one (line 12). The first for-loop (lines 17-19) runs on CPU no matter where the matrix is allocated. After printing the matrix (lines 21-22), we call the lambda function `f` (lines 24-26) with a help of `ParallelFor` (\ref TNL::Algorithms::ParallelFor , line 28) which is device sensitive and so it runs on CPU or GPU depending on where the matrix is allocated. The result looks as follows:
 
 \include SparseMatrixExample_setElement.out
 
-The method `addElement` adds a value to specific matrix element. Otherwise, it behaves the same as `setElement`. See the following example:
+The method `addElement` (\ref TNL::Matrices::SparseMatrix::addElement) adds a value to specific matrix element. Otherwise, it behaves the same as `setElement`. See the following example:
 
 \includelineno SparseMatrixExample_addElement.cpp
 
@@ -568,11 +565,11 @@ The result looks as follows:
 
 #### Method `getRow`
 
-More efficient method is to combine `getRow` (\ref TNL::Matrices::SparseMatrix::getRow) method with `ParallelFor` (\ref TNL::Algorithms::ParallelFor) and lambda function as the following example demonstrates:
+More efficient method, especially for GPUs, is to combine `getRow` (\ref TNL::Matrices::SparseMatrix::getRow) method with `ParallelFor` (\ref TNL::Algorithms::ParallelFor) and lambda function as the following example demonstrates:
 
 \includelineno SparseMatrixViewExample_getRow.cpp
 
-On the line 11, we create small matrix having five rows (number of rows is given by the size of the [initializer list](https://en.cppreference.com/w/cpp/utility/initializer_list) ) and columns (number of columns is given by the second parameter) and we set each row capacity to one (particular elements of the initalizer list). On the line 22, we call `ParallelFor` to iterate over all matrix elements. Each row is processed by the lambda function `f` (lines14-17). In the lambda function, we first fetch a sparse matrix row (\ref TNL::Matrices::SparseMatrixRowView) which is a proxy to matrix row. This object has a method `setElement` accepting three parameters:
+On the line 11, we create small matrix having five rows (number of rows is given by the size of the [initializer list](https://en.cppreference.com/w/cpp/utility/initializer_list) ) and columns (number of columns is given by the second parameter) and we set each row capacity to one (particular elements of the initializer list). On the line 22, we call `ParallelFor` (\ref TNL::Algorithms::ParallelFor) to iterate over all matrix elements. Each row is processed by the lambda function `f` (lines 14-17). In the lambda function, we first fetch a sparse matrix row (\ref TNL::Matrices::SparseMatrixRowView) which serves for accessing particular matrix rows. This object has a method `setElement` (\ref TNL::Matrices::SparseMatrixRowView::setElement) accepting three parameters:
 
 1. `localIdx` is a rank of the nonzero element in given matrix row.
 2. `columnIdx` is the new column index of the matrix element.
@@ -584,19 +581,19 @@ The result looks as follows:
 
 #### Method `forRows`
 
-Finaly, for the most efficient way of setting the non-zero matrix elements, is use of a method `forRows`. It requires indexes of the range of rows (`begin` and `end`) to be processed and a lambda function `function` which is called for each non-zero element. The lambda functions provides the following data:
+Finally, another efficient way of setting the nonzero matrix elements, is use of the method `forRows` (\ref TNL::Matrices::SparseMatrix::forRows). It requires indexes of the range of rows (`begin` and `end`) to be processed and a lambda function `function` which is called for each nonzero element. The lambda function provides the following data:
 
 * `rowIdx` is a row index of the matrix element.
-* `localIdx` is an index of the non-zero matrix element within the matrix row.
-* `columnIdx` is a column index of the matrix element. If the matrix element is suppsoed to be changed, this parameter can be a reference and so its value can be changed.
-* `value` is a value of the matrix element. It the matrix element is supposed to be changed, this parameter can be a reference as well and so the element value can be changed.
+* `localIdx` is an index of the nonzero matrix element within the matrix row.
+* `columnIdx` is a column index of the matrix element. If the matrix element column index is supposed to be modified, this parameter can be a reference and so its value can be changed.
+* `value` is a value of the matrix element. If the matrix element value is supposed to be modified, this parameter can be a reference as well and so the element value can be changed.
 * `compute` is a bool reference. When it is set to `false` the rest of the row can be omitted. This is, however, only a hint and it depends on the underlying matrix format if it is taken into account.
 
 See the following example:
 
 \includelineno SparseMatrixExample_forRows.cpp
 
-On the line 9, we allocate a lower triangular matrix (because the row capacities `{1,2,3,4,5}` are equal to row index) using the `SparseMatrix`. On the line 11, we prepare lambda function `f` which we execute on the line 22 just by calling the method `forRows` (\ref TNL::Matrices::SpartseMatrix::forRows). This method takes the range of matrix rows as the first two parameters and the lambda function as the last parameter. The lambda function receives parameters metioned above (see the line 11). We first check if the matrix element coordinates (`rowIdx` and `localIdx`) points to an element lying before the matrix diagonal or on the diagonal. In case of the lower triangular matrix in our example, the local index is in fact the same as the column index
+On the line 9, we allocate a lower triangular matrix byt setting the row capacities as `{1,2,3,4,5}`. On the line 11, we prepare lambda function `f` which we execute on the line 22 just by calling the method `forRows` (\ref TNL::Matrices::SparseMatrix::forRows). This method takes the range of matrix rows as the first two parameters and the lambda function as the last parameter. The lambda function receives parameters mentioned above (see the line 11). We first check if the matrix element coordinates (`rowIdx` and `localIdx`) points to an element lying before the matrix diagonal or on the diagonal (line 12). In case of the lower triangular matrix in our example, the local index is in fact the same as the column index
 
 \f[
 \left(
@@ -610,7 +607,7 @@ On the line 9, we allocate a lower triangular matrix (because the row capacities
 \right)
 \f]
 
-If we call the method `forRows` to setup the matrix elements for the first time, the parameter `columnIdx` has no sense because the matrix elements and their column indexes were not set yet. Therefore it is important that the test on the line 12 reads as
+If we call the method `forRows` (\ref TNL::Matrices::SparseMatrix::forRows) to setup the matrix elements for the first time, the parameter `columnIdx` has no sense because the matrix elements and their column indexes were not set yet. Therefore it is important that the test on the line 12 reads as
 
 ```
 if( rowIdx < localIdx )
@@ -643,8 +640,7 @@ Tridiagonal matrix format serves for specific matrix pattern when the nonzero ma
  \right)
 \f]
 
-An advantage is that we do not store the column indexes  explicitly as it is in \ref TNL::Matrices::SparseMatrix. This can reduce significantly the  memory requirements which also means better performance. See the following table for the storage requirements comparison between \ref TNL::Matrices::TridiagonalMatrix and \ref TNL::Matrices::SparseMatrix.
-
+An advantage is that we do not store the column indexes explicitly as it is in \ref TNL::Matrices::SparseMatrix. This can reduce significantly the  memory requirements which also means better performance. See the following table for the storage requirements comparison between \ref TNL::Matrices::TridiagonalMatrix and \ref TNL::Matrices::SparseMatrix.
 
   Real   | Index      |      SparseMatrix    | TridiagonalMatrix   | Ratio
  --------|------------|----------------------|---------------------|--------
@@ -661,15 +657,7 @@ Tridiagonal matrix is a templated class defined in the namespace \ref TNL::Matri
 * `ElementsOrganization` defines the organization of the matrix elements in memory. It can be \ref TNL::Algorithms::Segments::ColumnMajorOrder or \ref TNL::Algorithms::Segments::RowMajorOrder for column-major and row-major organization respectively. Be default it is the row-major order if the matrix is allocated in the host system and column major order if it is allocated on GPU.
 * `RealAllocator` is a memory allocator (one from \ref TNL::Allocators) which shall be used for allocation of the matrix elements. By default, it is the default allocator for given `Real` type and `Device` type -- see \ref TNL::Allocators::Default.
 
-In the following text we shows different methods for setup of tridiagonal matrices.
-
-#### Initializer list
-
-The tridiagonal matrix can be initialized by the means of the constructor with [initializer list](https://en.cppreference.com/w/cpp/utility/initializer_list). The matrix from the begining of this section can be constructed as the following example shows:
-
-\includelineno TridiagonalMatrixExample_Constructor_init_list_1.cpp
-
-For better alignment in the memory the tridiagonal format is organised like if there were three nonzero matrix elements in each row. This is not true for example in the first row where there is no matrix element on the left side of the diagonal. The same happens on the last row of the matrix. In our example, we have to add even the artificial matrix elements like this:
+For better alignment in the memory the tridiagonal format is organized like if there were three nonzero matrix elements in each row. This is not true for example in the first row where there is no matrix element on the left side of the diagonal. The same happens on the last row of the matrix. We have to add even the artificial matrix elements like this:
 
 \f[
 \begin{array}{c}
@@ -700,7 +688,7 @@ For better alignment in the memory the tridiagonal format is organised like if t
 \end{array}
 \f]
 
-If a matrix has more rows then columns, we have to extend the last two rows with nonzero elements in this way
+If the tridiagonal matrix has more rows then columns, we have to extend the last two rows with nonzero elements in this way
 
 \f[
 \left(
@@ -746,6 +734,32 @@ If a matrix has more rows then columns, we have to extend the last two rows with
 \end{array}
 \f]
 
+We also would like to remind the meaning of the local index (`localIdx`) of the matrix element within a matrix row. It is a rank of the nonzero matrix element in given row as we explained  in section [Indexing of nonzero matrix elements in sparse matrices](#indexing-of-nonzero-matrix-elements-in-sparse-matrices). The values of the local index for tridiagonal matrix elements are as follows
+
+\f[
+\left(
+\begin{array}{cccccc}
+1 & 2 &   &   &   &     \\
+0 & 1 & 2 &   &   &     \\
+  & 0 & 1 & 2 &   &     \\
+  &   & 0 & 1 & 2 &     \\
+  &   &   & 0 & 1 & 2   \\
+  &   &   &   & 0 & 1
+\end{array}
+\right)
+\f]
+
+
+In the following text we show different methods for setup of tridiagonal matrices.
+
+#### Initializer list
+
+The tridiagonal matrix can be initialized by the means of the constructor with [initializer list](https://en.cppreference.com/w/cpp/utility/initializer_list). The matrix from the beginning of this section can be constructed as the following example demonstrates:
+
+\includelineno TridiagonalMatrixExample_Constructor_init_list_1.cpp
+
+The matrix elements values are defined on lines 39-44. Each matrix row is represented by embedded an initializer list. We set three values in each row including the padding zeros.
+
 The output of the example looks as:
 
 \include TridiagonalMatrixExample_Constructor_init_list_1.out
@@ -770,11 +784,11 @@ The result looks as follows:
 
 #### Method `getRow`
 
- A slightly simpler way how to do the same with no need for shared pointer (\ref TNL::Pointers::SharedPointer), could be with the use of tridiagonal matrix view and the method `getRow` (\ref TNL::Matrices::TridiagonalMatrixView::getRow) as the following example demonstrates:
+ A bit different way how to do the same is the use of tridiagonal matrix view and the method `getRow` (\ref TNL::Matrices::TridiagonalMatrixView::getRow) as the following example demonstrates:
 
 \includelineno TridiagonalMatrixViewExample_getRow.cpp
 
-We create a matrix with the same size (line 10-15) set ones on the diagonal (lines 15-16). Next, we fetch the tridiagonal matrix view (line 16) which we can refer in the lambda function for matrix elements modification (lines 18-26). Inside the lambda function, we first get a matrix row by calling the method `getRow` (\ref TNL::Matrices::TridiagonalMatrixView::getRow) using which we can acces the matrix elements (lines 21-25). The lambda function is called by the parallel for (\ref TNL::Algorithms::ParallelFor).
+We create a matrix with the same size (line 10-15). Next, we fetch the tridiagonal matrix view (ef TNL::Matrices::TridiagonalMatrixView ,line 16) which we use in the lambda function for matrix elements modification (lines 18-26). Inside the lambda function, we first get a matrix row by calling the method `getRow` (\ref TNL::Matrices::TridiagonalMatrixView::getRow) using which we can access the matrix elements (lines 21-25). We would like to stress that the method `setElement` addresses the matrix elements with the `localIdx` parameter which is a rank of the nonzero element in the matrix row - see [Indexing of nonzero matrix elements in sparse matrices](#indexing-of-nonzero-matrix-elements-in-sparse-matrices). The lambda function is called by the `ParallelFor` (\ref TNL::Algorithms::ParallelFor).
 
 The result looks as follows:
 
@@ -782,26 +796,11 @@ The result looks as follows:
 
 #### Method `forRows`
 
-Finaly, even a bit more simple and bit less flexible way of matrix elements manipulation with use of the method `forRows` (\ref TNL::Matrices::TridiagonalMatrix::forRows) is demonstrated in the following example:
+Finally, even a bit more simple way of matrix elements manipulation with the method `forRows` (\ref TNL::Matrices::TridiagonalMatrix::forRows) is demonstrated in the following example:
 
 \includelineno TridiagonalMatrixViewExample_forRows.cpp
 
-On the line 41 we call the method `forRows` (\ref TNL::Matrices::TridiagonalMatrix::forRows) instead of parallel for (\ref TNL::Algorithms::ParallelFor). This method iterates over all matrix rows and all nonzero matrix elements. The lambda function function on the line 24 therefore do not receive only the matrix row index but also local index of the matrix element (`localIdx`) which is a rank of the nonzero matrix element in given row. The values of the local index for given matrix elements is as follows
-
-\f[
-\left(
-\begin{array}{cccccc}
-1 & 2 &   &   &   &     \\
-0 & 1 & 2 &   &   &     \\
-  & 0 & 1 & 2 &   &     \\
-  &   & 0 & 1 & 2 &     \\
-  &   &   & 0 & 1 & 2   \\
-  &   &   &   & 0 & 1
-\end{array}
-\right)
-\f]
-
-Next parameter `columnIdx` received by the lambda function is the column index of the matrix element. The fourth parameter `value` is a reference on the matrix element which we use for its modification. If the last parameter `compute` is set to false, the iterations over the matrix rows is terminated.
+On the line 41, we call the method `forRows` (\ref TNL::Matrices::TridiagonalMatrix::forRows) instead of parallel for (\ref TNL::Algorithms::ParallelFor). This method iterates over all matrix rows and all nonzero matrix elements. The lambda function on the line 24 therefore do not receive only the matrix row index but also local index of the matrix element (`localIdx`) which is a rank of the nonzero matrix element in given row  - see [Indexing of nonzero matrix elements in sparse matrices](#indexing-of-nonzero-matrix-elements-in-sparse-matrices). Next parameter, `columnIdx` received by the lambda function, is the column index of the matrix element. The fourth parameter `value` is a reference on the matrix element which we use for its modification. If the last parameter `compute` is set to false, the iterations over the matrix rows is terminated.
 
 The result looks as follows:
 
-- 
GitLab


From 8f6e0f5423329ef42f657222afaa01dda736cafd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Sun, 17 Jan 2021 16:50:45 +0100
Subject: [PATCH 44/53] Renaming shifts to offsets in MultidiagonalMatrix code.

---
 .../MultidiagonalMatrixExample_Constructor.cpp            | 4 ++--
 src/TNL/Matrices/MultidiagonalMatrix.hpp                  | 8 ++++----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/Documentation/Examples/Matrices/MultidiagonalMatrix/MultidiagonalMatrixExample_Constructor.cpp b/Documentation/Examples/Matrices/MultidiagonalMatrix/MultidiagonalMatrixExample_Constructor.cpp
index 8f8b8139b..0744804ad 100644
--- a/Documentation/Examples/Matrices/MultidiagonalMatrix/MultidiagonalMatrixExample_Constructor.cpp
+++ b/Documentation/Examples/Matrices/MultidiagonalMatrix/MultidiagonalMatrixExample_Constructor.cpp
@@ -14,8 +14,8 @@ void laplaceOperatorMatrix()
     */
    const int gridSize( 4 );
    const int matrixSize = gridSize * gridSize;
-   TNL::Containers::Vector< int, Device > shifts { - gridSize, -1, 0, 1, gridSize };
-   TNL::Matrices::MultidiagonalMatrix< double, Device > matrix( matrixSize, matrixSize, shifts );
+   TNL::Containers::Vector< int, Device > offsets { - gridSize, -1, 0, 1, gridSize };
+   TNL::Matrices::MultidiagonalMatrix< double, Device > matrix( matrixSize, matrixSize, offsets );
    auto matrixView = matrix.getView();
    auto f = [=] __cuda_callable__ ( int i, int j ) mutable {
       const int elementIdx = j * gridSize + i;
diff --git a/src/TNL/Matrices/MultidiagonalMatrix.hpp b/src/TNL/Matrices/MultidiagonalMatrix.hpp
index b0494ee1f..99cd518bc 100644
--- a/src/TNL/Matrices/MultidiagonalMatrix.hpp
+++ b/src/TNL/Matrices/MultidiagonalMatrix.hpp
@@ -41,7 +41,7 @@ MultidiagonalMatrix( const IndexType rows,
                const IndexType columns,
                const Vector& diagonalsOffsets )
 {
-   TNL_ASSERT_GT( diagonalsOffsets.getSize(), 0, "Cannot construct mutltidiagonal matrix with no diagonals shifts." );
+   TNL_ASSERT_GT( diagonalsOffsets.getSize(), 0, "Cannot construct multidiagonal matrix with no diagonals offsets." );
    this->setDimensions( rows, columns, diagonalsOffsets );
 }
 
@@ -57,9 +57,9 @@ MultidiagonalMatrix( const IndexType rows,
                      const IndexType columns,
                      const std::initializer_list< ListIndex > diagonalsOffsets )
 {
-   Containers::Vector< IndexType, DeviceType, IndexType > shifts( diagonalsOffsets );
-   TNL_ASSERT_GT( shifts.getSize(), 0, "Cannot construct multidiagonal matrix with no diagonals shifts." );
-   this->setDimensions( rows, columns, shifts );
+   Containers::Vector< IndexType, DeviceType, IndexType > offsets( diagonalsOffsets );
+   TNL_ASSERT_GT( offsets.getSize(), 0, "Cannot construct multidiagonal matrix with no diagonals offsets." );
+   this->setDimensions( rows, columns, offsets );
 }
 
 template< typename Real,
-- 
GitLab


From d6916fe6979af2906691570403d9b7164dc21f90 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Sun, 17 Jan 2021 16:51:12 +0100
Subject: [PATCH 45/53] Writing tutorial on MutldiagonalMatrix.

---
 .../Tutorials/Matrices/tutorial_Matrices.md   | 211 +++++++-----------
 1 file changed, 79 insertions(+), 132 deletions(-)

diff --git a/Documentation/Tutorials/Matrices/tutorial_Matrices.md b/Documentation/Tutorials/Matrices/tutorial_Matrices.md
index 778d12e44..648039686 100644
--- a/Documentation/Tutorials/Matrices/tutorial_Matrices.md
+++ b/Documentation/Tutorials/Matrices/tutorial_Matrices.md
@@ -808,7 +808,7 @@ The result looks as follows:
 
 ### Multidiagonal matrices <a name="multidiagonal_matrices_setup"></a>
 
-Multidiagonal matrices are generalization of the tridiagonal matrix. It is a special type of sparse matrices with specific pattern of the nonzero matrix elements which are positioned only parallel along diagonal. See the following example:
+Multidiagonal matrices are generalization of the tridiagonal ones. It is a special type of sparse matrices with specific pattern of the nonzero matrix elements which are positioned only parallel along diagonal. See the following example:
 
 \f[
   \left(
@@ -823,7 +823,7 @@ Multidiagonal matrices are generalization of the tridiagonal matrix. It is a spe
   \right)
  \f]
 
- We can see that the matrix elements lay on lines parallel to the main diagonal. Such lines can be expressed by their offsets from the main diagonal. On the following figure, each such line is depicted in different color:
+ We can see that the matrix elements lay on lines parallel to the main diagonal. Such lines can be characterized by their offsets from the main diagonal. On the following figure, each such line is depicted in different color:
 
   \f[
 \begin{array}{ccc}
@@ -850,7 +850,7 @@ Multidiagonal matrices are generalization of the tridiagonal matrix. It is a spe
   \right)
  \f]
 
- In this matrix, the offsets reads as \f$\{-3, -1, 0, +1, +3\}\f$. It also means that the column indexes on \f$i-\f$th row are \f$\{i-3, i-1, i, i+1, i+3\}\f$ (where the resulting index is non-negative and  smaller than the number of matrix columns). An advantage is that, similar to the tridiagonal matrix (\ref TNL::Matrices::TridiagonalMatrix), we do not store the column indexes explicitly as it is in \ref SparseMatrix. This can reduce significantly the  memory requirements which also means better performance. See the following table for the storage requirements comparison between \ref TNL::Matrices::MultidiagonalMatrix and \ref TNL::Matrices::SparseMatrix.
+ In this matrix, the offsets reads as \f$\{-3, -1, 0, +1, +3\}\f$. It also means that the column indexes on \f$i-\f$th row are \f$\{i-3, i-1, i, i+1, i+3\}\f$ (where we accept only nonnegative indexes smaller than the number of matrix columns). An advantage is that, similar to the tridiagonal matrix (\ref TNL::Matrices::TridiagonalMatrix), we do not store the column indexes explicitly as it is in \ref TNL::Matrices::SparseMatrix. This can significantly reduce the  memory requirements which also means better performance. See the following table for the storage requirements comparison between multidiagonal matrix (\ref TNL::Matrices::MultidiagonalMatrix) and general sparse matrix (\ref TNL::Matrices::SparseMatrix).
 
   Real   | Index     |      SparseMatrix    | MultidiagonalMatrix | Ratio
  --------|-----------|----------------------|---------------------|--------
@@ -859,12 +859,73 @@ Multidiagonal matrices are generalization of the tridiagonal matrix. It is a spe
   float  | 64-bit int| 12 bytes per element | 4 bytes per element | 30%
   double | 64-bit int| 16 bytes per element | 8 bytes per element | 50%
 
+ For the sake of better memory alignment and faster access to the matrix elements, we store all subdiagonals in complete form including the elements which are outside the matrix as depicted on the following figure where zeros stand for the padding artificial zero matrix elements
+
+\f[
+\begin{array}{cccc}
+0  &   &   & 0  \\
+   & 0 &   &    \\
+   &   & 0 &    \\
+   &   &   & 0  \\
+   &   &   &    \\
+   &   &   &    \\
+   &   &   &    \\
+   &   &   &    \\
+   &   &   &    \\
+   &   &   &    \\
+   &   &   &    \\
+   &   &   &    \\
+   &   &   &    \\
+   &   &   &    \\
+   &   &   &    \\
+   &   &   &
+\end{array}
+\left(
+\begin{array}{cccccccccccccccc}
+1  &  0 &    &    &  0 &    &    &    &    &    &    &    &     &    &    &   \\
+0  &  1 &  0 &    &    &  0 &    &    &    &    &    &    &     &    &    &   \\
+   &  0 &  1 &  0 &    &    & 0  &    &    &    &    &    &     &    &    &   \\
+   &    &  0 &  1 &  0 &    &    &  0 &    &    &    &    &     &    &    &   \\
+0  &    &    &  0 &  1 & 0  &    &    & 0  &    &    &    &     &    &    &   \\
+   & -1 &    &    & -1 & 1  & -1 &    &    & -1 &    &    &     &    &    &   \\
+   &    & -1 &    &    & -1 &  1 & -1 &    &    & -1 &    &     &    &    &   \\
+   &    &    & 0  &    &    &  0 &  1 & 0  &    &    & 0  &     &    &    &   \\
+   &    &    &    & 0  &    &    &  0 & 1  &  0 &    &    &  0  &    &    &   \\
+   &    &    &    &    & -1 &    &    & -1 &  1 & -1 &    &     & -1 &    &   \\
+   &    &    &    &    &    & -1 &    &    & -1 &  1 & -1 &     &    & -1 &   \\
+   &    &    &    &    &    &    &  0 &    &    &  0 &  1 &  0  &    &    & 0 \\
+   &    &    &    &    &    &    &    & 0  &    &    &  0 &  1  & 0  &    &   \\
+   &    &    &    &    &    &    &    &    &  0 &    &    &  0  & 1  & 0  &   \\
+   &    &    &    &    &    &    &    &    &    &  0 &    &     & 0  & 1  & 0 \\
+   &    &    &    &    &    &    &    &    &    &    & 0  &     &    & 0  & 1
+\end{array}
+\right)
+\begin{array}
+   &   &   &    \\
+   &   &   &    \\
+   &   &   &    \\
+   &   &   &    \\
+   &   &   &    \\
+   &   &   &    \\
+   &   &   &    \\
+   &   &   &    \\
+   &   &   &    \\
+   &   &   &    \\
+   &   &   &    \\
+   &   &   &    \\
+0  &   &   &    \\
+   & 0 &   &    \\
+   &   & 0 &    \\
+0  &   &   & 0
+\end{array}
+\f]
+
 Multidiagonal matrix is a templated class defined in the namespace \ref TNL::Matrices. It has six template parameters:
 
 * `Real` is a type of the matrix elements. It is `double` by default.
-* `Device` is a device where the matrix shall be allocated. Currently it can be either \ref TNL::Devices::Host for CPU or \ref TNL::Devices::Cuda for GPU supporting CUDA. It is \ref TNL::Devices::Host by default.
+* `Device` is a device where the matrix shall be allocated. Currently it can be either \ref TNL::Devices::Host for CPU or \ref TNL::Devices::Cuda for CUDA supporting GPUs. It is \ref TNL::Devices::Host by default.
 * `Index` is a type to be used for indexing of the matrix elements. It is `int` by default.
-* `ElementsOrganization` defines the organization of the matrix elements in memory. It can be \ref TNL::Algorithms::Segments::ColumnMajorOrder or \ref TNL::Algorithms::Segments::RowMajorOrder for column-major and row-major organization respectively. Be default it is the row-major order if the matrix is allocated in the host system and column major order if it is allocated on GPU.
+* `ElementsOrganization` defines the organization of the matrix elements in memory. It can be \ref TNL::Algorithms::Segments::ColumnMajorOrder or \ref TNL::Algorithms::Segments::RowMajorOrder for column-major and row-major organization respectively. Be default, it is the row-major order if the matrix is allocated in the host system and column major order if it is allocated on GPU.
 * `RealAllocator` is a memory allocator (one from \ref TNL::Allocators) which shall be used for allocation of the matrix elements. By default, it is the default allocator for given `Real` type and `Device` type -- see \ref TNL::Allocators::Default.
 * `IndexAllocator` is a memory allocator (one from \ref TNL::Allocators) which shall be used for allocation of the matrix elements offsets. By default, it is the default allocator for given `Index` type and `Device` type -- see \ref TNL::Allocators::Default.
 
@@ -872,19 +933,7 @@ In the following text we show different methods how to setup multidiagonal matri
 
 #### Initializer list
 
-Smaller multidiagonal matrices can be constructed using the constructor of multidiagonal matrix taking the subdiagonals offsets as an initializer list:
-
-\includelineno MultidiagonalMatrixExample_Constructor_init_list_1.cpp
-
-The only change is on the line 17 which reads as
-
-```
-TNL::Matrices::MultidiagonalMatrix< double, Device > matrix( matrixSize, matrixSize, { - gridSize, -1, 0, 1, gridSize } );
-```
-
-Here we call the mentioned constructor, which accepts the matrix dimensions (number of rows and columns) as first two parameters and the initializer list with the subdiagonal offsets as the last one. The result looks the same as in the previous example.
-
-There is also a constructor with initializer list for matrix elements values as demonstrated by the following example:
+Smaller multidiagonal matrices can be created using the constructor with initializer lists (\ref std::initializer_list) as we demonstrate in the following example:
 
 \includelineno MultidiagonalMatrixExample_Constructor_init_list_2.cpp
 
@@ -909,23 +958,17 @@ On the lines 25-46, we call the constructor which, in addition to matrix dimensi
 
 #### Methods `setElement` and `addElement`
 
-The matrix elements values can be changed using the method method `setElements` (\ref TNL::Matrices::MutlidiagonalMatrix::setElements) which accepts the elements values in the same form of embedded initializer list. It just does not allow changing the subdiagonals offsets. For this purpose method `setDiagonalsOffsets` (\ref TNL::Matrices::MultidiagonalMatrix::setDiagonalsOffsets) can be used. Note, however, that this method deletes all current matrix elements.
-
-Another way of setting the matrix elements is by means of the method `setElement` (\ref TNL::Matrices::MutlidiagonalMatrix::setElement). It works the same way as with other matrix types as we can see in the follofwing example:
-
-\includelineno MultidiagonalMatrixExample_setElement.cpp
-
-This examples shows that the method `setElement` can be used both on the host (CPU) (line 17) as well as in the GPU kernels (lines 23-27). Here we use shared pointer (\ref TNL::Pointers::SharedPointer) (line 15) to pass the multidiagonal matrix to lambda function `f` (lines 22-28) which may run on GPU. In this case we have to synchronize to share pointer explicitly by calling the function \ref TNL::Pointers::synchronizeSmartPointersOnDevice. To avoid this inconvenience the same can be achieved with the multidiagonal matrix view:
+Another and more efficient way of setting the matrix elements is by means of the method `setElement` (\ref TNL::Matrices::MultidiagonalMatrix::setElement). It is demonstrated in the following example:
 
 \includelineno MultidiagonalMatrixViewExample_setElement.cpp
 
-In this example, we fetch the matrix view (line 16) immediately after creating the matrix itself (line 15). Note that the matrix view can be obtained from the matrix at any time while the shared pointer only at the time of the matrix creation. On the other hand, if the original matrix is changed, all matrix views become invalid which is not true for the shared pointers. So it is better to fetch the matrix view immediately before we use it to avoid the sitaution that you would use invalid matrix view. The method `setElement` (\ref TNL::Matrices::MutlidiagonalMatrixView::setElement) can be used on both host (CPU) (line 19) and the device (lines 25-29) if the lambda function `f` (lines 24-30) runs in GPU kernel. The result of both examles looks the same:
+This examples shows that the method `setElement` can be used both on the host (CPU) (line 19) as well as in the lambda functions that can be called from GPU kernels (lines 25-29). For this purpose, we fetch a matrix view on the line 16. The result looks as follows:
 
 \include MultidiagonalMatrixViewExample_setElement.out
 
 #### Method `getRow`
 
-In this example we will create matrix of the following form:
+Slightly more efficient way of the multidiagonal matrix setup is offered by the method `getRow` (\ref TNL::Matrices::MultidiagonalMatrix::getRow). We will use it to create a matrix of the following form:
 
 \f[
 \left(
@@ -935,12 +978,12 @@ In this example we will create matrix of the following form:
    &  . &  1 &  . &    &    & .  &    &    &    &    &    &     &    &    &   \\
    &    &  . &  1 &  . &    &    &  . &    &    &    &    &     &    &    &   \\
 .  &    &    &  . &  1 & .  &    &    & .  &    &    &    &     &    &    &   \\
-   & -1 &    &    & -1 & 1  & -1 &    &    & -1 &    &    &     &    &    &   \\
-   &    & -1 &    &    & -1 &  1 & -1 &    &    & -1 &    &     &    &    &   \\
+   & -1 &    &    & -1 & -4 & -1 &    &    & -1 &    &    &     &    &    &   \\
+   &    & -1 &    &    & -1 & -4 & -1 &    &    & -1 &    &     &    &    &   \\
    &    &    & .  &    &    &  . &  1 & .  &    &    & .  &     &    &    &   \\
    &    &    &    & .  &    &    &  . & 1  &  . &    &    &  .  &    &    &   \\
-   &    &    &    &    & -1 &    &    & -1 &  1 & -1 &    &     & -1 &    &   \\
-   &    &    &    &    &    & -1 &    &    & -1 &  1 & -1 &     &    & -1 &   \\
+   &    &    &    &    & -1 &    &    & -1 & -4 & -1 &    &     & -1 &    &   \\
+   &    &    &    &    &    & -1 &    &    & -1 & -4 & -1 &     &    & -1 &   \\
    &    &    &    &    &    &    &  . &    &    &  . &  1 &  .  &    &    & . \\
    &    &    &    &    &    &    &    & .  &    &    &  . &  1  & .  &    &   \\
    &    &    &    &    &    &    &    &    &  . &    &    &  .  & 1  & .  &   \\
@@ -950,15 +993,13 @@ In this example we will create matrix of the following form:
 \right)
 \f]
 
-The code based on use of method 'getRow' reads as:
+The matrices of this form arise from a discretization of the [Laplace operator in 2D](https://en.wikipedia.org/wiki/Discrete_Laplace_operator) by the [finite difference method](https://en.wikipedia.org/wiki/Discrete_Poisson_equation). We use this example because it is a frequent numerical problem and the multidiagonal format is very suitable for such matrices. If the reader, however, is not familiar with the finite difference method, please, do not be scared, we will just create the matrix mentioned above. The code based on use of method `getRow` reads as:
 
 \includelineno MultidiagonalMatrixExample_Constructor.cpp
 
-The matrix from this example arises from a discretization of the [Laplace operator in 2D by the finite difference method](https://en.wikipedia.org/wiki/Discrete_Poisson_equation). We use this example because it is very frequent numerical problem. If the reader, however, is not familiar with the finite difference method, please, do not be scared, we will just create the matrix mentioned above.
+We firstly compute the matrix size (`matrixSize`) based on the numerical grid dimensions on the line 16. The subdiagonals offsets are defined by the numerical grid size and since it is four in this example the offsets read as \f$\left\{-4,-1,0,1,4 \right\} \f$ or `{ -gridSize, -1, 0, 1, gridSize}` (line 17). Here we store the offsets in vector (\ref TNL::Containers::Vector) called `offsets`. Next we use a constructor with matrix dimensions and offsets passed via TNL vector (line 18) and we fetch a matrix view (\ref TNL::Matrices::MultidiagonalMatrixView, line 19).
 
-We firstly compute the matrix size (`matrixSize`) based on the numerical grid dimensions on the line 16. The subdiagonals offsets are defined by the numerical grid size and since it is four in this example the offsets read as \f$\left\{-4,-1,0,1,4 \right\} \f$ or `{ -gridSize, -1, 0, 1, gridSize}` (line 17). Here we store the offsets (referred as `shifts`) in vector (\ref TNL::Containers::Vector). Next we use a constructor with matrix dimensions and offsets passed via TNL vector (line 18). Next we fetch matrix view (line 19) (see [Multidiagonal matrix view](#multidiagonal_matrix_view)).
-
-The matrix is constructed by iterating over particular nodes of the numerical grid. Each node corresponed to one matrix row. This is why the lambda function `f` (lines 20-35) take two indexes `i` and `j` (line 20). Their values are coordinates of the twodimensional numerical grid. Based on these coodrinates we compute index (`elementIdx`) of the corresponding matrix row (line 21). We fetch matrix row (`row`) by calling the `getRow` method (\ref TNL::Matrices::MutlidiagonalMatrix::getRow) (line 22). Depending on the grid node coordinates we set either the boundary conditions (lines 23-26) for the boundary nodes (those laying on the boundary of the grid and so their coordinates fulfil the condition `i == 0 || j == 0 || i == gridSize - 1 || j == gridSize - 1` ) for which se set onle diagonal element to 1. The inner nodes of the numerical grid are handled on the lines 29-33 where we set coefficients approximating the Laplace operator. We use the method `setElement` of the matrix row (\ref TNL::Matrices::MultidiagonalMatrixRow::setElement) which takes the local index of the nonzero matrix element as the first parametr and the new value of the element as the second parameter. The local indexes, in fact, refer to particular subdiagonals as depicted on the following figure (in blue):
+The matrix is constructed by iterating over particular nodes of the numerical grid. Each node correspond to one matrix row. This is why the lambda function `f` (lines 20-35) take two indexes `i` and `j` (line 20). Their values are coordinates of the two-dimensional numerical grid. Based on these coordinates we compute index (`elementIdx`) of the corresponding matrix row (line 21). We fetch matrix row (`row`) by calling the `getRow` method (\ref TNL::Matrices::MultidiagonalMatrix::getRow) (line 22). Depending on the grid node coordinates we set either the boundary conditions (lines 23-26) for the boundary nodes (those laying on the boundary of the grid and so their coordinates fulfil the condition `i == 0 || j == 0 || i == gridSize - 1 || j == gridSize - 1` ) for which se set only the diagonal element to 1. The inner nodes of the numerical grid are handled on the lines 29-33 where we set coefficients approximating the Laplace operator. We use the method `setElement` of the matrix row (\ref TNL::Matrices::MultidiagonalMatrixRow::setElement) which takes the local index of the nonzero matrix element as the first parameter (see [Indexing of nonzero matrix elements in sparse matrices](#indexing-of-nonzero-matrix-elements-in-sparse-matrices)) and the new value of the element as the second parameter. The local indexes, in fact, refer to particular subdiagonals as depicted on the following figure (in blue):
 
 \f[
 \begin{array}{cccc}
@@ -1005,104 +1046,10 @@ The matrix is constructed by iterating over particular nodes of the numerical gr
 \right)
 \f]
 
-We use `ParallelFor2D` (\ref TNL::Algorithms::ParallelFor2D) to iterate over all nodes of the numerical grid (line 36) and apply the lambda function. Also note that for the sake of better memory alignemnt and faster acces to the matrix elements, we store all subdiagonals in complete form including the elemenets which are outside the matrix as depicted on the following figure where zeros stand for the padding artificial zero matrix elements
-
-\f[
-\begin{array}{cccc}
-0  &   &   & 0  \\
-   & 0 &   &    \\
-   &   & 0 &    \\
-   &   &   & 0  \\
-   &   &   &    \\
-   &   &   &    \\
-   &   &   &    \\
-   &   &   &    \\
-   &   &   &    \\
-   &   &   &    \\
-   &   &   &    \\
-   &   &   &    \\
-   &   &   &    \\
-   &   &   &    \\
-   &   &   &    \\
-   &   &   &
-\end{array}
-\left(
-\begin{array}{cccccccccccccccc}
-1  &  0 &    &    &  0 &    &    &    &    &    &    &    &     &    &    &   \\
-0  &  1 &  0 &    &    &  0 &    &    &    &    &    &    &     &    &    &   \\
-   &  0 &  1 &  0 &    &    & 0  &    &    &    &    &    &     &    &    &   \\
-   &    &  0 &  1 &  0 &    &    &  0 &    &    &    &    &     &    &    &   \\
-0  &    &    &  0 &  1 & 0  &    &    & 0  &    &    &    &     &    &    &   \\
-   & -1 &    &    & -1 & 1  & -1 &    &    & -1 &    &    &     &    &    &   \\
-   &    & -1 &    &    & -1 &  1 & -1 &    &    & -1 &    &     &    &    &   \\
-   &    &    & 0  &    &    &  0 &  1 & 0  &    &    & 0  &     &    &    &   \\
-   &    &    &    & 0  &    &    &  0 & 1  &  0 &    &    &  0  &    &    &   \\
-   &    &    &    &    & -1 &    &    & -1 &  1 & -1 &    &     & -1 &    &   \\
-   &    &    &    &    &    & -1 &    &    & -1 &  1 & -1 &     &    & -1 &   \\
-   &    &    &    &    &    &    &  0 &    &    &  0 &  1 &  0  &    &    & 0 \\
-   &    &    &    &    &    &    &    & 0  &    &    &  0 &  1  & 0  &    &   \\
-   &    &    &    &    &    &    &    &    &  0 &    &    &  0  & 1  & 0  &   \\
-   &    &    &    &    &    &    &    &    &    &  0 &    &     & 0  & 1  & 0 \\
-   &    &    &    &    &    &    &    &    &    &    & 0  &     &    & 0  & 1
-\end{array}
-\right)
-\begin{array}
-   &   &   &    \\
-   &   &   &    \\
-   &   &   &    \\
-   &   &   &    \\
-   &   &   &    \\
-   &   &   &    \\
-   &   &   &    \\
-   &   &   &    \\
-   &   &   &    \\
-   &   &   &    \\
-   &   &   &    \\
-   &   &   &    \\
-0  &   &   &    \\
-   & 0 &   &    \\
-   &   & 0 &    \\
-0  &   &   & 0
-\end{array}
-\f]
-
-The result looks as follows:
+We use `ParallelFor2D` (\ref TNL::Algorithms::ParallelFor2D) to iterate over all nodes of the numerical grid (line 36) and apply the lambda function. The result looks as follows:
 
 \include MultidiagonalMatrixExample_Constructor.out
 
-Another way for setting the matrix elements is by means of the multidiagonal matrix row is as follows:
-
-\includelineno MultidiagonalMatrixViewExample_getRow.cpp
-
-Here we use the matrix view again (line 19) and in the lambda function `f` which serves for the matrix elements setting, we fetch the matrix row just at the beginning (line 22). Next we use the method `setElement` (\ref TNL::Matrices::MultidiagonalMatrixRow::setElement) which accepts two parameters. The first is the local index of the matrix element which in case of the multidiagonal matrix agrees with index of the subdiagonal as demonstrated on this figure which shows just the matrix we are creating in this example (the subdiagonal indexes are depicted in blue color):
-
-\f[
-\begin{array}{c}
-\color{blue}{0} \\
-\hline
-* \\
-  \\
-  \\
-  \\
-~
-\end{array}
-\left(
-\begin{array}{ccccc}
- \color{blue}{1} &  \color{blue}{2} &    &    &    \\
- \hline
-2  & -1 &    &    &    \\
--1 &  2 & -1 &    &    \\
-   & -1 &  2 & -1 &    \\
-   &    & -1 &  2 & -1 \\
-   &    &    & -1 &  2
-\end{array}
-\right)
-\f]
-
-The second parameter of the method `setElement` is the new matrix elements value. An advantage of this method is that it can access  the matrix elements faster. The output of this example looks as follows:
-
-\include MultidiagonalMatrixViewExample_getRow.out
-
 #### Method `forRows`
 
 Similar and even a bit simpler way of setting the matrix elements is offered by the method `forRows` (\ref TNL::Matrices::MultidiagonalMatrix::forRows, \ref TNL::Matrices::MultidiagonalMatrixView::forRows) as demonstrated in the following example:
@@ -1117,7 +1064,7 @@ In this case, we need to provide a lambda function `f` (lines 27-43) which is ca
 * `value` is a reference to the matrix element value. It can be used even for changing the value.
 * `compute` is a reference to boolean. If it is set to false, the iteration over the matrix row can be stopped.
 
-In this example, the matrix element value depends only on the subdiagonal index `localIdx` as we can see on the line 42. The result looks as follows:
+In this example, the matrix element value depends only on the subdiagonal index `localIdx` (see [Indexing of nonzero matrix elements in sparse matrices](#indexing-of-nonzero-matrix-elements-in-sparse-matrices)) as we can see on the line 42. The result looks as follows:
 
 \include MultidiagonalMatrixExample_forRows.out
 
-- 
GitLab


From be56f6dbb200b93df57d0e0ba5383295f907bdf5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Sun, 17 Jan 2021 17:29:26 +0100
Subject: [PATCH 46/53] Renaming rowLengths to compressedRowLengths in
 LambdaMatrix.

---
 .../LambdaMatrix/LambdaMatrixExample_Constructor.cpp      | 8 ++++----
 .../LambdaMatrix/LambdaMatrixExample_forAllRows.cpp       | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/Documentation/Examples/Matrices/LambdaMatrix/LambdaMatrixExample_Constructor.cpp b/Documentation/Examples/Matrices/LambdaMatrix/LambdaMatrixExample_Constructor.cpp
index 2c418dd54..fd6d2e1df 100644
--- a/Documentation/Examples/Matrices/LambdaMatrix/LambdaMatrixExample_Constructor.cpp
+++ b/Documentation/Examples/Matrices/LambdaMatrix/LambdaMatrixExample_Constructor.cpp
@@ -6,7 +6,7 @@ int main( int argc, char* argv[] )
    /***
     * Lambda functions defining the matrix.
     */
-   auto rowLengths = [=] __cuda_callable__ ( const int rows, const int columns, const int rowIdx ) -> int { return 1; };
+   auto compressedRowLengths = [=] __cuda_callable__ ( const int rows, const int columns, const int rowIdx ) -> int { return 1; };
    auto matrixElements1 = [=] __cuda_callable__ ( const int rows, const int columns, const int rowIdx, const int localIdx, int& columnIdx, double& value ) {
          columnIdx = rowIdx;
          value =  1.0;
@@ -21,13 +21,13 @@ int main( int argc, char* argv[] )
    /***
     * Matrix construction with explicit type definition.
     */
-   using MatrixType = decltype( TNL::Matrices::LambdaMatrixFactory< double, TNL::Devices::Host, int >::create( matrixElements1, rowLengths ) );
-   MatrixType m1( size, size, matrixElements1, rowLengths );
+   using MatrixType = decltype( TNL::Matrices::LambdaMatrixFactory< double, TNL::Devices::Host, int >::create( matrixElements1, compressedRowLengths ) );
+   MatrixType m1( size, size, matrixElements1, compressedRowLengths );
 
    /***
     * Matrix construction using 'auto'.
     */
-   auto m2 = TNL::Matrices::LambdaMatrixFactory< double, TNL::Devices::Host, int >::create( matrixElements2, rowLengths );
+   auto m2 = TNL::Matrices::LambdaMatrixFactory< double, TNL::Devices::Host, int >::create( matrixElements2, compressedRowLengths );
    m2.setDimensions( size, size );
 
    std::cout << "The first lambda matrix: " << std::endl << m1 << std::endl;
diff --git a/Documentation/Examples/Matrices/LambdaMatrix/LambdaMatrixExample_forAllRows.cpp b/Documentation/Examples/Matrices/LambdaMatrix/LambdaMatrixExample_forAllRows.cpp
index 72ff96101..88ceb5687 100644
--- a/Documentation/Examples/Matrices/LambdaMatrix/LambdaMatrixExample_forAllRows.cpp
+++ b/Documentation/Examples/Matrices/LambdaMatrix/LambdaMatrixExample_forAllRows.cpp
@@ -10,14 +10,14 @@ void forRowsExample()
    /***
     * Lambda functions defining the matrix.
     */
-   auto rowLengths = [=] __cuda_callable__ ( const int rows, const int columns, const int rowIdx ) -> int { return columns; };
+   auto compressedRowLengths = [=] __cuda_callable__ ( const int rows, const int columns, const int rowIdx ) -> int { return columns; };
    auto matrixElements = [=] __cuda_callable__ ( const int rows, const int columns, const int rowIdx, const int localIdx, int& columnIdx, double& value ) {
          columnIdx = localIdx;
          value = TNL::max( rowIdx - columnIdx + 1, 0 );
    };
 
    using MatrixFactory = TNL::Matrices::LambdaMatrixFactory< double, Device, int >;
-   auto matrix = MatrixFactory::create( 5, 5, matrixElements, rowLengths );
+   auto matrix = MatrixFactory::create( 5, 5, matrixElements, compressedRowLengths );
 
    TNL::Matrices::DenseMatrix< double, Device > denseMatrix( 5, 5 );
    auto denseView = denseMatrix.getView();
-- 
GitLab


From 87b2af4a4b8fd0a122d1a0d50c224ece51b34628 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Sun, 17 Jan 2021 17:29:55 +0100
Subject: [PATCH 47/53] Writing tutorial on LambdaMatrix.

---
 .vscode/settings.json                         | 28 +++++++++++
 ...CompressedRowLengthsLambda_declaration.cpp |  4 ++
 .../Tutorials/Matrices/tutorial_Matrices.md   | 49 +++++++++----------
 3 files changed, 54 insertions(+), 27 deletions(-)
 create mode 100644 .vscode/settings.json
 create mode 100644 Documentation/Tutorials/Matrices/snippet_CompressedRowLengthsLambda_declaration.cpp

diff --git a/.vscode/settings.json b/.vscode/settings.json
new file mode 100644
index 000000000..2de193076
--- /dev/null
+++ b/.vscode/settings.json
@@ -0,0 +1,28 @@
+{
+    "editor.tokenColorCustomizations": {
+        "textMateRules": [
+            {
+                "scope": "googletest.failed",
+                "settings": {
+                    "foreground": "#f00"
+                }
+            },
+            {
+                "scope": "googletest.passed",
+                "settings": {
+                    "foreground": "#0f0"
+                }
+            },
+            {
+                "scope": "googletest.run",
+                "settings": {
+                    "foreground": "#0f0"
+                }
+            }
+        ]
+    },
+    "files.associations": {
+        "numeric": "cpp",
+        "ostream": "cpp"
+    }
+}
\ No newline at end of file
diff --git a/Documentation/Tutorials/Matrices/snippet_CompressedRowLengthsLambda_declaration.cpp b/Documentation/Tutorials/Matrices/snippet_CompressedRowLengthsLambda_declaration.cpp
new file mode 100644
index 000000000..ac3a9de94
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/snippet_CompressedRowLengthsLambda_declaration.cpp
@@ -0,0 +1,4 @@
+auto compressedRowLengths = [=] __cuda_callable__ (
+    Index rows,
+    Index columns,
+    Index row ) -> Index;
\ No newline at end of file
diff --git a/Documentation/Tutorials/Matrices/tutorial_Matrices.md b/Documentation/Tutorials/Matrices/tutorial_Matrices.md
index 648039686..42b09f477 100644
--- a/Documentation/Tutorials/Matrices/tutorial_Matrices.md
+++ b/Documentation/Tutorials/Matrices/tutorial_Matrices.md
@@ -1068,44 +1068,34 @@ In this example, the matrix element value depends only on the subdiagonal index
 
 \include MultidiagonalMatrixExample_forRows.out
 
-## Lambda matrices <a name="lambda_matrices_setup"></a>
+### Lambda matrices <a name="lambda_matrices_setup"></a>
 
-Lambda matrix (\ref TNL::Matrices::LambdaMatrix) is a special type of matrix which could be also called *** matrix-free matrix ***. Its elements are not stored in memory explicitlely but they are evaluated on-the-fly by means of user defined lambda functions. If the matrix elements can be expressed by computationaly not expansive formula, we can significantly reduce the memory consumptions which can be appriciated especially on GPU. Since the memory accesses are quite expensive even on CPU, we can get, at the end, even much faster code.
+Lambda matrix (\ref TNL::Matrices::LambdaMatrix) is a special type of matrix which could be also called *matrix-free matrix*. The matrix elements are not stored in memory explicitly but they are evaluated **on-the-fly** by means of user defined lambda functions. If the matrix elements can be expressed by computationally not expansive formula, we can significantly reduce the memory consumption which is appreciated especially on GPUs. Since the memory accesses are quite expensive even on both CPU and GPU, we can get, at the end, even much faster code.
 
 The lambda matrix (\ref TNL::Matrices::LambdaMatrix) is a templated class with the following template parameters:
 
 * `MatrixElementsLambda` is a lambda function which evaluates the matrix elements values and column indexes.
 * `CompressedRowLengthsLambda` is a lambda function telling how many nonzero elements are there in given matrix row.
-* `Real` is a of matrix elements values.
+* `Real` is a type of matrix elements values.
 * `Device` is a device on which the lambda functions mentioned above will be evaluated.
 * `Index` is a type to be used for indexing.
 
 The lambda function `MatrixElementsLambda` is supposed to have the following declaration:
 
-```
-matrixElements( Index rows,
-                Index columns,
-                Index row,
-                Index localIdx,
-                Index& columnIdx,
-                Real& value )
-```
-where the particular parameterts have the following meaning:
+\includelineno snippet_MatrixElementsLambda_declaration.cpp
+
+where the particular parameters have the following meaning:
 
 * `rows` tells the number of matrix rows.
 * `columns` tells the number of matrix columns.
 * `rowIdx` is index of the matrix row in which we are supposed to evaluate the matrix element.
-* `localIdx` is a rank of the nonzero matrix element.
+* `localIdx` is a rank of the nonzero matrix element, see [Indexing of nonzero matrix elements in sparse matrices](#indexing-of-nonzero-matrix-elements-in-sparse-matrices).
 * `columnIdx` is a reference on variable where we are supposed to store the matrix element column index.
 * `value` is a reference on variable where we are supposed to store the matrix element value.
 
-The lambda function `CompressedRowLengthsLambda` is supposed to look like this:
+The lambda function `CompressedRowLengthsLambda` (by compressed row length we mean the number of matrix elements in a row after ignoring/compressing the zero elements) is supposed to be declared like this:
 
-```
-rowLengths( Index rows,
-            Index columns,
-            Index row ) -> Index
-```
+\includelineno snippet_CompressedRowLengthsLambda_declaration.cpp
 
 where the parameters can be described as follows:
 
@@ -1115,27 +1105,29 @@ where the parameters can be described as follows:
 
 The lambda function is supposed to return just the number of the nonzero matrix elements in given matrix row.
 
-### Lambda matrix inititation
+#### Lambda matrix inititation
 
-See the following example which demonstrates how to create the lambda matrix:
+How to put the lambda functions together with the lambda matrix is demonstrated in the following example:
 
 \includelineno LambdaMatrixExample_Constructor.cpp
 
-Here we create two simple diagonal matrices. Therefore thay share the same lambda function `rowLengths` telling the the number of nonzero matrix elements in particular matrix rows which is always one (line 9). The first matrix, defined by the lambda function `matrixElements1`, is identity matrix and so its each diagonal element equals one. We set the matrix element value to `1.0` (line 12) and the column index equals the row index (line 15). The second matrix, defined by the lambda function `matrixElements2`, is also diagonal but not the identity matrix. The values of the diagonal elements equal to row index (line 16).
+Here we create two simple diagonal matrices. Therefore they share the same lambda function `compressedRowLengths` telling the number of nonzero matrix elements in particular matrix rows which is always one (line 9). The first matrix, defined by the lambda function `matrixElements1`, is identity matrix and so its each diagonal element equals one. We set the matrix element value to `1.0` (line 12) and the column index equals the row index (line 15). The second matrix, defined by the lambda function `matrixElements2`, is also diagonal but not the identity matrix. The values of the diagonal elements equal to row index (line 16).
 
-With the same lambda functions we can define matrices with different dimensions. In this example, we set the matrix size to five (line 19). It can be quite difficult to express the lambda matrix type because it depends on the types of the lambda functions. To make this easier, one may use a lambda-matrix factory (\ref TNL::Matrices::LambdaMatrixFactory). Using `decltype` one can deduce even the matrix type (line 24) followed by calling lambda matrix constructor with matrix dimensions and instances of the lambda functions (line 25). Or one can just simply employ the keyword `auto` (line 30) followed by setting the matrix dimensins (line 31).
+With the same lambda functions we can define matrices with different dimensions. In this example, we set the matrix size to five (line 19). It could be quite difficult to express the lambda matrix type because it depends on the types of the lambda functions. To make this easier, one may use the lambda-matrix factory (\ref TNL::Matrices::LambdaMatrixFactory). Using `decltype` one can deduce even the matrix type (line 24) followed by calling lambda matrix constructor with matrix dimensions and instances of the lambda functions (line 25). Or one can just simply employ the keyword `auto` (line 30) followed by setting the matrix dimensions (line 31).
 
 The result looks as follows:
 
 \include LambdaMatrixExample_Constructor.out
 
-Of course, the lambda matrix has the same interface as other matrix types. The following example demonstrates the use of the method `forRows` to copy the lambda matrix into the dense matrix:
+#### Method `forRows`
+
+The lambda matrix has the same interface as other matrix types except of the method `getRow`. The following example demonstrates the use of the method `forRows` (\ref TNL::Matrices::LambdaMatrix::forRows) to copy the lambda matrix into the dense matrix:
 
 \includelineno LambdaMatrixExample_forRows.cpp
 
-Here, we treat the lambda matrix as if it was dense matrix. The lambda function `rowLengths` returns the number of the nonzero elements equal to the number of matrix columns (line 13). However, the lambda function `matrixElements` (lines 14-17), sets nozero values only to lower triangular part of the matrix. The elements in the upper part are equal to zero (line 16). Next we create an instance of the lambda matrix with help of the lambda matrix factory (\ref TNL::Matrices::LambdaMatrixFactory) (lines 19-20) and an instance of the dense matrix (\ref TNL::Matrices::DenseMatrix) (lines 22-23).
+Here, we treat the lambda matrix as if it was dense matrix and so the lambda function `compressedRowLengths` returns the number of the nonzero elements equal to the number of matrix columns (line 13). However, the lambda function `matrixElements` (lines 14-17), sets nonzero values only to lower triangular part of the matrix. The elements in the upper part are equal to zero (line 16). Next we create an instance of the lambda matrix with a help of the lambda matrix factory (\ref TNL::Matrices::LambdaMatrixFactory) (lines 19-20) and an instance of the dense matrix (\ref TNL::Matrices::DenseMatrix) (lines 22-23).
 
-Next we call the lambda function `f` by the method `forRows` (\ref TNL::Matrices::LambdaMatrix::forRows) to set the matrix elements of the dense matrix `denseMatrix` (line 26) via the dense matrix view (`denseView`) (\ref TNL::Matrices::DenseMatrixView). Note, that in the lambda function `f` we get the matrix element value already evaluated in the variable `value` as we are used to from other matrix types. So in fact, the same lambda function `f` woudl do the same job even for sparse matrix or any other. Also note, that in this case we iterate even over all zero matrix elements because the lambda function `rowLengths` (line 13) tells so. The result looks as follows:
+Next we call the lambda function `f` by the method `forRows` (\ref TNL::Matrices::LambdaMatrix::forRows) to set the matrix elements of the dense matrix `denseMatrix` (line 26) via the dense matrix view (`denseView`) (\ref TNL::Matrices::DenseMatrixView). Note, that in the lambda function `f` we get the matrix element value already evaluated in the variable `value` as we are used to from other matrix types. So in fact, the same lambda function `f` would do the same job even for sparse matrix or any other. Also note, that in this case we iterate even over all zero matrix elements because the lambda function `compressedRowLengths` (line 13) tells so. The result looks as follows:
 
 \include LambdaMatrixExample_forRows.out
 
@@ -1143,7 +1135,7 @@ At the end of this part, we show two more examples, how to express a matrix appr
 
 \includelineno LambdaMatrixExample_Laplace.cpp
 
-The following is another way of doing the same but precomputed supporting vectors:
+The following is another way of doing the same but with precomputed supporting vectors:
 
 \includelineno LambdaMatrixExample_Laplace_2.cpp
 
@@ -1153,6 +1145,9 @@ The result of both examples looks as follows:
 
 ## Flexible reduction in matrix rows <a name="flexible_reduction_in_matrix_rows"></a>
 
+
+
+
 ### Dense matrix
 
 Simillar operation to `forRows` is `rowsReduction` (\ref TNL::Matrices::DenseMatrix::rowsReduction) which performs given reduction in each matric row. For example, a matrix-vector product can be seen as a reduction of products of matrix elements and input vector in particular matrix rows. The first element of the result vector ios obtained as:
-- 
GitLab


From 752de0f1a3034f7adcfaea4b2d7670e38d858307 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Sun, 17 Jan 2021 20:53:23 +0100
Subject: [PATCH 48/53] Writing users guide about flexible reduction for
 matrices.

---
 ...ippet_MatrixElementsLambda_declaration.cpp |  8 ++
 ...ippet_rows_reduction_fetch_declaration.cpp |  1 +
 ...nippet_rows_reduction_keep_declaration.cpp |  1 +
 ...ppet_rows_reduction_reduce_declaration.cpp |  1 +
 .../Tutorials/Matrices/tutorial_Matrices.md   | 84 +++++++++++++++----
 5 files changed, 77 insertions(+), 18 deletions(-)
 create mode 100644 Documentation/Tutorials/Matrices/snippet_MatrixElementsLambda_declaration.cpp
 create mode 100644 Documentation/Tutorials/Matrices/snippet_rows_reduction_fetch_declaration.cpp
 create mode 100644 Documentation/Tutorials/Matrices/snippet_rows_reduction_keep_declaration.cpp
 create mode 100644 Documentation/Tutorials/Matrices/snippet_rows_reduction_reduce_declaration.cpp

diff --git a/Documentation/Tutorials/Matrices/snippet_MatrixElementsLambda_declaration.cpp b/Documentation/Tutorials/Matrices/snippet_MatrixElementsLambda_declaration.cpp
new file mode 100644
index 000000000..0db51ee48
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/snippet_MatrixElementsLambda_declaration.cpp
@@ -0,0 +1,8 @@
+auto matrixElements = [=] __cuda_callable__ (
+    Index rows,
+    Index columns,
+    Index row,
+    Index localIdx,
+    Index& columnIdx,
+    Real& value );
+
diff --git a/Documentation/Tutorials/Matrices/snippet_rows_reduction_fetch_declaration.cpp b/Documentation/Tutorials/Matrices/snippet_rows_reduction_fetch_declaration.cpp
new file mode 100644
index 000000000..57439abfa
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/snippet_rows_reduction_fetch_declaration.cpp
@@ -0,0 +1 @@
+auto fetch = [=] __cuda_callable__ ( Index rowIdx, Index columnIdx, const Real& value ) -> Real;
\ No newline at end of file
diff --git a/Documentation/Tutorials/Matrices/snippet_rows_reduction_keep_declaration.cpp b/Documentation/Tutorials/Matrices/snippet_rows_reduction_keep_declaration.cpp
new file mode 100644
index 000000000..8428768ee
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/snippet_rows_reduction_keep_declaration.cpp
@@ -0,0 +1 @@
+auto keep = [=] __cuda_callable__ ( Index rowIdx, const Real& value ) mutable;
\ No newline at end of file
diff --git a/Documentation/Tutorials/Matrices/snippet_rows_reduction_reduce_declaration.cpp b/Documentation/Tutorials/Matrices/snippet_rows_reduction_reduce_declaration.cpp
new file mode 100644
index 000000000..e2d63c014
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/snippet_rows_reduction_reduce_declaration.cpp
@@ -0,0 +1 @@
+auto reduce = [] __cuda_callable__ ( const Real& a, const Real& b ) -> Real;
\ No newline at end of file
diff --git a/Documentation/Tutorials/Matrices/tutorial_Matrices.md b/Documentation/Tutorials/Matrices/tutorial_Matrices.md
index 42b09f477..1f1ac3015 100644
--- a/Documentation/Tutorials/Matrices/tutorial_Matrices.md
+++ b/Documentation/Tutorials/Matrices/tutorial_Matrices.md
@@ -11,6 +11,11 @@
    4. [Multidiagonal matrices](#multidiagonal_matrices_setup)
    5. [Lambda matrices](#lambda_matrices_setup)
 5. [Flexible reduction in matrix rows](#flexible_reduction_in_matrix_rows)
+   1. [Dense matrices example](#dense-matrices-flexible-reduction-example)
+   2. [Sparse matrices example](#sparse-matrices-flexible-reduction-example)
+   3. [Tridiagonal matrices example](#tridiagonal-matrices-flexible-reduction-example)
+   4. [Multidiagonal matrices example](#multidiagonal-matrices-flexible-reduction-example)
+   5. [Lambda matrices example](#lambda-matrices-flexible-reduction-example)
 6. [Matrix-vector product](#matrix_vector_product)
 7. [Matrix I/O operations](#matrix_io_operations)
 8. [Appendix](#appendix)
@@ -1145,12 +1150,8 @@ The result of both examples looks as follows:
 
 ## Flexible reduction in matrix rows <a name="flexible_reduction_in_matrix_rows"></a>
 
-
-
-
-### Dense matrix
-
-Simillar operation to `forRows` is `rowsReduction` (\ref TNL::Matrices::DenseMatrix::rowsReduction) which performs given reduction in each matric row. For example, a matrix-vector product can be seen as a reduction of products of matrix elements and input vector in particular matrix rows. The first element of the result vector ios obtained as:
+Flexible reduction in matrix rows is a powerful tool for many different matrix operations. It is represented by the method `rowsReduction` (\ref TNL::Matrices::DenseMatrix::rowsReduction, 
+\ref TNL::Matrices::SparseMatrix::rowsReduction, \ref TNL::Matrices::TridiagonalMatrix::rowsReduction, \ref TNL::Matrices::MultidiagonalMatrix::rowsReduction, \ref TNL::Matrices::LambdaMatrix::rowsReduction) and similar to the method `forRows` it iterates over particular matrix rows. However, it performs *flexible paralell reduction* in addition. For example, the matrix-vector product can be seen as a reduction of products of matrix elements with the input vector in particular matrix rows. The first element of the result vector ios obtained as:
 
 \f[
 y_1 = a_{11} x_1 + a_{12} x_2 + \ldots + a_{1n} x_n = \sum_{j=1}^n a_{1j}x_j
@@ -1162,12 +1163,64 @@ and in general i-th element of the result vector is computed as
 y_i = a_{i1} x_1 + a_{i2} x_2 + \ldots + a_{in} x_n = \sum_{j=1}^n a_{ij}x_j.
 \f]
 
-We see that in i-th matrix row we have to compute the sum \f$\sum_{j=1}^n a_{ij}x_j\f$ which is reduction of products \f$ a_{ij}x_j\f$. Similar to *flexible parallel reduction* (\ref TNL::Algorithms::Reduction) we just need to design proper lambda functions. See the following example:
+We see that in i-th matrix row we have to compute the sum \f$\sum_{j=1}^n a_{ij}x_j\f$ which is reduction of products \f$ a_{ij}x_j\f$. Similar to flexible parallel reduction (\ref TNL::Algorithms::Reduction) we just need to design proper lambda functions. There are three of them.
+
+1. `fetch` reads and preprocesses data entering the flexible parallel reduction.
+2. `reduce` performs the reduction operation.
+3. `keep` stores the results from each matrix row.
+
+#### Lambda function fetch
+
+This lambda function has the same purpose as the lambda function `fetch` in flexible parallel reduction for arrays and vectors (see [Flexible Parallel Reduction](tutorial_ReductionAndScan.html#flexible_parallel_reduction)). It is supposed to be declared as follows:
+
+\includelineno snippet_rows_reduction_fetch_declaration.cpp
+
+The meaning of the particular parameters is as follows:
+
+1. `rowIdx` is the row index of the matrix element.
+2. `columnIdx` is the column index of the matrix element.
+3. `value` is the value of the matrix element.
+
+The lambda function returns a value of type `Real` based on the input data.
+
+#### Lambda function reduce
+
+The lambda function `reduce` expresses reduction operation (sum, product, minimum, maximum etc.) which is supposed to be done during the flexible reduction.
+
+\includelineno snippet_rows_reduction_reduce_declaration.cpp
+
+The meaning of the particular parameters is as follows:
+
+1. `a` is the first operand for the reduction operation.
+2. `b` is the second operand for the reduction operation.
 
+#### Lambda function keep
+
+The lambda function `keep` is new one compared to the flexible reduction for arrays, vectors or other linear structures. The reason is that the result consists of as many numbers as there are matrix rows. Result obtained for each matrix row is processed by this lambda function. It is declared as follows:
+
+\includelineno snippet_rows_reduction_keep_declaration.cpp
+
+The meaning of the particular parameters is as follows:
+
+1. `rowIdx` is an index of the matrix row related to given result of flexible reduction.
+2. `value`is the result of the flexible reduction in given matrix row.
+
+The method `rowsReduction` (\ref TNL::Matrices::DenseMatrix::rowsReduction, \ref TNL::Matrices::SparseMatrix::rowsReduction, \ref TNL::Matrices::TridiagonalMatrix::rowsReduction, \ref TNL::Matrices::MultidiagonalMatrix::rowsReduction, \ref TNL::Matrices::LambdaMatrix::rowsReduction) accepts the following arguments:
+
+1. `begin` is the beginning of the matrix rows range on which the reduction will be performed.
+2. `end` is the end of the matrix rows range on which the reduction will be performed. The last matrix row which is going to be processed has index `end-1`.
+3. `fetch` is the lambda function for data fetching.
+4. `reduce` is the the lambda function performing the reduction.
+5. `keep` is the lambda function responsible for processing the results from particular matrix rows.
+6. `zero` is the "zero" element of given reduction operation also known as *idempotent*.
+
+Though the interface is the same for all matrix types, in the following part we will show several examples for different matrix types to better demonstrate possible ways of use of the flexible reduction for matrices.
+
+### Dense matrices example <a name="dense-matrices-flexible-reduction-example"></a>
 
 \includelineno DenseMatrixExample_rowsReduction_vectorProduct.cpp
 
-The `fetch` lambda function computes the product \f$ a_{ij}x_j\f$ where \f$ a_{ij} \f$ is represented by `value` and \f$x_j \f$ is represented by `xView[columnIdx]`. The reduction is just sum of results particular products and it is represented by by the lambda function `reduce`. Finaly, the lambda function `keep` is responsible for storing the results of reduction in each matrix row (which is represented by the variable `value`) into the output vector `y`.
+The `fetch` lambda function computes the product \f$ a_{ij}x_j\f$ where \f$ a_{ij} \f$ is represented by `value` and \f$x_j \f$ is represented by `xView[columnIdx]`. The reduction is just sum of particular products and it is represented by by the lambda function `reduce`. Finally, the lambda function `keep` is responsible for storing the results of reduction in each matrix row (which is represented by the variable `value`) into the output vector `y`.
 The result looks as:
 
 \include DenseMatrixExample_rowsReduction_vectorProduct.out
@@ -1182,19 +1235,14 @@ See the following example:
 
 \includelineno DenseMatrixExample_rowsReduction_maxNorm.cpp
 
-
-The `fetch` lambda function just returns absolute value of \f$a_{ij} \f$ which is represented again by the varibale `value`. The `reduce` lambda function returns larger of given values and the lambda fuction 'keep' stores the results to the output vectro the same way as in the previous example. Of course, if we compute the maximum of all output vector elements we get some kined of max matrix norm. The output looks as:
+The `fetch` lambda function just returns absolute value of \f$a_{ij} \f$ which is represented again by the varibale `value`. The `reduce` lambda function returns larger of given values and the lambda function 'keep' stores the results to the output vector the same way as in the previous example. Of course, if we compute the maximum of all output vector elements we get some kind of maximal matrix norm. The output looks as:
 
 \include DenseMatrixExample_rowsReduction_maxNorm.out
 
-### Sparse matrix
+### Sparse matrices example <a name="sparse-matrices-flexible-reduction-example"></a>
 
 The *flexible parallel reduction* in rows for sparse matrices is very simmilar to the one for dense matrices. It consits of three lambda functions:
 
-1. `fetch` reads and preproces data entering the flexible parallel reduction.
-2. `reduce` performs the reduction operation.
-3. `keep` stores the results from each matrix row.
-
 See the following example:
 
 \includelineno SparseMatrixExample_rowsReduction_vectorProduct.cpp
@@ -1237,7 +1285,7 @@ At the end we print the matrix, the input and the output vector -- lines 55-57.
 
 \include SparseMatrixExample_rowsReduction_vectorProduct.out
 
-### Tridiagonal matrix
+### Tridiagonal matrices example <a name="tridiagonal-matrices-flexible-reduction-example"></a>
 
 The *flexible parallel reduction* in rows for tridiagonal matrices is also simmilar as for dense and sparse matrices. It is represented by three lambda functions:
 
@@ -1289,7 +1337,7 @@ The method `rowsReduction` (\ref TNL::Matrices::SparseMatrix::rowsReduction) act
 
 \include TridiagonalMatrixExample_rowsReduction.out
 
-### Multidiagonal matrix
+### Multidiagonal matrices example <a name="multidiagonal-matrices-flexible-reduction-example"></a>
 
 The flexible parallel reduction in rows for multidiagonal matrices works the same way as for other matrix types. It consits of three lambda functions:
 
@@ -1325,7 +1373,7 @@ Finaly we call the method `rowsReduction` (\ref TNL::Matrices::MultidiagonalMatr
 
 \include MultidiagonalMatrixExample_rowsReduction.out
 
-### Lambda matrix
+### Lambda matrices example <a name="lambda-matrices-flexible-reduction-example"></a>
 
 The reduction of matrix rows is available for the lambda matrices as well. See the follogin example:
 
-- 
GitLab


From b5b4a867b3c86000f2c4abb295804a443a43916f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Mon, 18 Jan 2021 18:38:46 +0100
Subject: [PATCH 49/53] Writing documentation on matrices.

---
 .../Tutorials/Matrices/tutorial_Matrices.md   | 361 ++++--------------
 1 file changed, 70 insertions(+), 291 deletions(-)

diff --git a/Documentation/Tutorials/Matrices/tutorial_Matrices.md b/Documentation/Tutorials/Matrices/tutorial_Matrices.md
index 1f1ac3015..89eb551ca 100644
--- a/Documentation/Tutorials/Matrices/tutorial_Matrices.md
+++ b/Documentation/Tutorials/Matrices/tutorial_Matrices.md
@@ -10,6 +10,7 @@
    3. [Tridiagonal matrices](#tridiagonal_matrices_setup)
    4. [Multidiagonal matrices](#multidiagonal_matrices_setup)
    5. [Lambda matrices](#lambda_matrices_setup)
+   6. [Distributed matrices](#distributed-matrices-setup)
 5. [Flexible reduction in matrix rows](#flexible_reduction_in_matrix_rows)
    1. [Dense matrices example](#dense-matrices-flexible-reduction-example)
    2. [Sparse matrices example](#sparse-matrices-flexible-reduction-example)
@@ -18,6 +19,8 @@
    5. [Lambda matrices example](#lambda-matrices-flexible-reduction-example)
 6. [Matrix-vector product](#matrix_vector_product)
 7. [Matrix I/O operations](#matrix_io_operations)
+   1. [Matrix reader](#matrix-reader)
+   2. [Matrix writer](#matrix-writer)
 8. [Appendix](#appendix)
 
 ## Introduction
@@ -1148,6 +1151,10 @@ The result of both examples looks as follows:
 
 \include LambdaMatrixExample_Laplace.out
 
+### Distributed matrices <a name="distributed-matrices-setup"></a>
+
+TODO: Write documentation on distributed matrices.
+
 ## Flexible reduction in matrix rows <a name="flexible_reduction_in_matrix_rows"></a>
 
 Flexible reduction in matrix rows is a powerful tool for many different matrix operations. It is represented by the method `rowsReduction` (\ref TNL::Matrices::DenseMatrix::rowsReduction, 
@@ -1218,14 +1225,25 @@ Though the interface is the same for all matrix types, in the following part we
 
 ### Dense matrices example <a name="dense-matrices-flexible-reduction-example"></a>
 
+The following example demonstrates implementation of the dense matrix-vector product \f$ {\bf y} = A \vec {\bf x}\f$, i.e.
+
+\f[
+   y_i = \sum_{j=0}^{columns - 1} a_{ij} x_j \text{ for } i = 0, \ldots, rows-1.
+\f]
+
 \includelineno DenseMatrixExample_rowsReduction_vectorProduct.cpp
 
-The `fetch` lambda function computes the product \f$ a_{ij}x_j\f$ where \f$ a_{ij} \f$ is represented by `value` and \f$x_j \f$ is represented by `xView[columnIdx]`. The reduction is just sum of particular products and it is represented by by the lambda function `reduce`. Finally, the lambda function `keep` is responsible for storing the results of reduction in each matrix row (which is represented by the variable `value`) into the output vector `y`.
+We set the following lambda functions:
+
+* `fetch` lambda function computes the product \f$ a_{ij}x_j\f$ where \f$ a_{ij} \f$ is represented by `value` and \f$x_j \f$ is represented by `xView[columnIdx]` (line 40).
+* `reduce` - reduction is just sum of particular products and it is represented by \ref std::plus (line 53).
+* `keep` is responsible for storing the results of reduction in each matrix row (which is represented by the variable `value`) into the output vector `y`.
+
 The result looks as:
 
 \include DenseMatrixExample_rowsReduction_vectorProduct.out
 
-We will show one more example which is computation of maximal absolute value in each matrix row. The results will be stored in a vector:
+We will show one more example which is a computation of maximal absolute value in each matrix row. The results will be stored in a vector:
 
 \f[
 y_i = \max_{j=1,\ldots,n} |a_{ij}|.
@@ -1235,15 +1253,19 @@ See the following example:
 
 \includelineno DenseMatrixExample_rowsReduction_maxNorm.cpp
 
-The `fetch` lambda function just returns absolute value of \f$a_{ij} \f$ which is represented again by the varibale `value`. The `reduce` lambda function returns larger of given values and the lambda function 'keep' stores the results to the output vector the same way as in the previous example. Of course, if we compute the maximum of all output vector elements we get some kind of maximal matrix norm. The output looks as:
+The lambda functions rare:
+
+* `fetch` lambda function just returns absolute value of \f$a_{ij} \f$ which is represented again by the variable `value`.
+* `reduce` lambda function returns larger of given values.
+* `keep` stores the results to the output vector the same way as in the previous example. 
+
+Note, that the idempotent value for the reduction is \ref std::numeric_limits< double >::lowest. Of course, if we compute the maximum of all output vector elements, we get some kind of maximal matrix norm. The output looks as:
 
 \include DenseMatrixExample_rowsReduction_maxNorm.out
 
 ### Sparse matrices example <a name="sparse-matrices-flexible-reduction-example"></a>
 
-The *flexible parallel reduction* in rows for sparse matrices is very simmilar to the one for dense matrices. It consits of three lambda functions:
-
-See the following example:
+The following example demonstrates sparse matrix-vector product:
 
 \includelineno SparseMatrixExample_rowsReduction_vectorProduct.cpp
 
@@ -1261,43 +1283,13 @@ On the lines 11-16 we set the following matrix:
 \right)
 \f]
 
-Next we prepare input (`x`) and output (`y`) vectors on the lines 21 and 22 and set all elements of the input vector to one (line 27). Since we will need to access these vectors in lambda functions we prepare their views on lines 32 and 33. On the lines 39-41, we define the `fetch` lambda function. It receives three arguments:
-
-1. `rowIdx` is a row index of the matrix element being currently processed.
-2. `columnIdx` is a column index of the matrix elements being currently processed.
-3. `value` is a value of the matrix element being currently procesed.
-
-We ommit the row index and take the column index which indicates index of the element of the input vector we need to fetch (`xView[ columnIdx ]`). We take its value and multiply it with the value (`value`) of the current matrix element. We do not need to write lambda function for reduction since it is only summation of the intermediate results from the `fetch` lamda and we can use `std::plus<>{}` (see the line 60). The `keep` lambda function offers two parameters:
-
-1. `rowIdx` tells the index of the matrix row for which we aim to store the result.
-2. `value` is the result obtained in the given matrix row.
-
-In our example, we just write the result into appropriate element of the output vector `y` which is given just by the row index `rowIdx` -- see the line 47.  On the line 53 we start the computation of the matrix-vector product. The method `rowsReduction` (\ref TNL::Matrices::SparseMatrix::rowsReduction) accepts the following arguments:
-
-1. `begin` is the begining of the matrix rows range on which the reduction will be performed.
-2. `end` is the end of the matrix rows range on which the reduction will be performed. The last matrix row which is going to be processed has index `end-1`.
-3. `fetch` is the fetch lambda function.
-4. `reduce` is the the lmabda function performing the reduction.
-5. `keep` is the lambda function responsible for processing the results from particular matrix rows.
-6. `zero` is the "zero" element of given reduction opertation also known as *idempotent*. It is really 0 for summation in our example (adding zero to any number does not change the result).
-
-At the end we print the matrix, the input and the output vector -- lines 55-57. The result looks as follows:
+The lambda functions on the lines 39-48 are the same as in the example with the dense matrix. The result looks as follows:
 
 \include SparseMatrixExample_rowsReduction_vectorProduct.out
 
 ### Tridiagonal matrices example <a name="tridiagonal-matrices-flexible-reduction-example"></a>
 
-The *flexible parallel reduction* in rows for tridiagonal matrices is also simmilar as for dense and sparse matrices. It is represented by three lambda functions:
-
-1. `fetch` reads and preproces data entering the flexible parallel reduction.
-2. `reduce` performs the reduction operation.
-3. `keep` stores the results from each matrix row.
-
-See the following example:
-
-\includelineno TridiagonalMatrixExample_rowsReduction.cpp
-
-Here we first set tridiagonal matrix (lines 10-27) which looks as
+In this example, we will compute maximal absolute value in each row of the following tridiagonal matrix:
 
 \f[
 \left(
@@ -1311,45 +1303,23 @@ Here we first set tridiagonal matrix (lines 10-27) which looks as
 \right).
 \f]
 
-Next we want to compute maximal absolute value of the nonzero matrix elements in each row. We allocate the vector `rowMax` where we will store the results (line 32). The lambda function `fetch` (lines 42-44) is responsible for reading the matrix elements. It receives three arguments:
-
-1. `rowIdx` is a row index of the matrix element being currently processed.
-2. `columnIdx` is a column index of the matrix elements being currently processed.
-3. `value` is a value of the matrix element being currently procesed.
-
-In our example, the only thing this function has to do, is to compute the absolute value of each matrix element represented by variable `value`. The next lambda function, `reduce` (lines 49-51), performs reduction operation. In this case, it returns maximum of two input values `a` and `b`. Finaly, the lambda function `keep` (lines 56-58) is defined with the following parameters:
+The source code reads as follows:
 
-1. `rowIdx` tells the index of the matrix row for which we aim to store the result.
-2. `value` is the result obtained in the given matrix row.
+\includelineno TridiagonalMatrixExample_rowsReduction.cpp
 
-In our example, it just takes the result of the reduction in variable `value` in each row and stores it into the vector `rowMax` via related vector view `rowMaxView`.
+Here we first set the tridiagonal matrix (lines 10-27). Next we allocate the vector `rowMax` where we will store the results (line 32). The lambda function are:
 
-The method `rowsReduction` (\ref TNL::Matrices::SparseMatrix::rowsReduction) activates all the mantioned lambda functions (line 63). It accepts the following arguments:
+* `fetch` (lines 42-44) is responsible for reading the matrix elements. In our example, the only thing this function has to do, is to compute the absolute value of each matrix element represented by variable `value`.
+* `reduce` (lines 49-51), performs reduction operation. In this case, it returns maximum of two input values `a` and `b`.
+* `keep` (lines 56-58) takes the result of the reduction in variable `value` in each row and stores it into the vector `rowMax` via related vector view `rowMaxView`.
 
-1. `begin` is the begining of the matrix rows range on which the reduction will be performed.
-2. `end` is the end of the matrix rows range on which the reduction will be performed. The last matrix row which is going to be processed has index `end-1`.
-3. `fetch` is the fetch lambda function.
-4. `reduce` is the the lmabda function performing the reduction.
-5. `keep` is the lambda function responsible for processing the results from particular matrix rows.
-6. `zero` is the "zero" element of given reduction opertation also known as *idempotent*. In our example, the role of this element has the lowest number of given type which we can obtain using function `std::numeric_limits< double >::lowest()` from STL.
-
- The results looks as follows:
+Note, that the idempotent value for the reduction is \ref std::numeric_limits< double >::lowest. The results looks as follows:
 
 \include TridiagonalMatrixExample_rowsReduction.out
 
 ### Multidiagonal matrices example <a name="multidiagonal-matrices-flexible-reduction-example"></a>
 
-The flexible parallel reduction in rows for multidiagonal matrices works the same way as for other matrix types. It consits of three lambda functions:
-
-1. `fetch` reads and preproces data entering the flexible parallel reduction.
-2. `reduce` performs the reduction operation.
-3. `keep` stores the results from each matrix row.
-
-See the following example:
-
-\includelineno MultidiagonalMatrixExample_rowsReduction.cpp
-
-On the lines 10-29, we first create the following matrix
+The next example computes again the maximal absolute value in each row. Now, we do it for multidiagonal matrix the following form:
 
 \f[
 \left(
@@ -1363,23 +1333,37 @@ On the lines 10-29, we first create the following matrix
 \right)
 \f]
 
-and we aim to compute maximal value in each row. We first create vector `rowMax` into which we will store the results and fetch it view `rowMaxView` (line 39). Next we prepare necessary lambda functions:
+We first create vector `rowMax` into which we will store the results and fetch it view `rowMaxView` (line 39). Next we prepare necessary lambda functions:
 
 * `fetch` (lines 44-46) is responsible for reading the matrix element value which is stored in the constant reference `value` and for returning its absolute value. The other parameters `rowIdx` and `columnIdx` correspond to row and column indexes respectively and they are omitted in our example.
 * `reduce` (lines 51-53) returns maximum value of the two input values `a` and `b`.
-* `keep` (line 58-60) stores the input `value` at the corresponding position, given by the row index `rowIdx`, in the ouput vector view `rowMaxView`.
+* `keep` (line 58-60) stores the input `value` at the corresponding position, given by the row index `rowIdx`, in the output vector view `rowMaxView`.
 
-Finaly we call the method `rowsReduction` (\ref TNL::Matrices::MultidiagonalMatrix::rowsReduction) with parameters telling the interval of rows to be processed (the first and second parameter), the lambda functions `fetch`, `reduce` and `keep`, and the idempotent element for the reduction operation which is the lowest number of given type (\ref std::numeric_limits< double >::lowest ). The result looks as follows:
+Finally, we call the method `rowsReduction` (\ref TNL::Matrices::MultidiagonalMatrix::rowsReduction) with parameters telling the interval of rows to be processed (the first and second parameter), the lambda functions `fetch`, `reduce` and `keep`, and the idempotent element for the reduction operation which is the lowest number of given type (\ref std::numeric_limits< double >::lowest ). The result looks as follows:
 
 \include MultidiagonalMatrixExample_rowsReduction.out
 
 ### Lambda matrices example <a name="lambda-matrices-flexible-reduction-example"></a>
 
-The reduction of matrix rows is available for the lambda matrices as well. See the follogin example:
+The reduction of matrix rows is available for the lambda matrices as well. See the following example:
 
 \includelineno LambdaMatrixExample_rowsReduction.cpp
 
-On the lines 14-21, we create the same lower trianguilar lambda matrix as in the previous example. As we did it in similar examples for other matrix types, we want to compute maximal absolute value of matrix elements in each row. For this purpose we define well known lambda functions:
+On the lines 14-21, we create the lower triangular lambda matrix which looks as follows:
+
+\f[
+\left(
+\begin{array}{ccccc}
+1 &   &   &   &   \\
+2 & 1 &   &   &   \\
+3 & 2 & 1 &   &   \\
+4 & 3 & 2 & 1 &   \\
+5 & 4 & 3 & 2 & 1
+\end{array}
+\right)
+\f]
+
+We want to compute maximal absolute value of matrix elements in each row. For this purpose we define well known lambda functions:
 
 * `fetch` takes the value of the lambda matrix element and returns its absolute value.
 * `reduce` computes maximum value of two input variables.
@@ -1391,237 +1375,32 @@ Note that the interface of the lambda functions is the same as for other matrix
 
 ## Matrix-vector product <a name="matrix_vector_product"></a>
 
-### Dense matrix
-
-One of the most important matrix operation is the matrix-vector multiplication. It is represented by a method `vectorProduct` (\ref TNL::Matrices::DenseMatrix::vectorProduct). It is templated method with two template parameters `InVector` and `OutVector` telling the types of input and output vector respectively. Usually one will substitute some of \ref TNL::Containers::Array, \ref TNL::Containers::ArrayView, \ref TNL::Containers::Vector or \ref TNL::Containers::VectorView for these types. The method accepts the following parameters:
+One of the most important matrix operation is the matrix-vector multiplication. It is represented by a method `vectorProduct` (\ref TNL::Matrices::DenseMatrix::vectorProduct, \ref TNL::Matrices::SparseMatrix::vectorProduct, \ref TNL::Matrices::TridiagonalMatrix::vectorProduct, \ref TNL::Matrices::MultidiagonalMatrix::vectorProduct, \ref TNL::Matrices::LambdaMatrix::vectorProduct). It is templated method with two template parameters `InVector` and `OutVector` telling the types of input and output vector respectively. Usually one will substitute some of \ref TNL::Containers::Array, \ref TNL::Containers::ArrayView, \ref TNL::Containers::Vector or \ref TNL::Containers::VectorView for these types. The method accepts the following parameters:
 
-* `inVector` is the input vector having the same number of elements as the number of matrix columns.
-* `outVector` is the output vector having the same number of elements as the number of matrix rows.
-* `matrixMultiplicator` is a number by which the result of matrix-vector product is multiplied.
-* `outVectorMultiplicator` is a number by which the output vector is multiplied before added to the result of matrix-vector product.
-* `begin` is an index of the first matrix row that is involved in the multiplication. It is zero be default.
-* `end` is an index of the last matrix row that is involved in the multiplication. It is the last matrix row by default.
+1. `inVector` is the input vector having the same number of elements as the number of matrix columns.
+2. `outVector` is the output vector having the same number of elements as the number of matrix rows.
+3. `matrixMultiplicator` is a number by which the result of matrix-vector product is multiplied.
+4. `outVectorMultiplicator` is a number by which the output vector is multiplied before added to the result of matrix-vector product.
+5. `begin` is the beginning of the matrix rows range on which we compute the matrix-vector product.
+6. `end` is the end of the matrix rows range on which the matrix-vector product will be evaluated. The last matrix row which is going to be processed has index `end-1`.
 
-Note that the ouput vector dimension must be the same as the number of matrix rows no matter how we set `begin` and `end` parameters. These parameters just say that some matrix rows and the output vector elements are omitted.
+Note that the output vector dimension must be the same as the number of matrix rows no matter how we set `begin` and `end` parameters. These parameters just say that some matrix rows and the output vector elements are omitted.
 
 To summarize, this method computes the following formula:
 
 `outVector = matrixMultiplicator * ( *this ) * inVector + outVectorMultiplicator * outVector.`
 
-### Sparse matrix
-
-As we mentioned already in the part explaining the dense matrices, matrix-vector multiplication or in this case sparse matrix-vector multiplication ([SpMV](https://en.wikipedia.org/wiki/Sparse_matrix-vector_multiplication)) is one of the most important operations in numerical mathematics and high-performance computing. It is represented by a method `vectorProduct` (\ref TNL::Matrices::SparseMatrix::vectorProduct). It is templated method with two template parameters `InVector` and `OutVector` telling the types of input and output vector respectively. Usually one will substitute some of \ref TNL::Containers::Array, \ref TNL::Containers::ArrayView, \ref TNL::Containers::Vector or \ref TNL::Containers::VectorView for these types. The method computes the following formula
-
-```
-outVector = matrixMultiplicator * ( *this ) * inVector + outVectorMultiplicator * outVector
-```
-
-and it accepts the following parameters:
-
-* `inVector` is the input vector having the same number of elements as the number of matrix columns.
-* `outVector` is the output vector having the same number of elements as the number of matrix rows.
-* `matrixMultiplicator` is a number by which the result of matrix-vector product is multiplied.
-* `outVectorMultiplicator` is a number by which the output vector is multiplied before added to the result of matrix-vector product.
-* `begin` is an index of the first matrix row that is involved in the multiplication. It is zero be default.
-* `end` is an index of the last matrix row that is involved in the multiplication. It is the last matrix row by default.
-
-Note that the ouput vector dimension must be the same as the number of matrix rows no matter how we set `begin` and `end` parameters. These parameters just say that some matrix rows and the output vector elements are omitted.
-
-### Tridiagonal matrix
-
-Similar to dense and sparse matrices, matrix-vector multiplication is represented by a method `vectorProduct` (\ref TNL::Matrices::TridiagonalMatrix::vectorProduct). It is templated method with two template parameters `InVector` and `OutVector` telling the types of input and output vector respectively. Usually one will substitute some of \ref TNL::Containers::Array, \ref TNL::Containers::ArrayView, \ref TNL::Containers::Vector or \ref TNL::Containers::VectorView for these types. The method computes the following formula
-
-```
-outVector = matrixMultiplicator * ( *this ) * inVector + outVectorMultiplicator * outVector
-```
-
-and it accepts the following parameters:
-
-* `inVector` is the input vector having the same number of elements as the number of matrix columns.
-* `outVector` is the output vector having the same number of elements as the number of matrix rows.
-* `matrixMultiplicator` is a number by which the result of matrix-vector product is multiplied.
-* `outVectorMultiplicator` is a number by which the output vector is multiplied before added to the result of matrix-vector product.
-* `begin` is an index of the first matrix row that is involved in the multiplication. It is zero be default.
-* `end` is an index indicating the last matrix row that is involved in the multiplication which is `end - 1`. It is the number of matrix rows.
-
-Note that the output vector dimension must be the same as the number of matrix rows no
-matter how we set `begin` and `end` parameters. These parameters just say that some matrix rows and the output vector elements are omitted.
-
-### Multidiagonal matrix
-
-
-Similar to matrix types, matrix-vector multiplication is represented by the method `vectorProduct` (\ref TNL::Matrices::MultidiagonalMatrix::vectorProduct). It is templated method with two template parameters `InVector` and `OutVector` telling the types of the input and output vector respectively. Usually one will substitute some of \ref TNL::Containers::Array, \ref TNL::Containers::ArrayView, \ref TNL::Containers::Vector or \ref TNL::Containers::VectorView for these types. The method computes the following formula
-
-```
-outVector = matrixMultiplicator * ( *this ) * inVector + outVectorMultiplicator * outVector
-```
-
-and it accepts the following parameters:
-
-* `inVector` is the input vector having the same number of elements as the number of matrix columns.
-* `outVector` is the output vector having the same number of elements as the number of matrix rows.
-* `matrixMultiplicator` is a number by which the result of matrix-vector product is multiplied.
-* `outVectorMultiplicator` is a number by which the output vector is multiplied before it is added to the result of matrix-vector product.
-* `begin` is an index of the first matrix row that is involved in the multiplication. It is zero be default.
-* `end` is an index indicating the last matrix row that is involved in the multiplication which is `end - 1`. It is the number of matrix rows.
-
-Note that the output vector dimension must be the same as the number of matrix rows no matter how we set `begin` and `end` parameters. These parameters just say that some matrix rows and the output vector elements are omitted.
-
-### Lambda matrix
-
-The matrix-vector multiplication is represented by the method `vectorProduct` (\ref TNL::Matrices::LambdaMatrix::vectorProduct). It is templated method with two template parameters `InVector` and `OutVector` telling the types of the input and output vector respectively. Usually one will substitute some of \ref TNL::Containers::Array, \ref TNL::Containers::ArrayView, \ref TNL::Containers::Vector or \ref TNL::Containers::VectorView for these types. The method computes the following formula
-
-```
-outVector = matrixMultiplicator * ( *this ) * inVector + outVectorMultiplicator * outVector
-```
-
-and it accepts the following parameters:
-
-* `inVector` is the input vector having the same number of elements as the number of matrix columns.
-* `outVector` is the output vector having the same number of elements as the number of matrix rows.
-* `matrixMultiplicator` is a number by which the result of matrix-vector product is multiplied.
-* `outVectorMultiplicator` is a number by which the output vector is multiplied before it is added to the result of matrix-vector product.
-* `begin` is an index of the first matrix row that is involved in the multiplication. It is zero be default.
-* `end` is an index indicating the last matrix row that is involved in the multiplication which is `end - 1`. It is the number of matrix rows.
-
-Note that the output vector dimension must be the same as the number of matrix rows no matter how we set `begin` and `end` parameters. These parameters just say that some matrix rows and the output vector elements are omitted.
-
 ## Matrix I/O operations <a name="matrix_io_operations"></a>
 
-### Dense matrix
-
-The dense matrix can be saved to a file using a method `save` (\ref TNL::Matrices::DenseMatrix::save) and restored with a method `load` (\ref TNL::Matrices::DenseMatrix::load). To print the matrix, there is a method `print` (\ref TNL::Matrices::DenseMatrix::print) can be used.
-
-### Sparse matrix
-The sparse matrix can be saved to a file using a method `save` (\ref TNL::Matrices::SparseMatrix::save) and restored with a method `load` (\ref TNL::Matrices::SparseMatrix::load). For printing the matrix, there is a method `print` (\ref TNL::Matrices::SparseMatrix::print) can be used.
-
-### Tridiagonal matrix IO
-
-The tridiagonal matrix can be saved to a file using a method `save` (\ref TNL::Matrices::TridiagonalMatrix::save) and restored with a method `load` (\ref TNL::Matrices::TridiagonalMatrix::load). For printing the matrix, there is a method `print` (\ref TNL::Matrices::TridiagonalMatrix::print) can be used.
-
-### Multidiagonal matrix IO
-
-The multidiagonal matrix can be saved to a file using a method `save` (\ref TNL::Matrices::MultiidiagonalMatrix::save) and restored with a method `load` (\ref TNL::Matrices::MultidiagonalMatrix::load). For printing the matrix, there is a method `print` (\ref TNL::Matrices::MultidiagonalMatrix::print) can be used.
-
-### Lambda matrix IO
-
-The lambda matrix, can be printed by the means of the method `print` (\ref TNL::Matrices::LambdaMatrix::print). The lambda matrix do not offer the methods `save` and `load` since it does not manage any data. Of course, the lambda function evaluating the matrix elements can use any supporting data containers but it is up these containers to manage the IO operations.
-
-## Matrix view
-
-### Dense matrix view
-
-Similar to array view (\ref TNL::Containers::ArayView) and vector view (\ref TNL::Containers::VectorView), matrices also offer their view for easier use with lambda functions. For the dense matrix there is a `DenseMatrixView` (\ref TNL::Matrices::DenseMatrixView) which is a templated class with the following template arguments (they are the same as for `DenseMatrix` -- \ref TNL::Matrices::DenseMatrix -- except of the allocator):
-
-* `Real` is a type of matrix elements.
-* `Device` is a device on which the matrix is allocated. This can be \ref TNL::Devices::Host or \ref TNL::Devices::Cuda.
-* `Index` is a type for indexing the matrix elements and also row and column indexes.
-* `Organization` tells the ordering of matrix elements in memory. It is either RowMajorOrder or ColumnMajorOrder.
-
-The first main reason for using the dense matrix view is its ability to be captured by lambda functions since the copy constructor makes only shallow copy. We will demonstrate it on the example showing the method `setElement` (\ref TNL::Matrices::DenseMatrix::setElement). However, the `SharedPointer` will be replaced with the `DenseMatrixView`. The code looks as follows:
-
-\includelineno DenseMatrixViewExample_setElement.cpp
-
-You can see that we do not need to use the shared pointer (\ref TNL::Pointers::SharedPointer) as we did in the example demonstrating the method `setElement` for dense matrix.  And the result is:
-
-\include DenseMatrixViewExample_setElement.out
-
-The second reason for using the `DenseMatrixView` is to encapsulate data allocated by some other library or program then TNL. The following example demonstrates how to do it:
-
-\includelineno DenseMatrixViewExample_data_encapsulation.cpp
-
-On the lines 18--34 we create matrix by allocating array `data` and filling the matrix using a formula \f$ a_{ij} = i * size + j + 1\f$. We do it first on the host (lines 18--21) in auxilliary array `host_data` to make initiation of the array `data` easier in case when `Device` is GPU. Next, depending on the argument `Device`, we allocate the array `data` on the host or on GPU and copy data from the arary `host_data` to the array `data`. To insert this array into the dense matrix view, we first need to encapsulate it with vector view (\ref TNL::Conatianers::VectorView) `dataView` on the line 39 which can be then used to create the dense matrix view `matrix` on the line 40. Note that wee must set proper matrix elements organizationa which is `RowMajorOrder` (\ref TNL::Algorithms::Segments::RowMajorOrder) in this example. Next, we print the matrix to see if the encapsulation was succesfull (lines 42 and 43) and finaly we demonstrate manipulation with matrix elements (lines 45--48) and we print the result (lines 50 and 51).
-
-The result looks as follows:
-
-\include DenseMatrixViewExample_data_encapsulation.out
-
-The dense matrix view offers almost all methods which the dense matrix does. So it can be easily used at almost any situation the same way as the dense matrix itself.
-
-### Sparse matrix view
-
-Sparse matrix view serves, simillar to other views in TNL, to data sharing and for use with lambda functions (views can be easily captured since they make only shallow copy). The sparse matrix view (\ref TNL::Matrices::SparseMatrixView) is templated class having the following template arguments (they are the same as for `SparseMatrix` -- \ref TNL::Matrices::SparseMatrix -- except of the allocators):
-
-* `Real` is type if the matrix elements. It is `double` by default.
-* `Device` is a device where the matrix is allocated. Currently it can be either \ref TNL::Devices::Host for CPU or \ref TNL::Devices::Cuda for GPU supporting CUDA. It is \ref TNL::Devices::Host by default.
-* `Index` is a type to be used for indexing of the matrix elements. It is `int` by default.
-* `MatrixType` tells if the matrix is symmetric (\ref TNL::Matrices::SymmetricMatrix) or general (\ref TNL::Matrices::GeneralMatrix). It is a \ref TNL::Matrices::GeneralMatrix by default.
-* `Segments` define the format of the sparse matrix. It can be (by default, it is \ref TNL::Algorithms::Segments::CSR):
-   * \ref TNL::Algorithms::Segments::CSR for [CSR format](https://en.wikipedia.org/wiki/Sparse_matrix#Compressed_sparse_row_(CSR,_CRS_or_Yale_format)).
-   * \ref TNL::Algorithms::Segments::Ellpack for [Ellpack format](http://mgarland.org/files/papers/nvr-2008-004.pdf).
-   * \ref TNL::Algorithms::Segments::SlicedEllpack for [SlicedEllpack format](https://link.springer.com/chapter/10.1007/978-3-642-11515-8_10) which was also presented as [Row-grouped CSR format](https://arxiv.org/abs/1012.2270).
-   * \ref TNL::Algorithms::Segments::ChunkedEllpack for [ChunkedEllpack format](http://geraldine.fjfi.cvut.cz/~oberhuber/data/vyzkum/publikace/12-heller-oberhuber-improved-rgcsr-format.pdf) which we reffered as Improved Row-grouped CSR and we renamed it to Ellpack format since it uses padding zeros.
-   * \ref TNL::Algorithms::Segments::BiEllpack for [BiEllpack format](https://www.sciencedirect.com/science/article/pii/S0743731514000458?casa_token=2phrEj0Ef1gAAAAA:Lgf6rMBUN6T7TJne6mAgI_CSUJ-jR8jz7Eghdv6L0SJeGm4jfso-x6Wh8zgERk3Si7nFtTAJngg).
-* `ComputeReal` is type which is used for internal computations. By default it is the same as `Real` if `Real` is not `bool`. If `Real` is `bool`, `ComputeReal` is set to `Index` type. This can be changed, of course, by the user.
-
-**If `Real` is set to `bool`, we get *a binary matrix view*.**
-
-The following example shows the use of `SparseMatrixView` with lambda functions:
-
-\includelineno SparseMatrixViewExample_setElement.cpp
-
-The result looks as follows:
-
-\include SparseMatrixViewExample_setElement.out
-
-### Tridiagonal matrix view
-
-Similar to dense and sparse matrix view, tridiagonal matrix also offers its view for easier use with lambda functions. It is represented by a templated class \ref TNL::Matrices::TridiagonalMatrixView with the following template parameters:
-
-* `Real` is a type of matrix elements.
-* `Device` is a device on which the matrix is allocated. This can be \ref TNL::Devices::Host or \ref TNL::Devices::Cuda.
-* `Index` is a type for indexing the matrix elements and also row and column indexes.
-* `Organization` tells the ordering of matrix elements in memory. It is either RowMajorOrder or ColumnMajorOrder.
-
-The first main reason for using the matrix view is its ability to be captured by lambda functions since the copy constructor makes only shallow copy. We can demonstrate it on the example showing the method `setElement` (\ref TNL::Matrices::TridiagonalMatrix::setElement). The code looks as follows:
-
-\includelineno TridiagonalMatrixViewExample_setElement.cpp
-
-The matrix view is obtained by the method `getView` (\ref TNL::Matrices::TridiagonalMatrix::getView) on the line 13. We firsrt show, that the view can be used the same way as common matrix (lines 14 and 15) but it can be used the same way even in lambda functions as we can see on the lines 20-26. Compare it with the same example using shared pointer instead of the matrix view:
-
-\includelineno TridiagonalMatrixExample_setElement.cpp
-
-The main disadventages are:
-
-1. The shared pointer must be created together with the matrix (line 14) and there is no way to get it later. The matrix view can be obtained from any matrix at any time.
-2. We have to synchronize shared pointers explicitly by calling the function \ref TNL::Pointers::synchronizeSmartPointersOnDevice (line 34).
-
-So for the sake of using a matrix in lambda functions, the matrix view is better tool. The result of both examples looks as:
-
-\include TridiagonalMatrixExample_setElement.out
-
-As we mentioned already, the tridiagonal matrix view offers almost all methods which the tridiagonal matrix does. So it can be easily used at almost any situation the same way as the tridiagonal matrix itself.
-
-### Multidiagonal matrix view <a name="multidiagonal_matrix_view"></a>
-
-Multidiagonal matrix also offers its view for easier use with lambda functions. It is represented by a templated class \ref TNL::Matrices::MultidiagonalMatrixView with the following template parameters:
-
-* `Real` is a type of matrix elements.
-* `Device` is a device on which the matrix is allocated. This can be \ref TNL::Devices::Host or \ref TNL::Devices::Cuda.
-* `Index` is a type for indexing the matrix elements and also row and column indexes.
-* `Organization` tells the ordering of matrix elements in memory. It is either RowMajorOrder or ColumnMajorOrder.
-
-The first main reason for using the matrix view is its ability to be captured by lambda functions since the copy constructor makes only shallow copy. We can demonstrate it on the example showing the method `setElement` (\ref TNL::Matrices::MultidiagonalMatrix::setElement). The code looks as follows:
-
-\includelineno MultidiagonalMatrixViewExample_setElement.cpp
-
-The matrix view is obtained by the method `getView` (\ref TNL::Matrices::MultidiagonalMatrix::getView) on the line 13. We firsrt show, that the view can be used the same way as common matrix (lines 14 and 15) but it can be used the same way even in lambda functions as we can see on the lines 20-26. Compare it with the same example using shared pointer instead of the matrix view:
-
-\includelineno MultidiagonalMatrixExample_setElement.cpp
-
-The main disadventages are:
-
-1. The shared pointer must be created together with the matrix (line 14) and there is no way to get it later. The matrix view can be obtained from any matrix at any time.
-2. We have to synchronize shared pointers explicitly by calling the function \ref TNL::Pointers::synchronizeSmartPointersOnDevice (line 34).
-
-So for the sake of using a matrix in lambda functions, the matrix view is better tool. The result of both examples looks as:
+All  matrices can be saved to a file using a method `save` (\ref TNL::Matrices::DenseMatrix::save, \ref TNL::Matrices::SparseMatrix::save, \ref TNL::Matrices::TridiagonalMatrix::save, \ref TNL::Matrices::MultidiagonalMatrix::save, \ref TNL::Matrices::LambdaMatrix::save) and restored with a method `load` (\ref TNL::Matrices::DenseMatrix::load, \ref TNL::Matrices::SparseMatrix::load, \ref TNL::Matrices::TridiagonalMatrix::load, \ref TNL::Matrices::MultidiagonalMatrix::load, \ref TNL::Matrices::LambdaMatrix::load). To print the matrix, there is a method `print` (\ref TNL::Matrices::DenseMatrix::print, \ref TNL::Matrices::SparseMatrix::print, \ref TNL::Matrices::TridiagonalMatrix::print, \ref TNL::Matrices::MultidiagonalMatrix::print, \ref TNL::Matrices::LambdaMatrix::print) can be used. TNL also offers matrix reader (\ref TNL::Matrices::MatrixReader) and matrix writer (\ref TNL::Matrices::MatrixWriter) for import and export of matrices. We describe both in the following sections.
 
-\include MultidiagonalMatrixExample_setElement.out
+### Matrix reader <a name="matrix-reader></a>
 
-As we mentioned already, the multidiagonal matrix view offers almost all methods which the multidiagonal matrix does. So it can be easily used at almost any situation the same way as the multidiagonal matrix itself.
+TODO: Write documentation on matrix reader.
 
-TODO: Move to explanation of the matrix view to introduction.
+### Matrix writer <a name="matrix-writer></a>
 
+TODO: Write documentation on matrix writer.
 
 ## Appendix<a name="appendix"></a>
 
-- 
GitLab


From 795dcf0f25ba920c391b1b4a801060d0b748daae Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Wed, 20 Jan 2021 11:12:07 +0100
Subject: [PATCH 50/53] CI: fixing jobs for the documentation build

---
 .gitlab-ci.yml | 42 +++++++++++++++++++++++++++---------------
 1 file changed, 27 insertions(+), 15 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index d9b660268..4ad81fd81 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -76,7 +76,7 @@ stages:
         - cmake ../..
                 -G Ninja
                 -DCMAKE_BUILD_TYPE=${BUILD_TYPE}
-                -DCMAKE_INSTALL_PREFIX=$(pwd)/${BUILD_TYPE}_install_prefix
+                -DCMAKE_INSTALL_PREFIX="$(pwd)/${BUILD_TYPE}_install_prefix"
                 -DWITH_OPENMP=${WITH_OPENMP}
                 -DWITH_MPI=${WITH_MPI}
                 -DWITH_CUDA=${WITH_CUDA}
@@ -102,8 +102,6 @@ stages:
     only:
         changes:
             - src/**/*.{h,hpp,cpp,cu}
-            - Documentation/Examples/**/*.{h,hpp,cpp,cu}
-            - Documentation/Tutorials/**/*.{h,hpp,cpp,cu}
             - "**/CMakeLists.txt"
             - .gitlab-ci.yml
     interruptible: true
@@ -117,11 +115,12 @@ dummy build job:
         - merge_requests
     except:
         changes:
+            # .build_template
             - src/**/*.{h,hpp,cpp,cu}
-            - Documentation/Examples/**/*.{h,hpp,cpp,cu}
-            - Documentation/Tutorials/**/*.{h,hpp,cpp,cu}
             - "**/CMakeLists.txt"
             - .gitlab-ci.yml
+            # build documentation
+            - Documentation/**/*
 
 # Cuda builds are specified first because they take more time than host-only builds,
 # which can be allocated on hosts whitout GPUs.
@@ -189,12 +188,6 @@ cuda_examples_Debug:
         WITH_CUDA: "yes"
         BUILD_TYPE: Debug
         WITH_EXAMPLES: "yes"
-        # build output snippets for documentation
-        WITH_DOC: "yes"
-    # store output snippets for documentation
-    artifacts:
-        paths:
-            - Documentation/output_snippets/
 
 cuda_examples_Release:
     extends: .build_template
@@ -468,6 +461,28 @@ clang_mpi_benchmarks_tools_python_Release:
 
 
 
+documentation examples:
+    extends: .build_template
+    stage: build:cuda
+    tags:
+        - docker
+        - nvidia
+    variables:
+        <<: *default_cmake_flags
+        WITH_CUDA: "yes"
+        BUILD_TYPE: Debug
+        # build output snippets for documentation
+        WITH_DOC: "yes"
+    only:
+        changes:
+            - Documentation/**/*
+            - src/TNL/**/*.{h,hpp}
+            - .gitlab-ci.yml
+    # store output snippets for documentation
+    artifacts:
+        paths:
+            - Documentation/output_snippets/
+
 build documentation:
     stage: build:doc
     only:
@@ -477,16 +492,13 @@ build documentation:
             - .gitlab-ci.yml
     # use "needs" instead of "dependencies" to allow out-of-order start of this job
     needs:
-        # the job which builds Documentation/output_snippets/
-        - job: cuda_examples_Debug
+        - job: documentation examples
           artifacts: true
     script:
         - ./Documentation/build
     artifacts:
         paths:
             - ./Documentation/html/
-#    tags:
-#        - doxygen
 
 deploy documentation:
     stage: deploy
-- 
GitLab


From c6d5d4b82e2ba503b655cadda150617d95e3abc3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Wed, 20 Jan 2021 11:43:43 +0100
Subject: [PATCH 51/53] Fixed a typo

---
 .../Tutorials/GeneralConcepts/tutorial_GeneralConcepts.md     | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Documentation/Tutorials/GeneralConcepts/tutorial_GeneralConcepts.md b/Documentation/Tutorials/GeneralConcepts/tutorial_GeneralConcepts.md
index edab5bc8e..0e91cd6ee 100644
--- a/Documentation/Tutorials/GeneralConcepts/tutorial_GeneralConcepts.md
+++ b/Documentation/Tutorials/GeneralConcepts/tutorial_GeneralConcepts.md
@@ -98,7 +98,7 @@ View is a kind of lightweight reference object which makes only a shallow copy o
 
 The differences are on the line 5 where we fetch the view by means of method `getView` and on the line 7 where we work with the `view` and not with the array `a`. The view has very similar interface (see \ref TNL::Containers::ArrayView) as the array (\ref TNL::Containers::Array) and so mostly there is no difference in using array and its view for the programmer. In TNL, each data structure which can be accessed from GPU kernels (it means that it has methods defined as `__cuda_callable__`) provides also a method `getView` for getting appropriate view of the object.
 
-Views are simple objects because they must be transferred to GPU in each kernel call. So there are no smart links between a view and the original object. In fact, the array view contains just a pointer the the data managed by the array and the size of the array. Therefore if the original object get changed, all views obtained from the object before may become invalid. See the following example:
+Views are simple objects because they must be transferred to GPU in each kernel call. So there are no smart links between a view and the original object. In fact, the array view contains just a pointer the data managed by the array and the size of the array. Therefore if the original object get changed, all views obtained from the object before may become invalid. See the following example:
 
 \includelineno snippet_shared_pointers_and_views_capture_view_change.cpp
 
@@ -112,4 +112,4 @@ On the line 6, we change value of the first element. This causes no data realloc
 
 ### Shared pointers<a name="shared-pointers"></a>
 
-TNL offers smart pointers working across different devices (meaning CPU or GPU).
\ No newline at end of file
+TNL offers smart pointers working across different devices (meaning CPU or GPU).
-- 
GitLab


From c6405fec59d1dffcf7de0ddd0aeedda0002b37bd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Wed, 20 Jan 2021 12:17:04 +0100
Subject: [PATCH 52/53] Removed .vscode/ directory

[skip ci]
---
 .gitignore            |  3 +++
 .vscode/settings.json | 28 ----------------------------
 2 files changed, 3 insertions(+), 28 deletions(-)
 delete mode 100644 .vscode/settings.json

diff --git a/.gitignore b/.gitignore
index c5045f356..15a758dbd 100644
--- a/.gitignore
+++ b/.gitignore
@@ -12,3 +12,6 @@
 /.settings
 /.project
 /.pydevproject
+
+# VSCode
+/.vscode
diff --git a/.vscode/settings.json b/.vscode/settings.json
deleted file mode 100644
index 2de193076..000000000
--- a/.vscode/settings.json
+++ /dev/null
@@ -1,28 +0,0 @@
-{
-    "editor.tokenColorCustomizations": {
-        "textMateRules": [
-            {
-                "scope": "googletest.failed",
-                "settings": {
-                    "foreground": "#f00"
-                }
-            },
-            {
-                "scope": "googletest.passed",
-                "settings": {
-                    "foreground": "#0f0"
-                }
-            },
-            {
-                "scope": "googletest.run",
-                "settings": {
-                    "foreground": "#0f0"
-                }
-            }
-        ]
-    },
-    "files.associations": {
-        "numeric": "cpp",
-        "ostream": "cpp"
-    }
-}
\ No newline at end of file
-- 
GitLab


From 5a1c87909ac0da7308bc9acc65945e44c6f31ae2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Wed, 20 Jan 2021 15:59:02 +0100
Subject: [PATCH 53/53] Fix of matrices documentation.

---
 Documentation/Tutorials/Matrices/tutorial_Matrices.md | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/Documentation/Tutorials/Matrices/tutorial_Matrices.md b/Documentation/Tutorials/Matrices/tutorial_Matrices.md
index 89eb551ca..c6847e64c 100644
--- a/Documentation/Tutorials/Matrices/tutorial_Matrices.md
+++ b/Documentation/Tutorials/Matrices/tutorial_Matrices.md
@@ -1392,16 +1392,12 @@ To summarize, this method computes the following formula:
 
 ## Matrix I/O operations <a name="matrix_io_operations"></a>
 
-All  matrices can be saved to a file using a method `save` (\ref TNL::Matrices::DenseMatrix::save, \ref TNL::Matrices::SparseMatrix::save, \ref TNL::Matrices::TridiagonalMatrix::save, \ref TNL::Matrices::MultidiagonalMatrix::save, \ref TNL::Matrices::LambdaMatrix::save) and restored with a method `load` (\ref TNL::Matrices::DenseMatrix::load, \ref TNL::Matrices::SparseMatrix::load, \ref TNL::Matrices::TridiagonalMatrix::load, \ref TNL::Matrices::MultidiagonalMatrix::load, \ref TNL::Matrices::LambdaMatrix::load). To print the matrix, there is a method `print` (\ref TNL::Matrices::DenseMatrix::print, \ref TNL::Matrices::SparseMatrix::print, \ref TNL::Matrices::TridiagonalMatrix::print, \ref TNL::Matrices::MultidiagonalMatrix::print, \ref TNL::Matrices::LambdaMatrix::print) can be used. TNL also offers matrix reader (\ref TNL::Matrices::MatrixReader) and matrix writer (\ref TNL::Matrices::MatrixWriter) for import and export of matrices. We describe both in the following sections.
+All  matrices can be saved to a file using a method `save` (\ref TNL::Matrices::DenseMatrix::save, \ref TNL::Matrices::SparseMatrix::save, \ref TNL::Matrices::TridiagonalMatrix::save, \ref TNL::Matrices::MultidiagonalMatrix::save, \ref TNL::Matrices::LambdaMatrix::save) and restored with a method `load` (\ref TNL::Matrices::DenseMatrix::load, \ref TNL::Matrices::SparseMatrix::load, \ref TNL::Matrices::TridiagonalMatrix::load, \ref TNL::Matrices::MultidiagonalMatrix::load, \ref TNL::Matrices::LambdaMatrix::load). To print the matrix, there is a method `print` (\ref TNL::Matrices::DenseMatrix::print, \ref TNL::Matrices::SparseMatrix::print, \ref TNL::Matrices::TridiagonalMatrix::print, \ref TNL::Matrices::MultidiagonalMatrix::print, \ref TNL::Matrices::LambdaMatrix::print) can be used. TNL also offers matrix reader (\ref TNL::Matrices::MatrixReader) for import of matrices. We describe it in the following sections.
 
 ### Matrix reader <a name="matrix-reader></a>
 
 TODO: Write documentation on matrix reader.
 
-### Matrix writer <a name="matrix-writer></a>
-
-TODO: Write documentation on matrix writer.
-
 ## Appendix<a name="appendix"></a>
 
 ### Benchmark of dense matrix setup<a name="benchmark-of-dense-matrix-setup"></a>
-- 
GitLab