diff --git a/Documentation/Examples/CMakeLists.txt b/Documentation/Examples/CMakeLists.txt
index 45689f9e93ce4284ac5c622715bb9c8cf54961c4..ca8662ad0aaa59c7be58ea5c1db3b92fddadde28 100644
--- a/Documentation/Examples/CMakeLists.txt
+++ b/Documentation/Examples/CMakeLists.txt
@@ -1,6 +1,7 @@
 ADD_SUBDIRECTORY( Algorithms )
 ADD_SUBDIRECTORY( Containers )
 ADD_SUBDIRECTORY( Pointers )
+ADD_SUBDIRECTORY( Matrices )
 
 ADD_EXECUTABLE( FileExample FileExample.cpp )
 ADD_CUSTOM_COMMAND( COMMAND FileExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/FileExample.out OUTPUT FileExample.out )
diff --git a/Documentation/Examples/Matrices/CMakeLists.txt b/Documentation/Examples/Matrices/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..e0d7a6f4217ba7a210350b951e394abbfa06d66a
--- /dev/null
+++ b/Documentation/Examples/Matrices/CMakeLists.txt
@@ -0,0 +1,154 @@
+IF( BUILD_CUDA )
+   CUDA_ADD_EXECUTABLE( DenseMatrixExample_Constructor_init_list_cuda DenseMatrixExample_Constructor_init_list.cu )
+   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_Constructor_init_list_cuda >
+                        ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_Constructor_init_list.out
+                       OUTPUT DenseMatrixExample_Constructor_init_list.out )
+
+   CUDA_ADD_EXECUTABLE( DenseMatrixExample_setElements_cuda DenseMatrixExample_setElements.cu )
+   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_setElements_cuda > 
+                        ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_setElements.out
+                       OUTPUT DenseMatrixExample_setElements.out )
+
+   CUDA_ADD_EXECUTABLE( DenseMatrixExample_getCompressedRowLengths_cuda DenseMatrixExample_getCompressedRowLengths.cu )
+   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_getCompressedRowLengths_cuda >
+                        ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_getCompressedRowLengths.out
+                       OUTPUT DenseMatrixExample_getCompressedRowLengths.out )
+
+   CUDA_ADD_EXECUTABLE( DenseMatrixExample_getElementsCount_cuda DenseMatrixExample_getElementsCount.cu )
+   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_getElementsCount_cuda >
+                        ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_getElementsCount.out
+                       OUTPUT DenseMatrixExample_getElementsCount.out )
+
+   CUDA_ADD_EXECUTABLE( DenseMatrixExample_getConstRow_cuda DenseMatrixExample_getConstRow.cu )
+   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_getConstRow_cuda >
+                        ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_getConstRow.out
+                       OUTPUT DenseMatrixExample_getConstRow.out )
+
+   CUDA_ADD_EXECUTABLE( DenseMatrixExample_getRow_cuda DenseMatrixExample_getRow.cu )
+   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_getRow_cuda >
+                        ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_getRow.out
+                       OUTPUT DenseMatrixExample_getRow.out )
+
+   CUDA_ADD_EXECUTABLE( DenseMatrixExample_setElement_cuda DenseMatrixExample_setElement.cu )
+   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_setElement_cuda >
+                        ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_setElement.out
+                       OUTPUT DenseMatrixExample_setElement.out )
+
+   CUDA_ADD_EXECUTABLE( DenseMatrixExample_addElement_cuda DenseMatrixExample_addElement.cu )
+   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_addElement_cuda >
+                        ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_addElement.out
+                       OUTPUT DenseMatrixExample_addElement.out )
+
+   CUDA_ADD_EXECUTABLE( DenseMatrixExample_getElement_cuda DenseMatrixExample_getElement.cu )
+   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_getElement_cuda >
+                        ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_getElement.out
+                       OUTPUT DenseMatrixExample_getElement.out )
+
+   CUDA_ADD_EXECUTABLE( DenseMatrixExample_rowsReduction_cuda DenseMatrixExample_rowsReduction.cu )
+   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_rowsReduction_cuda >
+                        ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_rowsReduction.out
+                       OUTPUT DenseMatrixExample_rowsReduction.out )
+
+   CUDA_ADD_EXECUTABLE( DenseMatrixExample_allRowsReduction_cuda DenseMatrixExample_allRowsReduction.cu )
+   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_allRowsReduction_cuda >
+                        ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_allRowsReduction.out
+                       OUTPUT DenseMatrixExample_allRowsReduction.out )
+
+   CUDA_ADD_EXECUTABLE( DenseMatrixExample_forRows_cuda DenseMatrixExample_forRows.cu )
+   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_forRows_cuda >
+                        ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_forRows.out
+                       OUTPUT DenseMatrixExample_forRows.out )
+
+   CUDA_ADD_EXECUTABLE( DenseMatrixExample_forAllRows_cuda DenseMatrixExample_forAllRows.cu )
+   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_forAllRows_cuda >
+                        ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_forAllRows.out
+                       OUTPUT DenseMatrixExample_forAllRows.out )
+
+ELSE()
+   ADD_EXECUTABLE( DenseMatrixExample_Constructor_init_list DenseMatrixExample_Constructor_init_list.cpp )
+   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_Constructor_init_list >
+                        ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_Constructor_init_list.out
+                       OUTPUT DenseMatrixExample_Constructor_init_list.out )
+
+   ADD_EXECUTABLE( DenseMatrixExample_setElements DenseMatrixExample_setElements.cpp )
+   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_setElements > 
+                        ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_setElements.out
+                       OUTPUT DenseMatrixExample_setElements.out )
+
+   ADD_EXECUTABLE( DenseMatrixExample_getCompressedRowLengths DenseMatrixExample_getCompressedRowLengths.cpp )
+   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_getCompressedRowLengths >
+                        ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_getCompressedRowLengths.out
+                       OUTPUT DenseMatrixExample_getCompressedRowLengths.out )
+
+   ADD_EXECUTABLE( DenseMatrixExample_getElementsCount DenseMatrixExample_getElementsCount.cpp )
+   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_getElementsCount >
+                        ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_getElementsCount.out
+                       OUTPUT DenseMatrixExample_getElementsCount.out )
+
+   ADD_EXECUTABLE( DenseMatrixExample_getConstRow DenseMatrixExample_getConstRow.cpp )
+   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_getConstRow >
+                        ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_getConstRow.out
+                       OUTPUT DenseMatrixExample_getConstRow.out )
+
+   ADD_EXECUTABLE( DenseMatrixExample_getRow DenseMatrixExample_getRow.cpp )
+   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_getRow >
+                        ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_getRow.out
+                       OUTPUT DenseMatrixExample_getRow.out )
+
+   ADD_EXECUTABLE( DenseMatrixExample_setElement DenseMatrixExample_setElement.cpp )
+   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_setElement >
+                        ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_setElement.out
+                       OUTPUT DenseMatrixExample_setElement.out )
+
+   ADD_EXECUTABLE( DenseMatrixExample_addElement DenseMatrixExample_addElement.cpp )
+   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_addElement >
+                        ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_addElement.out
+                       OUTPUT DenseMatrixExample_addElement.out )
+
+   ADD_EXECUTABLE( DenseMatrixExample_getElement DenseMatrixExample_getElement.cpp )
+   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_getElement >
+                        ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_getElement.out
+                       OUTPUT DenseMatrixExample_getElement.out )
+
+   ADD_EXECUTABLE( DenseMatrixExample_rowsReduction DenseMatrixExample_rowsReduction.cpp )
+   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_rowsReduction >
+                        ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_rowsReduction.out
+                       OUTPUT DenseMatrixExample_rowsReduction.out )
+
+   ADD_EXECUTABLE( DenseMatrixExample_allRowsReduction DenseMatrixExample_allRowsReduction.cpp )
+   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_allRowsReduction >
+                        ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_allRowsReduction.out
+                       OUTPUT DenseMatrixExample_allRowsReduction.out )
+
+   ADD_EXECUTABLE( DenseMatrixExample_forRows DenseMatrixExample_forRows.cpp )
+   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_forRows >
+                        ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_forRows.out
+                       OUTPUT DenseMatrixExample_forRows.out )
+
+   ADD_EXECUTABLE( DenseMatrixExample_forAllRows DenseMatrixExample_forAllRows.cpp )
+   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_forAllRows >
+                        ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_forAllRows.out
+                       OUTPUT DenseMatrixExample_forAllRows.out )
+
+ENDIF()
+
+
+
+
+
+ADD_CUSTOM_TARGET( RunMatricesExamples ALL DEPENDS
+   DenseMatrixExample_Constructor_init_list.out
+   DenseMatrixExample_setElements.out
+   DenseMatrixExample_getCompressedRowLengths.out
+   DenseMatrixExample_getElementsCount.out
+   DenseMatrixExample_getConstRow.out
+   DenseMatrixExample_getRow.out
+   DenseMatrixExample_setElement.out
+   DenseMatrixExample_addElement.out
+   DenseMatrixExample_getElement.out
+   DenseMatrixExample_rowsReduction.out
+   DenseMatrixExample_allRowsReduction.out
+   DenseMatrixExample_forRows.out
+   DenseMatrixExample_forAllRows.out
+)
+
diff --git a/Documentation/Examples/Matrices/DenseMatrixExample_Constructor_init_list.cpp b/Documentation/Examples/Matrices/DenseMatrixExample_Constructor_init_list.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..91426a6f12a3ee4c3bcd3f736524eda93986f826
--- /dev/null
+++ b/Documentation/Examples/Matrices/DenseMatrixExample_Constructor_init_list.cpp
@@ -0,0 +1,37 @@
+#include <iostream>
+#include <TNL/Matrices/DenseMatrix.h>
+#include <TNL/Devices/Host.h>
+
+
+template< typename Device >
+void initializerListExample()
+{
+   TNL::Matrices::DenseMatrix< double, Device > matrix {
+      {  1,  2,  3,  4,  5,  6 },
+      {  7,  8,  9, 10, 11, 12 },
+      { 13, 14, 15, 16, 17, 18 }
+   };
+
+   std::cout << "General dense matrix: " << std::endl << matrix << std::endl;
+
+   TNL::Matrices::DenseMatrix< double, Device > triangularMatrix {
+      {  1 },
+      {  2,  3 },
+      {  4,  5,  6 },
+      {  7,  8,  9, 10 },
+      { 11, 12, 13, 14, 15 }
+   };
+
+   std::cout << "Triangular dense matrix: " << std::endl << triangularMatrix << std::endl;
+}
+
+int main( int argc, char* argv[] )
+{
+   std::cout << "Creating matrices on CPU ... " << std::endl;
+   initializerListExample< TNL::Devices::Host >();
+
+#ifdef HAVE_CUDA
+   std::cout << "Creating matrices on CUDA GPU ... " << std::endl;
+   initializerListExample< TNL::Devices::Cuda >();
+#endif
+}
diff --git a/Documentation/Examples/Matrices/DenseMatrixExample_Constructor_init_list.cu b/Documentation/Examples/Matrices/DenseMatrixExample_Constructor_init_list.cu
new file mode 120000
index 0000000000000000000000000000000000000000..91fa4f073fdd92c38ccf252d84f3ec4b953aa9dc
--- /dev/null
+++ b/Documentation/Examples/Matrices/DenseMatrixExample_Constructor_init_list.cu
@@ -0,0 +1 @@
+DenseMatrixExample_Constructor_init_list.cpp
\ No newline at end of file
diff --git a/Documentation/Examples/Matrices/DenseMatrixExample_addElement.cpp b/Documentation/Examples/Matrices/DenseMatrixExample_addElement.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..32e39e6a3ec5fb55618ed54523db22f34ed0ebbd
--- /dev/null
+++ b/Documentation/Examples/Matrices/DenseMatrixExample_addElement.cpp
@@ -0,0 +1,30 @@
+#include <iostream>
+#include <TNL/Matrices/DenseMatrix.h>
+#include <TNL/Devices/Host.h>
+
+template< typename Device >
+void addElements()
+{
+   TNL::Matrices::DenseMatrix< double, Device > matrix( 5, 5 );
+   for( int i = 0; i < 5; i++ )
+      matrix.setElement( i, i, i );
+
+   std::cout << "Initial matrix is: " << matrix << std::endl;
+
+   for( int i = 0; i < 5; i++ )
+      for( int j = 0; j < 5; j++ )
+         matrix.addElement( i, j, 1.0, 5.0 );
+
+   std::cout << "Matrix after addition is: " << matrix << std::endl;
+}
+
+int main( int argc, char* argv[] )
+{
+   std::cout << "Add elements on host:" << std::endl;
+   addElements< TNL::Devices::Host >();
+
+#ifdef HAVE_CUDA
+   std::cout << "Add elements on CUDA device:" << std::endl;
+   addElements< TNL::Devices::Cuda >();
+#endif
+}
diff --git a/Documentation/Examples/Matrices/DenseMatrixExample_addElement.cu b/Documentation/Examples/Matrices/DenseMatrixExample_addElement.cu
new file mode 120000
index 0000000000000000000000000000000000000000..dd83670e4536d2d9cb33cccce30ca397c60587ac
--- /dev/null
+++ b/Documentation/Examples/Matrices/DenseMatrixExample_addElement.cu
@@ -0,0 +1 @@
+DenseMatrixExample_addElement.cpp
\ No newline at end of file
diff --git a/Documentation/Examples/Matrices/DenseMatrixExample_allRowsReduction.cpp b/Documentation/Examples/Matrices/DenseMatrixExample_allRowsReduction.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ce323671fc86e77ae19c513487b41148afc9cf84
--- /dev/null
+++ b/Documentation/Examples/Matrices/DenseMatrixExample_allRowsReduction.cpp
@@ -0,0 +1,66 @@
+#include <iostream>
+#include <iomanip>
+#include <functional>
+#include <TNL/Matrices/DenseMatrix.h>
+#include <TNL/Devices/Host.h>
+#include <TNL/Devices/Cuda.h>
+
+template< typename Device >
+void allRowsReduction()
+{
+   TNL::Matrices::DenseMatrix< double, Device > matrix {
+      {  1,  0,  0,  0,  0 },
+      {  1,  2,  0,  0,  0 },
+      {  0,  1,  8,  0,  0 },
+      {  0,  0,  1,  9,  0 },
+      {  0,  0,  0,  0,  1 } };
+
+   /***
+    * Find largest element in each row.
+    */
+   TNL::Containers::Vector< double, Device > rowMax( matrix.getRows() );
+
+   /***
+    * Prepare vector view and matrix view for lambdas.
+    */
+   auto rowMaxView = rowMax.getView();
+
+   /***
+    * Fetch lambda just returns absolute value of matrix elements.
+    */
+   auto fetch = [=] __cuda_callable__ ( int rowIdx, int columnIdx, const double& value ) -> double {
+      return TNL::abs( value );
+   };
+
+   /***
+    * Reduce lambda return maximum of given values.
+    */
+   auto reduce = [=] __cuda_callable__ ( double& a, const double& b ) -> double {
+      return TNL::max( a, b );
+   };
+
+   /***
+    * Keep lambda store the largest value in each row to the vector rowMax.
+    */
+   auto keep = [=] __cuda_callable__ ( int rowIdx, const double& value ) mutable {
+      rowMaxView[ rowIdx ] = value;
+   };
+
+   /***
+    * Compute the largest values in each row.
+    */
+   matrix.allRowsReduction( fetch, reduce, keep, std::numeric_limits< double >::lowest() );
+
+   std::cout << "Max. elements in rows are: " << rowMax << std::endl;
+}
+
+int main( int argc, char* argv[] )
+{
+   std::cout << "All rows reduction on host:" << std::endl;
+   allRowsReduction< TNL::Devices::Host >();
+
+#ifdef HAVE_CUDA
+   std::cout << "All rows reduction on CUDA device:" << std::endl;
+   allRowsReduction< TNL::Devices::Cuda >();
+#endif
+}
diff --git a/Documentation/Examples/Matrices/DenseMatrixExample_allRowsReduction.cu b/Documentation/Examples/Matrices/DenseMatrixExample_allRowsReduction.cu
new file mode 120000
index 0000000000000000000000000000000000000000..70f517f68bde7c679d39e1e315879355f0f366a8
--- /dev/null
+++ b/Documentation/Examples/Matrices/DenseMatrixExample_allRowsReduction.cu
@@ -0,0 +1 @@
+DenseMatrixExample_allRowsReduction.cpp
\ No newline at end of file
diff --git a/Documentation/Examples/Matrices/DenseMatrixExample_forAllRows.cpp b/Documentation/Examples/Matrices/DenseMatrixExample_forAllRows.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5fddf0f34ff789591802da01c143c3600baeadf3
--- /dev/null
+++ b/Documentation/Examples/Matrices/DenseMatrixExample_forAllRows.cpp
@@ -0,0 +1,31 @@
+#include <iostream>
+#include <TNL/Matrices/DenseMatrix.h>
+#include <TNL/Devices/Host.h>
+#include <TNL/Devices/Cuda.h>
+
+template< typename Device >
+void forAllRowsExample()
+{
+   TNL::Matrices::DenseMatrix< double, Device > matrix( 5, 5 );
+
+   auto f = [=] __cuda_callable__ ( int rowIdx, int columnIdx, int globalIdx, double& value, bool& compute ) {
+      if( rowIdx < columnIdx )
+         compute = false;
+      else
+         value = rowIdx + columnIdx;
+   };
+
+   matrix.forAllRows( f );
+   std::cout << matrix << std::endl;
+}
+
+int main( int argc, char* argv[] )
+{
+   std::cout << "Creating matrix on host: " << std::endl;
+   forAllRowsExample< TNL::Devices::Host >();
+
+#ifdef HAVE_CUDA
+   std::cout << "Creating matrix on CUDA device: " << std::endl;
+   forAllRowsExample< TNL::Devices::Cuda >();
+#endif
+}
diff --git a/Documentation/Examples/Matrices/DenseMatrixExample_forAllRows.cu b/Documentation/Examples/Matrices/DenseMatrixExample_forAllRows.cu
new file mode 120000
index 0000000000000000000000000000000000000000..589520f796db5b9d4d637a922f8d433d79c987c7
--- /dev/null
+++ b/Documentation/Examples/Matrices/DenseMatrixExample_forAllRows.cu
@@ -0,0 +1 @@
+DenseMatrixExample_forAllRows.cpp
\ No newline at end of file
diff --git a/Documentation/Examples/Matrices/DenseMatrixExample_forRows.cpp b/Documentation/Examples/Matrices/DenseMatrixExample_forRows.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f3e45a006adbe3f73ec5f37b89afc7f0aed81cce
--- /dev/null
+++ b/Documentation/Examples/Matrices/DenseMatrixExample_forRows.cpp
@@ -0,0 +1,31 @@
+#include <iostream>
+#include <TNL/Matrices/DenseMatrix.h>
+#include <TNL/Devices/Host.h>
+#include <TNL/Devices/Cuda.h>
+
+template< typename Device >
+void forRowsExample()
+{
+   TNL::Matrices::DenseMatrix< double, Device > matrix( 5, 5 );
+
+   auto f = [=] __cuda_callable__ ( int rowIdx, int columnIdx, int globalIdx, double& value, bool& compute ) {
+      if( rowIdx < columnIdx )
+         compute = false;
+      else
+         value = rowIdx + columnIdx;
+   };
+
+   matrix.forRows( 0, matrix.getRows(), f );
+   std::cout << matrix << std::endl;
+}
+
+int main( int argc, char* argv[] )
+{
+   std::cout << "Creating matrix on host: " << std::endl;
+   forRowsExample< TNL::Devices::Host >();
+
+#ifdef HAVE_CUDA
+   std::cout << "Creating matrix on CUDA device: " << std::endl;
+   forRowsExample< TNL::Devices::Cuda >();
+#endif
+}
diff --git a/Documentation/Examples/Matrices/DenseMatrixExample_forRows.cu b/Documentation/Examples/Matrices/DenseMatrixExample_forRows.cu
new file mode 120000
index 0000000000000000000000000000000000000000..f97a66ee329635c4522ad123e16e3a173f5d8884
--- /dev/null
+++ b/Documentation/Examples/Matrices/DenseMatrixExample_forRows.cu
@@ -0,0 +1 @@
+DenseMatrixExample_forRows.cpp
\ No newline at end of file
diff --git a/Documentation/Examples/Matrices/DenseMatrixExample_getCompressedRowLengths.cpp b/Documentation/Examples/Matrices/DenseMatrixExample_getCompressedRowLengths.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e89992d9fafcada3738b37639c6cdb81e761ff55
--- /dev/null
+++ b/Documentation/Examples/Matrices/DenseMatrixExample_getCompressedRowLengths.cpp
@@ -0,0 +1,34 @@
+#include <iostream>
+#include <TNL/Matrices/DenseMatrix.h>
+#include <TNL/Devices/Host.h>
+#include <TNL/Devices/Cuda.h>
+
+template< typename Device >
+void getCompressedRowLengthsExample()
+{
+   TNL::Matrices::DenseMatrix< double, Device > triangularMatrix {
+      {  1 },
+      {  2,  3 },
+      {  4,  5,  6 },
+      {  7,  8,  9, 10 },
+      { 11, 12, 13, 14, 15 }
+   };
+
+   std::cout << triangularMatrix << std::endl;
+
+   TNL::Containers::Vector< int, Device > rowLengths;
+   triangularMatrix.getCompressedRowLengths( rowLengths );
+
+   std::cout << "Compressed row lengths are: " << rowLengths << std::endl;
+}
+
+int main( int argc, char* argv[] )
+{
+   std::cout << "Getting compressed row lengths on host: " << std::endl;
+   getCompressedRowLengthsExample< TNL::Devices::Host >();
+
+#ifdef HAVE_CUDA
+   std::cout << "Getting compressed row lengths on CUDA device: " << std::endl;
+   getCompressedRowLengthsExample< TNL::Devices::Cuda >();
+#endif
+}
diff --git a/Documentation/Examples/Matrices/DenseMatrixExample_getCompressedRowLengths.cu b/Documentation/Examples/Matrices/DenseMatrixExample_getCompressedRowLengths.cu
new file mode 120000
index 0000000000000000000000000000000000000000..2b3cd6c1377adc000b69c7124e73f09d73df8176
--- /dev/null
+++ b/Documentation/Examples/Matrices/DenseMatrixExample_getCompressedRowLengths.cu
@@ -0,0 +1 @@
+DenseMatrixExample_getCompressedRowLengths.cpp
\ No newline at end of file
diff --git a/Documentation/Examples/Matrices/DenseMatrixExample_getConstRow.cpp b/Documentation/Examples/Matrices/DenseMatrixExample_getConstRow.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..08b655e55c5ecb6625dfa84e2d24f3eac65fae53
--- /dev/null
+++ b/Documentation/Examples/Matrices/DenseMatrixExample_getConstRow.cpp
@@ -0,0 +1,52 @@
+#include <iostream>
+#include <functional>
+#include <TNL/Algorithms/ParallelFor.h>
+#include <TNL/Matrices/DenseMatrix.h>
+#include <TNL/Devices/Host.h>
+#include <TNL/Devices/Cuda.h>
+#include <TNL/Pointers/SharedPointer.h>
+
+template< typename Device >
+void getRowExample()
+{
+   using MatrixType = TNL::Matrices::DenseMatrix< double, Device >;
+   TNL::Pointers::SharedPointer< MatrixType > matrix {
+      { 1, 0, 0, 0, 0 },
+      { 1, 2, 0, 0, 0 },
+      { 1, 2, 3, 0, 0 },
+      { 1, 2, 3, 4, 0 },
+      { 1, 2, 3, 4, 5 }
+   };
+
+   /***
+    * Fetch lambda function returns diagonal element in each row.
+    */
+   auto fetch = [=] __cuda_callable__ ( int rowIdx ) mutable -> double {
+      auto row = matrix->getRow( rowIdx );
+      return row.getElement( rowIdx );
+   };
+
+   /***
+    * For the case when Device is CUDA device we need to synchronize smart
+    * pointers. To avoid this you may use DenseMatrixView. See
+    * DenseMatrixView::getConstRow example for details.
+    */
+   TNL::Pointers::synchronizeSmartPointersOnDevice< Device >();
+
+   /***
+    * Compute the matrix trace.
+    */
+   int trace = TNL::Algorithms::Reduction< Device >::reduce( matrix->getRows(), std::plus<>{}, fetch, 0 );
+   std::cout << "Matrix trace is " << trace << "." << std::endl;
+}
+
+int main( int argc, char* argv[] )
+{
+   std::cout << "Getting matrix rows on host: " << std::endl;
+   getRowExample< TNL::Devices::Host >();
+
+#ifdef HAVE_CUDA
+   std::cout << "Getting matrix rows on CUDA device: " << std::endl;
+   getRowExample< TNL::Devices::Cuda >();
+#endif
+}
diff --git a/Documentation/Examples/Matrices/DenseMatrixExample_getConstRow.cu b/Documentation/Examples/Matrices/DenseMatrixExample_getConstRow.cu
new file mode 120000
index 0000000000000000000000000000000000000000..c78f9cfdc5f6586165d96c44201f944de0aaa263
--- /dev/null
+++ b/Documentation/Examples/Matrices/DenseMatrixExample_getConstRow.cu
@@ -0,0 +1 @@
+DenseMatrixExample_getConstRow.cpp
\ No newline at end of file
diff --git a/Documentation/Examples/Matrices/DenseMatrixExample_getElement.cpp b/Documentation/Examples/Matrices/DenseMatrixExample_getElement.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..72a5d0af44682b84c8e503c5bcc561666eb43088
--- /dev/null
+++ b/Documentation/Examples/Matrices/DenseMatrixExample_getElement.cpp
@@ -0,0 +1,34 @@
+#include <iostream>
+#include <iomanip>
+#include <TNL/Matrices/DenseMatrix.h>
+#include <TNL/Devices/Host.h>
+
+template< typename Device >
+void getElements()
+{
+   TNL::Matrices::DenseMatrix< double, Device > matrix {
+      {  1,  0,  0,  0,  0 },
+      { -1,  2, -1,  0,  0 },
+      {  0, -1,  2, -1,  0 },
+      {  0,  0, -1,  2, -1 },
+      {  0,  0,  0,  0,  1 } };
+
+
+   for( int i = 0; i < 5; i++ )
+   {
+      for( int j = 0; j < 5; j++ )
+         std::cout << std::setw( 5 ) << std::ios::right << matrix.getElement( i, i );
+      std::cout << std::endl;
+   }
+}
+
+int main( int argc, char* argv[] )
+{
+   std::cout << "Get elements on host:" << std::endl;
+   getElements< TNL::Devices::Host >();
+
+#ifdef HAVE_CUDA
+   std::cout << "Get elements on CUDA device:" << std::endl;
+   getElements< TNL::Devices::Cuda >();
+#endif
+}
diff --git a/Documentation/Examples/Matrices/DenseMatrixExample_getElement.cu b/Documentation/Examples/Matrices/DenseMatrixExample_getElement.cu
new file mode 120000
index 0000000000000000000000000000000000000000..bad6f2fabe82e81bd5319d4f30992c2ab8a93d73
--- /dev/null
+++ b/Documentation/Examples/Matrices/DenseMatrixExample_getElement.cu
@@ -0,0 +1 @@
+DenseMatrixExample_getElement.cpp
\ No newline at end of file
diff --git a/Documentation/Examples/Matrices/DenseMatrixExample_getElementsCount.cpp b/Documentation/Examples/Matrices/DenseMatrixExample_getElementsCount.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a95fa00e77e9f6b40de672a21931005c17862549
--- /dev/null
+++ b/Documentation/Examples/Matrices/DenseMatrixExample_getElementsCount.cpp
@@ -0,0 +1,30 @@
+#include <iostream>
+#include <TNL/Matrices/DenseMatrix.h>
+#include <TNL/Devices/Host.h>
+#include <TNL/Devices/Cuda.h>
+
+template< typename Device >
+void getElementsCountExample()
+{
+   TNL::Matrices::DenseMatrix< double, Device > triangularMatrix {
+      {  1 },
+      {  2,  3 },
+      {  4,  5,  6 },
+      {  7,  8,  9, 10 },
+      { 11, 12, 13, 14, 15 }
+   };
+
+   std::cout << "Matrix elements count is " << triangularMatrix.getElementsCount() << "." << std::endl;
+   std::cout << "Non-zero matrix elements count is " << triangularMatrix.getNonzeroElementsCount() << "." << std::endl;
+}
+
+int main( int argc, char* argv[] )
+{
+   std::cout << "Computing matrix elements on host: " << std::endl;
+   getElementsCountExample< TNL::Devices::Host >();
+
+#ifdef HAVE_CUDA
+   std::cout << "Computing matrix elements on CUDA device: " << std::endl;
+   getElementsCountExample< TNL::Devices::Cuda >();
+#endif
+}
diff --git a/Documentation/Examples/Matrices/DenseMatrixExample_getElementsCount.cu b/Documentation/Examples/Matrices/DenseMatrixExample_getElementsCount.cu
new file mode 120000
index 0000000000000000000000000000000000000000..6e8348f73f57a6111d25da0cc47fa67730b42546
--- /dev/null
+++ b/Documentation/Examples/Matrices/DenseMatrixExample_getElementsCount.cu
@@ -0,0 +1 @@
+DenseMatrixExample_getElementsCount.cpp
\ No newline at end of file
diff --git a/Documentation/Examples/Matrices/DenseMatrixExample_getRow.cpp b/Documentation/Examples/Matrices/DenseMatrixExample_getRow.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..00a6b11192a5f7fdedfc5964db674ed5fc4c05b7
--- /dev/null
+++ b/Documentation/Examples/Matrices/DenseMatrixExample_getRow.cpp
@@ -0,0 +1,42 @@
+#include <iostream>
+#include <TNL/Algorithms/ParallelFor.h>
+#include <TNL/Matrices/DenseMatrix.h>
+#include <TNL/Devices/Host.h>
+#include <TNL/Devices/Cuda.h>
+#include <TNL/Pointers/SharedPointer.h>
+
+template< typename Device >
+void getRowExample()
+{
+   using MatrixType = TNL::Matrices::DenseMatrix< double, Device >;
+   TNL::Pointers::SharedPointer< MatrixType > matrix( 5, 5 );
+
+   auto f = [=] __cuda_callable__ ( int rowIdx ) mutable {
+      auto row = matrix->getRow( rowIdx );
+      row.setElement( rowIdx, 10* ( rowIdx + 1 ) );
+   };
+
+   /***
+    * For the case when Device is CUDA device we need to synchronize smart
+    * pointers. To avoid this you may use DenseMatrixView. See
+    * DenseMatrixView::getRow example for details.
+    */
+   TNL::Pointers::synchronizeSmartPointersOnDevice< Device >();
+
+   /***
+    * Set the matrix elements.
+    */
+   TNL::Algorithms::ParallelFor< Device >::exec( 0, matrix->getRows(), f );
+   std::cout << matrix << std::endl;
+}
+
+int main( int argc, char* argv[] )
+{
+   std::cout << "Getting matrix rows on host: " << std::endl;
+   getRowExample< TNL::Devices::Host >();
+
+#ifdef HAVE_CUDA
+   std::cout << "Getting matrix rows on CUDA device: " << std::endl;
+   getRowExample< TNL::Devices::Cuda >();
+#endif
+}
diff --git a/Documentation/Examples/Matrices/DenseMatrixExample_getRow.cu b/Documentation/Examples/Matrices/DenseMatrixExample_getRow.cu
new file mode 120000
index 0000000000000000000000000000000000000000..58a55f2fd463b846b541dc4fe566dd19b97c35e0
--- /dev/null
+++ b/Documentation/Examples/Matrices/DenseMatrixExample_getRow.cu
@@ -0,0 +1 @@
+DenseMatrixExample_getRow.cpp
\ No newline at end of file
diff --git a/Documentation/Examples/Matrices/DenseMatrixExample_rowsReduction.cpp b/Documentation/Examples/Matrices/DenseMatrixExample_rowsReduction.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..dbc44f85486191b7c09e9b5e38a9a9951a0bfaca
--- /dev/null
+++ b/Documentation/Examples/Matrices/DenseMatrixExample_rowsReduction.cpp
@@ -0,0 +1,66 @@
+#include <iostream>
+#include <iomanip>
+#include <functional>
+#include <TNL/Matrices/DenseMatrix.h>
+#include <TNL/Devices/Host.h>
+
+template< typename Device >
+void rowsReduction()
+{
+   TNL::Matrices::DenseMatrix< double, Device > matrix {
+      {  1,  0,  0,  0,  0 },
+      {  1,  2,  0,  0,  0 },
+      {  0,  1,  8,  0,  0 },
+      {  0,  0,  1,  9,  0 },
+      {  0,  0,  0,  0,  1 } };
+
+   /***
+    * Find largest element in each row.
+    */
+   TNL::Containers::Vector< double, Device > rowMax( matrix.getRows() );
+
+   /***
+    * Prepare vector view and matrix view for lambdas.
+    */
+   const auto matrixView = matrix.getConstView();
+   auto rowMaxView = rowMax.getView();
+
+   /***
+    * Fetch lambda just returns absolute value of matrix elements.
+    */
+   auto fetch = [=] __cuda_callable__ ( int rowIdx, int columnIdx, const double& value ) -> double {
+      return TNL::abs( value );
+   };
+
+   /***
+    * Reduce lambda return maximum of given values.
+    */
+   auto reduce = [=] __cuda_callable__ ( double& a, const double& b ) -> double {
+      return TNL::max( a, b );
+   };
+
+   /***
+    * Keep lambda store the largest value in each row to the vector rowMax.
+    */
+   auto keep = [=] __cuda_callable__ ( int rowIdx, const double& value ) mutable {
+      rowMaxView[ rowIdx ] = value;
+   };
+
+   /***
+    * Compute the largest values in each row.
+    */
+   matrix.rowsReduction( 0, matrix.getRows(), fetch, reduce, keep, std::numeric_limits< double >::lowest() );
+
+   std::cout << "Max. elements in rows are: " << rowMax << std::endl;
+}
+
+int main( int argc, char* argv[] )
+{
+   std::cout << "Rows reduction on host:" << std::endl;
+   rowsReduction< TNL::Devices::Host >();
+
+#ifdef HAVE_CUDA
+   std::cout << "Rows reduction on CUDA device:" << std::endl;
+   rowsReduction< TNL::Devices::Cuda >();
+#endif
+}
diff --git a/Documentation/Examples/Matrices/DenseMatrixExample_rowsReduction.cu b/Documentation/Examples/Matrices/DenseMatrixExample_rowsReduction.cu
new file mode 120000
index 0000000000000000000000000000000000000000..41bf46ebc4242f1d736981068acab56004a1786d
--- /dev/null
+++ b/Documentation/Examples/Matrices/DenseMatrixExample_rowsReduction.cu
@@ -0,0 +1 @@
+DenseMatrixExample_rowsReduction.cpp
\ No newline at end of file
diff --git a/Documentation/Examples/Matrices/DenseMatrixExample_setElement.cpp b/Documentation/Examples/Matrices/DenseMatrixExample_setElement.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0b5498adf64428472e75d701bf62634f57e60616
--- /dev/null
+++ b/Documentation/Examples/Matrices/DenseMatrixExample_setElement.cpp
@@ -0,0 +1,24 @@
+#include <iostream>
+#include <TNL/Matrices/DenseMatrix.h>
+#include <TNL/Devices/Host.h>
+
+template< typename Device >
+void setElements()
+{
+   TNL::Matrices::DenseMatrix< double, Device > matrix( 5, 5 );
+   for( int i = 0; i < 5; i++ )
+      matrix.setElement( i, i, i );
+
+   std::cout << matrix << std::endl;
+}
+
+int main( int argc, char* argv[] )
+{
+   std::cout << "Set elements on host:" << std::endl;
+   setElements< TNL::Devices::Host >();
+
+#ifdef HAVE_CUDA
+   std::cout << "Set elements on CUDA device:" << std::endl;
+   setElements< TNL::Devices::Cuda >();
+#endif
+}
diff --git a/Documentation/Examples/Matrices/DenseMatrixExample_setElement.cu b/Documentation/Examples/Matrices/DenseMatrixExample_setElement.cu
new file mode 120000
index 0000000000000000000000000000000000000000..5128052c215326d2a5ec1409df334fa842acc0af
--- /dev/null
+++ b/Documentation/Examples/Matrices/DenseMatrixExample_setElement.cu
@@ -0,0 +1 @@
+DenseMatrixExample_setElement.cpp
\ No newline at end of file
diff --git a/Documentation/Examples/Matrices/DenseMatrixExample_setElements.cpp b/Documentation/Examples/Matrices/DenseMatrixExample_setElements.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0eb0610a44d9e4fe24be025407b18f0e4728b42e
--- /dev/null
+++ b/Documentation/Examples/Matrices/DenseMatrixExample_setElements.cpp
@@ -0,0 +1,39 @@
+#include <iostream>
+#include <TNL/Matrices/DenseMatrix.h>
+#include <TNL/Devices/Host.h>
+#include <TNL/Devices/Cuda.h>
+
+template< typename Device >
+void setElementsExample()
+{
+   TNL::Matrices::DenseMatrix< double, Device > matrix;
+   matrix.setElements( {
+      {  1,  2,  3,  4,  5,  6 },
+      {  7,  8,  9, 10, 11, 12 },
+      { 13, 14, 15, 16, 17, 18 }
+   } );
+
+   std::cout << matrix << std::endl;
+
+   TNL::Matrices::DenseMatrix< double, Device > triangularMatrix;
+   triangularMatrix.setElements( {
+      {  1 },
+      {  2,  3 },
+      {  4,  5,  6 },
+      {  7,  8,  9, 10 },
+      { 11, 12, 13, 14, 15 }
+   } );
+
+   std::cout << triangularMatrix << std::endl;
+}
+
+int main( int argc, char* argv[] )
+{
+   std::cout << "Setting matrix elements on host: " << std::endl;
+   setElementsExample< TNL::Devices::Host >();
+
+#ifdef HAVE_CUDA
+   std::cout << "Setting matrix elements on CUDA device: " << std::endl;
+   setElementsExample< TNL::Devices::Cuda >();
+#endif
+}
\ No newline at end of file
diff --git a/Documentation/Examples/Matrices/DenseMatrixExample_setElements.cu b/Documentation/Examples/Matrices/DenseMatrixExample_setElements.cu
new file mode 120000
index 0000000000000000000000000000000000000000..fa2487e278ec58ceb13a2b5c20aafb8142ccb920
--- /dev/null
+++ b/Documentation/Examples/Matrices/DenseMatrixExample_setElements.cu
@@ -0,0 +1 @@
+DenseMatrixExample_setElements.cpp
\ No newline at end of file
diff --git a/Documentation/Examples/Matrices/DenseMatrixViewExample_allRowsReduction.cpp b/Documentation/Examples/Matrices/DenseMatrixViewExample_allRowsReduction.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b65cb3ea9d798184812aff48c15cf46d5f3321a0
--- /dev/null
+++ b/Documentation/Examples/Matrices/DenseMatrixViewExample_allRowsReduction.cpp
@@ -0,0 +1,66 @@
+#include <iostream>
+#include <iomanip>
+#include <functional>
+#include <TNL/Matrices/DenseMatrix.h>
+#include <TNL/Devices/Host.h>
+
+template< typename Device >
+void allRowsReduction()
+{
+   TNL::Matrices::DenseMatrix< double, Device > matrix {
+      {  1,  0,  0,  0,  0 },
+      {  1,  2,  0,  0,  0 },
+      {  0,  1,  8,  0,  0 },
+      {  0,  0,  1,  9,  0 },
+      {  0,  0,  0,  0,  1 } };
+   auto matrixView = matrix.getView();
+
+   /***
+    * Find largest element in each row.
+    */
+   TNL::Containers::Vector< double, Device > rowMax( matrix.getRows() );
+
+   /***
+    * Prepare vector view and matrix view for lambdas.
+    */
+   auto rowMaxView = rowMax.getView();
+
+   /***
+    * Fetch lambda just returns absolute value of matrix elements.
+    */
+   auto fetch = [=] __cuda_callable__ ( int rowIdx, int columnIdx, const double& value ) -> double {
+      return TNL::abs( value );
+   };
+
+   /***
+    * Reduce lambda return maximum of given values.
+    */
+   auto reduce = [=] __cuda_callable__ ( double& a, const double& b ) -> double {
+      return TNL::max( a, b );
+   };
+
+   /***
+    * Keep lambda store the largest value in each row to the vector rowMax.
+    */
+   auto keep = [=] __cuda_callable__ ( int rowIdx, const double& value ) mutable {
+      rowMaxView[ rowIdx ] = value;
+   };
+
+   /***
+    * Compute the largest values in each row.
+    */
+   matrixView.allRowsReduction( fetch, reduce, keep, std::numeric_limits< double >::lowest() );
+
+   std::cout << "Max. elements in rows are: " << rowMax << std::endl;
+}
+
+int main( int argc, char* argv[] )
+{
+   std::cout << "All rows reduction on host:" << std::endl;
+   allRowsReduction< TNL::Devices::Host >();
+
+#ifdef HAVE_CUDA
+   std::cout << "All rows reduction on CUDA device:" << std::endl;
+   allRowsReduction< TNL::Devices::Cuda >();
+#endif
+}
diff --git a/Documentation/Examples/Matrices/DenseMatrixViewExample_allRowsReduction.cu b/Documentation/Examples/Matrices/DenseMatrixViewExample_allRowsReduction.cu
new file mode 120000
index 0000000000000000000000000000000000000000..61dd891255d95692787f5742566c2bcdc0872190
--- /dev/null
+++ b/Documentation/Examples/Matrices/DenseMatrixViewExample_allRowsReduction.cu
@@ -0,0 +1 @@
+DenseMatrixViewExample_allRowsReduction.cpp
\ No newline at end of file
diff --git a/Documentation/Examples/Matrices/DenseMatrixViewExample_getConstRow.cpp b/Documentation/Examples/Matrices/DenseMatrixViewExample_getConstRow.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ab155185e3077ee210390ee16b72369c5161ee7a
--- /dev/null
+++ b/Documentation/Examples/Matrices/DenseMatrixViewExample_getConstRow.cpp
@@ -0,0 +1,45 @@
+#include <iostream>
+#include <functional>
+#include <TNL/Algorithms/ParallelFor.h>
+#include <TNL/Matrices/DenseMatrix.h>
+#include <TNL/Devices/Host.h>
+#include <TNL/Devices/Cuda.h>
+
+template< typename Device >
+void getRowExample()
+{
+   TNL::Matrices::DenseMatrix< double, Device > matrix {
+      { 1, 0, 0, 0, 0 },
+      { 1, 2, 0, 0, 0 },
+      { 1, 2, 3, 0, 0 },
+      { 1, 2, 3, 4, 0 },
+      { 1, 2, 3, 4, 5 }
+   };
+
+   /***
+    * We need a matrix view to pass the matrix to lambda function even on CUDA device.
+    */
+   const auto matrixView = matrix.getConstView();
+
+   /***
+    * Fetch lambda function returns diagonal element in each row.
+    */
+   auto fetch = [=] __cuda_callable__ ( int rowIdx ) mutable -> double {
+      auto row = matrixView.getRow( rowIdx );
+      return row.getElement( rowIdx );
+   };
+
+   int trace = TNL::Algorithms::Reduction< Device >::reduce( matrix.getRows(), std::plus<>{}, fetch, 0 );
+   std::cout << "Matrix trace is " << trace << "." << std::endl;
+}
+
+int main( int argc, char* argv[] )
+{
+   std::cout << "Getting matrix rows on host: " << std::endl;
+   getRowExample< TNL::Devices::Host >();
+
+#ifdef HAVE_CUDA
+   std::cout << "Getting matrix rows on CUDA device: " << std::endl;
+   getRowExample< TNL::Devices::Cuda >();
+#endif
+}
diff --git a/Documentation/Examples/Matrices/DenseMatrixViewExample_getRow.cpp b/Documentation/Examples/Matrices/DenseMatrixViewExample_getRow.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..30d893bc18ebc27f35c55566b900cb506084122c
--- /dev/null
+++ b/Documentation/Examples/Matrices/DenseMatrixViewExample_getRow.cpp
@@ -0,0 +1,34 @@
+#include <iostream>
+#include <TNL/Algorithms/ParallelFor.h>
+#include <TNL/Matrices/DenseMatrix.h>
+#include <TNL/Devices/Host.h>
+#include <TNL/Devices/Cuda.h>
+
+template< typename Device >
+void getRowExample()
+{
+   TNL::Matrices::DenseMatrix< double, Device > matrix( 5, 5 );
+
+   /***
+    * We need a matrix view to pass the matrix to lambda function even on CUDA device.
+    */
+   auto matrixView = matrix.getView();
+   auto f = [=] __cuda_callable__ ( int rowIdx ) mutable {
+      auto row = matrixView.getRow( rowIdx );
+      row.setElement( rowIdx, 10* ( rowIdx + 1 ) );
+   };
+
+   TNL::Algorithms::ParallelFor< Device >::exec( 0, matrix.getRows(), f );
+   std::cout << matrix << std::endl;
+}
+
+int main( int argc, char* argv[] )
+{
+   std::cout << "Getting matrix rows on host: " << std::endl;
+   getRowExample< TNL::Devices::Host >();
+
+#ifdef HAVE_CUDA
+   std::cout << "Getting matrix rows on CUDA device: " << std::endl;
+   getRowExample< TNL::Devices::Cuda >();
+#endif
+}
\ No newline at end of file
diff --git a/Documentation/Tutorials/Arrays/tutorial_Arrays.md b/Documentation/Tutorials/Arrays/tutorial_Arrays.md
index 0d728935e35b5edbdf491b94be27d359c166a249..8050c1867f0e646048cde2fb2c3cfe10155d6e35 100644
--- a/Documentation/Tutorials/Arrays/tutorial_Arrays.md
+++ b/Documentation/Tutorials/Arrays/tutorial_Arrays.md
@@ -104,7 +104,7 @@ In general in TNL, each method defined as `__cuda_callable__` can be called from
 
 #### Accessing the array element with `setElement` and `getElement` <a name="accessing_the_array_elements_with_set_get_element"></a>
 
-On the other hand, the methods `setElement` and `getElement` can be called **from the host only** no matter where the array is allocated. None of the methods can be used in CUDA kernels. `getElement` returns copy of an element rather than a reference. Therefore it is slightly slower. If the array is on GPU, the array element is copied from the device on the host (or vice versa) which is significantly slower. In the parts of code where the performance matters, these methods shall not be called. Their use is, however, much easier and they allow to write one simple code for both CPU and GPU. Both methods are good candidates for:
+On the other hand, the methods `setElement` and `getElement` can be called from the host **no matter where the array is allocated**. In addition they can be called from kernels on device where the array is allocated. `getElement` returns copy of an element rather than a reference. Therefore it is slightly slower. If the array is on GPU and the methods are called from the host, the array element is copied from the device on the host (or vice versa) which is significantly slower. In the parts of code where the performance matters, these methods shall not be called from the host when the array is allocated on the device. In this way, their use is, however, easier compared to `operator[]` and they allow to write one simple code for both CPU and GPU. Both methods are good candidates for:
 
 * reading/writing of only few elements in the array
 * arrays initiation which is done only once and it is not time critical part of a code
diff --git a/src/Benchmarks/DistSpMV/tnl-benchmark-distributed-spmv.h b/src/Benchmarks/DistSpMV/tnl-benchmark-distributed-spmv.h
index 53dfb07f4018b62efc2e936bd4efcefa7d7b7bca..4f7b07dee9b6a780bcb7665a2014658b7a20cbaf 100644
--- a/src/Benchmarks/DistSpMV/tnl-benchmark-distributed-spmv.h
+++ b/src/Benchmarks/DistSpMV/tnl-benchmark-distributed-spmv.h
@@ -25,11 +25,16 @@
 #include <TNL/Containers/Partitioner.h>
 #include <TNL/Containers/DistributedVector.h>
 #include <TNL/Matrices/DistributedMatrix.h>
+#include <TNL/Matrices/SparseOperations.h>
 
 #include "../Benchmarks.h"
 #include "ordering.h"
 
-#include <TNL/Matrices/Legacy/SlicedEllpack.h>
+#include <TNL/Matrices/SparseMatrix.h>
+#include <TNL/Containers/Segments/SlicedEllpack.h>
+
+template< typename _Device, typename _Index, typename _IndexAlocator >
+using SegmentsType = TNL::Containers::Segments::SlicedEllpack< _Device, _Index, _IndexAlocator >;
 
 using namespace TNL;
 using namespace TNL::Benchmarks;
@@ -234,7 +239,7 @@ struct SpmvBenchmark
       DistributedRowLengths distributedRowLengths( localRange, matrix.getRows(), group );
       for( IndexType i = 0; i < distributedMatrix.getLocalMatrix().getRows(); i++ ) {
          const auto gi = distributedMatrix.getLocalRowRange().getGlobalIndex( i );
-         distributedRowLengths[ gi ] = matrix.getRowLength( gi );
+         distributedRowLengths[ gi ] = matrix.getRowCapacity( gi );
       }
       distributedMatrix.setCompressedRowLengths( distributedRowLengths );
 
@@ -243,11 +248,15 @@ struct SpmvBenchmark
          const auto gi = distributedMatrix.getLocalRowRange().getGlobalIndex( i );
          distributedVector[ gi ] = vector[ gi ];
 
-         const IndexType rowLength = matrix.getRowLength( i );
-         IndexType columns[ rowLength ];
-         RealType values[ rowLength ];
-         matrix.getRowFast( gi, columns, values );
-         distributedMatrix.setRowFast( gi, columns, values, rowLength );
+//         const IndexType rowLength = matrix.getRowLength( i );
+//         IndexType columns[ rowLength ];
+//         RealType values[ rowLength ];
+//         matrix.getRowFast( gi, columns, values );
+//         distributedMatrix.setRowFast( gi, columns, values, rowLength );
+         const auto global_row = matrix.getRow( gi );
+         auto local_row = distributedMatrix.getRow( gi );
+         for( IndexType j = 0; j < global_row.getSize(); j++ )
+            local_row.setElement( j, global_row.getColumnIndex( j ), global_row.getValue( j ) );
       }
 
       benchmarkDistributedSpmv( benchmark, distributedMatrix, distributedVector );
@@ -339,7 +348,12 @@ main( int argc, char* argv[] )
 //   return ! Matrices::resolveMatrixType< MainConfig,
 //                                         Devices::Host,
 //                                         SpmvBenchmark >( benchmark, metadata, parameters );
-   using MatrixType = Matrices::Legacy::SlicedEllpack< double, Devices::Host, int >;
+   using MatrixType = TNL::Matrices::SparseMatrix< double,
+                                                   Devices::Host,
+                                                   int,
+                                                   TNL::Matrices::GeneralMatrix,
+                                                   SegmentsType
+                                                 >;
    const bool status = SpmvBenchmark< MatrixType >::run( benchmark, metadata, parameters );
 
    if( rank == 0 )
diff --git a/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.h b/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.h
index 7e275244ee2925c1c3c0b000ee007a42b3e819a5..36ca471e165121e31caf3057e72eb60e6b5fd6cd 100644
--- a/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.h
+++ b/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.h
@@ -30,6 +30,7 @@
 #include <TNL/Containers/Partitioner.h>
 #include <TNL/Containers/DistributedVector.h>
 #include <TNL/Matrices/DistributedMatrix.h>
+#include <TNL/Matrices/SparseOperations.h>
 #include <TNL/Matrices/MatrixReader.h>
 #include <TNL/Solvers/Linear/Preconditioners/Diagonal.h>
 #include <TNL/Solvers/Linear/Preconditioners/ILU0.h>
@@ -54,7 +55,12 @@
    #define HAVE_CUSOLVER
 #endif
 
-#include <TNL/Matrices/Legacy/SlicedEllpack.h>
+#include <TNL/Matrices/SparseMatrix.h>
+#include <TNL/Containers/Segments/CSR.h>
+#include <TNL/Containers/Segments/SlicedEllpack.h>
+
+template< typename _Device, typename _Index, typename _IndexAlocator >
+using SegmentsType = TNL::Containers::Segments::SlicedEllpack< _Device, _Index, _IndexAlocator >;
 
 using namespace TNL;
 using namespace TNL::Benchmarks;
@@ -455,7 +461,7 @@ struct LinearSolversBenchmark
       DistributedRowLengths distributedRowLengths( localRange, matrixPointer->getRows(), group );
       for( IndexType i = 0; i < distMatrixPointer->getLocalMatrix().getRows(); i++ ) {
          const auto gi = distMatrixPointer->getLocalRowRange().getGlobalIndex( i );
-         distributedRowLengths[ gi ] = matrixPointer->getRowLength( gi );
+         distributedRowLengths[ gi ] = matrixPointer->getRowCapacity( gi );
       }
       distMatrixPointer->setCompressedRowLengths( distributedRowLengths );
 
@@ -465,11 +471,15 @@ struct LinearSolversBenchmark
          dist_x0[ gi ] = x0[ gi ];
          dist_b[ gi ] = b[ gi ];
 
-         const IndexType rowLength = matrixPointer->getRowLength( i );
-         IndexType columns[ rowLength ];
-         RealType values[ rowLength ];
-         matrixPointer->getRowFast( gi, columns, values );
-         distMatrixPointer->setRowFast( gi, columns, values, rowLength );
+//         const IndexType rowLength = matrixPointer->getRowLength( i );
+//         IndexType columns[ rowLength ];
+//         RealType values[ rowLength ];
+//         matrixPointer->getRowFast( gi, columns, values );
+//         distMatrixPointer->setRowFast( gi, columns, values, rowLength );
+         const auto global_row = matrixPointer->getRow( gi );
+         auto local_row = distMatrixPointer->getRow( gi );
+         for( IndexType j = 0; j < global_row.getSize(); j++ )
+            local_row.setElement( j, global_row.getColumnIndex( j ), global_row.getValue( j ) );
       }
 
       std::cout << "Iterative solvers:" << std::endl;
@@ -488,7 +498,12 @@ struct LinearSolversBenchmark
    {
       // direct solvers
       if( parameters.getParameter< bool >( "with-direct" ) ) {
-         using CSR = Matrices::Legacy::CSR< RealType, DeviceType, IndexType >;
+         using CSR = TNL::Matrices::SparseMatrix< RealType,
+                                                  DeviceType,
+                                                  IndexType,
+                                                  TNL::Matrices::GeneralMatrix,
+                                                  Containers::Segments::CSR
+                                                >;
          SharedPointer< CSR > matrixCopy;
          Matrices::copySparseMatrix( *matrixCopy, *matrixPointer );
 
@@ -511,11 +526,21 @@ struct LinearSolversBenchmark
 #ifdef HAVE_CUSOLVER
       std::cout << "CuSOLVER:" << std::endl;
       {
-         using CSR = Matrices::CSR< RealType, DeviceType, IndexType >;
+         using CSR = TNL::Matrices::SparseMatrix< RealType,
+                                                  DeviceType,
+                                                  IndexType,
+                                                  TNL::Matrices::GeneralMatrix,
+                                                  Containers::Segments::CSR
+                                                >;
          SharedPointer< CSR > matrixCopy;
          Matrices::copySparseMatrix( *matrixCopy, *matrixPointer );
 
-         using CudaCSR = Matrices::CSR< RealType, Devices::Cuda, IndexType >;
+         using CudaCSR = TNL::Matrices::SparseMatrix< RealType,
+                                                      Devices::Cuda,
+                                                      IndexType,
+                                                      TNL::Matrices::GeneralMatrix,
+                                                      Containers::Segments::CSR
+                                                    >;
          using CudaVector = typename VectorType::template Self< RealType, Devices::Cuda >;
          SharedPointer< CudaCSR > cuda_matrixCopy;
          *cuda_matrixCopy = *matrixCopy;
@@ -567,7 +592,7 @@ configSetup( Config::ConfigDescription& config )
 
    config.addDelimiter( "Linear solver settings:" );
    Solvers::IterativeSolver< double, int >::configSetup( config );
-   using Matrix = Matrices::Legacy::SlicedEllpack< double, Devices::Host, int >;
+   using Matrix = Matrices::SparseMatrix< double >;
    using GMRES = Solvers::Linear::GMRES< Matrix >;
    GMRES::configSetup( config );
    using BiCGstabL = Solvers::Linear::BICGStabL< Matrix >;
@@ -621,7 +646,12 @@ main( int argc, char* argv[] )
 //   return ! Matrices::resolveMatrixType< MainConfig,
 //                                         Devices::Host,
 //                                         LinearSolversBenchmark >( benchmark, metadata, parameters );
-   using MatrixType = Matrices::Legacy::SlicedEllpack< double, Devices::Host, int >;
+   using MatrixType = TNL::Matrices::SparseMatrix< double,
+                                                   Devices::Host,
+                                                   int,
+                                                   TNL::Matrices::GeneralMatrix,
+                                                   SegmentsType
+                                                 >;
    const bool status = LinearSolversBenchmark< MatrixType >::run( benchmark, metadata, parameters );
 
    if( rank == 0 )
diff --git a/src/Benchmarks/SpMV/spmv-legacy.h b/src/Benchmarks/SpMV/spmv-legacy.h
index f690a50c614b3853ac91a770159787b9f056c726..a066b461ef018232023873a4787948a4fb3aba60 100644
--- a/src/Benchmarks/SpMV/spmv-legacy.h
+++ b/src/Benchmarks/SpMV/spmv-legacy.h
@@ -33,6 +33,8 @@
 #include <TNL/Containers/Segments/CSR.h>
 #include <TNL/Containers/Segments/Ellpack.h>
 #include <TNL/Containers/Segments/SlicedEllpack.h>
+#include <TNL/Containers/Segments/ChunkedEllpack.h>
+#include <TNL/Containers/Segments/BiEllpack.h>
 using namespace TNL::Matrices;
 
 #include "cusparseCSRMatrix.h"
@@ -61,6 +63,18 @@ using SlicedEllpackSegments = Containers::Segments::SlicedEllpack< Device, Index
 template< typename Real, typename Device, typename Index >
 using SparseMatrix_SlicedEllpack = Matrices::SparseMatrix< Real, Device, Index, Matrices::GeneralMatrix, SlicedEllpackSegments >;
 
+template< typename Device, typename Index, typename IndexAllocator >
+using ChunkedEllpackSegments = Containers::Segments::ChunkedEllpack< Device, Index, IndexAllocator >;
+
+template< typename Real, typename Device, typename Index >
+using SparseMatrix_ChunkedEllpack = Matrices::SparseMatrix< Real, Device, Index, Matrices::GeneralMatrix, ChunkedEllpackSegments >;
+
+template< typename Device, typename Index, typename IndexAllocator >
+using BiEllpackSegments = Containers::Segments::BiEllpack< Device, Index, IndexAllocator >;
+
+template< typename Real, typename Device, typename Index >
+using SparseMatrix_BiEllpack = Matrices::SparseMatrix< Real, Device, Index, Matrices::GeneralMatrix, BiEllpackSegments >;
+
 // Legacy formats
 template< typename Real, typename Device, typename Index >
 using SparseMatrixLegacy_CSR_Scalar = Matrices::Legacy::CSR< Real, Device, Index, Matrices::Legacy::CSRScalar >;
@@ -218,7 +232,7 @@ benchmarkSpmvSynthetic( Benchmark& benchmark,
    // Perform benchmark on host with CSR as a reference CPU format
    //
    benchmark.setMetadataColumns( Benchmark::MetadataColumns({
-         { "matrix name", convertToString( getMatrixFileName( inputFileName ) ) },
+         { "matrix name", convertToString( inputFileName ) },
          { "non-zeros", convertToString( csrHostMatrix.getNumberOfNonzeroMatrixElements() ) },
          { "rows", convertToString( csrHostMatrix.getRows() ) },
          { "columns", convertToString( csrHostMatrix.getColumns() ) },
@@ -243,7 +257,7 @@ benchmarkSpmvSynthetic( Benchmark& benchmark,
    //
 #ifdef HAVE_CUDA
    benchmark.setMetadataColumns( Benchmark::MetadataColumns({
-         { "matrix name", convertToString( getMatrixFileName( inputFileName ) ) },
+         { "matrix name", convertToString( inputFileName ) },
          { "non-zeros", convertToString( csrHostMatrix.getNumberOfNonzeroMatrixElements() ) },
          { "rows", convertToString( csrHostMatrix.getRows() ) },
          { "columns", convertToString( csrHostMatrix.getColumns() ) },
@@ -276,17 +290,19 @@ benchmarkSpmvSynthetic( Benchmark& benchmark,
 #endif
 
    benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Scalar    >( benchmark, hostOutVector, inputFileName, verboseMR );
-   benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Vector    >( benchmark, hostOutVector, inputFileName, verboseMR );
-   benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Light     >( benchmark, hostOutVector, inputFileName, verboseMR );
-   benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Adaptive  >( benchmark, hostOutVector, inputFileName, verboseMR );
-   benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Stream    >( benchmark, hostOutVector, inputFileName, verboseMR );
+   //benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Vector    >( benchmark, hostOutVector, inputFileName, verboseMR );
+   //benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Light     >( benchmark, hostOutVector, inputFileName, verboseMR );
+   //benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Adaptive  >( benchmark, hostOutVector, inputFileName, verboseMR );
+   //benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Stream    >( benchmark, hostOutVector, inputFileName, verboseMR );
    benchmarkSpMV< Real, SparseMatrix_CSR                 >( benchmark, hostOutVector, inputFileName, verboseMR );
    benchmarkSpMV< Real, Matrices::Legacy::Ellpack        >( benchmark, hostOutVector, inputFileName, verboseMR );
    benchmarkSpMV< Real, SparseMatrix_Ellpack             >( benchmark, hostOutVector, inputFileName, verboseMR );
    benchmarkSpMV< Real, SlicedEllpackAlias               >( benchmark, hostOutVector, inputFileName, verboseMR );
    benchmarkSpMV< Real, SparseMatrix_SlicedEllpack       >( benchmark, hostOutVector, inputFileName, verboseMR );
    benchmarkSpMV< Real, Matrices::Legacy::ChunkedEllpack >( benchmark, hostOutVector, inputFileName, verboseMR );
+   benchmarkSpMV< Real, SparseMatrix_ChunkedEllpack      >( benchmark, hostOutVector, inputFileName, verboseMR );
    benchmarkSpMV< Real, Matrices::Legacy::BiEllpack      >( benchmark, hostOutVector, inputFileName, verboseMR );
+   benchmarkSpMV< Real, SparseMatrix_BiEllpack           >( benchmark, hostOutVector, inputFileName, verboseMR );
    /* AdEllpack is broken
    benchmarkSpMV< Real, Matrices::AdEllpack              >( benchmark, hostOutVector, inputFileName, verboseMR );
     */
diff --git a/src/Python/pytnl/tnl/SparseMatrix.cpp b/src/Python/pytnl/tnl/SparseMatrix.cpp
index 4307155976e58a2790ca8601cf6af8586c10cb7b..f4b1772a706bbfd8d7171cc5a50f93e765b4169d 100644
--- a/src/Python/pytnl/tnl/SparseMatrix.cpp
+++ b/src/Python/pytnl/tnl/SparseMatrix.cpp
@@ -21,10 +21,11 @@ void export_SparseMatrices( py::module & m )
     export_Matrix< E_host   >( m, "Ellpack" );
     export_Matrix< SE_host  >( m, "SlicedEllpack" );
 
+    // TODO: copySparseMatrix does not work with Legacy matrices anymore
     //m.def("copySparseMatrix", &TNL::Matrices::copySparseMatrix< CSR_host, E_host >);
     //m.def("copySparseMatrix", &TNL::Matrices::copySparseMatrix< E_host, CSR_host >);
     //m.def("copySparseMatrix", &TNL::Matrices::copySparseMatrix< CSR_host, SE_host >);
     //m.def("copySparseMatrix", &TNL::Matrices::copySparseMatrix< SE_host, CSR_host >);
-    m.def("copySparseMatrix", &TNL::Matrices::copySparseMatrix< E_host, SE_host >);
-    m.def("copySparseMatrix", &TNL::Matrices::copySparseMatrix< SE_host, E_host >);
+    //m.def("copySparseMatrix", &TNL::Matrices::copySparseMatrix< E_host, SE_host >);
+    //m.def("copySparseMatrix", &TNL::Matrices::copySparseMatrix< SE_host, E_host >);
 }
diff --git a/src/TNL/Algorithms/MemoryOperations.h b/src/TNL/Algorithms/MemoryOperations.h
index 59da324028c513853fdc6da81ba21d877bb98334..85b44a4657bb331bbdb3bb9adddf7c1ccd837d5c 100644
--- a/src/TNL/Algorithms/MemoryOperations.h
+++ b/src/TNL/Algorithms/MemoryOperations.h
@@ -132,10 +132,12 @@ template<>
 struct MemoryOperations< Devices::Cuda >
 {
    template< typename Element >
+   __cuda_callable__
    static void setElement( Element* data,
                            const Element& value );
 
    template< typename Element >
+   __cuda_callable__
    static Element getElement( const Element* data );
 
    template< typename Element, typename Index >
diff --git a/src/TNL/Algorithms/MemoryOperationsCuda.hpp b/src/TNL/Algorithms/MemoryOperationsCuda.hpp
index ea4b92b61ba5d52fdc6ea98f656d25a97db02ab9..31e442a3f7503cd5a9d58a1f263d8fb377bee7a0 100644
--- a/src/TNL/Algorithms/MemoryOperationsCuda.hpp
+++ b/src/TNL/Algorithms/MemoryOperationsCuda.hpp
@@ -24,24 +24,52 @@ namespace TNL {
 namespace Algorithms {
 
 template< typename Element >
-void
+__cuda_callable__ void
 MemoryOperations< Devices::Cuda >::
 setElement( Element* data,
             const Element& value )
 {
    TNL_ASSERT_TRUE( data, "Attempted to set data through a nullptr." );
-   MemoryOperations< Devices::Cuda >::set( data, value, 1 );
+#ifdef __CUDA_ARCH__
+   *data = value;
+#else
+#ifdef HAVE_CUDA
+   cudaMemcpy( ( void* ) data, ( void* ) &value, sizeof( Element ), cudaMemcpyHostToDevice );
+   TNL_CHECK_CUDA_DEVICE;
+#else
+   throw Exceptions::CudaSupportMissing();
+#endif
+   // TODO: For some reason the following does not work after adding
+   // #ifdef __CUDA_ARCH__ to Array::setElement and ArrayView::setElement.
+   // Probably it might be a problem with lambda function 'kernel' which
+   // nvcc probably does not handle properly.
+   //MemoryOperations< Devices::Cuda >::set( data, value, 1 );
+#endif
 }
 
 template< typename Element >
-Element
+__cuda_callable__ Element
 MemoryOperations< Devices::Cuda >::
 getElement( const Element* data )
 {
    TNL_ASSERT_TRUE( data, "Attempted to get data through a nullptr." );
+#ifdef __CUDA_ARCH__
+   return *data;
+#else
    Element result;
-   MultiDeviceMemoryOperations< void, Devices::Cuda >::template copy< Element, Element, int >( &result, data, 1 );
+#ifdef HAVE_CUDA
+   cudaMemcpy( ( void* ) &result, ( void* ) data, sizeof( Element ), cudaMemcpyDeviceToHost );
+   TNL_CHECK_CUDA_DEVICE;
+#else
+   throw Exceptions::CudaSupportMissing();
+#endif
+   // TODO: For some reason the following does not work after adding
+   // #ifdef __CUDA_ARCH__ to Array::getElement and ArrayView::getElement 
+   // Probably it might be a problem with lambda function 'kernel' which
+   // nvcc probably does not handle properly.
+   //MultiDeviceMemoryOperations< void, Devices::Cuda >::template copy< Element, Element, int >( &result, data, 1 );
    return result;
+#endif
 }
 
 template< typename Element, typename Index >
diff --git a/src/TNL/Containers/Array.h b/src/TNL/Containers/Array.h
index a73385eb1aab92d095406976c23d1b4bdb711728..0888cdf9ba3d617c6e60a9930f3927402ed80521 100644
--- a/src/TNL/Containers/Array.h
+++ b/src/TNL/Containers/Array.h
@@ -396,7 +396,7 @@ class Array
        * \brief Resets the array to the empty state.
        *
        * The current data will be deallocated, thus all pointers and views to
-       * the array alements will become invalid.
+       * the array elements will become invalid.
        */
       void reset();
 
@@ -446,22 +446,24 @@ class Array
       /**
        * \brief Sets the value of the \e i-th element to \e v.
        *
-       * This method can be called only from the host, but even for arrays
-       * allocated in a different memory space (e.g. GPU global memory).
+       * This method can be called from both the host system and the device
+       * where the array is allocated.
        *
        * \param i The index of the element to be set.
        * \param v The new value of the element.
        */
+      __cuda_callable__
       void setElement( const Index& i, const Value& v );
 
       /**
        * \brief Returns the value of the \e i-th element.
        *
-       * This method can be called only from the host, but even for arrays
-       * allocated in a different memory space (e.g. GPU global memory).
+       * This method can be called from both the host system and the device
+       * where the array is allocated.
        *
        * \param i The index of the element to be returned.
        */
+      __cuda_callable__
       Value getElement( const Index& i ) const;
 
       /**
diff --git a/src/TNL/Containers/Array.hpp b/src/TNL/Containers/Array.hpp
index f45b7370de7fb3152df2facd377dbced37ef0466..b2f377ff43b79406083914303d3dcfa20a2014d0 100644
--- a/src/TNL/Containers/Array.hpp
+++ b/src/TNL/Containers/Array.hpp
@@ -504,20 +504,20 @@ template< typename Value,
           typename Device,
           typename Index,
           typename Allocator >
-void
+__cuda_callable__ void
 Array< Value, Device, Index, Allocator >::
 setElement( const Index& i, const Value& x )
 {
    TNL_ASSERT_GE( i, (Index) 0, "Element index must be non-negative." );
    TNL_ASSERT_LT( i, this->getSize(), "Element index is out of bounds." );
-   return Algorithms::MemoryOperations< Device >::setElement( &( this->data[ i ] ), x );
+   Algorithms::MemoryOperations< Device >::setElement( &( this->data[ i ] ), x );
 }
 
 template< typename Value,
           typename Device,
           typename Index,
           typename Allocator >
-Value
+__cuda_callable__ Value
 Array< Value, Device, Index, Allocator >::
 getElement( const Index& i ) const
 {
diff --git a/src/TNL/Containers/ArrayView.h b/src/TNL/Containers/ArrayView.h
index 5b9766ffd2b07d00318edff0c3c24080a020d3b6..d1d1c1177066d3f36039b802d7e6201716375043 100644
--- a/src/TNL/Containers/ArrayView.h
+++ b/src/TNL/Containers/ArrayView.h
@@ -313,22 +313,24 @@ public:
    /**
     * \brief Sets the value of the \e i-th element to \e v.
     *
-    * This method can be called only from the host, but even for array views
-    * allocated in a different memory space (e.g. GPU global memory).
+    * This method can be called from both the host system and the device
+    * where the array is allocated.
     *
     * \param i The index of the element to be set.
     * \param v The new value of the element.
     */
+   __cuda_callable__
    void setElement( Index i, Value value );
 
    /**
     * \brief Returns the value of the \e i-th element.
     *
-    * This method can be called only from the host, but even for array views
-    * allocated in a different memory space (e.g. GPU global memory).
+    * This method can be called from both the host system and the device
+    * where the array is allocated.
     *
     * \param i The index of the element to be returned.
     */
+   __cuda_callable__
    Value getElement( Index i ) const;
 
    /**
diff --git a/src/TNL/Containers/ArrayView.hpp b/src/TNL/Containers/ArrayView.hpp
index e36182cd54acfc17075a73944cb72df7ed3eb042..7ab7915e6abe26bc742ba25ac37ab809ee2166f9 100644
--- a/src/TNL/Containers/ArrayView.hpp
+++ b/src/TNL/Containers/ArrayView.hpp
@@ -211,19 +211,20 @@ getSize() const
 template< typename Value,
           typename Device,
           typename Index >
+__cuda_callable__
 void
 ArrayView< Value, Device, Index >::
 setElement( Index i, Value value )
 {
    TNL_ASSERT_GE( i, 0, "Element index must be non-negative." );
    TNL_ASSERT_LT( i, this->getSize(), "Element index is out of bounds." );
-   return Algorithms::MemoryOperations< Device >::setElement( &data[ i ], value );
+   Algorithms::MemoryOperations< Device >::setElement( &this->data[ i ], value );
 }
 
 template< typename Value,
           typename Device,
           typename Index >
-Value
+__cuda_callable__ Value
 ArrayView< Value, Device, Index >::
 getElement( Index i ) const
 {
diff --git a/src/TNL/Containers/Segments/BiEllpack.h b/src/TNL/Containers/Segments/BiEllpack.h
new file mode 100644
index 0000000000000000000000000000000000000000..5f16011c5418865de5c9d6f3f394136598cad5b6
--- /dev/null
+++ b/src/TNL/Containers/Segments/BiEllpack.h
@@ -0,0 +1,174 @@
+/***************************************************************************
+                          BiEllpack.h -  description
+                             -------------------
+    begin                : Apr 5, 2020
+    copyright            : (C) 2020 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#include <TNL/Allocators/Default.h>
+#include <TNL/Containers/Vector.h>
+#include <TNL/Containers/Segments/BiEllpackView.h>
+#include <TNL/Containers/Segments/SegmentView.h>
+
+namespace TNL {
+   namespace Containers {
+      namespace Segments {
+
+template< typename Device,
+          typename Index,
+          typename IndexAllocator = typename Allocators::Default< Device >::template Allocator< Index >,
+          bool RowMajorOrder = std::is_same< Device, Devices::Host >::value,
+          int WarpSize = 32 >
+class BiEllpack
+{
+   public:
+
+      using DeviceType = Device;
+      using IndexType = std::remove_const_t< Index >;
+      using OffsetsHolder = Containers::Vector< Index, DeviceType, IndexType, IndexAllocator >;
+      static constexpr bool getRowMajorOrder() { return RowMajorOrder; }
+      using ViewType = BiEllpackView< Device, Index, RowMajorOrder >;
+      template< typename Device_, typename Index_ >
+      using ViewTemplate = BiEllpackView< Device_, Index_, RowMajorOrder >;
+      using ConstViewType = BiEllpackView< Device, std::add_const_t< IndexType >, RowMajorOrder >;
+      using SegmentViewType = BiEllpackSegmentView< IndexType, RowMajorOrder >;
+
+      BiEllpack() = default;
+
+      BiEllpack( const Vector< IndexType, DeviceType, IndexType >& sizes );
+
+      BiEllpack( const BiEllpack& segments );
+
+      BiEllpack( const BiEllpack&& segments );
+
+      static String getSerializationType();
+
+      static String getSegmentsType();
+
+      ViewType getView();
+
+      const ConstViewType getConstView() const;
+
+      /**
+       * \brief Number of segments.
+       */
+      __cuda_callable__
+      IndexType getSegmentsCount() const;
+
+      /**
+       * \brief Set sizes of particular segments.
+       */
+      template< typename SizesHolder = OffsetsHolder >
+      void setSegmentsSizes( const SizesHolder& sizes );
+
+      void reset();
+
+      IndexType getSegmentSize( const IndexType segmentIdx ) const;
+
+      /**
+       * \brief Number segments.
+       */
+      __cuda_callable__
+      IndexType getSize() const;
+
+      __cuda_callable__
+      IndexType getStorageSize() const;
+
+      __cuda_callable__
+      IndexType getGlobalIndex( const IndexType segmentIdx, const IndexType localIdx ) const;
+
+      __cuda_callable__
+      SegmentViewType getSegmentView( const IndexType segmentIdx ) const;
+
+      /***
+       * \brief Go over all segments and for each segment element call
+       * function 'f' with arguments 'args'. The return type of 'f' is bool.
+       * When its true, the for-loop continues. Once 'f' returns false, the for-loop
+       * is terminated.
+       */
+      template< typename Function, typename... Args >
+      void forSegments( IndexType first, IndexType last, Function& f, Args... args ) const;
+
+      template< typename Function, typename... Args >
+      void forAll( Function& f, Args... args ) const;
+
+
+      /***
+       * \brief Go over all segments and perform a reduction in each of them.
+       */
+      template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
+      void segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
+
+      template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
+      void allReduction( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
+
+      BiEllpack& operator=( const BiEllpack& source ) = default;
+
+      template< typename Device_, typename Index_, typename IndexAllocator_, bool RowMajorOrder_ >
+      BiEllpack& operator=( const BiEllpack< Device_, Index_, IndexAllocator_, RowMajorOrder_, WarpSize >& source );
+
+      void save( File& file ) const;
+
+      void load( File& file );
+
+      void printStructure( std::ostream& str ) const;
+
+      // TODO: nvcc needs this public because of lambda function used inside
+      template< typename SizesHolder = OffsetsHolder >
+      void performRowBubbleSort( const SizesHolder& segmentsSize );
+
+      // TODO: the same as  above
+      template< typename SizesHolder = OffsetsHolder >
+      void computeColumnSizes( const SizesHolder& segmentsSizes );
+
+   protected:
+
+      static constexpr int getWarpSize() { return WarpSize; };
+
+      static constexpr int getLogWarpSize() { return std::log2( WarpSize ); };
+
+      template< typename SizesHolder = OffsetsHolder >
+      void verifyRowPerm( const SizesHolder& segmentsSizes );
+
+      template< typename SizesHolder = OffsetsHolder >
+      void verifyRowLengths( const SizesHolder& segmentsSizes );
+
+      IndexType getStripLength( const IndexType stripIdx ) const;
+
+      IndexType getGroupLength( const IndexType strip, const IndexType group ) const;
+
+      IndexType size = 0, storageSize = 0;
+
+      IndexType virtualRows = 0;
+
+      OffsetsHolder rowPermArray;
+
+      OffsetsHolder groupPointers;
+
+      // TODO: Replace later
+      __cuda_callable__ Index power( const IndexType number, const IndexType exponent ) const
+      {
+          if( exponent >= 0 )
+          {
+              IndexType result = 1;
+              for( IndexType i = 0; i < exponent; i++ )
+                  result *= number;
+              return result;
+          }
+          return 0;
+      };
+
+      template< typename Device_, typename Index_, typename IndexAllocator_, bool RowMajorOrder_, int WarpSize_ >
+      friend class BiEllpack;
+};
+
+      } // namespace Segements
+   }  // namespace Conatiners
+} // namespace TNL
+
+#include <TNL/Containers/Segments/BiEllpack.hpp>
diff --git a/src/TNL/Containers/Segments/BiEllpack.hpp b/src/TNL/Containers/Segments/BiEllpack.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..032543d1ab51dc0fad970b2aee31ebe4cde70ab9
--- /dev/null
+++ b/src/TNL/Containers/Segments/BiEllpack.hpp
@@ -0,0 +1,581 @@
+/***************************************************************************
+                          BiEllpack.hpp -  description
+                             -------------------
+    begin                : Apr 5, 2020
+    copyright            : (C) 2020 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#include <math.h>
+#include <TNL/Containers/Vector.h>
+#include <TNL/Algorithms/ParallelFor.h>
+#include <TNL/Containers/Segments/BiEllpack.h>
+#include <TNL/Containers/Segments/Ellpack.h>
+
+namespace TNL {
+   namespace Containers {
+      namespace Segments {
+
+template< typename Device,
+          typename Index,
+          typename IndexAllocator,
+          bool RowMajorOrder,
+          int WarpSize >
+BiEllpack< Device, Index, IndexAllocator, RowMajorOrder, WarpSize >::
+BiEllpack( const Vector< IndexType, DeviceType, IndexType >& sizes )
+{
+   this->setSegmentsSizes( sizes );
+}
+
+template< typename Device,
+          typename Index,
+          typename IndexAllocator,
+          bool RowMajorOrder,
+          int WarpSize >
+BiEllpack< Device, Index, IndexAllocator, RowMajorOrder, WarpSize >::
+BiEllpack( const BiEllpack& biEllpack )
+   : size( biEllpack.size ),
+     storageSize( biEllpack.storageSize ),
+     virtualRows( biEllpack.virtualRows ),
+     rowPermArray( biEllpack.rowPermArray ),
+     groupPointers( biEllpack.groupPointers )
+{
+}
+
+template< typename Device,
+          typename Index,
+          typename IndexAllocator,
+          bool RowMajorOrder,
+          int WarpSize >
+BiEllpack< Device, Index, IndexAllocator, RowMajorOrder, WarpSize >::
+BiEllpack( const BiEllpack&& biEllpack )
+   : size( biEllpack.size ),
+     storageSize( biEllpack.storageSize ),
+     virtualRows( biEllpack.virtualRows ),
+     rowPermArray( std::move( biEllpack.rowPermArray ) ),
+     groupPointers( std::move( biEllpack.groupPointers ) )
+{
+}
+
+template< typename Device,
+          typename Index,
+          typename IndexAllocator,
+          bool RowMajorOrder,
+          int WarpSize >
+String
+BiEllpack< Device, Index, IndexAllocator, RowMajorOrder, WarpSize >::
+getSerializationType()
+{
+   return "BiEllpack< [any_device], " + TNL::getSerializationType< IndexType >() + " >";
+}
+
+template< typename Device,
+          typename Index,
+          typename IndexAllocator,
+          bool RowMajorOrder,
+          int WarpSize >
+String
+BiEllpack< Device, Index, IndexAllocator, RowMajorOrder, WarpSize >::
+getSegmentsType()
+{
+   return ViewType::getSegmentsType();
+}
+
+template< typename Device,
+          typename Index,
+          typename IndexAllocator,
+          bool RowMajorOrder,
+          int WarpSize >
+typename BiEllpack< Device, Index, IndexAllocator, RowMajorOrder, WarpSize >::ViewType
+BiEllpack< Device, Index, IndexAllocator, RowMajorOrder, WarpSize >::
+getView()
+{
+   return ViewType( size, storageSize, virtualRows, rowPermArray.getView(), groupPointers.getView() );
+}
+
+template< typename Device,
+          typename Index,
+          typename IndexAllocator,
+          bool RowMajorOrder,
+          int WarpSize >
+auto BiEllpack< Device, Index, IndexAllocator, RowMajorOrder, WarpSize >::
+getConstView() const -> const ConstViewType
+{
+   return ConstViewType( size, storageSize, virtualRows, rowPermArray.getConstView(), groupPointers.getConstView() );
+}
+
+template< typename Device,
+          typename Index,
+          typename IndexAllocator,
+          bool RowMajorOrder,
+          int WarpSize >
+auto BiEllpack< Device, Index, IndexAllocator, RowMajorOrder, WarpSize >::
+getSegmentsCount() const -> IndexType
+{
+   return this->size;
+}
+
+template< typename Device,
+          typename Index,
+          typename IndexAllocator,
+          bool RowMajorOrder,
+          int WarpSize >
+   template< typename SizesHolder >
+void BiEllpack< Device, Index, IndexAllocator, RowMajorOrder, WarpSize >::
+performRowBubbleSort( const SizesHolder& segmentsSizes )
+{
+   if( segmentsSizes.getSize() == 0 )
+      return;
+
+   this->rowPermArray.evaluate( [] __cuda_callable__ ( const IndexType i ) -> IndexType { return i; } );
+
+   //if( std::is_same< DeviceType, Devices::Host >::value )
+   {
+      IndexType strips = this->virtualRows / getWarpSize();
+      for( IndexType i = 0; i < strips; i++ )
+      {
+         IndexType begin = i * getWarpSize();
+         IndexType end = ( i + 1 ) * getWarpSize() - 1;
+         if(this->getSize() - 1 < end)
+            end = this->getSize() - 1;
+         bool sorted = false;
+         IndexType permIndex1, permIndex2, offset = 0;
+         while( !sorted )
+         {
+            sorted = true;
+            for( IndexType j = begin + offset; j < end - offset; j++ )
+            {
+               for( IndexType k = begin; k < end + 1; k++ )
+               {
+                  if( this->rowPermArray.getElement( k ) == j )
+                     permIndex1 = k;
+                  if( this->rowPermArray.getElement( k ) == j + 1 )
+                     permIndex2 = k;
+               }
+               if( segmentsSizes.getElement( permIndex1 ) < segmentsSizes.getElement( permIndex2 ) )
+               {
+                  IndexType temp = this->rowPermArray.getElement( permIndex1 );
+                  this->rowPermArray.setElement( permIndex1, this->rowPermArray.getElement( permIndex2 ) );
+                  this->rowPermArray.setElement( permIndex2, temp );
+                  sorted = false;
+               }
+            }
+            for( IndexType j = end - 1 - offset; j > begin + offset; j-- )
+            {
+               for( IndexType k = begin; k < end + 1; k++ )
+               {
+                  if( this->rowPermArray.getElement( k ) == j )
+                     permIndex1 = k;
+                  if( this->rowPermArray.getElement( k ) == j - 1 )
+                     permIndex2 = k;
+               }
+               if( segmentsSizes.getElement( permIndex2 ) < segmentsSizes.getElement( permIndex1 ) )
+               {
+                  IndexType temp = this->rowPermArray.getElement( permIndex1 );
+                  this->rowPermArray.setElement( permIndex1, this->rowPermArray.getElement( permIndex2 ) );
+                  this->rowPermArray.setElement( permIndex2, temp );
+                  sorted = false;
+               }
+            }
+            offset++;
+         }
+      }
+   }
+}
+
+template< typename Device,
+          typename Index,
+          typename IndexAllocator,
+          bool RowMajorOrder,
+          int WarpSize >
+   template< typename SizesHolder >
+void BiEllpack< Device, Index, IndexAllocator, RowMajorOrder, WarpSize >::
+computeColumnSizes( const SizesHolder& segmentsSizes )
+{
+   IndexType numberOfStrips = this->virtualRows / getWarpSize();
+   auto groupPointersView = this->groupPointers.getView();
+   auto segmentsPermutationView = this->rowPermArray.getView();
+   auto segmentsSizesView = segmentsSizes.getConstView();
+   const IndexType size = this->getSize();
+   auto createGroups = [=] __cuda_callable__ ( const IndexType strip ) mutable {
+      IndexType firstSegment = strip * getWarpSize();
+      IndexType groupBegin = strip * ( getLogWarpSize() + 1 );
+      IndexType emptyGroups = 0;
+
+      ////
+      // The last strip can be shorter
+      if( strip == numberOfStrips - 1 )
+      {
+         IndexType segmentsCount = size - firstSegment;
+         while( !( segmentsCount > TNL::pow( 2, getLogWarpSize() - 1 - emptyGroups ) ) )
+            emptyGroups++;
+         for( IndexType group = groupBegin; group < groupBegin + emptyGroups; group++ )
+            groupPointersView[ group ] = 0;
+      }
+
+      IndexType allocatedColumns = 0;
+      for( IndexType groupIdx = emptyGroups; groupIdx < getLogWarpSize(); groupIdx++ )
+      {
+         IndexType segmentIdx = TNL::pow( 2, getLogWarpSize() - 1 - groupIdx ) - 1;
+         IndexType permSegm = 0;
+         while( segmentsPermutationView[ permSegm + firstSegment ] != segmentIdx + firstSegment )
+            permSegm++;
+         const IndexType groupWidth = segmentsSizesView[ permSegm + firstSegment ] - allocatedColumns;
+         const IndexType groupHeight = TNL::pow( 2, getLogWarpSize() - groupIdx );
+         const IndexType groupSize = groupWidth * groupHeight;
+         allocatedColumns = segmentsSizesView[ permSegm + firstSegment ];
+         groupPointersView[ groupIdx + groupBegin ] = groupSize;
+      }
+   };
+   Algorithms::ParallelFor< DeviceType >::exec( ( IndexType ) 0, this->virtualRows / getWarpSize(), createGroups );
+}
+
+template< typename Device,
+          typename Index,
+          typename IndexAllocator,
+          bool RowMajorOrder,
+          int WarpSize >
+   template< typename SizesHolder >
+void BiEllpack< Device, Index, IndexAllocator, RowMajorOrder, WarpSize >::
+verifyRowPerm( const SizesHolder& segmentsSizes )
+{
+   bool ok = true;
+   IndexType numberOfStrips = this->virtualRows / getWarpSize();
+   for( IndexType strip = 0; strip < numberOfStrips; strip++ )
+   {
+      IndexType begin = strip * getWarpSize();
+      IndexType end = ( strip + 1 ) * getWarpSize();
+      if( this->getSize() < end )
+         end = this->getSize();
+      for( IndexType i = begin; i < end - 1; i++ )
+      {
+         IndexType permIndex1, permIndex2;
+         bool first = false;
+         bool second = false;
+         for( IndexType j = begin; j < end; j++ )
+         {
+            if( this->rowPermArray.getElement( j ) == i )
+            {
+               permIndex1 = j;
+               first = true;
+            }
+            if( this->rowPermArray.getElement( j ) == i + 1 )
+            {
+               permIndex2 = j;
+               second = true;
+            }
+         }
+         if( !first || !second )
+            std::cout << "Wrong permutation!" << std::endl;
+         if( segmentsSizes.getElement( permIndex1 ) >= segmentsSizes.getElement( permIndex2 ) )
+            continue;
+         else
+            ok = false;
+      }
+   }
+   if( !ok )
+      throw( std::logic_error( "Segments permutation verification failed." ) );
+}
+
+template< typename Device,
+          typename Index,
+          typename IndexAllocator,
+          bool RowMajorOrder,
+          int WarpSize >
+   template< typename SizesHolder >
+void BiEllpack< Device, Index, IndexAllocator, RowMajorOrder, WarpSize >::
+verifyRowLengths( const SizesHolder& segmentsSizes )
+{
+   bool ok = true;
+   for( IndexType segmentIdx = 0; segmentIdx < this->getSize(); segmentIdx++ )
+   {
+      const IndexType strip = segmentIdx / getWarpSize();
+      const IndexType stripLength = this->getStripLength( strip );
+      const IndexType groupBegin = ( getLogWarpSize() + 1 ) * strip;
+      const IndexType rowStripPerm = this->rowPermArray.getElement( segmentIdx ) - strip * getWarpSize();
+      const IndexType begin = this->groupPointers.getElement( groupBegin ) * getWarpSize() + rowStripPerm * stripLength;
+      IndexType elementPtr = begin;
+      IndexType rowLength = 0;
+      const IndexType groupsCount = details::BiEllpack< Index, Device, RowMajorOrder, WarpSize >::getActiveGroupsCount( this->rowPermArray.getConstView(), segmentIdx );
+      for( IndexType group = 0; group < groupsCount; group++ )
+      {
+         for( IndexType i = 0; i < this->getGroupLength( strip, group ); i++ )
+         {
+            IndexType biElementPtr = elementPtr;
+            for( IndexType j = 0; j < this->power( 2, group ); j++ )
+            {
+               rowLength++;
+               biElementPtr += this->power( 2, getLogWarpSize() - group ) * stripLength;
+            }
+            elementPtr++;
+         }
+      }
+      if( segmentsSizes.getElement( segmentIdx ) > rowLength )
+         ok = false;
+   }
+   if( ! ok )
+      throw( std::logic_error( "Segments capacities verification failed." ) );
+}
+
+template< typename Device,
+          typename Index,
+          typename IndexAllocator,
+          bool RowMajorOrder,
+          int WarpSize >
+   template< typename SizesHolder >
+void
+BiEllpack< Device, Index, IndexAllocator, RowMajorOrder, WarpSize >::
+setSegmentsSizes( const SizesHolder& segmentsSizes )
+{
+   if( std::is_same< DeviceType, Devices::Host >::value )
+   {
+      this->size = segmentsSizes.getSize();
+      if( this->size % WarpSize != 0 )
+         this->virtualRows = this->size + getWarpSize() - ( this->size % getWarpSize() );
+      else
+         this->virtualRows = this->size;
+      IndexType strips = this->virtualRows / getWarpSize();
+      this->rowPermArray.setSize( this->size );
+      this->groupPointers.setSize( strips * ( getLogWarpSize() + 1 ) + 1 );
+      this->groupPointers = 0;
+
+      this->performRowBubbleSort( segmentsSizes );
+      this->computeColumnSizes( segmentsSizes );
+
+      this->groupPointers.template scan< Algorithms::ScanType::Exclusive >();
+
+      this->verifyRowPerm( segmentsSizes );
+      this->verifyRowLengths( segmentsSizes );
+      this->storageSize =  getWarpSize() * this->groupPointers.getElement( strips * ( getLogWarpSize() + 1 ) );
+   }
+   else
+   {
+      BiEllpack< Devices::Host, Index, typename Allocators::Default< Devices::Host >::template Allocator< IndexType >, RowMajorOrder > hostSegments;
+      Containers::Vector< IndexType, Devices::Host, IndexType > hostSegmentsSizes;
+      hostSegmentsSizes = segmentsSizes;
+      hostSegments.setSegmentsSizes( hostSegmentsSizes );
+      *this = hostSegments;
+   }
+}
+
+template< typename Device,
+          typename Index,
+          typename IndexAllocator,
+          bool RowMajorOrder,
+          int WarpSize >
+void
+BiEllpack< Device, Index, IndexAllocator, RowMajorOrder, WarpSize >::
+reset()
+{
+   this->size = 0;
+   this->storageSize = 0;
+   this->virtualRows = 0;
+   rowPermArray.reset();
+   groupPointers.reset();
+}
+
+template< typename Device,
+          typename Index,
+          typename IndexAllocator,
+          bool RowMajorOrder,
+          int WarpSize >
+auto BiEllpack< Device, Index, IndexAllocator, RowMajorOrder, WarpSize >::
+getSegmentSize( const IndexType segmentIdx ) const -> IndexType
+{
+   return details::BiEllpack< IndexType, DeviceType, RowMajorOrder >::getSegmentSize(
+      rowPermArray.getConstView(),
+      groupPointers.getConstView(),
+      segmentIdx );
+}
+
+template< typename Device,
+          typename Index,
+          typename IndexAllocator,
+          bool RowMajorOrder,
+          int WarpSize >
+__cuda_callable__ auto BiEllpack< Device, Index, IndexAllocator, RowMajorOrder, WarpSize >::
+getSize() const -> IndexType
+{
+   return this->size;
+}
+
+template< typename Device,
+          typename Index,
+          typename IndexAllocator,
+          bool RowMajorOrder,
+          int WarpSize >
+__cuda_callable__ auto BiEllpack< Device, Index, IndexAllocator, RowMajorOrder, WarpSize >::
+getStorageSize() const -> IndexType
+{
+   return this->storageSize;
+}
+
+template< typename Device,
+          typename Index,
+          typename IndexAllocator,
+          bool RowMajorOrder,
+          int WarpSize >
+__cuda_callable__ auto BiEllpack< Device, Index, IndexAllocator, RowMajorOrder, WarpSize >::
+getGlobalIndex( const IndexType segmentIdx, const IndexType localIdx ) const -> IndexType
+{
+      return details::BiEllpack< IndexType, DeviceType, RowMajorOrder >::getGlobalIndex(
+         rowPermArray.getConstView(),
+         groupPointers.getConstView(),
+         segmentIdx,
+         localIdx );
+}
+
+template< typename Device,
+          typename Index,
+          typename IndexAllocator,
+          bool RowMajorOrder,
+          int WarpSize >
+__cuda_callable__ auto BiEllpack< Device, Index, IndexAllocator, RowMajorOrder, WarpSize >::
+getSegmentView( const IndexType segmentIdx ) const -> SegmentViewType
+{
+}
+
+template< typename Device,
+          typename Index,
+          typename IndexAllocator,
+          bool RowMajorOrder,
+          int WarpSize >
+   template< typename Function, typename... Args >
+void
+BiEllpack< Device, Index, IndexAllocator, RowMajorOrder, WarpSize >::
+forSegments( IndexType first, IndexType last, Function& f, Args... args ) const
+{
+   this->getConstView().forSegments( first, last, f, args... );
+}
+
+template< typename Device,
+          typename Index,
+          typename IndexAllocator,
+          bool RowMajorOrder,
+          int WarpSize >
+   template< typename Function, typename... Args >
+void
+BiEllpack< Device, Index, IndexAllocator, RowMajorOrder, WarpSize >::
+forAll( Function& f, Args... args ) const
+{
+   this->forSegments( 0, this->getSegmentsCount(), f, args... );
+}
+
+template< typename Device,
+          typename Index,
+          typename IndexAllocator,
+          bool RowMajorOrder,
+          int WarpSize >
+   template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
+void
+BiEllpack< Device, Index, IndexAllocator, RowMajorOrder, WarpSize >::
+segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
+{
+   this->getConstView().segmentsReduction( first, last, fetch, reduction, keeper, zero, args... );
+}
+
+template< typename Device,
+          typename Index,
+          typename IndexAllocator,
+          bool RowMajorOrder,
+          int WarpSize >
+   template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
+void
+BiEllpack< Device, Index, IndexAllocator, RowMajorOrder, WarpSize >::
+allReduction( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
+{
+   this->segmentsReduction( 0, this->getSegmentsCount(), fetch, reduction, keeper, zero, args... );
+}
+
+template< typename Device,
+          typename Index,
+          typename IndexAllocator,
+          bool RowMajorOrder,
+          int WarpSize >
+   template< typename Device_, typename Index_, typename IndexAllocator_, bool RowMajorOrder_ >
+BiEllpack< Device, Index, IndexAllocator, RowMajorOrder, WarpSize >&
+BiEllpack< Device, Index, IndexAllocator, RowMajorOrder, WarpSize >::
+operator=( const BiEllpack< Device_, Index_, IndexAllocator_, RowMajorOrder_, WarpSize >& source )
+{
+   this->size = source.size;
+   this->storageSize = source.storageSize;
+   this->virtualRows = source.virtualRows;
+   this->rowPermArray = source.rowPermArray;
+   this->groupPointers = source.groupPointers;
+   return *this;
+}
+
+template< typename Device,
+          typename Index,
+          typename IndexAllocator,
+          bool RowMajorOrder,
+          int WarpSize >
+void
+BiEllpack< Device, Index, IndexAllocator, RowMajorOrder, WarpSize >::
+save( File& file ) const
+{
+   file.save( &this->size );
+   file.save( &this->storageSize );
+   file.save( &this->virtualRows );
+   file << this->rowPermArray
+        << this->groupPointers;
+}
+
+template< typename Device,
+          typename Index,
+          typename IndexAllocator,
+          bool RowMajorOrder,
+          int WarpSize >
+void
+BiEllpack< Device, Index, IndexAllocator, RowMajorOrder, WarpSize >::
+load( File& file )
+{
+   file.load( &this->size );
+   file.load( &this->storageSize );
+   file.load( &this->virtualRows );
+   file >> this->rowPermArray
+        >> this->groupPointers;
+}
+
+template< typename Device,
+          typename Index,
+          typename IndexAllocator,
+          bool RowMajorOrder,
+          int WarpSize >
+void
+BiEllpack< Device, Index, IndexAllocator, RowMajorOrder, WarpSize >::
+printStructure( std::ostream& str ) const
+{
+   this->view.printStructure( str );
+}
+
+template< typename Device,
+          typename Index,
+          typename IndexAllocator,
+          bool RowMajorOrder,
+          int WarpSize >
+auto BiEllpack< Device, Index, IndexAllocator, RowMajorOrder, WarpSize >::
+getStripLength( const IndexType stripIdx ) const -> IndexType
+{
+   return details::BiEllpack< Index, Device, RowMajorOrder, WarpSize >::getStripLength( this->groupPointers.getConstView(), stripIdx );
+}
+
+template< typename Device,
+          typename Index,
+          typename IndexAllocator,
+          bool RowMajorOrder,
+          int WarpSize >
+auto BiEllpack< Device, Index, IndexAllocator, RowMajorOrder, WarpSize >::
+getGroupLength( const IndexType strip, const IndexType group ) const -> IndexType
+{
+   return this->groupPointers.getElement( strip * ( getLogWarpSize() + 1 ) + group + 1 )
+           - this->groupPointers.getElement( strip * ( getLogWarpSize() + 1 ) + group );
+}
+
+      } // namespace Segments
+   }  // namespace Conatiners
+} // namespace TNL
diff --git a/src/TNL/Containers/Segments/BiEllpackSegmentView.h b/src/TNL/Containers/Segments/BiEllpackSegmentView.h
new file mode 100644
index 0000000000000000000000000000000000000000..5f5e720491f2372ad1207ca5c8548eb625cf81d0
--- /dev/null
+++ b/src/TNL/Containers/Segments/BiEllpackSegmentView.h
@@ -0,0 +1,90 @@
+/***************************************************************************
+                          BiEllpackSegmentView.h -  description
+                             -------------------
+    begin                : Apr 7, 2020
+    copyright            : (C) 2020 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#include <math.h>
+#include <TNL/Containers/StaticVector.h>
+
+namespace TNL {
+   namespace Containers {
+      namespace Segments {
+
+template< typename Index,
+          bool RowMajorOrder = false,
+          int WarpSize = 32 >
+class BiEllpackSegmentView
+{
+   public:
+      
+      static constexpr int getWarpSize() { return WarpSize; };
+
+      static constexpr int getLogWarpSize() { static_assert( WarpSize == 32, "nvcc does not allow constexpr log2" ); return 5; }// TODO: return std::log2( WarpSize ); };
+
+      static constexpr int getGroupsCount() { return getLogWarpSize() + 1; };
+
+      using IndexType = Index;
+      using GroupsWidthType = Containers::StaticVector< getGroupsCount(), IndexType >;
+
+
+      /**
+       * \brief Constructor.
+       * 
+       * \param offset is offset of the first group of the strip the segment belongs to.
+       * \param size is the segment size
+       * \param inStripIdx is index of the segment within its strip.
+       * \param groupsWidth is a static vector containing widths of the strip groups
+       */
+      __cuda_callable__
+      BiEllpackSegmentView( const IndexType offset,
+                            const IndexType inStripIdx,
+                            const GroupsWidthType& groupsWidth )
+      : groupOffset( offset ), inStripIdx( inStripIdx ), segmentSize( TNL::sum( groupsWidth ) ), groupsWidth( groupsWidth ){};
+
+      __cuda_callable__
+      IndexType getSize() const
+      {
+         return this->segmentSize;
+      };
+
+      __cuda_callable__
+      IndexType getGlobalIndex( IndexType localIdx ) const
+      {
+         //std::cerr << "SegmentView: localIdx = " << localIdx << " groupWidth = " << groupsWidth << std::endl;
+         IndexType groupIdx( 0 ), offset( groupOffset ), groupHeight( getWarpSize() );
+         while( localIdx >= groupsWidth[ groupIdx ] )
+         {
+            //std::cerr << "ROW: groupIdx = " << groupIdx << " groupWidth = " << groupsWidth[ groupIdx ]
+            //          << " groupSize = " << groupsWidth[ groupIdx ] * groupHeight << std::endl;
+            localIdx -= groupsWidth[ groupIdx ];
+            offset += groupsWidth[ groupIdx++ ] * groupHeight;
+            groupHeight /= 2;
+         }
+         TNL_ASSERT_LE( groupIdx, TNL::log2( getWarpSize() - inStripIdx + 1 ), "Local index exceeds segment bounds." );
+         if( RowMajorOrder )
+         {
+            //std::cerr << " offset = " << offset << " inStripIdx = " << inStripIdx << " localIdx = " << localIdx 
+            //          << " return = " << offset + inStripIdx * groupsWidth[ groupIdx ] + localIdx << std::endl;
+            return offset + inStripIdx * groupsWidth[ groupIdx ] + localIdx;
+         }
+         else
+            return offset + inStripIdx + localIdx * groupHeight;
+      };
+
+      protected:
+
+         IndexType groupOffset, inStripIdx, segmentSize;
+
+         GroupsWidthType groupsWidth;
+};
+
+      } //namespace Segments
+   } //namespace Containers
+} //namespace TNL
diff --git a/src/TNL/Containers/Segments/BiEllpackView.h b/src/TNL/Containers/Segments/BiEllpackView.h
new file mode 100644
index 0000000000000000000000000000000000000000..e4807bef81157da338ab015e230cc92853de8a57
--- /dev/null
+++ b/src/TNL/Containers/Segments/BiEllpackView.h
@@ -0,0 +1,209 @@
+/***************************************************************************
+                          BiEllpackView.h -  description
+                             -------------------
+    begin                : Apr 5, 2020
+    copyright            : (C) 2020 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#include <type_traits>
+
+#include <TNL/Containers/Vector.h>
+#include <TNL/Containers/Segments/BiEllpackSegmentView.h>
+#include <TNL/Containers/Segments/details/BiEllpack.h>
+
+namespace TNL {
+   namespace Containers {
+      namespace Segments {
+
+
+template< typename Device,
+          typename Index,
+          bool RowMajorOrder = std::is_same< Device, Devices::Host >::value,
+          int WarpSize = 32 >
+class BiEllpackView
+{
+   public:
+
+      using DeviceType = Device;
+      using IndexType = std::remove_const_t< Index >;
+      using OffsetsView = typename Containers::VectorView< Index, DeviceType, IndexType >;
+      using ConstOffsetsView = typename OffsetsView::ConstViewType;
+      using ViewType = BiEllpackView;
+      template< typename Device_, typename Index_ >
+      using ViewTemplate = BiEllpackView< Device_, Index_ >;
+      using ConstViewType = BiEllpackView< Device, std::add_const_t< Index > >;
+      using SegmentViewType = BiEllpackSegmentView< IndexType, RowMajorOrder >;
+
+      __cuda_callable__
+      BiEllpackView() = default;
+
+      __cuda_callable__
+      BiEllpackView( const IndexType size,
+                     const IndexType storageSize,
+                     const IndexType virtualRows,
+                     const OffsetsView& rowPermArray,
+                     const OffsetsView& groupPointers );
+
+      __cuda_callable__
+      BiEllpackView( const IndexType size,
+                     const IndexType storageSize,
+                     const IndexType virtualRows,
+                     const OffsetsView&& rowPermArray,
+                     const OffsetsView&& groupPointers );
+
+      __cuda_callable__
+      BiEllpackView( const BiEllpackView& chunked_ellpack_view );
+
+      __cuda_callable__
+      BiEllpackView( const BiEllpackView&& chunked_ellpack_view );
+
+      static String getSerializationType();
+
+      static String getSegmentsType();
+
+      __cuda_callable__
+      ViewType getView();
+
+      __cuda_callable__
+      const ConstViewType getConstView() const;
+
+      /**
+       * \brief Number of segments.
+       */
+      __cuda_callable__
+      IndexType getSegmentsCount() const;
+
+      /***
+       * \brief Returns size of the segment number \r segmentIdx
+       */
+      __cuda_callable__
+      IndexType getSegmentSize( const IndexType segmentIdx ) const;
+
+      /***
+       * \brief Returns number of elements managed by all segments.
+       */
+      __cuda_callable__
+      IndexType getSize() const;
+
+      /***
+       * \brief Returns number of elements that needs to be allocated.
+       */
+      __cuda_callable__
+      IndexType getStorageSize() const;
+
+      __cuda_callable__
+      IndexType getGlobalIndex( const Index segmentIdx, const Index localIdx ) const;
+
+      __cuda_callable__
+      SegmentViewType getSegmentView( const IndexType segmentIdx ) const;
+
+      /***
+       * \brief Go over all segments and for each segment element call
+       * function 'f' with arguments 'args'. The return type of 'f' is bool.
+       * When its true, the for-loop continues. Once 'f' returns false, the for-loop
+       * is terminated.
+       */
+      template< typename Function, typename... Args >
+      void forSegments( IndexType first, IndexType last, Function& f, Args... args ) const;
+
+      template< typename Function, typename... Args >
+      void forAll( Function& f, Args... args ) const;
+
+
+      /***
+       * \brief Go over all segments and perform a reduction in each of them.
+       */
+      template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
+      void segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
+
+      template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
+      void allReduction( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
+
+      BiEllpackView& operator=( const BiEllpackView& view );
+
+      void save( File& file ) const;
+
+      void load( File& file );
+
+      void printStructure( std::ostream& str ) const;
+
+   protected:
+
+      static constexpr int getWarpSize() { return WarpSize; };
+
+      static constexpr int getLogWarpSize() { return std::log2( WarpSize ); };
+
+      IndexType size = 0, storageSize = 0;
+
+      IndexType virtualRows = 0;
+
+      OffsetsView rowPermArray;
+
+      OffsetsView groupPointers;
+
+#ifdef HAVE_CUDA
+      template< typename Fetch,
+                typename Reduction,
+                typename ResultKeeper,
+                typename Real,
+                int BlockDim,
+                typename... Args >
+      __device__
+      void segmentsReductionKernelWithAllParameters( IndexType gridIdx,
+                                                     IndexType first,
+                                                     IndexType last,
+                                                     Fetch fetch,
+                                                     Reduction reduction,
+                                                     ResultKeeper keeper,
+                                                     Real zero,
+                                                     Args... args ) const;
+
+      template< typename Fetch,
+                typename Reduction,
+                typename ResultKeeper,
+                typename Real_,
+                int BlockDim,
+                typename... Args >
+      __device__
+      void segmentsReductionKernel( IndexType gridIdx,
+                                    IndexType first,
+                                    IndexType last,
+                                    Fetch fetch,
+                                    Reduction reduction,
+                                    ResultKeeper keeper,
+                                    Real_ zero,
+                                    Args... args ) const;
+
+      template< typename View_,
+                typename Index_,
+                typename Fetch_,
+                typename Reduction_,
+                typename ResultKeeper_,
+                typename Real_,
+                int BlockDim,
+                typename... Args_ >
+      friend __global__
+      void BiEllpackSegmentsReductionKernel( View_ chunkedEllpack,
+                                             Index_ gridIdx,
+                                             Index_ first,
+                                             Index_ last,
+                                             Fetch_ fetch,
+                                             Reduction_ reduction,
+                                             ResultKeeper_ keeper,
+                                             Real_ zero,
+                                             Args_... args );
+
+      template< typename Index_, typename Fetch_, int BlockDim_, int WarpSize_, bool B_ >
+      friend struct details::BiEllpackSegmentsReductionDispatcher;
+#endif
+};
+      } // namespace Segements
+   }  // namespace Conatiners
+} // namespace TNL
+
+#include <TNL/Containers/Segments/BiEllpackView.hpp>
diff --git a/src/TNL/Containers/Segments/BiEllpackView.hpp b/src/TNL/Containers/Segments/BiEllpackView.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..1cb5ce7c0b700a9263aae1fcd58190b83fa4bdc0
--- /dev/null
+++ b/src/TNL/Containers/Segments/BiEllpackView.hpp
@@ -0,0 +1,620 @@
+/***************************************************************************
+                          BiEllpackView.hpp -  description
+                             -------------------
+    begin                : Apr 5, 2020
+    copyright            : (C) 2020 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#include <TNL/Containers/Vector.h>
+#include <TNL/Algorithms/ParallelFor.h>
+#include <TNL/Containers/Segments/BiEllpackView.h>
+#include <TNL/Containers/Segments/details/LambdaAdapter.h>
+//#include <TNL/Containers/Segments/details/BiEllpack.h>
+
+namespace TNL {
+   namespace Containers {
+      namespace Segments {
+
+template< typename Device,
+          typename Index,
+          bool RowMajorOrder,
+          int WarpSize >
+__cuda_callable__
+BiEllpackView< Device, Index, RowMajorOrder, WarpSize >::
+BiEllpackView( const IndexType size,
+               const IndexType storageSize,
+               const IndexType virtualRows,
+               const OffsetsView& rowPermArray,
+               const OffsetsView& groupPointers )
+: size( size ),
+  storageSize( storageSize ),
+  virtualRows( virtualRows ),
+  rowPermArray( rowPermArray ),
+  groupPointers( groupPointers )
+{
+}
+
+template< typename Device,
+          typename Index,
+          bool RowMajorOrder,
+          int WarpSize >
+__cuda_callable__
+BiEllpackView< Device, Index, RowMajorOrder, WarpSize >::
+BiEllpackView( const IndexType size,
+               const IndexType storageSize,
+               const IndexType virtualRows,
+               const OffsetsView&& rowPermArray,
+               const OffsetsView&& groupPointers )
+: size( size ),
+  storageSize( storageSize ),
+  virtualRows( virtualRows ),
+  rowPermArray( std::move( rowPermArray ) ),
+  groupPointers( std::move( groupPointers ) )
+{
+}
+
+template< typename Device,
+          typename Index,
+          bool RowMajorOrder,
+          int WarpSize >
+__cuda_callable__
+BiEllpackView< Device, Index, RowMajorOrder, WarpSize >::
+BiEllpackView( const BiEllpackView& bi_ellpack_view )
+: size( bi_ellpack_view.size ),
+  storageSize( bi_ellpack_view.storageSize ),
+  virtualRows( bi_ellpack_view.virtualRows ),
+  rowPermArray( bi_ellpack_view.rowPermArray ),
+  groupPointers( bi_ellpack_view.groupPointers )
+{
+}
+
+template< typename Device,
+          typename Index,
+          bool RowMajorOrder,
+          int WarpSize >
+__cuda_callable__
+BiEllpackView< Device, Index, RowMajorOrder, WarpSize >::
+BiEllpackView( const BiEllpackView&& bi_ellpack_view )
+: size( bi_ellpack_view.size ),
+  storageSize( bi_ellpack_view.storageSize ),
+  virtualRows( bi_ellpack_view.virtualRows ),
+  rowPermArray( std::move( bi_ellpack_view.rowPermArray ) ),
+  groupPointers( std::move( bi_ellpack_view.groupPointers ) )
+{
+}
+
+template< typename Device,
+          typename Index,
+          bool RowMajorOrder,
+          int WarpSize >
+String
+BiEllpackView< Device, Index, RowMajorOrder, WarpSize >::
+getSerializationType()
+{
+   return "BiEllpack< [any_device], " + TNL::getSerializationType< IndexType >() + " >";
+}
+
+template< typename Device,
+          typename Index,
+          bool RowMajorOrder,
+          int WarpSize >
+String
+BiEllpackView< Device, Index, RowMajorOrder, WarpSize >::
+getSegmentsType()
+{
+   return "BiEllpack";
+}
+
+template< typename Device,
+          typename Index,
+          bool RowMajorOrder,
+          int WarpSize >
+__cuda_callable__
+typename BiEllpackView< Device, Index, RowMajorOrder, WarpSize >::ViewType
+BiEllpackView< Device, Index, RowMajorOrder, WarpSize >::
+getView()
+{
+   return ViewType( size, storageSize, virtualRows, rowPermArray.getView(), groupPointers.getView() );
+}
+
+template< typename Device,
+          typename Index,
+          bool RowMajorOrder,
+          int WarpSize >
+__cuda_callable__ auto BiEllpackView< Device, Index, RowMajorOrder, WarpSize >::
+getConstView() const -> const ConstViewType
+{
+   return ConstViewType( size, storageSize, virtualRows, rowPermArray.getConstView(), groupPointers.getConstView() );
+}
+
+template< typename Device,
+          typename Index,
+          bool RowMajorOrder,
+          int WarpSize >
+__cuda_callable__ auto BiEllpackView< Device, Index, RowMajorOrder, WarpSize >::
+getSegmentsCount() const -> IndexType
+{
+   return this->size;
+}
+
+template< typename Device,
+          typename Index,
+          bool RowMajorOrder,
+          int WarpSize >
+__cuda_callable__ auto BiEllpackView< Device, Index, RowMajorOrder, WarpSize >::
+getSegmentSize( const IndexType segmentIdx ) const -> IndexType
+{
+   if( std::is_same< DeviceType, Devices::Host >::value )
+      return details::BiEllpack< IndexType, DeviceType, RowMajorOrder, WarpSize >::getSegmentSizeDirect(
+         rowPermArray,
+         groupPointers,
+         segmentIdx );
+   if( std::is_same< DeviceType, Devices::Cuda >::value )
+   {
+#ifdef __CUDA_ARCH__
+      return details::BiEllpack< IndexType, DeviceType, RowMajorOrder, WarpSize >::getSegmentSizeDirect(
+         rowPermArray,
+         groupPointers,
+         segmentIdx );
+#else
+      return details::BiEllpack< IndexType, DeviceType, RowMajorOrder, WarpSize >::getSegmentSize(
+         rowPermArray,
+         groupPointers,
+         segmentIdx );
+#endif
+   }
+}
+
+template< typename Device,
+          typename Index,
+          bool RowMajorOrder,
+          int WarpSize >
+__cuda_callable__ auto BiEllpackView< Device, Index, RowMajorOrder, WarpSize >::
+getSize() const -> IndexType
+{
+   return this->size;
+}
+
+template< typename Device,
+          typename Index,
+          bool RowMajorOrder,
+          int WarpSize >
+__cuda_callable__ auto BiEllpackView< Device, Index, RowMajorOrder, WarpSize >::
+getStorageSize() const -> IndexType
+{
+   return this->storageSize;
+}
+
+template< typename Device,
+          typename Index,
+          bool RowMajorOrder,
+          int WarpSize >
+__cuda_callable__ auto BiEllpackView< Device, Index, RowMajorOrder, WarpSize >::
+getGlobalIndex( const Index segmentIdx, const Index localIdx ) const -> IndexType
+{
+   if( std::is_same< DeviceType, Devices::Host >::value )
+      return details::BiEllpack< IndexType, DeviceType, RowMajorOrder, WarpSize >::getGlobalIndexDirect(
+         rowPermArray,
+         groupPointers,
+         segmentIdx,
+         localIdx );
+   if( std::is_same< DeviceType, Devices::Cuda >::value )
+   {
+#ifdef __CUDA_ARCH__
+      return details::BiEllpack< IndexType, DeviceType, RowMajorOrder, WarpSize >::getGlobalIndexDirect(
+         rowPermArray,
+         groupPointers,
+         segmentIdx,
+         localIdx );
+#else
+      return details::BiEllpack< IndexType, DeviceType, RowMajorOrder, WarpSize >::getGlobalIndex(
+         rowPermArray,
+         groupPointers,
+         segmentIdx,
+         localIdx );
+#endif
+   }
+}
+
+template< typename Device,
+          typename Index,
+          bool RowMajorOrder,
+          int WarpSize >
+__cuda_callable__
+auto
+BiEllpackView< Device, Index, RowMajorOrder, WarpSize >::
+getSegmentView( const IndexType segmentIdx ) const -> SegmentViewType
+{
+   if( std::is_same< DeviceType, Devices::Host >::value )
+      return details::BiEllpack< IndexType, DeviceType, RowMajorOrder, WarpSize >::getSegmentViewDirect(
+         rowPermArray,
+         groupPointers,
+         segmentIdx );
+   if( std::is_same< DeviceType, Devices::Cuda >::value )
+   {
+#ifdef __CUDA_ARCH__
+      return details::BiEllpack< IndexType, DeviceType, RowMajorOrder, WarpSize >::getSegmentViewDirect(
+         rowPermArray,
+         groupPointers,
+         segmentIdx );
+#else
+      return details::BiEllpack< IndexType, DeviceType, RowMajorOrder, WarpSize >::getSegmentView(
+         rowPermArray,
+         groupPointers,
+         segmentIdx );
+#endif
+   }
+}
+
+template< typename Device,
+          typename Index,
+          bool RowMajorOrder,
+          int WarpSize >
+   template< typename Function, typename... Args >
+void
+BiEllpackView< Device, Index, RowMajorOrder, WarpSize >::
+forSegments( IndexType first, IndexType last, Function& f, Args... args ) const
+{
+   const auto segmentsPermutationView = this->rowPermArray.getConstView();
+   const auto groupPointersView = this->groupPointers.getConstView();
+   auto work = [=] __cuda_callable__ ( IndexType segmentIdx, Args... args ) mutable {
+      const IndexType strip = segmentIdx / getWarpSize();
+      const IndexType firstGroupInStrip = strip * ( getLogWarpSize() + 1 );
+      const IndexType rowStripPerm = segmentsPermutationView[ segmentIdx ] - strip * getWarpSize();
+      const IndexType groupsCount = details::BiEllpack< IndexType, DeviceType, RowMajorOrder, getWarpSize() >::getActiveGroupsCountDirect( segmentsPermutationView, segmentIdx );
+      IndexType groupHeight = getWarpSize();
+      //printf( "segmentIdx = %d strip = %d firstGroupInStrip = %d rowStripPerm = %d groupsCount = %d \n", segmentIdx, strip, firstGroupInStrip, rowStripPerm, groupsCount );
+      bool compute( true );
+      IndexType localIdx( 0 );
+      for( IndexType groupIdx = firstGroupInStrip; groupIdx < firstGroupInStrip + groupsCount && compute; groupIdx++ )
+      {
+         IndexType groupOffset = groupPointersView[ groupIdx ];
+         const IndexType groupSize = groupPointersView[ groupIdx + 1 ] - groupOffset;
+         //printf( "groupSize = %d \n", groupSize );
+         if( groupSize )
+         {
+            const IndexType groupWidth = groupSize / groupHeight;
+            for( IndexType i = 0; i < groupWidth; i++ )
+            {
+               if( RowMajorOrder )
+               {
+                  f( segmentIdx, localIdx, groupOffset + rowStripPerm * groupWidth + i, compute );
+               }
+               else
+               {
+                  /*printf( "segmentIdx = %d localIdx = %d globalIdx = %d groupIdx = %d groupSize = %d groupWidth = %d\n",
+                     segmentIdx, localIdx, groupOffset + rowStripPerm + i * groupHeight,
+                     groupIdx, groupSize, groupWidth );*/
+                  f( segmentIdx, localIdx, groupOffset + rowStripPerm + i * groupHeight, compute );
+               }
+               localIdx++;
+            }
+         }
+         groupHeight /= 2;
+      }
+   };
+   Algorithms::ParallelFor< DeviceType >::exec( first, last , work, args... );
+}
+
+template< typename Device,
+          typename Index,
+          bool RowMajorOrder,
+          int WarpSize >
+   template< typename Function, typename... Args >
+void
+BiEllpackView< Device, Index, RowMajorOrder, WarpSize >::
+forAll( Function& f, Args... args ) const
+{
+   this->forSegments( 0, this->getSegmentsCount(), f, args... );
+}
+
+template< typename Device,
+          typename Index,
+          bool RowMajorOrder,
+          int WarpSize >
+   template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
+void
+BiEllpackView< Device, Index, RowMajorOrder, WarpSize >::
+segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
+{
+   using RealType = typename details::FetchLambdaAdapter< Index, Fetch >::ReturnType;
+   if( std::is_same< DeviceType, Devices::Host >::value )
+      for( IndexType segmentIdx = 0; segmentIdx < this->getSize(); segmentIdx++ )
+      {
+         const IndexType stripIdx = segmentIdx / getWarpSize();
+         const IndexType groupIdx = stripIdx * ( getLogWarpSize() + 1 );
+         const IndexType inStripIdx = rowPermArray[ segmentIdx ] - stripIdx * getWarpSize();
+         const IndexType groupsCount = details::BiEllpack< IndexType, DeviceType, RowMajorOrder, getWarpSize() >::getActiveGroupsCount( rowPermArray, segmentIdx );
+         IndexType globalIdx = groupPointers[ groupIdx ];
+         IndexType groupHeight = getWarpSize();
+         IndexType localIdx( 0 );
+         RealType aux( zero );
+         bool compute( true );
+         for( IndexType group = 0; group < groupsCount && compute; group++ )
+         {
+            const IndexType groupSize = details::BiEllpack< IndexType, DeviceType, RowMajorOrder, getWarpSize() >::getGroupSize( groupPointers, stripIdx, group );
+            IndexType groupWidth = groupSize / groupHeight;
+            const IndexType globalIdxBack = globalIdx;
+            if( RowMajorOrder )
+               globalIdx += inStripIdx * groupWidth;
+            else
+               globalIdx += inStripIdx;
+            for( IndexType j = 0; j < groupWidth && compute; j++ )
+            {
+               //std::cerr << "segmentIdx = " << segmentIdx << " groupIdx = " << groupIdx 
+               //         << " groupWidth = " << groupWidth << " groupHeight = " << groupHeight
+               //          << " localIdx = " << localIdx << " globalIdx = " << globalIdx 
+               //          << " fetch = " << details::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx++, globalIdx, compute ) << std::endl;
+               aux = reduction( aux, details::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx++, globalIdx, compute ) );
+               if( RowMajorOrder )
+                  globalIdx ++;
+               else
+                  globalIdx += groupHeight;
+            }
+            globalIdx = globalIdxBack + groupSize;
+            groupHeight /= 2;
+         }
+         keeper( segmentIdx, aux );
+      }
+   if( std::is_same< DeviceType, Devices::Cuda >::value )
+   {
+#ifdef HAVE_CUDA
+      constexpr int BlockDim = 256;//getWarpSize();
+      dim3 cudaBlockSize = BlockDim;
+      const IndexType stripsCount = roundUpDivision( last - first, getWarpSize() );
+      const IndexType cudaBlocks = roundUpDivision( stripsCount * getWarpSize(), cudaBlockSize.x );
+      const IndexType cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() );
+      IndexType sharedMemory = 0;
+      if( ! RowMajorOrder )
+         sharedMemory = cudaBlockSize.x * sizeof( RealType );
+
+      for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ )
+      {
+         dim3 cudaGridSize = Cuda::getMaxGridSize();
+         if( gridIdx == cudaGrids - 1 )
+            cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize();
+         details::BiEllpackSegmentsReductionKernel< ViewType, IndexType, Fetch, Reduction, ResultKeeper, Real, BlockDim, Args...  >
+            <<< cudaGridSize, cudaBlockSize, sharedMemory  >>>
+            ( *this, gridIdx, first, last, fetch, reduction, keeper, zero, args... );
+         cudaThreadSynchronize();
+         TNL_CHECK_CUDA_DEVICE;
+      }
+#endif
+   }
+}
+
+template< typename Device,
+          typename Index,
+          bool RowMajorOrder,
+          int WarpSize >
+   template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
+void
+BiEllpackView< Device, Index, RowMajorOrder, WarpSize >::
+allReduction( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
+{
+   this->segmentsReduction( 0, this->getSegmentsCount(), fetch, reduction, keeper, zero, args... );
+}
+
+template< typename Device,
+          typename Index,
+          bool RowMajorOrder,
+          int WarpSize >
+BiEllpackView< Device, Index, RowMajorOrder, WarpSize >&
+BiEllpackView< Device, Index, RowMajorOrder, WarpSize >::
+operator=( const BiEllpackView& source )
+{
+   this->size = source.size;
+   this->storageSize = source.storageSize;
+   this->virtualRows = source.virtualRows;
+   this->rowPermArray.bind( source.rowPermArray );
+   this->groupPointers.bind( source.groupPointers );
+   return *this;
+}
+
+template< typename Device,
+          typename Index,
+          bool RowMajorOrder,
+          int WarpSize >
+void
+BiEllpackView< Device, Index, RowMajorOrder, WarpSize >::
+save( File& file ) const
+{
+   file.save( &this->size );
+   file.save( &this->storageSize );
+   file.save( &this->virtualRows );
+   file << this->rowPermArray
+        << this->groupPointers;
+}
+
+template< typename Device,
+          typename Index,
+          bool RowMajorOrder,
+          int WarpSize >
+void
+BiEllpackView< Device, Index, RowMajorOrder, WarpSize >::
+printStructure( std::ostream& str ) const
+{
+   const IndexType stripsCount = roundUpDivision( this->getSize(), getWarpSize() );
+   for( IndexType stripIdx = 0; stripIdx < stripsCount; stripIdx++ )
+   {
+      str << "Strip: " << stripIdx << std::endl;
+      const IndexType firstGroupIdx = stripIdx * ( getLogWarpSize() + 1 );
+      const IndexType lastGroupIdx = firstGroupIdx + getLogWarpSize() + 1;
+      IndexType groupHeight = getWarpSize();
+      for( IndexType groupIdx = firstGroupIdx; groupIdx < lastGroupIdx; groupIdx ++ )
+      {
+         const IndexType groupSize = groupPointers.getElement( groupIdx + 1 ) - groupPointers.getElement( groupIdx );
+         const IndexType groupWidth = groupSize / groupHeight;
+         str << "\tGroup: " << groupIdx << " size = " << groupSize << " width = " << groupWidth << " height = " << groupHeight << std::endl;
+         groupHeight /= 2;
+      }
+   }
+}
+
+#ifdef HAVE_CUDA
+template< typename Device,
+          typename Index,
+          bool RowMajorOrder,
+          int WarpSize >
+   template< typename Fetch,
+             typename Reduction,
+             typename ResultKeeper,
+             typename Real,
+             int BlockDim,
+             typename... Args >
+__device__
+void
+BiEllpackView< Device, Index, RowMajorOrder, WarpSize >::
+segmentsReductionKernelWithAllParameters( IndexType gridIdx,
+                                          IndexType first,
+                                          IndexType last,
+                                          Fetch fetch,
+                                          Reduction reduction,
+                                          ResultKeeper keeper,
+                                          Real zero,
+                                          Args... args ) const
+{
+   using RealType = decltype( fetch( IndexType(), IndexType(), IndexType(), std::declval< bool& >(), args... ) );
+   const IndexType segmentIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x + first;
+   if( segmentIdx >= last )
+      return;
+
+   const IndexType strip = segmentIdx / getWarpSize();
+   const IndexType firstGroupInStrip = strip * ( getLogWarpSize() + 1 );
+   const IndexType rowStripPerm = rowPermArray[ segmentIdx ] - strip * getWarpSize();
+   const IndexType groupsCount = details::BiEllpack< IndexType, DeviceType, RowMajorOrder, getWarpSize() >::getActiveGroupsCountDirect( rowPermArray, segmentIdx );
+   IndexType groupHeight = getWarpSize();
+   bool compute( true );
+   IndexType localIdx( 0 );
+   RealType result( zero );
+   for( IndexType groupIdx = firstGroupInStrip; groupIdx < firstGroupInStrip + groupsCount && compute; groupIdx++ )
+   {
+      IndexType groupOffset = groupPointers[ groupIdx ];
+      const IndexType groupSize = groupPointers[ groupIdx + 1 ] - groupOffset;
+      if( groupSize )
+      {
+         const IndexType groupWidth = groupSize / groupHeight;
+         for( IndexType i = 0; i < groupWidth; i++ )
+         {
+            if( RowMajorOrder )
+               result = reduction( result, fetch( segmentIdx, localIdx, groupOffset + rowStripPerm * groupWidth + i, compute ) );
+            else
+               result = reduction( result, fetch( segmentIdx, localIdx, groupOffset + rowStripPerm + i * groupHeight, compute ) );
+            localIdx++;
+         }
+      }
+      groupHeight /= 2;
+   }
+   keeper( segmentIdx, result );
+}
+
+template< typename Device,
+          typename Index,
+          bool RowMajorOrder,
+          int WarpSize >
+   template< typename Fetch,
+             typename Reduction,
+             typename ResultKeeper,
+             typename Real,
+             int BlockDim,
+             typename... Args >
+__device__
+void
+BiEllpackView< Device, Index, RowMajorOrder, WarpSize >::
+segmentsReductionKernel( IndexType gridIdx,
+                         IndexType first,
+                         IndexType last,
+                         Fetch fetch,
+                         Reduction reduction,
+                         ResultKeeper keeper,
+                         Real zero,
+                         Args... args ) const
+{
+   using RealType = decltype( fetch( IndexType(), std::declval< bool& >(), args... ) );
+   Index segmentIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x + first;
+
+   const IndexType strip = segmentIdx >> getLogWarpSize();
+   const IndexType warpStart = strip << getLogWarpSize();
+   const IndexType inWarpIdx = segmentIdx & ( getWarpSize() - 1 );
+
+   if( warpStart >= last )
+      return;
+
+   IndexType groupHeight = getWarpSize();
+   IndexType firstGroupIdx = strip * ( getLogWarpSize() + 1 );
+
+   __shared__ RealType results[ BlockDim ];
+   results[ threadIdx.x ] = zero;
+   __shared__ IndexType sharedGroupPointers[ 7 ]; // TODO: getLogWarpSize() + 1 ];
+
+   if( threadIdx.x <= getLogWarpSize() + 1 )
+      sharedGroupPointers[ threadIdx.x ] = this->groupPointers[ firstGroupIdx + threadIdx.x ];
+   __syncthreads();
+
+   bool compute( true );
+   if( RowMajorOrder )
+   {
+      for( IndexType group = 0; group < getLogWarpSize() + 1; group++ )
+      {
+         IndexType groupBegin = sharedGroupPointers[ group ];
+         IndexType groupEnd = sharedGroupPointers[ group + 1 ];
+         if( groupEnd - groupBegin > 0 )
+         {
+
+               if( inWarpIdx < groupHeight )
+               {
+                  const IndexType groupWidth = ( groupEnd - groupBegin ) / groupHeight;
+                  IndexType globalIdx = groupBegin + inWarpIdx * groupWidth;
+                  for( IndexType i = 0; i < groupWidth && compute; i++ )
+                     results[ threadIdx.x ] = reduction( results[ threadIdx.x ], fetch( globalIdx++, compute ) );
+               }
+            }
+         groupHeight >>= 1;
+      }
+   }
+   else
+   {
+      RealType* temp = Cuda::getSharedMemory< RealType >();
+      for( IndexType group = 0; group < getLogWarpSize() + 1; group++ )
+      {
+         IndexType groupBegin = sharedGroupPointers[ group ];
+         IndexType groupEnd = sharedGroupPointers[ group + 1 ];
+         if( groupEnd - groupBegin > 0 )
+         {
+            temp[ threadIdx.x ] = zero;
+            IndexType globalIdx = groupBegin + inWarpIdx;
+            while( globalIdx < groupEnd )
+            {
+               temp[ threadIdx.x ] = reduction( temp[ threadIdx.x ], fetch( globalIdx, compute ) );
+               globalIdx += getWarpSize();
+            }
+            // TODO: reduction via templates
+            IndexType bisection2 = getWarpSize();
+            for( IndexType i = 0; i < group; i++ )
+            {
+               bisection2 >>= 1;
+               if( inWarpIdx < bisection2 )
+                  temp[ threadIdx.x ] = reduction( temp[ threadIdx.x ], temp[ threadIdx.x + bisection2 ] );
+            }
+            if( inWarpIdx < groupHeight )
+               results[ threadIdx.x ] = reduction( results[ threadIdx.x ], temp[ threadIdx.x ] );
+         }
+         groupHeight >>= 1;
+      }
+   }
+   __syncthreads();
+   if( warpStart + inWarpIdx >= last )
+      return;
+
+   keeper( warpStart + inWarpIdx, results[ this->rowPermArray[ warpStart + inWarpIdx ] & ( blockDim.x - 1 ) ] );
+}
+#endif
+
+      } // namespace Segments
+   }  // namespace Containers
+} // namespace TNL
diff --git a/src/TNL/Containers/Segments/CSR.h b/src/TNL/Containers/Segments/CSR.h
index 89cad0c6af3d80d9b9b78d336f4dfb95ff69cfc6..c5c0ce68f5c7a5e6fc55d36b1a7410175b325968 100644
--- a/src/TNL/Containers/Segments/CSR.h
+++ b/src/TNL/Containers/Segments/CSR.h
@@ -28,13 +28,13 @@ class CSR
    public:
 
       using DeviceType = Device;
-      using IndexType = Index;
-      using OffsetsHolder = Containers::Vector< IndexType, DeviceType, typename std::remove_const< IndexType >::type, IndexAllocator >;
+      using IndexType = std::remove_const_t< Index >;
+      using OffsetsHolder = Containers::Vector< Index, DeviceType, IndexType, IndexAllocator >;
       using SegmentsSizes = OffsetsHolder;
       template< typename Device_, typename Index_ >
       using ViewTemplate = CSRView< Device_, Index_ >;
       using ViewType = CSRView< Device, Index >;
-      using ConstViewType = CSRView< Device, std::add_const_t< Index > >;
+      using ConstViewType = CSRView< Device, std::add_const_t< IndexType > >;
       using SegmentViewType = SegmentView< IndexType, true >;
 
       CSR();
@@ -55,12 +55,14 @@ class CSR
       template< typename SizesHolder = OffsetsHolder >
       void setSegmentsSizes( const SizesHolder& sizes );
 
+      void reset();
+
       ViewType getView();
 
-      ConstViewType getConstView() const;
+      const ConstViewType getConstView() const;
 
       /**
-       * \brief Number segments.
+       * \brief Number of segments.
        */
       __cuda_callable__
       IndexType getSegmentsCount() const;
@@ -108,10 +110,10 @@ class CSR
        * \brief Go over all segments and perform a reduction in each of them.
        */
       template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
-      void segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
+      void segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
 
       template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
-      void allReduction( Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
+      void allReduction( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
 
       CSR& operator=( const CSR& rhsSegments ) = default;
 
diff --git a/src/TNL/Containers/Segments/CSR.hpp b/src/TNL/Containers/Segments/CSR.hpp
index 9a948b04e2dac7311c6eab9f4149cf779256c59f..685f6ef54a6d7ad90ec69e5d45d83d78c0e1f337 100644
--- a/src/TNL/Containers/Segments/CSR.hpp
+++ b/src/TNL/Containers/Segments/CSR.hpp
@@ -85,6 +85,18 @@ setSegmentsSizes( const SizesHolder& sizes )
    details::CSR< Device, Index >::setSegmentsSizes( sizes, this->offsets );
 }
 
+template< typename Device,
+          typename Index,
+          typename IndexAllocator >
+void
+CSR< Device, Index, IndexAllocator >::
+reset()
+{
+   this->offsets.setSize( 1 );
+   this->offsets = 0;
+}
+
+
 template< typename Device,
           typename Index,
           typename IndexAllocator >
@@ -98,9 +110,9 @@ getView()
 template< typename Device,
           typename Index,
           typename IndexAllocator >
-typename CSR< Device, Index, IndexAllocator >::ConstViewType
+auto
 CSR< Device, Index, IndexAllocator >::
-getConstView() const
+getConstView() const -> const ConstViewType
 {
    return ConstViewType( this->offsets.getConstView() );
 }
@@ -108,10 +120,8 @@ getConstView() const
 template< typename Device,
           typename Index,
           typename IndexAllocator >
-__cuda_callable__
-Index
-CSR< Device, Index, IndexAllocator >::
-getSegmentsCount() const
+__cuda_callable__ auto CSR< Device, Index, IndexAllocator >::
+getSegmentsCount() const -> IndexType
 {
    return this->offsets.getSize() - 1;
 }
@@ -119,10 +129,8 @@ getSegmentsCount() const
 template< typename Device,
           typename Index,
           typename IndexAllocator >
-__cuda_callable__
-Index
-CSR< Device, Index, IndexAllocator >::
-getSegmentSize( const IndexType segmentIdx ) const
+__cuda_callable__ auto CSR< Device, Index, IndexAllocator >::
+getSegmentSize( const IndexType segmentIdx ) const -> IndexType
 {
    return details::CSR< Device, Index >::getSegmentSize( this->offsets, segmentIdx );
 }
@@ -130,10 +138,8 @@ getSegmentSize( const IndexType segmentIdx ) const
 template< typename Device,
           typename Index,
           typename IndexAllocator >
-__cuda_callable__
-Index
-CSR< Device, Index, IndexAllocator >::
-getSize() const
+__cuda_callable__ auto CSR< Device, Index, IndexAllocator >::
+getSize() const -> IndexType
 {
    return this->getStorageSize();
 }
@@ -141,10 +147,8 @@ getSize() const
 template< typename Device,
           typename Index,
           typename IndexAllocator >
-__cuda_callable__
-Index
-CSR< Device, Index, IndexAllocator >::
-getStorageSize() const
+__cuda_callable__ auto CSR< Device, Index, IndexAllocator >::
+getStorageSize() const -> IndexType
 {
    return details::CSR< Device, Index >::getStorageSize( this->offsets );
 }
@@ -152,10 +156,8 @@ getStorageSize() const
 template< typename Device,
           typename Index,
           typename IndexAllocator >
-__cuda_callable__
-Index
-CSR< Device, Index, IndexAllocator >::
-getGlobalIndex( const Index segmentIdx, const Index localIdx ) const
+__cuda_callable__ auto CSR< Device, Index, IndexAllocator >::
+getGlobalIndex( const Index segmentIdx, const Index localIdx ) const -> IndexType
 {
    if( ! std::is_same< DeviceType, Devices::Host >::value )
    {
@@ -197,16 +199,7 @@ void
 CSR< Device, Index, IndexAllocator >::
 forSegments( IndexType first, IndexType last, Function& f, Args... args ) const
 {
-   const auto offsetsView = this->offsets.getConstView();
-   auto l = [=] __cuda_callable__ ( const IndexType segmentIdx, Args... args ) mutable {
-      const IndexType begin = offsetsView[ segmentIdx ];
-      const IndexType end = offsetsView[ segmentIdx + 1 ];
-      IndexType localIdx( 0 );
-      for( IndexType globalIdx = begin; globalIdx < end; globalIdx++  )
-         if( ! f( segmentIdx, localIdx++, globalIdx, args... ) )
-            break;
-   };
-   Algorithms::ParallelFor< Device >::exec( first, last, l, args... );
+   this->getConstView().forSegments( first, last, f, args... );
 }
 
 template< typename Device,
@@ -226,21 +219,9 @@ template< typename Device,
    template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
 void
 CSR< Device, Index, IndexAllocator >::
-segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
+segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
 {
-   using RealType = decltype( fetch( IndexType(), IndexType(), IndexType(), std::declval< bool& >(), args... ) );
-   const auto offsetsView = this->offsets.getConstView();
-   auto l = [=] __cuda_callable__ ( const IndexType i, Args... args ) mutable {
-      const IndexType begin = offsetsView[ i ];
-      const IndexType end = offsetsView[ i + 1 ];
-      RealType aux( zero );
-      bool compute( true );
-      IndexType localIdx( 0 );
-      for( IndexType j = begin; j < end && compute; j++  )
-         reduction( aux, fetch( i, localIdx++, j, compute, args... ) );
-      keeper( i, aux );
-   };
-   Algorithms::ParallelFor< Device >::exec( first, last, l, args... );
+   this->getConstView().segmentsReduction( first, last, fetch, reduction, keeper, zero, args... );
 }
 
 template< typename Device,
@@ -249,7 +230,7 @@ template< typename Device,
    template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
 void
 CSR< Device, Index, IndexAllocator >::
-allReduction( Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
+allReduction( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
 {
    this->segmentsReduction( 0, this->getSegmentsCount(), fetch, reduction, keeper, zero, args... );
 }
@@ -279,7 +260,7 @@ save( File& file ) const
 template< typename Device,
           typename Index,
           typename IndexAllocator >
-void
+void 
 CSR< Device, Index, IndexAllocator >::
 load( File& file )
 {
diff --git a/src/TNL/Containers/Segments/CSRView.h b/src/TNL/Containers/Segments/CSRView.h
index 4e53bd204f35ac301b3382d6e2e43da19095af41..b00b012d42df466c9760ce8973201da31af41ae7 100644
--- a/src/TNL/Containers/Segments/CSRView.h
+++ b/src/TNL/Containers/Segments/CSRView.h
@@ -26,9 +26,9 @@ class CSRView
    public:
 
       using DeviceType = Device;
-      using IndexType = Index;
-      using OffsetsView = typename Containers::VectorView< IndexType, DeviceType, typename std::remove_const< IndexType >::type >;
-      using ConstOffsetsView = typename Containers::Vector< IndexType, DeviceType, typename std::remove_const< IndexType >::type >::ConstViewType;
+      using IndexType = std::remove_const_t< Index >;
+      using OffsetsView = typename Containers::VectorView< Index, DeviceType, IndexType >;
+      using ConstOffsetsView = typename Containers::Vector< Index, DeviceType,IndexType >::ConstViewType;
       using ViewType = CSRView;
       template< typename Device_, typename Index_ >
       using ViewTemplate = CSRView< Device_, Index_ >;
@@ -58,7 +58,7 @@ class CSRView
       ViewType getView();
 
       __cuda_callable__
-      ConstViewType getConstView() const;
+      const ConstViewType getConstView() const;
 
       /**
        * \brief Number segments.
@@ -110,10 +110,10 @@ class CSRView
        * \brief Go over all segments and perform a reduction in each of them.
        */
       template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
-      void segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
+      void segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
 
       template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
-      void allReduction( Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
+      void allReduction( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
 
       CSRView& operator=( const CSRView& view );
 
diff --git a/src/TNL/Containers/Segments/CSRView.hpp b/src/TNL/Containers/Segments/CSRView.hpp
index b94db8c88b5df8b14c719e099a7cc525682e6d45..7599327d1085f40a3cbedb24b297dc8300f202fa 100644
--- a/src/TNL/Containers/Segments/CSRView.hpp
+++ b/src/TNL/Containers/Segments/CSRView.hpp
@@ -14,6 +14,7 @@
 #include <TNL/Algorithms/ParallelFor.h>
 #include <TNL/Containers/Segments/CSRView.h>
 #include <TNL/Containers/Segments/details/CSR.h>
+#include <TNL/Containers/Segments/details/LambdaAdapter.h>
 
 namespace TNL {
    namespace Containers {
@@ -95,59 +96,49 @@ getView()
 template< typename Device,
           typename Index >
 __cuda_callable__
-typename CSRView< Device, Index >::ConstViewType
+auto
 CSRView< Device, Index >::
-getConstView() const
+getConstView() const -> const ConstViewType
 {
    return ConstViewType( this->offsets.getConstView() );
 }
 
 template< typename Device,
           typename Index >
-__cuda_callable__
-Index
-CSRView< Device, Index >::
-getSegmentsCount() const
+__cuda_callable__ auto CSRView< Device, Index >::
+getSegmentsCount() const -> IndexType
 {
    return this->offsets.getSize() - 1;
 }
 
 template< typename Device,
           typename Index >
-__cuda_callable__
-Index
-CSRView< Device, Index >::
-getSegmentSize( const IndexType segmentIdx ) const
+__cuda_callable__ auto CSRView< Device, Index >::
+getSegmentSize( const IndexType segmentIdx ) const -> IndexType
 {
    return details::CSR< Device, Index >::getSegmentSize( this->offsets, segmentIdx );
 }
 
 template< typename Device,
           typename Index >
-__cuda_callable__
-Index
-CSRView< Device, Index >::
-getSize() const
+__cuda_callable__ auto CSRView< Device, Index >::
+getSize() const -> IndexType
 {
    return this->getStorageSize();
 }
 
 template< typename Device,
           typename Index >
-__cuda_callable__
-Index
-CSRView< Device, Index >::
-getStorageSize() const
+__cuda_callable__ auto CSRView< Device, Index >::
+getStorageSize() const -> IndexType
 {
    return details::CSR< Device, Index >::getStorageSize( this->offsets );
 }
 
 template< typename Device,
           typename Index >
-__cuda_callable__
-Index
-CSRView< Device, Index >::
-getGlobalIndex( const Index segmentIdx, const Index localIdx ) const
+__cuda_callable__ auto CSRView< Device, Index >::
+getGlobalIndex( const Index segmentIdx, const Index localIdx ) const -> IndexType
 {
    if( ! std::is_same< DeviceType, Devices::Host >::value )
    {
@@ -213,19 +204,19 @@ template< typename Device,
    template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
 void
 CSRView< Device, Index >::
-segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
+segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
 {
-   using RealType = decltype( fetch( IndexType(), IndexType(), IndexType(), std::declval< bool& >(), args... ) );
+   using RealType = typename details::FetchLambdaAdapter< Index, Fetch >::ReturnType;
    const auto offsetsView = this->offsets.getConstView();
-   auto l = [=] __cuda_callable__ ( const IndexType i, Args... args ) mutable {
-      const IndexType begin = offsetsView[ i ];
-      const IndexType end = offsetsView[ i + 1 ];
+   auto l = [=] __cuda_callable__ ( const IndexType segmentIdx, Args... args ) mutable {
+      const IndexType begin = offsetsView[ segmentIdx ];
+      const IndexType end = offsetsView[ segmentIdx + 1 ];
       RealType aux( zero );
       IndexType localIdx( 0 );
       bool compute( true );
-      for( IndexType j = begin; j < end && compute; j++  )
-         reduction( aux, fetch( i, localIdx++, j, compute, args... ) );
-      keeper( i, aux );
+      for( IndexType globalIdx = begin; globalIdx < end && compute; globalIdx++  )
+         aux = reduction( aux, details::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx++, globalIdx, compute ) );
+      keeper( segmentIdx, aux );
    };
    Algorithms::ParallelFor< Device >::exec( first, last, l, args... );
 }
@@ -235,7 +226,7 @@ template< typename Device,
    template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
 void
 CSRView< Device, Index >::
-allReduction( Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
+allReduction( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
 {
    this->segmentsReduction( 0, this->getSegmentsCount(), fetch, reduction, keeper, zero, args... );
 }
diff --git a/src/TNL/Containers/Segments/ChunkedEllpack.h b/src/TNL/Containers/Segments/ChunkedEllpack.h
index c6c7812dba1ae98d9a75618bc5d4df93510c3a30..dd4805887d7fd4b8c19bd79c162deaaaa45ed669 100644
--- a/src/TNL/Containers/Segments/ChunkedEllpack.h
+++ b/src/TNL/Containers/Segments/ChunkedEllpack.h
@@ -28,13 +28,13 @@ class ChunkedEllpack
    public:
 
       using DeviceType = Device;
-      using IndexType = Index;
-      using OffsetsHolder = Containers::Vector< IndexType, DeviceType, typename std::remove_const< IndexType >::type, IndexAllocator >;
+      using IndexType = std::remove_const_t< Index >;
+      using OffsetsHolder = Containers::Vector< Index, DeviceType, IndexType, IndexAllocator >;
       static constexpr bool getRowMajorOrder() { return RowMajorOrder; }
       using ViewType = ChunkedEllpackView< Device, Index, RowMajorOrder >;
       template< typename Device_, typename Index_ >
       using ViewTemplate = ChunkedEllpackView< Device_, Index_, RowMajorOrder >;
-      using ConstViewType = ChunkedEllpackView< Device, std::add_const_t< Index >, RowMajorOrder >;
+      using ConstViewType = ChunkedEllpackView< Device, std::add_const_t< IndexType >, RowMajorOrder >;
       using SegmentViewType = ChunkedEllpackSegmentView< IndexType, RowMajorOrder >;
       using ChunkedEllpackSliceInfoType = details::ChunkedEllpackSliceInfo< IndexType >;
       //TODO: using ChunkedEllpackSliceInfoAllocator = typename IndexAllocatorType::retype< ChunkedEllpackSliceInfoType >;
@@ -55,7 +55,13 @@ class ChunkedEllpack
 
       ViewType getView();
 
-      ConstViewType getConstView() const;
+      const ConstViewType getConstView() const;
+
+      /**
+       * \brief Number of segments.
+       */
+      __cuda_callable__
+      IndexType getSegmentsCount() const;
 
       /**
        * \brief Set sizes of particular segments.
@@ -63,8 +69,7 @@ class ChunkedEllpack
       template< typename SizesHolder = OffsetsHolder >
       void setSegmentsSizes( const SizesHolder& sizes );
 
-      __cuda_callable__
-      IndexType getSegmentsCount() const;
+      void reset();
 
       IndexType getSegmentSize( const IndexType segmentIdx ) const;
 
@@ -100,10 +105,10 @@ class ChunkedEllpack
        * \brief Go over all segments and perform a reduction in each of them.
        */
       template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
-      void segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
+      void segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
 
       template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
-      void allReduction( Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
+      void allReduction( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
 
       ChunkedEllpack& operator=( const ChunkedEllpack& source ) = default;
 
@@ -150,7 +155,7 @@ class ChunkedEllpack
 
       ChunkedEllpackSliceInfoContainer slices;
 
-      IndexType numberOfSlices;
+      IndexType numberOfSlices = 0;
 
       template< typename Device_, typename Index_, typename IndexAllocator_, bool RowMajorOrder_ >
       friend class ChunkedEllpack;
diff --git a/src/TNL/Containers/Segments/ChunkedEllpack.hpp b/src/TNL/Containers/Segments/ChunkedEllpack.hpp
index 6d0cf6fe7ea1d517feac4edcd4071a22c28f84ae..005b22a786853ffce68ecaa9655bfe610d8da041 100644
--- a/src/TNL/Containers/Segments/ChunkedEllpack.hpp
+++ b/src/TNL/Containers/Segments/ChunkedEllpack.hpp
@@ -25,7 +25,6 @@ template< typename Device,
           bool RowMajorOrder >
 ChunkedEllpack< Device, Index, IndexAllocator, RowMajorOrder >::
 ChunkedEllpack( const Vector< IndexType, DeviceType, IndexType >& sizes )
-   : size( 0 ), storageSize( 0 ), chunksInSlice( 0 ), desiredChunkSize( 0 )
 {
    this->setSegmentsSizes( sizes );
 }
@@ -41,7 +40,7 @@ ChunkedEllpack( const ChunkedEllpack& chunkedEllpack )
      chunksInSlice( chunkedEllpack.chunksInSlice ), 
      desiredChunkSize( chunkedEllpack.desiredChunkSize ),
      rowToChunkMapping( chunkedEllpack.rowToChunkMapping ),
-     rowToSliceMapping( chunkedEllpack.rowTopSliceMapping ),
+     rowToSliceMapping( chunkedEllpack.rowToSliceMapping ),
      chunksToSegmentsMapping( chunkedEllpack. chunksToSegmentsMapping ),
      rowPointers( chunkedEllpack.rowPointers ),
      slices( chunkedEllpack.slices ),
@@ -111,9 +110,8 @@ template< typename Device,
           typename Index,
           typename IndexAllocator,
           bool RowMajorOrder >
-typename ChunkedEllpack< Device, Index, IndexAllocator, RowMajorOrder >::ConstViewType
-ChunkedEllpack< Device, Index, IndexAllocator, RowMajorOrder >::
-getConstView() const
+auto ChunkedEllpack< Device, Index, IndexAllocator, RowMajorOrder >::
+getConstView() const -> const ConstViewType
 {
    return ConstViewType( size, storageSize, chunksInSlice, desiredChunkSize,
                          rowToChunkMapping.getConstView(),
@@ -224,7 +222,6 @@ setSlice( SegmentsSizes& rowLengths,
       maxChunkInSlice = TNL::max( maxChunkInSlice,
                               roundUpDivision( rowLengths[ i ], this->rowToChunkMapping[ i ] ) );
    }
-   TNL_ASSERT_GT( maxChunkInSlice, 0, "" );
 
    /****
     * Set-up the slice info.
@@ -296,7 +293,8 @@ setSegmentsSizes( const SizesHolder& segmentsSizes )
    else
    {
       ChunkedEllpack< Devices::Host, Index, typename Allocators::Default< Devices::Host >::template Allocator< Index >, RowMajorOrder > hostSegments;
-      Containers::Vector< IndexType, Devices::Host, IndexType > hostSegmentsSizes( segmentsSizes );
+      Containers::Vector< IndexType, Devices::Host, IndexType > hostSegmentsSizes;
+      hostSegmentsSizes = segmentsSizes;
       hostSegments.setSegmentsSizes( hostSegmentsSizes );
       *this = hostSegments;
    }
@@ -306,21 +304,36 @@ template< typename Device,
           typename Index,
           typename IndexAllocator,
           bool RowMajorOrder >
-__cuda_callable__
-Index
+void
 ChunkedEllpack< Device, Index, IndexAllocator, RowMajorOrder >::
-getSegmentsCount() const
+reset()
 {
-   return this->segmentsCount;
+   this->size = 0;
+   this->storageSize = 0;
+   this->rowToSliceMapping.reset();
+   this->rowToChunkMapping.reset();
+   this->chunksToSegmentsMapping.reset();
+   this->rowPointers.reset();
+   this->slices.reset();
+   this->numberOfSlices = 0;
 }
 
 template< typename Device,
           typename Index,
           typename IndexAllocator,
           bool RowMajorOrder >
-Index
-ChunkedEllpack< Device, Index, IndexAllocator, RowMajorOrder >::
-getSegmentSize( const IndexType segmentIdx ) const
+__cuda_callable__ auto ChunkedEllpack< Device, Index, IndexAllocator, RowMajorOrder >::
+getSegmentsCount() const -> IndexType
+{
+   return this->size;
+}
+
+template< typename Device,
+          typename Index,
+          typename IndexAllocator,
+          bool RowMajorOrder >
+auto ChunkedEllpack< Device, Index, IndexAllocator, RowMajorOrder >::
+getSegmentSize( const IndexType segmentIdx ) const -> IndexType
 {
    return details::ChunkedEllpack< IndexType, DeviceType, RowMajorOrder >::getSegmentSize(
       rowToSliceMapping.getView(),
@@ -333,10 +346,8 @@ template< typename Device,
           typename Index,
           typename IndexAllocator,
           bool RowMajorOrder >
-__cuda_callable__
-Index
-ChunkedEllpack< Device, Index, IndexAllocator, RowMajorOrder >::
-getSize() const
+__cuda_callable__ auto ChunkedEllpack< Device, Index, IndexAllocator, RowMajorOrder >::
+getSize() const -> IndexType
 {
    return this->size;
 }
@@ -345,10 +356,8 @@ template< typename Device,
           typename Index,
           typename IndexAllocator,
           bool RowMajorOrder >
-__cuda_callable__
-Index
-ChunkedEllpack< Device, Index, IndexAllocator, RowMajorOrder >::
-getStorageSize() const
+__cuda_callable__ auto ChunkedEllpack< Device, Index, IndexAllocator, RowMajorOrder >::
+getStorageSize() const -> IndexType
 {
    return this->storageSize;
 }
@@ -357,10 +366,8 @@ template< typename Device,
           typename Index,
           typename IndexAllocator,
           bool RowMajorOrder >
-__cuda_callable__
-Index
-ChunkedEllpack< Device, Index, IndexAllocator, RowMajorOrder >::
-getGlobalIndex( const Index segmentIdx, const Index localIdx ) const
+__cuda_callable__ auto ChunkedEllpack< Device, Index, IndexAllocator, RowMajorOrder >::
+getGlobalIndex( const Index segmentIdx, const Index localIdx ) const -> IndexType
 {
       return details::ChunkedEllpack< IndexType, DeviceType, RowMajorOrder >::getGlobalIndex(
          rowToSliceMapping,
@@ -375,9 +382,7 @@ template< typename Device,
           typename Index,
           typename IndexAllocator,
           bool RowMajorOrder >
-__cuda_callable__
-auto
-ChunkedEllpack< Device, Index, IndexAllocator, RowMajorOrder >::
+__cuda_callable__ auto ChunkedEllpack< Device, Index, IndexAllocator, RowMajorOrder >::
 getSegmentView( const IndexType segmentIdx ) const -> SegmentViewType
 {
 }
@@ -413,7 +418,7 @@ template< typename Device,
    template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
 void
 ChunkedEllpack< Device, Index, IndexAllocator, RowMajorOrder >::
-segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
+segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
 {
    this->getConstView().segmentsReduction( first, last, fetch, reduction, keeper, zero, args... );
 }
@@ -425,7 +430,7 @@ template< typename Device,
    template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
 void
 ChunkedEllpack< Device, Index, IndexAllocator, RowMajorOrder >::
-allReduction( Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
+allReduction( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
 {
    this->segmentsReduction( 0, this->getSegmentsCount(), fetch, reduction, keeper, zero, args... );
 }
diff --git a/src/TNL/Containers/Segments/ChunkedEllpackSegmentView.h b/src/TNL/Containers/Segments/ChunkedEllpackSegmentView.h
index 9eba9dd6867fe023ba418d70ff2c616e5f1e1e3b..93da55927df70cdda2b913f01d23363271283cff 100644
--- a/src/TNL/Containers/Segments/ChunkedEllpackSegmentView.h
+++ b/src/TNL/Containers/Segments/ChunkedEllpackSegmentView.h
@@ -1,5 +1,5 @@
 /***************************************************************************
-                          ChunkedEllpackChunkedEllpackSegmentView.h -  description
+                          ChunkedEllpackSegmentView.h -  description
                              -------------------
     begin                : Mar 24, 2020
     copyright            : (C) 2020 by Tomas Oberhuber
diff --git a/src/TNL/Containers/Segments/ChunkedEllpackView.h b/src/TNL/Containers/Segments/ChunkedEllpackView.h
index eaf2450b5e3cd9b70ea632026ba3d7b045da121a..a840447b9b45a4b13037b20faeca189c71d9d194 100644
--- a/src/TNL/Containers/Segments/ChunkedEllpackView.h
+++ b/src/TNL/Containers/Segments/ChunkedEllpackView.h
@@ -29,9 +29,9 @@ class ChunkedEllpackView
    public:
 
       using DeviceType = Device;
-      using IndexType = Index;
-      using OffsetsView = typename Containers::VectorView< IndexType, DeviceType, typename std::remove_const< IndexType >::type >;
-      using ConstOffsetsView = typename Containers::Vector< IndexType, DeviceType, typename std::remove_const< IndexType >::type >::ConstViewType;
+      using IndexType = std::remove_const_t< Index >;
+      using OffsetsView = typename Containers::VectorView< Index, DeviceType, IndexType >;
+      using ConstOffsetsView = typename OffsetsView::ConstViewType;
       using ViewType = ChunkedEllpackView;
       template< typename Device_, typename Index_ >
       using ViewTemplate = ChunkedEllpackView< Device_, Index_ >;
@@ -83,10 +83,10 @@ class ChunkedEllpackView
       ViewType getView();
 
       __cuda_callable__
-      ConstViewType getConstView() const;
+      const ConstViewType getConstView() const;
 
       /**
-       * \brief Number segments.
+       * \brief Number of segments.
        */
       __cuda_callable__
       IndexType getSegmentsCount() const;
@@ -132,22 +132,35 @@ class ChunkedEllpackView
        * \brief Go over all segments and perform a reduction in each of them.
        */
       template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
-      void segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
+      void segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
 
       template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
-      void allReduction( Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
+      void allReduction( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
 
       ChunkedEllpackView& operator=( const ChunkedEllpackView& view );
 
       void save( File& file ) const;
 
-      void load( File& file );
-
       void printStructure( std::ostream& str ) const;
 
    protected:
 
 #ifdef HAVE_CUDA
+      template< typename Fetch,
+                typename Reduction,
+                typename ResultKeeper,
+                typename Real,
+                typename... Args >
+      __device__
+      void segmentsReductionKernelWithAllParameters( IndexType gridIdx,
+                                                     IndexType first,
+                                                     IndexType last,
+                                                     Fetch fetch,
+                                                     Reduction reduction,
+                                                     ResultKeeper keeper,
+                                                     Real zero,
+                                                     Args... args ) const;
+
       template< typename Fetch,
                 typename Reduction,
                 typename ResultKeeper,
@@ -206,6 +219,9 @@ class ChunkedEllpackView
                                                   ResultKeeper_ keeper,
                                                   Real_ zero,
                                                   Args_... args );
+
+      template< typename Index_, typename Fetch_, bool B_ >
+      friend struct details::ChunkedEllpackSegmentsReductionDispatcher;
 #endif
 };
       } // namespace Segements
diff --git a/src/TNL/Containers/Segments/ChunkedEllpackView.hpp b/src/TNL/Containers/Segments/ChunkedEllpackView.hpp
index b3b151624f500282451ff0ea4a643c9011790b6c..19ae6f6723d74bb49c09fe9c3711094271f29706 100644
--- a/src/TNL/Containers/Segments/ChunkedEllpackView.hpp
+++ b/src/TNL/Containers/Segments/ChunkedEllpackView.hpp
@@ -13,36 +13,13 @@
 #include <TNL/Containers/Vector.h>
 #include <TNL/Algorithms/ParallelFor.h>
 #include <TNL/Containers/Segments/ChunkedEllpackView.h>
+#include <TNL/Containers/Segments/details/LambdaAdapter.h>
 //#include <TNL/Containers/Segments/details/ChunkedEllpack.h>
 
 namespace TNL {
    namespace Containers {
       namespace Segments {
 
-#ifdef HAVE_CUDA
-template< typename View,
-          typename Index,
-          typename Fetch,
-          typename Reduction,
-          typename ResultKeeper,
-          typename Real,
-          typename... Args >
-__global__
-void ChunkedEllpackSegmentsReductionKernel( View chunkedEllpack,
-                                            Index gridIdx,
-                                            Index first,
-                                            Index last,
-                                            Fetch fetch,
-                                            Reduction reduction,
-                                            ResultKeeper keeper,
-                                            Real zero,
-                                            Args... args )
-{
-   chunkedEllpack.segmentsReductionKernel( gridIdx, first, last, fetch, reduction, keeper, zero, args... );
-}
-#endif
-
-
 template< typename Device,
           typename Index,
           bool RowMajorOrder >
@@ -177,10 +154,8 @@ getView()
 template< typename Device,
           typename Index,
           bool RowMajorOrder >
-__cuda_callable__
-typename ChunkedEllpackView< Device, Index, RowMajorOrder >::ConstViewType
-ChunkedEllpackView< Device, Index, RowMajorOrder >::
-getConstView() const
+__cuda_callable__ auto ChunkedEllpackView< Device, Index, RowMajorOrder >::
+getConstView() const -> const ConstViewType
 {
    return ConstViewType( size, chunksInSlice, desiredChunkSize,
                          rowToChunkMapping.getConstView(),
@@ -194,10 +169,8 @@ getConstView() const
 template< typename Device,
           typename Index,
           bool RowMajorOrder >
-__cuda_callable__
-Index
-ChunkedEllpackView< Device, Index, RowMajorOrder >::
-getSegmentsCount() const
+__cuda_callable__ auto ChunkedEllpackView< Device, Index, RowMajorOrder >::
+getSegmentsCount() const -> IndexType
 {
    return this->size;
 }
@@ -205,10 +178,8 @@ getSegmentsCount() const
 template< typename Device,
           typename Index,
           bool RowMajorOrder >
-__cuda_callable__
-Index
-ChunkedEllpackView< Device, Index, RowMajorOrder >::
-getSegmentSize( const IndexType segmentIdx ) const
+__cuda_callable__ auto ChunkedEllpackView< Device, Index, RowMajorOrder >::
+getSegmentSize( const IndexType segmentIdx ) const -> IndexType
 {
    if( std::is_same< DeviceType, Devices::Host >::value )
       return details::ChunkedEllpack< IndexType, DeviceType, RowMajorOrder >::getSegmentSizeDirect(
@@ -237,10 +208,8 @@ getSegmentSize( const IndexType segmentIdx ) const
 template< typename Device,
           typename Index,
           bool RowMajorOrder >
-__cuda_callable__
-Index
-ChunkedEllpackView< Device, Index, RowMajorOrder >::
-getSize() const
+__cuda_callable__ auto ChunkedEllpackView< Device, Index, RowMajorOrder >::
+getSize() const -> IndexType
 {
    return this->size;
 }
@@ -248,10 +217,8 @@ getSize() const
 template< typename Device,
           typename Index,
           bool RowMajorOrder >
-__cuda_callable__
-Index
-ChunkedEllpackView< Device, Index, RowMajorOrder >::
-getStorageSize() const
+__cuda_callable__ auto ChunkedEllpackView< Device, Index, RowMajorOrder >::
+getStorageSize() const -> IndexType
 {
    return this->storageSize;
 }
@@ -259,10 +226,8 @@ getStorageSize() const
 template< typename Device,
           typename Index,
           bool RowMajorOrder >
-__cuda_callable__
-Index
-ChunkedEllpackView< Device, Index, RowMajorOrder >::
-getGlobalIndex( const Index segmentIdx, const Index localIdx ) const
+__cuda_callable__ auto ChunkedEllpackView< Device, Index, RowMajorOrder >::
+getGlobalIndex( const Index segmentIdx, const Index localIdx ) const -> IndexType
 {
    if( std::is_same< DeviceType, Devices::Host >::value )
       return details::ChunkedEllpack< IndexType, DeviceType, RowMajorOrder >::getGlobalIndexDirect(
@@ -399,9 +364,9 @@ template< typename Device,
    template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
 void
 ChunkedEllpackView< Device, Index, RowMajorOrder >::
-segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
+segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
 {
-   using RealType = decltype( fetch( IndexType(), IndexType(), IndexType(), std::declval< bool& >(), args... ) );
+   using RealType = typename details::FetchLambdaAdapter< Index, Fetch >::ReturnType;
    if( std::is_same< DeviceType, Devices::Host >::value )
    {
       //segmentsReductionKernel( 0, first, last, fetch, reduction, keeper, zero, args... );
@@ -428,8 +393,8 @@ segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& red
             IndexType begin = sliceOffset + firstChunkOfSegment * chunkSize;
             IndexType end = begin + segmentSize;
             IndexType localIdx( 0 );
-            for( IndexType j = begin; j < end && compute; j++ )
-               reduction( aux, fetch( segmentIdx, localIdx++, j, compute, args...) );
+            for( IndexType globalIdx = begin; globalIdx < end && compute; globalIdx++ )
+               aux = reduction( aux, details::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx++, globalIdx, compute ) );
          }
          else
          {
@@ -438,8 +403,8 @@ segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& red
                IndexType begin = sliceOffset + firstChunkOfSegment + chunkIdx;
                IndexType end = begin + chunksInSlice * chunkSize;
                IndexType localIdx( 0 );
-               for( IndexType j = begin; j < end && compute; j += chunksInSlice )
-                  reduction( aux, fetch( segmentIdx, localIdx++, j, compute, args...) );
+               for( IndexType globalIdx = begin; globalIdx < end && compute; globalIdx += chunksInSlice )
+                  aux = reduction( aux, details::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx++, globalIdx, compute ) );
             }
          }
          keeper( segmentIdx, aux );
@@ -459,7 +424,7 @@ segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& red
       {
          if( gridIdx == cudaGrids - 1 )
             cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize();
-         ChunkedEllpackSegmentsReductionKernel< ViewType, IndexType, Fetch, Reduction, ResultKeeper, Real, Args...  >
+         details::ChunkedEllpackSegmentsReductionKernel< ViewType, IndexType, Fetch, Reduction, ResultKeeper, Real, Args...  >
             <<< cudaGridSize, cudaBlockSize, sharedMemory  >>>
             ( *this, gridIdx, first, last, fetch, reduction, keeper, zero, args... );
       }
@@ -473,7 +438,7 @@ template< typename Device,
    template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
 void
 ChunkedEllpackView< Device, Index, RowMajorOrder >::
-allReduction( Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
+allReduction( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
 {
    this->segmentsReduction( 0, this->getSegmentsCount(), fetch, reduction, keeper, zero, args... );
 }
@@ -517,25 +482,6 @@ save( File& file ) const
    file.save( &this->numberOfSlices );
 }
 
-template< typename Device,
-          typename Index,
-          bool RowMajorOrder >
-void
-ChunkedEllpackView< Device, Index, RowMajorOrder >::
-load( File& file )
-{
-   file.load( &this->size );
-   file.load( &this->storageSize );
-   file.load( &this->chunksInSlice );
-   file.load( &this->desiredChunkSize );
-   file >> this->rowToChunkMapping
-        >> this->chunksToSegmentsMapping
-        >> this->rowToSliceMapping
-        >> this->rowPointers
-        >> this->slices;
-   file.load( &this->numberOfSlices );
-}
-
 template< typename Device,
           typename Index,
           bool RowMajorOrder >
@@ -559,6 +505,84 @@ printStructure( std::ostream& str ) const
 }
 
 #ifdef HAVE_CUDA
+template< typename Device,
+          typename Index,
+          bool RowMajorOrder >
+   template< typename Fetch,
+             typename Reduction,
+             typename ResultKeeper,
+             typename Real,
+             typename... Args >
+__device__
+void
+ChunkedEllpackView< Device, Index, RowMajorOrder >::
+segmentsReductionKernelWithAllParameters( IndexType gridIdx,
+                                          IndexType first,
+                                          IndexType last,
+                                          Fetch fetch,
+                                          Reduction reduction,
+                                          ResultKeeper keeper,
+                                          Real zero,
+                                          Args... args ) const
+{
+   using RealType = decltype( fetch( IndexType(), IndexType(), IndexType(), std::declval< bool& >(), args... ) );
+
+   const IndexType firstSlice = rowToSliceMapping[ first ];
+   const IndexType lastSlice = rowToSliceMapping[ last - 1 ];
+
+   const IndexType sliceIdx = firstSlice + gridIdx * Cuda::getMaxGridSize() + blockIdx.x;
+   if( sliceIdx > lastSlice )
+      return;
+
+   RealType* chunksResults = Cuda::getSharedMemory< RealType >();
+   __shared__ details::ChunkedEllpackSliceInfo< IndexType > sliceInfo;
+   if( threadIdx.x == 0 )
+      sliceInfo = this->slices[ sliceIdx ];
+   chunksResults[ threadIdx.x ] = zero;
+   __syncthreads();
+
+
+
+   const IndexType sliceOffset = sliceInfo.pointer;
+   const IndexType chunkSize = sliceInfo.chunkSize;
+   const IndexType chunkIdx = sliceIdx * chunksInSlice + threadIdx.x;
+   const IndexType segmentIdx = this->chunksToSegmentsMapping[ chunkIdx ];
+   IndexType firstChunkOfSegment( 0 );
+   if( segmentIdx != sliceInfo.firstSegment )
+      firstChunkOfSegment = rowToChunkMapping[ segmentIdx - 1 ];
+   IndexType localIdx = ( threadIdx.x - firstChunkOfSegment ) * chunkSize;
+   bool compute( true );
+
+   if( RowMajorOrder )
+   {
+      IndexType begin = sliceOffset + threadIdx.x * chunkSize; // threadIdx.x = chunkIdx within the slice
+      IndexType end = begin + chunkSize;
+      for( IndexType j = begin; j < end && compute; j++ )
+         chunksResults[ threadIdx.x ] = reduction( chunksResults[ threadIdx.x ], fetch( segmentIdx, localIdx++, j, compute ) );
+   }
+   else
+   {
+      const IndexType begin = sliceOffset + threadIdx.x; // threadIdx.x = chunkIdx within the slice
+      const IndexType end = begin + chunksInSlice * chunkSize;
+         for( IndexType j = begin; j < end && compute; j += chunksInSlice )
+            chunksResults[ threadIdx.x ] = reduction( chunksResults[ threadIdx.x ], fetch( segmentIdx, localIdx++, j, compute ) );
+   }
+   __syncthreads();
+   if( threadIdx.x < sliceInfo.size )
+   {
+      const IndexType row = sliceInfo.firstSegment + threadIdx.x;
+      IndexType chunkIndex( 0 );
+      if( threadIdx.x != 0 )
+         chunkIndex = this->rowToChunkMapping[ row - 1 ];
+      const IndexType lastChunk = this->rowToChunkMapping[ row ];
+      RealType result( zero );
+      while( chunkIndex < lastChunk )
+         result = reduction( result,  chunksResults[ chunkIndex++ ] );
+      if( row >= first && row < last )
+         keeper( row, result );
+   }
+}
+
 template< typename Device,
           typename Index,
           bool RowMajorOrder >
@@ -579,73 +603,57 @@ segmentsReductionKernel( IndexType gridIdx,
                          Real zero,
                          Args... args ) const
 {
-   using RealType = decltype( fetch( IndexType(), IndexType(), IndexType(), std::declval< bool& >(), args... ) );
+   using RealType = decltype( fetch( IndexType(), std::declval< bool& >(), args... ) );
+
+   const IndexType firstSlice = rowToSliceMapping[ first ];
+   const IndexType lastSlice = rowToSliceMapping[ last - 1 ];
+
+   const IndexType sliceIdx = firstSlice + gridIdx * Cuda::getMaxGridSize() + blockIdx.x;
+   if( sliceIdx > lastSlice )
+      return;
+
+   RealType* chunksResults = Cuda::getSharedMemory< RealType >();
+   __shared__ details::ChunkedEllpackSliceInfo< IndexType > sliceInfo;
 
-   const IndexType firstSlice = rowToChunkMapping[ first ] / chunksInSlice;
-   const IndexType lastSlice = rowToChunkMapping[ last - 1 ] / chunksInSlice;
-   //for( IndexType sliceIdx = firstSlice; sliceIdx < lastSlice; sliceIdx++ )
+   if( threadIdx.x == 0 )
+      sliceInfo = this->slices[ sliceIdx ];
+   chunksResults[ threadIdx.x ] = zero;
+   __syncthreads();
+
+   const IndexType sliceOffset = sliceInfo.pointer;
+   const IndexType chunkSize = sliceInfo.chunkSize;
+   const IndexType chunkIdx = sliceIdx * chunksInSlice + threadIdx.x;
+   bool compute( true );
+
+   if( RowMajorOrder )
    {
-      const IndexType sliceIdx = gridIdx * Cuda::getMaxGridSize() + blockIdx.x;
-      if( sliceIdx >= lastSlice )
-         return;
-
-      RealType* chunksResults = Cuda::getSharedMemory< RealType >();
-      //for( IndexType threadIdx = 0; threadIdx < 256; threadIdx++ )
-      //{
-         __shared__ details::ChunkedEllpackSliceInfo< IndexType > sliceInfo;
-         if( threadIdx.x == 0 )
-            sliceInfo = this->slices[ sliceIdx ];
-         chunksResults[ threadIdx.x ] = zero;
-         __syncthreads();
-
-   
-
-         const IndexType sliceOffset = sliceInfo.pointer;
-         const IndexType chunkSize = sliceInfo.chunkSize;
-         const IndexType chunkIdx = sliceIdx * chunksInSlice + threadIdx.x;
-         const IndexType segmentIdx = this->chunksToSegmentsMapping[ chunkIdx ];
-         IndexType firstChunkOfSegment( 0 );
-         if( segmentIdx != sliceInfo.firstSegment )
-            firstChunkOfSegment = rowToChunkMapping[ segmentIdx - 1 ];
-         IndexType localIdx = ( threadIdx.x - firstChunkOfSegment ) * chunkSize;
-         bool compute( true );
-          
-         if( RowMajorOrder )
-         {
-            IndexType begin = sliceOffset + threadIdx.x * chunkSize; // threadIdx.x = chunkIdx within the slice
-            IndexType end = begin + chunkSize;
-            for( IndexType j = begin; j < end && compute; j++ )
-               reduction( chunksResults[ threadIdx.x ], fetch( segmentIdx, localIdx++, j, compute, args...) );
-         }
-         else
-         {
-            const IndexType begin = sliceOffset + threadIdx.x; // threadIdx.x = chunkIdx within the slice
-            const IndexType end = begin + chunksInSlice * chunkSize;
-               for( IndexType j = begin; j < end && compute; j += chunksInSlice )
-                  reduction( chunksResults[ threadIdx.x ], fetch( segmentIdx, localIdx++, j, compute, args...) );
-         }
-         __syncthreads();
-      //}
-
-      //details::ChunkedEllpackSliceInfo< IndexType > sliceInfo;
-      //for( IndexType threadIdx = 0; threadIdx < 256; threadIdx++ )
-      //{
-         //if( threadIdx == 0 )
-         //   sliceInfo = this->slices[ sliceIdx ];
-         if( threadIdx.x < sliceInfo.size )
-         {
-            const IndexType row = sliceInfo.firstSegment + threadIdx.x;
-            IndexType chunkIndex( 0 );
-            if( threadIdx.x != 0 )
-               chunkIndex = this->rowToChunkMapping[ row - 1 ];
-            const IndexType lastChunk = this->rowToChunkMapping[ row ];
-            RealType result( zero );
-            while( chunkIndex < lastChunk )
-               reduction( result,  chunksResults[ chunkIndex++ ] );
-            keeper( row, result );
-         }
-      //} // threadIdx
-   } // sliceIdx
+      IndexType begin = sliceOffset + threadIdx.x * chunkSize; // threadIdx.x = chunkIdx within the slice
+      IndexType end = begin + chunkSize;
+      for( IndexType j = begin; j < end && compute; j++ )
+         chunksResults[ threadIdx.x ] = reduction( chunksResults[ threadIdx.x ], fetch( j, compute ) );
+   }
+   else
+   {
+      const IndexType begin = sliceOffset + threadIdx.x; // threadIdx.x = chunkIdx within the slice
+      const IndexType end = begin + chunksInSlice * chunkSize;
+         for( IndexType j = begin; j < end && compute; j += chunksInSlice )
+            chunksResults[ threadIdx.x ] = reduction( chunksResults[ threadIdx.x ], fetch( j, compute ) );
+   }
+   __syncthreads();
+
+   if( threadIdx.x < sliceInfo.size )
+   {
+      const IndexType row = sliceInfo.firstSegment + threadIdx.x;
+      IndexType chunkIndex( 0 );
+      if( threadIdx.x != 0 )
+         chunkIndex = this->rowToChunkMapping[ row - 1 ];
+      const IndexType lastChunk = this->rowToChunkMapping[ row ];
+      RealType result( zero );
+      while( chunkIndex < lastChunk )
+         result = reduction( result,  chunksResults[ chunkIndex++ ] );
+      if( row >= first && row < last )
+         keeper( row, result );
+   }
 }
 #endif
 
diff --git a/src/TNL/Containers/Segments/Ellpack.h b/src/TNL/Containers/Segments/Ellpack.h
index a1188a854e952f15f1f2449cc33c9535ccfac10a..63ca556a4b97099c0d24b97d311b079a30bf1b7a 100644
--- a/src/TNL/Containers/Segments/Ellpack.h
+++ b/src/TNL/Containers/Segments/Ellpack.h
@@ -28,7 +28,7 @@ class Ellpack
    public:
 
       using DeviceType = Device;
-      using IndexType = Index;
+      using IndexType = std::remove_const_t< Index >;
       static constexpr int getAlignment() { return Alignment; }
       static constexpr bool getRowMajorOrder() { return RowMajorOrder; }
       using OffsetsHolder = Containers::Vector< IndexType, DeviceType, IndexType >;
@@ -36,7 +36,7 @@ class Ellpack
       template< typename Device_, typename Index_ >
       using ViewTemplate = EllpackView< Device_, Index_, RowMajorOrder, Alignment >;
       using ViewType = EllpackView< Device, Index, RowMajorOrder, Alignment >;
-      //using ConstViewType = EllpackView< Device, std::add_const_t< Index >, RowMajorOrder, Alignment >;
+      using ConstViewType = typename ViewType::ConstViewType;
       using SegmentViewType = SegmentView< IndexType, RowMajorOrder >;
 
       Ellpack();
@@ -55,7 +55,7 @@ class Ellpack
 
       ViewType getView();
 
-      //ConstViewType getConstView() const;
+      const ConstViewType getConstView() const;
 
       /**
        * \brief Set sizes of particular segments.
@@ -64,6 +64,9 @@ class Ellpack
       void setSegmentsSizes( const SizesHolder& sizes );
 
       void setSegmentsSizes( const IndexType segmentsCount, const IndexType segmentSize );
+
+      void reset();
+
       /**
        * \brief Number segments.
        */
@@ -105,10 +108,10 @@ class Ellpack
        * \brief Go over all segments and perform a reduction in each of them.
        */
       template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
-      void segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
+      void segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
 
       template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
-      void allReduction( Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
+      void allReduction( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
 
       Ellpack& operator=( const Ellpack& source ) = default;
 
diff --git a/src/TNL/Containers/Segments/Ellpack.hpp b/src/TNL/Containers/Segments/Ellpack.hpp
index 9c59c5529eada436df075bd130ddaa16f0ef20ea..e4e2180adf74c80bb9d1d2fbfa651f9b3f570805 100644
--- a/src/TNL/Containers/Segments/Ellpack.hpp
+++ b/src/TNL/Containers/Segments/Ellpack.hpp
@@ -105,23 +105,24 @@ template< typename Device,
           typename IndexAllocator,
           bool RowMajorOrder,
           int Alignment >
-typename Ellpack< Device, Index, IndexAllocator, RowMajorOrder, Alignment >::ViewType
+auto
 Ellpack< Device, Index, IndexAllocator, RowMajorOrder, Alignment >::
-getView()
+getView() -> ViewType
 {
    return ViewType( segmentSize, size, alignedSize );
 }
 
-/*template< typename Device,
+template< typename Device,
           typename Index,
+          typename IndexAllocator,
           bool RowMajorOrder,
           int Alignment >
-typename Ellpack< Device, Index, IndexAllocator, RowMajorOrder, Alignment >::ConstViewType
+auto
 Ellpack< Device, Index, IndexAllocator, RowMajorOrder, Alignment >::
-getConstView() const
+getConstView() const -> const ConstViewType
 {
    return ConstViewType( segmentSize, size, alignedSize );
-}*/
+}
 
 template< typename Device,
           typename Index,
@@ -141,6 +142,20 @@ setSegmentsSizes( const SizesHolder& sizes )
       this->alignedSize = roundUpDivision( size, this->getAlignment() ) * this->getAlignment();
 }
 
+template< typename Device,
+          typename Index,
+          typename IndexAllocator,
+          bool RowMajorOrder,
+          int Alignment >
+void
+Ellpack< Device, Index, IndexAllocator, RowMajorOrder, Alignment >::
+reset()
+{
+   this->segmentSize = 0;
+   this->size = 0;
+   this->alignedSize = 0;
+}
+
 template< typename Device,
           typename Index,
           typename IndexAllocator,
@@ -164,10 +179,8 @@ template< typename Device,
           typename IndexAllocator,
           bool RowMajorOrder,
           int Alignment >
-__cuda_callable__
-Index
-Ellpack< Device, Index, IndexAllocator, RowMajorOrder, Alignment >::
-getSegmentsCount() const
+__cuda_callable__ auto Ellpack< Device, Index, IndexAllocator, RowMajorOrder, Alignment >::
+getSegmentsCount() const -> IndexType
 {
    return this->size;
 }
@@ -177,10 +190,8 @@ template< typename Device,
           typename IndexAllocator,
           bool RowMajorOrder,
           int Alignment >
-__cuda_callable__
-Index
-Ellpack< Device, Index, IndexAllocator, RowMajorOrder, Alignment >::
-getSegmentSize( const IndexType segmentIdx ) const
+__cuda_callable__ auto Ellpack< Device, Index, IndexAllocator, RowMajorOrder, Alignment >::
+getSegmentSize( const IndexType segmentIdx ) const -> IndexType
 {
    return this->segmentSize;
 }
@@ -190,10 +201,8 @@ template< typename Device,
           typename IndexAllocator,
           bool RowMajorOrder,
           int Alignment >
-__cuda_callable__
-Index
-Ellpack< Device, Index, IndexAllocator, RowMajorOrder, Alignment >::
-getSize() const
+__cuda_callable__ auto Ellpack< Device, Index, IndexAllocator, RowMajorOrder, Alignment >::
+getSize() const  -> IndexType
 {
    return this->size * this->segmentSize;
 }
@@ -204,10 +213,8 @@ template< typename Device,
           typename IndexAllocator,
           bool RowMajorOrder,
           int Alignment >
-__cuda_callable__
-Index
-Ellpack< Device, Index, IndexAllocator, RowMajorOrder, Alignment >::
-getStorageSize() const
+__cuda_callable__ auto Ellpack< Device, Index, IndexAllocator, RowMajorOrder, Alignment >::
+getStorageSize() const -> IndexType
 {
    return this->alignedSize * this->segmentSize;
 }
@@ -217,10 +224,8 @@ template< typename Device,
           typename IndexAllocator,
           bool RowMajorOrder,
           int Alignment >
-__cuda_callable__
-Index
-Ellpack< Device, Index, IndexAllocator, RowMajorOrder, Alignment >::
-getGlobalIndex( const Index segmentIdx, const Index localIdx ) const
+__cuda_callable__ auto Ellpack< Device, Index, IndexAllocator, RowMajorOrder, Alignment >::
+getGlobalIndex( const Index segmentIdx, const Index localIdx ) const -> IndexType
 {
    if( RowMajorOrder )
       return segmentIdx * this->segmentSize + localIdx;
@@ -233,9 +238,8 @@ template< typename Device,
           typename IndexAllocator,
           bool RowMajorOrder,
           int Alignment >
-__cuda_callable__
-void
-Ellpack< Device, Index, IndexAllocator, RowMajorOrder, Alignment >::
+__cuda_callable__ 
+void Ellpack< Device, Index, IndexAllocator, RowMajorOrder, Alignment >::
 getSegmentAndLocalIndex( const Index globalIdx, Index& segmentIdx, Index& localIdx ) const
 {
 }
@@ -245,9 +249,7 @@ template< typename Device,
           typename IndexAllocator,
           bool RowMajorOrder,
           int Alignment >
-__cuda_callable__
-auto
-Ellpack< Device, Index, IndexAllocator, RowMajorOrder, Alignment >::
+__cuda_callable__ auto Ellpack< Device, Index, IndexAllocator, RowMajorOrder, Alignment >::
 getSegmentView( const IndexType segmentIdx ) const -> SegmentViewType
 {
    if( RowMajorOrder )
@@ -266,33 +268,7 @@ void
 Ellpack< Device, Index, IndexAllocator, RowMajorOrder, Alignment >::
 forSegments( IndexType first, IndexType last, Function& f, Args... args ) const
 {
-   if( RowMajorOrder )
-   {
-      const IndexType segmentSize = this->segmentSize;
-      auto l = [=] __cuda_callable__ ( const IndexType segmentIdx, Args... args ) mutable {
-         const IndexType begin = segmentIdx * segmentSize;
-         const IndexType end = begin + segmentSize;
-         IndexType localIdx( 0 );
-         for( IndexType globalIdx = begin; globalIdx < end; globalIdx++  )
-            if( ! f( segmentIdx, localIdx++, globalIdx,  args... ) )
-               break;
-      };
-      Algorithms::ParallelFor< Device >::exec( first, last, l, args... );
-   }
-   else
-   {
-      const IndexType storageSize = this->getStorageSize();
-      const IndexType alignedSize = this->alignedSize;
-      auto l = [=] __cuda_callable__ ( const IndexType segmentIdx, Args... args ) mutable {
-         const IndexType begin = segmentIdx;
-         const IndexType end = storageSize;
-         IndexType localIdx( 0 );
-         for( IndexType globalIdx = begin; globalIdx < end; globalIdx += alignedSize )
-            if( ! f( segmentIdx, localIdx++, globalIdx, args... ) )
-               break;
-      };
-      Algorithms::ParallelFor< Device >::exec( first, last, l, args... );
-   }
+   this->getConstView().forSegments( first, last, f, args... );
 }
 
 template< typename Device,
@@ -316,38 +292,9 @@ template< typename Device,
    template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
 void
 Ellpack< Device, Index, IndexAllocator, RowMajorOrder, Alignment >::
-segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
+segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
 {
-   using RealType = decltype( fetch( IndexType(), IndexType(), IndexType(), std::declval< bool& >(), args... ) );
-   if( RowMajorOrder )
-   {
-      const IndexType segmentSize = this->segmentSize;
-      auto l = [=] __cuda_callable__ ( const IndexType i, Args... args ) mutable {
-         const IndexType begin = i * segmentSize;
-         const IndexType end = begin + segmentSize;
-         RealType aux( zero );
-         bool compute( true );
-         for( IndexType j = begin, localIdx = 0; j < end && compute; j++, localIdx++  )
-            reduction( aux, fetch( i, localIdx, j, compute, args... ) );
-         keeper( i, aux );
-      };
-      Algorithms::ParallelFor< Device >::exec( first, last, l, args... );
-   }
-   else
-   {
-      const IndexType storageSize = this->getStorageSize();
-      const IndexType alignedSize = this->alignedSize;
-      auto l = [=] __cuda_callable__ ( const IndexType i, Args... args ) mutable {
-         const IndexType begin = i;
-         const IndexType end = storageSize;
-         RealType aux( zero );
-         bool compute( true );
-         for( IndexType j = begin, localIdx = 0; j < end && compute; j += alignedSize, localIdx++  )
-            reduction( aux, fetch( i, localIdx, j, compute, args... ) );
-         keeper( i, aux );
-      };
-      Algorithms::ParallelFor< Device >::exec( first, last, l, args... );
-   }
+   this->getConstView().segmentsReduction( first, last, fetch, reduction, keeper, zero, args... );
 }
 
 template< typename Device,
@@ -358,7 +305,7 @@ template< typename Device,
    template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
 void
 Ellpack< Device, Index, IndexAllocator, RowMajorOrder, Alignment >::
-allReduction( Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
+allReduction( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
 {
    this->segmentsReduction( 0, this->getSegmentsCount(), fetch, reduction, keeper, zero, args... );
 }
diff --git a/src/TNL/Containers/Segments/EllpackView.h b/src/TNL/Containers/Segments/EllpackView.h
index 10a89bd7bafd7de62a8e5f37f567478a3d4af1ee..846e75cf47ebdc6bcad64c36c4db84adb1e10952 100644
--- a/src/TNL/Containers/Segments/EllpackView.h
+++ b/src/TNL/Containers/Segments/EllpackView.h
@@ -29,7 +29,7 @@ class EllpackView
    public:
 
       using DeviceType = Device;
-      using IndexType = Index;
+      using IndexType = std::remove_const_t< Index >;
       static constexpr int getAlignment() { return Alignment; }
       static constexpr bool getRowMajorOrder() { return RowMajorOrder; }
       using OffsetsHolder = Containers::Vector< IndexType, DeviceType, IndexType >;
@@ -37,7 +37,7 @@ class EllpackView
       template< typename Device_, typename Index_ >
       using ViewTemplate = EllpackView< Device_, Index_, RowMajorOrder, Alignment >;
       using ViewType = EllpackView;
-      using ConstViewType = EllpackView< Device, std::add_const_t< Index > >;
+      using ConstViewType = ViewType;
       using SegmentViewType = SegmentView< IndexType, RowMajorOrder >;
 
       __cuda_callable__
@@ -60,7 +60,7 @@ class EllpackView
       ViewType getView();
 
       __cuda_callable__
-      ConstViewType getConstView() const;
+      const ConstViewType getConstView() const;
 
       /**
        * \brief Number segments.
@@ -103,10 +103,10 @@ class EllpackView
        * \brief Go over all segments and perform a reduction in each of them.
        */
       template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
-      void segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
+      void segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
 
       template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
-      void allReduction( Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
+      void allReduction( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
 
       EllpackView& operator=( const EllpackView& view );
 
diff --git a/src/TNL/Containers/Segments/EllpackView.hpp b/src/TNL/Containers/Segments/EllpackView.hpp
index 84086f380bfb12ac86113f82a76e40db1fbabdef..fa40227f88e7f822e9a48e9d91683992c85da032 100644
--- a/src/TNL/Containers/Segments/EllpackView.hpp
+++ b/src/TNL/Containers/Segments/EllpackView.hpp
@@ -13,6 +13,7 @@
 #include <TNL/Containers/Vector.h>
 #include <TNL/Algorithms/ParallelFor.h>
 #include <TNL/Containers/Segments/EllpackView.h>
+#include <TNL/Containers/Segments/details/LambdaAdapter.h>
 
 namespace TNL {
    namespace Containers {
@@ -102,9 +103,9 @@ template< typename Device,
           bool RowMajorOrder,
           int Alignment >
 __cuda_callable__
-typename EllpackView< Device, Index, RowMajorOrder, Alignment >::ConstViewType
+auto
 EllpackView< Device, Index, RowMajorOrder, Alignment >::
-getConstView() const
+getConstView() const -> const ConstViewType
 {
    return ConstViewType( segmentSize, size, alignedSize );
 }
@@ -113,10 +114,8 @@ template< typename Device,
           typename Index,
           bool RowMajorOrder,
           int Alignment >
-__cuda_callable__
-Index
-EllpackView< Device, Index, RowMajorOrder, Alignment >::
-getSegmentsCount() const
+__cuda_callable__ auto EllpackView< Device, Index, RowMajorOrder, Alignment >::
+getSegmentsCount() const -> IndexType
 {
    return this->size;
 }
@@ -125,10 +124,8 @@ template< typename Device,
           typename Index,
           bool RowMajorOrder,
           int Alignment >
-__cuda_callable__
-Index
-EllpackView< Device, Index, RowMajorOrder, Alignment >::
-getSegmentSize( const IndexType segmentIdx ) const
+__cuda_callable__ auto EllpackView< Device, Index, RowMajorOrder, Alignment >::
+getSegmentSize( const IndexType segmentIdx ) const -> IndexType
 {
    return this->segmentSize;
 }
@@ -137,10 +134,8 @@ template< typename Device,
           typename Index,
           bool RowMajorOrder,
           int Alignment >
-__cuda_callable__
-Index
-EllpackView< Device, Index, RowMajorOrder, Alignment >::
-getSize() const
+__cuda_callable__ auto EllpackView< Device, Index, RowMajorOrder, Alignment >::
+getSize() const -> IndexType
 {
    return this->size * this->segmentSize;
 }
@@ -150,10 +145,8 @@ template< typename Device,
           typename Index,
           bool RowMajorOrder,
           int Alignment >
-__cuda_callable__
-Index
-EllpackView< Device, Index, RowMajorOrder, Alignment >::
-getStorageSize() const
+__cuda_callable__ auto EllpackView< Device, Index, RowMajorOrder, Alignment >::
+getStorageSize() const -> IndexType
 {
    return this->alignedSize * this->segmentSize;
 }
@@ -162,10 +155,8 @@ template< typename Device,
           typename Index,
           bool RowMajorOrder,
           int Alignment >
-__cuda_callable__
-Index
-EllpackView< Device, Index, RowMajorOrder, Alignment >::
-getGlobalIndex( const Index segmentIdx, const Index localIdx ) const
+__cuda_callable__ auto EllpackView< Device, Index, RowMajorOrder, Alignment >::
+getGlobalIndex( const Index segmentIdx, const Index localIdx ) const -> IndexType
 {
    if( RowMajorOrder )
       return segmentIdx * this->segmentSize + localIdx;
@@ -177,9 +168,7 @@ template< typename Device,
           typename Index,
           bool RowMajorOrder,
           int Alignment >
-__cuda_callable__
-void
-EllpackView< Device, Index, RowMajorOrder, Alignment >::
+__cuda_callable__ void EllpackView< Device, Index, RowMajorOrder, Alignment >::
 getSegmentAndLocalIndex( const Index globalIdx, Index& segmentIdx, Index& localIdx ) const
 {
 }
@@ -188,9 +177,7 @@ template< typename Device,
           typename Index,
           bool RowMajorOrder,
           int Alignment >
-__cuda_callable__
-auto
-EllpackView< Device, Index, RowMajorOrder, Alignment >::
+__cuda_callable__ auto EllpackView< Device, Index, RowMajorOrder, Alignment >::
 getSegmentView( const IndexType segmentIdx ) const -> SegmentViewType
 {
    if( RowMajorOrder )
@@ -204,8 +191,7 @@ template< typename Device,
           bool RowMajorOrder,
           int Alignment >
    template< typename Function, typename... Args >
-void
-EllpackView< Device, Index, RowMajorOrder, Alignment >::
+void EllpackView< Device, Index, RowMajorOrder, Alignment >::
 forSegments( IndexType first, IndexType last, Function& f, Args... args ) const
 {
    if( RowMajorOrder )
@@ -217,7 +203,7 @@ forSegments( IndexType first, IndexType last, Function& f, Args... args ) const
          IndexType localIdx( 0 );
          bool compute( true );
          for( IndexType globalIdx = begin; globalIdx < end && compute; globalIdx++  )
-            f( segmentIdx, localIdx++, globalIdx, compute, args... );
+            f( segmentIdx, localIdx++, globalIdx, compute );
       };
       Algorithms::ParallelFor< Device >::exec( first, last, l, args... );
    }
@@ -242,8 +228,7 @@ template< typename Device,
           bool RowMajorOrder,
           int Alignment >
    template< typename Function, typename... Args >
-void
-EllpackView< Device, Index, RowMajorOrder, Alignment >::
+void EllpackView< Device, Index, RowMajorOrder, Alignment >::
 forAll( Function& f, Args... args ) const
 {
    this->forSegments( 0, this->getSegmentsCount(), f, args... );
@@ -254,23 +239,23 @@ template< typename Device,
           bool RowMajorOrder,
           int Alignment >
    template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
-void
-EllpackView< Device, Index, RowMajorOrder, Alignment >::
-segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
+void EllpackView< Device, Index, RowMajorOrder, Alignment >::
+segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
 {
-   using RealType = decltype( fetch( IndexType(), IndexType(), IndexType(), std::declval< bool& >(), args... ) );
+   //using RealType = decltype( fetch( IndexType(), IndexType(), IndexType(), std::declval< bool& >(), args... ) );
+   using RealType = typename details::FetchLambdaAdapter< Index, Fetch >::ReturnType;
    if( RowMajorOrder )
    {
       const IndexType segmentSize = this->segmentSize;
-      auto l = [=] __cuda_callable__ ( const IndexType i, Args... args ) mutable {
-         const IndexType begin = i * segmentSize;
+      auto l = [=] __cuda_callable__ ( const IndexType segmentIdx, Args... args ) mutable {
+         const IndexType begin = segmentIdx * segmentSize;
          const IndexType end = begin + segmentSize;
          RealType aux( zero );
          IndexType localIdx( 0 );
          bool compute( true );
          for( IndexType j = begin; j < end && compute; j++  )
-            reduction( aux, fetch( i, localIdx++, j, compute, args... ) );
-         keeper( i, aux );
+            aux = reduction( aux, details::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx++, j, compute ) );
+         keeper( segmentIdx, aux );
       };
       Algorithms::ParallelFor< Device >::exec( first, last, l, args... );
    }
@@ -278,15 +263,15 @@ segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& red
    {
       const IndexType storageSize = this->getStorageSize();
       const IndexType alignedSize = this->alignedSize;
-      auto l = [=] __cuda_callable__ ( const IndexType i, Args... args ) mutable {
-         const IndexType begin = i;
+      auto l = [=] __cuda_callable__ ( const IndexType segmentIdx, Args... args ) mutable {
+         const IndexType begin = segmentIdx;
          const IndexType end = storageSize;
          RealType aux( zero );
          IndexType localIdx( 0 );
          bool compute( true );
          for( IndexType j = begin; j < end && compute; j += alignedSize  )
-            reduction( aux, fetch( i, localIdx++, j, compute, args... ) );
-         keeper( i, aux );
+            aux = reduction( aux, details::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx++, j, compute ) );
+         keeper( segmentIdx, aux );
       };
       Algorithms::ParallelFor< Device >::exec( first, last, l, args... );
    }
@@ -297,9 +282,8 @@ template< typename Device,
           bool RowMajorOrder,
           int Alignment >
    template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
-void
-EllpackView< Device, Index, RowMajorOrder, Alignment >::
-allReduction( Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
+void EllpackView< Device, Index, RowMajorOrder, Alignment >::
+allReduction( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
 {
    this->segmentsReduction( 0, this->getSegmentsCount(), fetch, reduction, keeper, zero, args... );
 }
@@ -322,8 +306,7 @@ template< typename Device,
           typename Index,
           bool RowMajorOrder,
           int Alignment >
-void
-EllpackView< Device, Index, RowMajorOrder, Alignment >::
+void EllpackView< Device, Index, RowMajorOrder, Alignment >::
 save( File& file ) const
 {
    file.save( &segmentSize );
@@ -335,8 +318,7 @@ template< typename Device,
           typename Index,
           bool RowMajorOrder,
           int Alignment >
-void
-EllpackView< Device, Index, RowMajorOrder, Alignment >::
+void EllpackView< Device, Index, RowMajorOrder, Alignment >::
 load( File& file )
 {
    file.load( &segmentSize );
diff --git a/src/TNL/Containers/Segments/SlicedEllpack.h b/src/TNL/Containers/Segments/SlicedEllpack.h
index 2027f1d78a96d685806f2715257098a38a624800..19c1b8eb4ad3b823cb4b216e3c47bbd291d740cc 100644
--- a/src/TNL/Containers/Segments/SlicedEllpack.h
+++ b/src/TNL/Containers/Segments/SlicedEllpack.h
@@ -29,8 +29,8 @@ class SlicedEllpack
    public:
 
       using DeviceType = Device;
-      using IndexType = Index;
-      using OffsetsHolder = Containers::Vector< IndexType, DeviceType, typename std::remove_const< IndexType >::type, IndexAllocator >;
+      using IndexType = std::remove_const_t< Index >;
+      using OffsetsHolder = Containers::Vector< Index, DeviceType, IndexType, IndexAllocator >;
       static constexpr int getSliceSize() { return SliceSize; }
       static constexpr bool getRowMajorOrder() { return RowMajorOrder; }
       using ViewType = SlicedEllpackView< Device, Index, RowMajorOrder, SliceSize >;
@@ -53,7 +53,7 @@ class SlicedEllpack
 
       ViewType getView();
 
-      ConstViewType getConstView() const;
+      const ConstViewType getConstView() const;
 
       /**
        * \brief Set sizes of particular segments.
@@ -61,6 +61,8 @@ class SlicedEllpack
       template< typename SizesHolder = OffsetsHolder >
       void setSegmentsSizes( const SizesHolder& sizes );
 
+      void reset();
+
       __cuda_callable__
       IndexType getSegmentsCount() const;
 
@@ -103,10 +105,10 @@ class SlicedEllpack
        * \brief Go over all segments and perform a reduction in each of them.
        */
       template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
-      void segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
+      void segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
 
       template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
-      void allReduction( Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
+      void allReduction( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
 
       SlicedEllpack& operator=( const SlicedEllpack& source ) = default;
 
diff --git a/src/TNL/Containers/Segments/SlicedEllpack.hpp b/src/TNL/Containers/Segments/SlicedEllpack.hpp
index 9ba1276e3eaea3fdf39261c99e7376c6122d4f8b..e76e6d43099c3949996cc151dbab162ba861fa35 100644
--- a/src/TNL/Containers/Segments/SlicedEllpack.hpp
+++ b/src/TNL/Containers/Segments/SlicedEllpack.hpp
@@ -110,9 +110,9 @@ template< typename Device,
           typename IndexAllocator,
           bool RowMajorOrder,
           int SliceSize >
-typename SlicedEllpack< Device, Index, IndexAllocator, RowMajorOrder, SliceSize >::ConstViewType
+auto
 SlicedEllpack< Device, Index, IndexAllocator, RowMajorOrder, SliceSize >::
-getConstView() const
+getConstView() const -> const ConstViewType
 {
    return ConstViewType( size, alignedSize, segmentsCount, sliceOffsets.getConstView(), sliceSegmentSizes.getConstView() );
 }
@@ -144,8 +144,8 @@ setSegmentsSizes( const SizesHolder& sizes )
          return sizes_view[ globalIdx ];
       return 0;
    };
-   auto reduce = [] __cuda_callable__ ( IndexType& aux, const IndexType i ) {
-      aux = TNL::max( aux, i );
+   auto reduce = [] __cuda_callable__ ( IndexType& aux, const IndexType i ) -> IndexType {
+      return TNL::max( aux, i );
    };
    auto keep = [=] __cuda_callable__ ( IndexType i, IndexType res ) mutable {
       slices_view[ i ] = res * SliceSize;
@@ -162,10 +162,24 @@ template< typename Device,
           typename IndexAllocator,
           bool RowMajorOrder,
           int SliceSize >
-__cuda_callable__
-Index
+void
 SlicedEllpack< Device, Index, IndexAllocator, RowMajorOrder, SliceSize >::
-getSegmentsCount() const
+reset()
+{
+   this->size = 0;
+   this->alignedSize = 0;
+   this->segmentsCount = 0;
+   this->sliceOffsets.reset();
+   this->sliceSegmentSizes.reset();
+}
+
+template< typename Device,
+          typename Index,
+          typename IndexAllocator,
+          bool RowMajorOrder,
+          int SliceSize >
+__cuda_callable__ auto SlicedEllpack< Device, Index, IndexAllocator, RowMajorOrder, SliceSize >::
+getSegmentsCount() const -> IndexType
 {
    return this->segmentsCount;
 }
@@ -175,10 +189,8 @@ template< typename Device,
           typename IndexAllocator,
           bool RowMajorOrder,
           int SliceSize >
-__cuda_callable__
-Index
-SlicedEllpack< Device, Index, IndexAllocator, RowMajorOrder, SliceSize >::
-getSegmentSize( const IndexType segmentIdx ) const
+__cuda_callable__ auto SlicedEllpack< Device, Index, IndexAllocator, RowMajorOrder, SliceSize >::
+getSegmentSize( const IndexType segmentIdx ) const -> IndexType
 {
    const Index sliceIdx = segmentIdx / SliceSize;
    if( std::is_same< DeviceType, Devices::Host >::value )
@@ -198,10 +210,8 @@ template< typename Device,
           typename IndexAllocator,
           bool RowMajorOrder,
           int SliceSize >
-__cuda_callable__
-Index
-SlicedEllpack< Device, Index, IndexAllocator, RowMajorOrder, SliceSize >::
-getSize() const
+__cuda_callable__ auto SlicedEllpack< Device, Index, IndexAllocator, RowMajorOrder, SliceSize >::
+getSize() const -> IndexType
 {
    return this->size;
 }
@@ -211,10 +221,8 @@ template< typename Device,
           typename IndexAllocator,
           bool RowMajorOrder,
           int SliceSize >
-__cuda_callable__
-Index
-SlicedEllpack< Device, Index, IndexAllocator, RowMajorOrder, SliceSize >::
-getStorageSize() const
+__cuda_callable__ auto SlicedEllpack< Device, Index, IndexAllocator, RowMajorOrder, SliceSize >::
+getStorageSize() const -> IndexType
 {
    return this->alignedSize;
 }
@@ -224,10 +232,8 @@ template< typename Device,
           typename IndexAllocator,
           bool RowMajorOrder,
           int SliceSize >
-__cuda_callable__
-Index
-SlicedEllpack< Device, Index, IndexAllocator, RowMajorOrder, SliceSize >::
-getGlobalIndex( const Index segmentIdx, const Index localIdx ) const
+__cuda_callable__ auto SlicedEllpack< Device, Index, IndexAllocator, RowMajorOrder, SliceSize >::
+getGlobalIndex( const Index segmentIdx, const Index localIdx ) const -> IndexType
 {
    const IndexType sliceIdx = segmentIdx / SliceSize;
    const IndexType segmentInSliceIdx = segmentIdx % SliceSize;
@@ -296,38 +302,7 @@ void
 SlicedEllpack< Device, Index, IndexAllocator, RowMajorOrder, SliceSize >::
 forSegments( IndexType first, IndexType last, Function& f, Args... args ) const
 {
-   const auto sliceSegmentSizes_view = this->sliceSegmentSizes.getConstView();
-   const auto sliceOffsets_view = this->sliceOffsets.getConstView();
-   if( RowMajorOrder )
-   {
-      auto l = [=] __cuda_callable__ ( const IndexType segmentIdx, Args... args ) mutable {
-         const IndexType sliceIdx = segmentIdx / SliceSize;
-         const IndexType segmentInSliceIdx = segmentIdx % SliceSize;
-         const IndexType segmentSize = sliceSegmentSizes_view[ sliceIdx ];
-         const IndexType begin = sliceOffsets_view[ sliceIdx ] + segmentInSliceIdx * segmentSize;
-         const IndexType end = begin + segmentSize;
-         IndexType localIdx( 0 );
-         for( IndexType globalIdx = begin; globalIdx < end; globalIdx++  )
-            if( ! f( segmentIdx, localIdx++, globalIdx, args... ) )
-               break;
-      };
-      Algorithms::ParallelFor< Device >::exec( first, last, l, args... );
-   }
-   else
-   {
-      auto l = [=] __cuda_callable__ ( const IndexType segmentIdx, Args... args ) mutable {
-         const IndexType sliceIdx = segmentIdx / SliceSize;
-         const IndexType segmentInSliceIdx = segmentIdx % SliceSize;
-         const IndexType segmentSize = sliceSegmentSizes_view[ sliceIdx ];
-         const IndexType begin = sliceOffsets_view[ sliceIdx ] + segmentInSliceIdx;
-         const IndexType end = sliceOffsets_view[ sliceIdx + 1 ];
-         IndexType localIdx( 0 );
-         for( IndexType globalIdx = begin; globalIdx < end; globalIdx += SliceSize )
-            if( ! f( segmentIdx, localIdx++, globalIdx, args... ) )
-               break;
-      };
-      Algorithms::ParallelFor< Device >::exec( first, last, l, args... );
-   }
+   this->getConstView().forSegments( first, last, f, args... );
 }
 
 template< typename Device,
@@ -351,45 +326,9 @@ template< typename Device,
    template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
 void
 SlicedEllpack< Device, Index, IndexAllocator, RowMajorOrder, SliceSize >::
-segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
+segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
 {
-   using RealType = decltype( fetch( IndexType(), IndexType(), IndexType(), std::declval< bool& >(), args... ) );
-   const auto sliceSegmentSizes_view = this->sliceSegmentSizes.getConstView();
-   const auto sliceOffsets_view = this->sliceOffsets.getConstView();
-   if( RowMajorOrder )
-   {
-      auto l = [=] __cuda_callable__ ( const IndexType segmentIdx, Args... args ) mutable {
-         const IndexType sliceIdx = segmentIdx / SliceSize;
-         const IndexType segmentInSliceIdx = segmentIdx % SliceSize;
-         const IndexType segmentSize = sliceSegmentSizes_view[ sliceIdx ];
-         const IndexType begin = sliceOffsets_view[ sliceIdx ] + segmentInSliceIdx * segmentSize;
-         const IndexType end = begin + segmentSize;
-         RealType aux( zero );
-         bool compute( true );
-         IndexType localIdx( 0 );
-         for( IndexType globalIdx = begin; globalIdx< end; globalIdx++  )
-            reduction( aux, fetch( segmentIdx, localIdx++, globalIdx, compute, args... ) );
-         keeper( segmentIdx, aux );
-      };
-      Algorithms::ParallelFor< Device >::exec( first, last, l, args... );
-   }
-   else
-   {
-      auto l = [=] __cuda_callable__ ( const IndexType segmentIdx, Args... args ) mutable {
-         const IndexType sliceIdx = segmentIdx / SliceSize;
-         const IndexType segmentInSliceIdx = segmentIdx % SliceSize;
-         const IndexType segmentSize = sliceSegmentSizes_view[ sliceIdx ];
-         const IndexType begin = sliceOffsets_view[ sliceIdx ] + segmentInSliceIdx;
-         const IndexType end = sliceOffsets_view[ sliceIdx + 1 ];
-         RealType aux( zero );
-         bool compute( true );
-         IndexType localIdx( 0 );
-         for( IndexType globalIdx = begin; globalIdx < end; globalIdx += SliceSize  )
-            reduction( aux, fetch( segmentIdx, localIdx++, globalIdx, compute, args... ) );
-         keeper( segmentIdx, aux );
-      };
-      Algorithms::ParallelFor< Device >::exec( first, last, l, args... );
-   }
+   this->getConstView().segmentsReduction( first, last, fetch, reduction, keeper, zero, args... );
 }
 
 template< typename Device,
@@ -400,7 +339,7 @@ template< typename Device,
    template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
 void
 SlicedEllpack< Device, Index, IndexAllocator, RowMajorOrder, SliceSize >::
-allReduction( Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
+allReduction( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
 {
    this->segmentsReduction( 0, this->getSegmentsCount(), fetch, reduction, keeper, zero, args... );
 }
diff --git a/src/TNL/Containers/Segments/SlicedEllpackView.h b/src/TNL/Containers/Segments/SlicedEllpackView.h
index 6e2e55bbc3ef040c9b15f42a41b7e0a4bc14f7d7..4ed62ebef3c270106be54dc5726575bf595b4d8e 100644
--- a/src/TNL/Containers/Segments/SlicedEllpackView.h
+++ b/src/TNL/Containers/Segments/SlicedEllpackView.h
@@ -28,14 +28,14 @@ class SlicedEllpackView
    public:
 
       using DeviceType = Device;
-      using IndexType = Index;
-      using OffsetsView = typename Containers::VectorView< IndexType, DeviceType, typename std::remove_const < IndexType >::type >;
+      using IndexType = std::remove_const_t< Index >;
+      using OffsetsView = typename Containers::VectorView< Index, DeviceType, IndexType >;
       static constexpr int getSliceSize() { return SliceSize; }
       static constexpr bool getRowMajorOrder() { return RowMajorOrder; }
       template< typename Device_, typename Index_ >
       using ViewTemplate = SlicedEllpackView< Device_, Index_, RowMajorOrder, SliceSize >;
       using ViewType = SlicedEllpackView;
-      using ConstViewType = SlicedEllpackView< Device, std::add_const_t< Index > >;
+      using ConstViewType = ViewType;
       using SegmentViewType = SegmentView< IndexType, RowMajorOrder >;
 
       __cuda_callable__
@@ -62,7 +62,7 @@ class SlicedEllpackView
       ViewType getView();
 
       __cuda_callable__
-      ConstViewType getConstView() const;
+      const ConstViewType getConstView() const;
 
       __cuda_callable__
       IndexType getSegmentsCount() const;
@@ -105,10 +105,10 @@ class SlicedEllpackView
        * \brief Go over all segments and perform a reduction in each of them.
        */
       template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
-      void segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
+      void segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
 
       template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
-      void allReduction( Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
+      void allReduction( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
 
       SlicedEllpackView& operator=( const SlicedEllpackView& view );
 
diff --git a/src/TNL/Containers/Segments/SlicedEllpackView.hpp b/src/TNL/Containers/Segments/SlicedEllpackView.hpp
index c4e03aada22f877f1b6d0e14498193de8df9cdbd..258b877541d76a417cf2429339f7a34fcd53f887 100644
--- a/src/TNL/Containers/Segments/SlicedEllpackView.hpp
+++ b/src/TNL/Containers/Segments/SlicedEllpackView.hpp
@@ -13,6 +13,7 @@
 #include <TNL/Containers/Vector.h>
 #include <TNL/Algorithms/ParallelFor.h>
 #include <TNL/Containers/Segments/SlicedEllpackView.h>
+#include <TNL/Containers/Segments/details/LambdaAdapter.h>
 
 #include "SlicedEllpackView.h"
 
@@ -113,9 +114,9 @@ template< typename Device,
           bool RowMajorOrder,
           int SliceSize >
 __cuda_callable__
-typename SlicedEllpackView< Device, Index, RowMajorOrder, SliceSize >::ConstViewType
+auto
 SlicedEllpackView< Device, Index, RowMajorOrder, SliceSize >::
-getConstView() const
+getConstView() const -> const ConstViewType
 {
    return ConstViewType( size, alignedSize, segmentsCount, sliceOffsets.getConstView(), sliceSegmentSizes.getConstView() );
 }
@@ -124,10 +125,8 @@ template< typename Device,
           typename Index,
           bool RowMajorOrder,
           int SliceSize >
-__cuda_callable__
-Index
-SlicedEllpackView< Device, Index, RowMajorOrder, SliceSize >::
-getSegmentsCount() const
+__cuda_callable__ auto SlicedEllpackView< Device, Index, RowMajorOrder, SliceSize >::
+getSegmentsCount() const -> IndexType
 {
    return this->segmentsCount;
 }
@@ -136,10 +135,8 @@ template< typename Device,
           typename Index,
           bool RowMajorOrder,
           int SliceSize >
-__cuda_callable__
-Index
-SlicedEllpackView< Device, Index, RowMajorOrder, SliceSize >::
-getSegmentSize( const IndexType segmentIdx ) const
+__cuda_callable__ auto SlicedEllpackView< Device, Index, RowMajorOrder, SliceSize >::
+getSegmentSize( const IndexType segmentIdx ) const -> IndexType
 {
    const Index sliceIdx = segmentIdx / SliceSize;
    if( std::is_same< DeviceType, Devices::Host >::value )
@@ -158,10 +155,8 @@ template< typename Device,
           typename Index,
           bool RowMajorOrder,
           int SliceSize >
-__cuda_callable__
-Index
-SlicedEllpackView< Device, Index, RowMajorOrder, SliceSize >::
-getSize() const
+__cuda_callable__ auto SlicedEllpackView< Device, Index, RowMajorOrder, SliceSize >::
+getSize() const -> IndexType
 {
    return this->size;
 }
@@ -170,10 +165,8 @@ template< typename Device,
           typename Index,
           bool RowMajorOrder,
           int SliceSize >
-__cuda_callable__
-Index
-SlicedEllpackView< Device, Index, RowMajorOrder, SliceSize >::
-getStorageSize() const
+__cuda_callable__ auto SlicedEllpackView< Device, Index, RowMajorOrder, SliceSize >::
+getStorageSize() const -> IndexType
 {
    return this->alignedSize;
 }
@@ -182,10 +175,8 @@ template< typename Device,
           typename Index,
           bool RowMajorOrder,
           int SliceSize >
-__cuda_callable__
-Index
-SlicedEllpackView< Device, Index, RowMajorOrder, SliceSize >::
-getGlobalIndex( const Index segmentIdx, const Index localIdx ) const
+__cuda_callable__ auto SlicedEllpackView< Device, Index, RowMajorOrder, SliceSize >::
+getGlobalIndex( const Index segmentIdx, const Index localIdx ) const -> IndexType
 {
    const IndexType sliceIdx = segmentIdx / SliceSize;
    const IndexType segmentInSliceIdx = segmentIdx % SliceSize;
@@ -304,9 +295,10 @@ template< typename Device,
    template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
 void
 SlicedEllpackView< Device, Index, RowMajorOrder, SliceSize >::
-segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
+segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
 {
-   using RealType = decltype( fetch( IndexType(), IndexType(), IndexType(), std::declval< bool& >(), args... ) );
+   using RealType = typename details::FetchLambdaAdapter< Index, Fetch >::ReturnType;
+   //using RealType = decltype( fetch( IndexType(), IndexType(), IndexType(), std::declval< bool& >(), args... ) );
    const auto sliceSegmentSizes_view = this->sliceSegmentSizes.getConstView();
    const auto sliceOffsets_view = this->sliceOffsets.getConstView();
    if( RowMajorOrder )
@@ -321,7 +313,7 @@ segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& red
          IndexType localIdx( 0 );
          bool compute( true );
          for( IndexType globalIdx = begin; globalIdx< end; globalIdx++  )
-            reduction( aux, fetch( segmentIdx, localIdx++, globalIdx, compute, args... ) );
+            aux = reduction( aux, details::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx++, globalIdx, compute ) );
          keeper( segmentIdx, aux );
       };
       Algorithms::ParallelFor< Device >::exec( first, last, l, args... );
@@ -338,7 +330,7 @@ segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& red
          IndexType localIdx( 0 );
          bool compute( true );
          for( IndexType globalIdx = begin; globalIdx < end; globalIdx += SliceSize  )
-            reduction( aux, fetch( segmentIdx, localIdx++, globalIdx, compute, args... ) );
+            aux = reduction( aux, details::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx++, globalIdx, compute ) );
          keeper( segmentIdx, aux );
       };
       Algorithms::ParallelFor< Device >::exec( first, last, l, args... );
@@ -352,7 +344,7 @@ template< typename Device,
    template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
 void
 SlicedEllpackView< Device, Index, RowMajorOrder, SliceSize >::
-allReduction( Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
+allReduction( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
 {
    this->segmentsReduction( 0, this->getSegmentsCount(), fetch, reduction, keeper, zero, args... );
 }
diff --git a/src/TNL/Containers/Segments/details/BiEllpack.h b/src/TNL/Containers/Segments/details/BiEllpack.h
new file mode 100644
index 0000000000000000000000000000000000000000..fe2701f07e0659cb08dd8a8715c3125fbfcc2624
--- /dev/null
+++ b/src/TNL/Containers/Segments/details/BiEllpack.h
@@ -0,0 +1,369 @@
+/***************************************************************************
+                          BiEllpack.h -  description
+                             -------------------
+    begin                : Apr 7, 2020
+    copyright            : (C) 2020 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#include <type_traits>
+#include <TNL/Containers/Vector.h>
+#include <TNL/Containers/Segments/BiEllpackSegmentView.h>
+#include <TNL/Containers/Segments/details/CheckLambdas.h>
+
+namespace TNL {
+   namespace Containers {
+      namespace Segments {
+         namespace details {
+
+template< typename Index,
+          typename Device,
+          bool RowMajorOrder = std::is_same< Device, Devices::Host >::value,
+          int WarpSize = 32 >
+class BiEllpack
+{
+   public:
+
+      using DeviceType = Device;
+      using IndexType = Index;
+      static constexpr bool getRowMajorOrder() { return RowMajorOrder; }
+      using OffsetsHolder = Containers::Vector< IndexType, DeviceType, IndexType >;
+      using OffsetsHolderView = typename OffsetsHolder::ViewType;
+      using ConstOffsetsHolderView = typename OffsetsHolderView::ConstViewType;
+      using SegmentsSizes = OffsetsHolder;
+      using SegmentViewType = BiEllpackSegmentView< IndexType, RowMajorOrder >;
+      
+      static constexpr int getWarpSize() { return WarpSize; };
+
+      static constexpr int getLogWarpSize() { return std::log2( WarpSize ); };
+
+      static constexpr int getGroupsCount() { return getLogWarpSize() + 1; };
+
+      __cuda_callable__
+      static IndexType getActiveGroupsCountDirect( const ConstOffsetsHolderView& rowPermArray, const IndexType segmentIdx )
+      {
+         TNL_ASSERT_GE( segmentIdx, 0, "" );
+         //TNL_ASSERT_LT( segmentIdx, this->getSize(), "" );
+
+         IndexType strip = segmentIdx / getWarpSize();
+         IndexType rowStripPermutation = rowPermArray[ segmentIdx ] - getWarpSize() * strip;
+         IndexType numberOfGroups = getLogWarpSize() + 1;
+         IndexType bisection = 1;
+         for( IndexType i = 0; i < getLogWarpSize() + 1; i++ )
+         {
+            if( rowStripPermutation < bisection )
+               return numberOfGroups - i;
+            bisection *= 2;
+         }
+         TNL_ASSERT_TRUE( false, "segmentIdx was not found" );
+         return -1; // to avoid compiler warning
+      }
+
+      static IndexType getActiveGroupsCount( const ConstOffsetsHolderView& rowPermArray, const IndexType segmentIdx )
+      {
+         TNL_ASSERT_GE( segmentIdx, 0, "" );
+         //TNL_ASSERT_LT( segmentIdx, this->getSize(), "" );
+
+         IndexType strip = segmentIdx / getWarpSize();
+         IndexType rowStripPermutation = rowPermArray.getElement( segmentIdx ) - getWarpSize() * strip;
+         IndexType numberOfGroups = getLogWarpSize() + 1;
+         IndexType bisection = 1;
+         for( IndexType i = 0; i < getLogWarpSize() + 1; i++ )
+         {
+            if( rowStripPermutation < bisection )
+               return numberOfGroups - i;
+            bisection *= 2;
+         }
+         throw std::logic_error( "segmentIdx was not found" );
+      }
+
+      __cuda_callable__
+      static IndexType getGroupSizeDirect( const ConstOffsetsHolderView& groupPointers,
+                                           const IndexType strip,
+                                           const IndexType group )
+      {
+         const IndexType groupOffset = strip * ( getLogWarpSize() + 1 ) + group;
+         return groupPointers[ groupOffset + 1 ] - groupPointers[ groupOffset ];
+      }
+
+      
+      static IndexType getGroupSize( const ConstOffsetsHolderView& groupPointers,
+                                     const IndexType strip,
+                                     const IndexType group )
+      {
+         const IndexType groupOffset = strip * ( getLogWarpSize() + 1 ) + group;
+         return groupPointers.getElement( groupOffset + 1 ) - groupPointers.getElement( groupOffset );
+      }
+      __cuda_callable__ static
+      IndexType getSegmentSizeDirect( const OffsetsHolderView& rowPermArray,
+                                      const OffsetsHolderView& groupPointers,
+                                      const IndexType segmentIdx )
+      {
+         const IndexType strip = segmentIdx / getWarpSize();
+         const IndexType groupIdx = strip * ( getLogWarpSize() + 1 );
+         const IndexType rowStripPerm = rowPermArray[ segmentIdx ] - strip * getWarpSize();
+         const IndexType groupsCount = getActiveGroupsCountDirect( rowPermArray, segmentIdx );
+         IndexType groupHeight = getWarpSize();
+         IndexType segmentSize = 0;
+         for( IndexType groupIdx = 0; groupIdx < groupsCount; groupIdx++ )
+         {
+            const IndexType groupSize = getGroupSizeDirect( groupPointers, strip, groupIdx );
+            IndexType groupWidth =  groupSize / groupHeight;
+            segmentSize += groupWidth;
+            groupHeight /= 2;
+         }
+         return segmentSize;
+      }
+
+      static
+      IndexType getSegmentSize( const OffsetsHolderView& rowPermArray,
+                                const OffsetsHolderView& groupPointers,
+                                const IndexType segmentIdx )
+      {
+         const IndexType strip = segmentIdx / getWarpSize();
+         const IndexType groupIdx = strip * ( getLogWarpSize() + 1 );
+         const IndexType rowStripPerm = rowPermArray.getElement( segmentIdx ) - strip * getWarpSize();
+         const IndexType groupsCount = getActiveGroupsCount( rowPermArray, segmentIdx );
+         IndexType groupHeight = getWarpSize();
+         IndexType segmentSize = 0;
+         for( IndexType group = 0; group < groupsCount; group++ )
+         {
+            const IndexType groupSize = getGroupSize( groupPointers, strip, group );
+            IndexType groupWidth =  groupSize / groupHeight;
+            segmentSize += groupWidth;
+            groupHeight /= 2;
+         }
+         return segmentSize;
+      }
+
+      __cuda_callable__ static
+      IndexType getGlobalIndexDirect( const OffsetsHolderView& rowPermArray,
+                                      const OffsetsHolderView& groupPointers,
+                                      const IndexType segmentIdx,
+                                      IndexType localIdx )
+      {
+         const IndexType strip = segmentIdx / getWarpSize();
+         const IndexType groupIdx = strip * ( getLogWarpSize() + 1 );
+         const IndexType rowStripPerm = rowPermArray[ segmentIdx ] - strip * getWarpSize();
+         const IndexType groupsCount = getActiveGroupsCountDirect( rowPermArray, segmentIdx );
+         IndexType globalIdx = groupPointers[ groupIdx ];
+         IndexType groupHeight = getWarpSize();
+         for( IndexType group = 0; group < groupsCount; group++ )
+         {
+            const IndexType groupSize = getGroupSizeDirect( groupPointers, strip, group );
+            if(  groupSize )
+            {
+               IndexType groupWidth =  groupSize / groupHeight;
+               if( localIdx >= groupWidth )
+               {
+                  localIdx -= groupWidth;
+                  globalIdx += groupSize;
+               }
+               else
+               {
+                  if( RowMajorOrder )
+                     return globalIdx + rowStripPerm * groupWidth + localIdx;
+                  else
+                     return globalIdx + rowStripPerm + localIdx * groupHeight;
+               }
+            }
+            groupHeight /= 2;
+         }
+         TNL_ASSERT_TRUE( false, "Segment capacity exceeded, wrong localIdx." );
+         return -1; // to avoid compiler warning
+      }
+
+      static
+      IndexType getGlobalIndex( const ConstOffsetsHolderView& rowPermArray,
+                                const ConstOffsetsHolderView& groupPointers,
+                                const IndexType segmentIdx,
+                                IndexType localIdx )
+      {
+         const IndexType strip = segmentIdx / getWarpSize();
+         const IndexType groupIdx = strip * ( getLogWarpSize() + 1 );
+         const IndexType rowStripPerm = rowPermArray.getElement( segmentIdx ) - strip * getWarpSize();
+         const IndexType groupsCount = getActiveGroupsCount( rowPermArray, segmentIdx );
+         IndexType globalIdx = groupPointers.getElement( groupIdx );
+         IndexType groupHeight = getWarpSize();
+         for( IndexType group = 0; group < groupsCount; group++ )
+         {
+            const IndexType groupSize = getGroupSize( groupPointers, strip, group );
+            if(  groupSize )
+            {
+               IndexType groupWidth =  groupSize / groupHeight;
+               if( localIdx >= groupWidth )
+               {
+                  localIdx -= groupWidth;
+                  globalIdx += groupSize;
+               }
+               else
+               {
+                  if( RowMajorOrder )
+                  {
+                     return globalIdx + rowStripPerm * groupWidth + localIdx;
+                  }
+                  else
+                     return globalIdx + rowStripPerm + localIdx * groupHeight;
+               }
+            }
+            groupHeight /= 2;
+         }
+         TNL_ASSERT_TRUE( false, "Segment capacity exceeded, wrong localIdx." );
+         return -1; // to avoid compiler warning
+      }
+
+      static __cuda_callable__
+      SegmentViewType getSegmentViewDirect( const OffsetsHolderView& rowPermArray,
+                                            const OffsetsHolderView& groupPointers,
+                                            const IndexType segmentIdx )
+      {
+         using GroupsWidthType = typename SegmentViewType::GroupsWidthType;
+
+         const IndexType strip = segmentIdx / getWarpSize();
+         const IndexType groupIdx = strip * ( getLogWarpSize() + 1 );
+         const IndexType inStripIdx = rowPermArray[ segmentIdx ] - strip * getWarpSize();
+         const IndexType groupsCount = getActiveGroupsCountDirect( rowPermArray, segmentIdx );
+         IndexType groupHeight = getWarpSize();
+         GroupsWidthType groupsWidth( 0 );
+         TNL_ASSERT_LE( groupsCount, getGroupsCount(), "" );
+         for( IndexType i = 0; i < groupsCount; i++ )
+         {
+            const IndexType groupSize = groupPointers[ groupIdx + i + 1 ] - groupPointers[ groupIdx + i ];
+            groupsWidth[ i ] = groupSize / groupHeight;
+            groupHeight /= 2;
+            //std::cerr << " ROW INIT: groupIdx = " << i << " groupSize = " << groupSize << " groupWidth = " << groupsWidth[ i ] << std::endl;
+         }
+         return SegmentViewType( groupPointers[ groupIdx ],
+                                 inStripIdx,
+                                 groupsWidth );
+      }
+
+      static __cuda_callable__
+      SegmentViewType getSegmentView( const OffsetsHolderView& rowPermArray,
+                                      const OffsetsHolderView& groupPointers,
+                                      const IndexType segmentIdx )
+      {
+         using GroupsWidthType = typename SegmentViewType::GroupsWidthType;
+
+         const IndexType strip = segmentIdx / getWarpSize();
+         const IndexType groupIdx = strip * ( getLogWarpSize() + 1 );
+         const IndexType inStripIdx = rowPermArray.getElement( segmentIdx ) - strip * getWarpSize();
+         const IndexType groupsCount = getActiveGroupsCount( rowPermArray, segmentIdx );
+         IndexType groupHeight = getWarpSize();
+         GroupsWidthType groupsWidth( 0 );
+         for( IndexType i = 0; i < groupsCount; i++ )
+         {
+            const IndexType groupSize = groupPointers.getElement( groupIdx + i + 1 ) - groupPointers.getElement( groupIdx + i );
+            groupsWidth[ i ] = groupSize / groupHeight;
+            groupHeight /= 2;
+         }
+         return SegmentViewType( groupPointers[ groupIdx ],
+                                 inStripIdx,
+                                 groupsWidth );
+      }
+
+      static
+      Index getStripLength( const ConstOffsetsHolderView& groupPointers, const IndexType strip )
+      {
+         TNL_ASSERT( strip >= 0, std::cerr << "strip = " << strip );
+
+          return groupPointers.getElement( ( strip + 1 ) * ( getLogWarpSize() + 1 ) )
+                 - groupPointers.getElement( strip * ( getLogWarpSize() + 1 ) );
+      }
+
+      static __cuda_callable__
+      Index getStripLengthDirect( const ConstOffsetsHolderView& groupPointers, const IndexType strip )
+      {
+         TNL_ASSERT( strip >= 0, std::cerr << "strip = " << strip );
+
+          return groupPointers[ ( strip + 1 ) * ( getLogWarpSize() + 1 ) ]
+                 - groupPointers[ strip * ( getLogWarpSize() + 1 ) ];
+      }
+
+};
+
+#ifdef HAVE_CUDA
+template< typename Index,
+          typename Fetch,
+          int BlockDim = 256,
+          int WarpSize = 32,
+          bool HasAllParameters = details::CheckFetchLambda< Index, Fetch >::hasAllParameters() >
+struct BiEllpackSegmentsReductionDispatcher{};
+
+template< typename Index, typename Fetch, int BlockDim, int WarpSize >
+struct BiEllpackSegmentsReductionDispatcher< Index, Fetch, BlockDim, WarpSize, true >
+{
+   template< typename View,
+             typename Reduction,
+             typename ResultKeeper,
+             typename Real,
+             typename... Args >
+   __device__
+   static void exec( View biEllpack,
+                     Index gridIdx,
+                     Index first,
+                     Index last,
+                     Fetch fetch,
+                     Reduction reduction,
+                     ResultKeeper keeper,
+                     Real zero,
+                     Args... args )
+   {
+      biEllpack.template segmentsReductionKernelWithAllParameters< Fetch, Reduction, ResultKeeper, Real, BlockDim, Args... >( gridIdx, first, last, fetch, reduction, keeper, zero, args... );
+   }
+};
+
+template< typename Index, typename Fetch, int BlockDim, int WarpSize >
+struct BiEllpackSegmentsReductionDispatcher< Index, Fetch, BlockDim, WarpSize, false >
+{
+   template< typename View,
+             typename Reduction,
+             typename ResultKeeper,
+             typename Real,
+             typename... Args >
+   __device__
+   static void exec( View biEllpack,
+                     Index gridIdx,
+                     Index first,
+                     Index last,
+                     Fetch fetch,
+                     Reduction reduction,
+                     ResultKeeper keeper,
+                     Real zero,
+                     Args... args )
+   {
+      biEllpack.template segmentsReductionKernel< Fetch, Reduction, ResultKeeper, Real, BlockDim, Args... >( gridIdx, first, last, fetch, reduction, keeper, zero, args... );
+   }
+};
+
+template< typename View,
+          typename Index,
+          typename Fetch,
+          typename Reduction,
+          typename ResultKeeper,
+          typename Real,
+          int BlockDim,
+          typename... Args >
+__global__
+void BiEllpackSegmentsReductionKernel( View biEllpack,
+                                       Index gridIdx,
+                                       Index first,
+                                       Index last,
+                                       Fetch fetch,
+                                       Reduction reduction,
+                                       ResultKeeper keeper,
+                                       Real zero,
+                                       Args... args )
+{
+   BiEllpackSegmentsReductionDispatcher< Index, Fetch, BlockDim >::exec( biEllpack, gridIdx, first, last, fetch, reduction, keeper, zero, args... );
+}
+#endif
+
+         } //namespace details
+      } //namespace Segments
+   } //namespace Containers
+} //namepsace TNL
diff --git a/src/TNL/Containers/Segments/details/CSR.h b/src/TNL/Containers/Segments/details/CSR.h
index 38f097669150b7e3f929bdeab3beb1af03ce3e7d..637ebac362caef1eef651cf552f161b9f9aed58a 100644
--- a/src/TNL/Containers/Segments/details/CSR.h
+++ b/src/TNL/Containers/Segments/details/CSR.h
@@ -29,8 +29,11 @@ class CSR
       static void setSegmentsSizes( const SizesHolder& sizes, CSROffsets& offsets )
       {
          offsets.setSize( sizes.getSize() + 1 );
-         auto view = offsets.getView( 0, sizes.getSize() );
-         view = sizes;
+         // GOTCHA: when sizes.getSize() == 0, getView returns a full view with size == 1
+         if( sizes.getSize() > 0 ) {
+            auto view = offsets.getView( 0, sizes.getSize() );
+            view = sizes;
+         }
          offsets.setElement( sizes.getSize(), 0 );
          offsets.template scan< Algorithms::ScanType::Exclusive >();
       }
diff --git a/src/TNL/Containers/Segments/details/CheckLambdas.h b/src/TNL/Containers/Segments/details/CheckLambdas.h
new file mode 100644
index 0000000000000000000000000000000000000000..498e85a7a40028fa527d3ddfe0e99a857d453e92
--- /dev/null
+++ b/src/TNL/Containers/Segments/details/CheckLambdas.h
@@ -0,0 +1,40 @@
+/***************************************************************************
+                          CheckLambdas.h -  description
+                             -------------------
+    begin                : Dpr 4, 2020
+    copyright            : (C) 2020 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+
+namespace TNL {
+   namespace Containers {
+      namespace Segments {
+         namespace details {
+
+template< typename Index,
+          typename Lambda >
+class CheckFetchLambda
+{
+   private:
+      typedef char YesType[1];
+      typedef char NoType[2];
+
+      template< typename C > static YesType& test( decltype(std::declval< C >()( Index(), Index(), Index(), std::declval< bool& >() ) ) );
+      template< typename C > static NoType& test(...);
+
+      static constexpr bool value = ( sizeof( test< Lambda >(0) ) == sizeof( YesType ) );
+
+   public:
+
+      static constexpr bool hasAllParameters() { return value; };
+};
+
+         } // namespace details
+      } // namespace Segements
+   }  // namespace Conatiners
+} // namespace TNL
diff --git a/src/TNL/Containers/Segments/details/ChunkedEllpack.h b/src/TNL/Containers/Segments/details/ChunkedEllpack.h
index 95ae00c88f3e62cfb2e11a3ef8ba55a704bf44a9..14e181c7efb07d58a3fbe3986225aec3899092e2 100644
--- a/src/TNL/Containers/Segments/details/ChunkedEllpack.h
+++ b/src/TNL/Containers/Segments/details/ChunkedEllpack.h
@@ -13,6 +13,7 @@
 #include <type_traits>
 #include <TNL/Containers/Vector.h>
 #include <TNL/Containers/Segments/ChunkedEllpackSegmentView.h>
+#include <TNL/Containers/Segments/details/CheckLambdas.h>
 
 namespace TNL {
    namespace Containers {
@@ -223,6 +224,81 @@ class ChunkedEllpack
                                     chunksInSlice );
       }
 };
+
+#ifdef HAVE_CUDA
+template< typename Index,
+          typename Fetch,
+          bool HasAllParameters = details::CheckFetchLambda< Index, Fetch >::hasAllParameters() >
+struct ChunkedEllpackSegmentsReductionDispatcher{};
+
+template< typename Index, typename Fetch >
+struct ChunkedEllpackSegmentsReductionDispatcher< Index, Fetch, true >
+{
+   template< typename View,
+             typename Reduction,
+             typename ResultKeeper,
+             typename Real,
+             typename... Args >
+   __device__
+   static void exec( View chunkedEllpack,
+                     Index gridIdx,
+                     Index first,
+                     Index last,
+                     Fetch fetch,
+                     Reduction reduction,
+                     ResultKeeper keeper,
+                     Real zero,
+                     Args... args )
+   {
+      chunkedEllpack.segmentsReductionKernelWithAllParameters( gridIdx, first, last, fetch, reduction, keeper, zero, args... );
+   }
+};
+
+template< typename Index, typename Fetch >
+struct ChunkedEllpackSegmentsReductionDispatcher< Index, Fetch, false >
+{
+   template< typename View,
+             typename Reduction,
+             typename ResultKeeper,
+             typename Real,
+             typename... Args >
+   __device__
+   static void exec( View chunkedEllpack,
+                     Index gridIdx,
+                     Index first,
+                     Index last,
+                     Fetch fetch,
+                     Reduction reduction,
+                     ResultKeeper keeper,
+                     Real zero,
+                     Args... args )
+   {
+      chunkedEllpack.segmentsReductionKernel( gridIdx, first, last, fetch, reduction, keeper, zero, args... );
+   }
+};
+
+template< typename View,
+          typename Index,
+          typename Fetch,
+          typename Reduction,
+          typename ResultKeeper,
+          typename Real,
+          typename... Args >
+__global__
+void ChunkedEllpackSegmentsReductionKernel( View chunkedEllpack,
+                                            Index gridIdx,
+                                            Index first,
+                                            Index last,
+                                            Fetch fetch,
+                                            Reduction reduction,
+                                            ResultKeeper keeper,
+                                            Real zero,
+                                            Args... args )
+{
+   ChunkedEllpackSegmentsReductionDispatcher< Index, Fetch >::exec( chunkedEllpack, gridIdx, first, last, fetch, reduction, keeper, zero, args... );
+}
+#endif
+
          } //namespace details
       } //namespace Segments
    } //namespace Containers
diff --git a/src/TNL/Containers/Segments/details/LambdaAdapter.h b/src/TNL/Containers/Segments/details/LambdaAdapter.h
new file mode 100644
index 0000000000000000000000000000000000000000..a87915cedd4ea967c6780d600cca1f02414e8986
--- /dev/null
+++ b/src/TNL/Containers/Segments/details/LambdaAdapter.h
@@ -0,0 +1,56 @@
+/***************************************************************************
+                          LambdaAdapter.h -  description
+                             -------------------
+    begin                : Dpr 4, 2020
+    copyright            : (C) 2020 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#include "CheckLambdas.h"
+
+namespace TNL {
+   namespace Containers {
+      namespace Segments {
+         namespace details {
+
+template< typename Index,
+          typename Lambda,
+          bool AllParameters = CheckFetchLambda< Index, Lambda >::hasAllParameters() >
+struct FetchLambdaAdapter
+{
+};
+
+template< typename Index,
+          typename Lambda >
+struct FetchLambdaAdapter< Index, Lambda, true >
+{
+   using ReturnType = decltype( std::declval< Lambda >()( Index(), Index(), Index(), std::declval< bool& >() ) );
+
+   __cuda_callable__
+   static ReturnType call( Lambda& f, Index segmentIdx, Index localIdx, Index globalIdx, bool& compute )
+   {
+      return f( segmentIdx, localIdx, globalIdx, compute );
+   }
+};
+
+template< typename Index,
+          typename Lambda >
+struct FetchLambdaAdapter< Index, Lambda, false >
+{
+   using ReturnType = decltype( std::declval< Lambda >()( Index(), std::declval< bool& >() ) );
+
+   __cuda_callable__
+   static ReturnType call( Lambda& f, Index segmentIdx, Index localIdx, Index globalIdx, bool& compute )
+   {
+      return f( globalIdx, compute );
+   }
+};
+
+         } // namespace details
+      } // namespace Segements
+   }  // namespace Conatiners
+} // namespace TNL
diff --git a/src/TNL/File.hpp b/src/TNL/File.hpp
index af112e992a7640070ab880192688b3a0aac8f1d2..289dc92c886c46fe9e9f0de0a834eeddc88f2891 100644
--- a/src/TNL/File.hpp
+++ b/src/TNL/File.hpp
@@ -176,7 +176,7 @@ template< typename Type,
           typename Allocator >
 void File::save( const Type* buffer, std::streamsize elements )
 {
-   static_assert( std::is_same< Type, typename Allocator::value_type >::value,
+   static_assert( std::is_same< std::remove_cv_t< Type >, std::remove_cv_t< typename Allocator::value_type > >::value,
                   "Allocator::value_type must be the same as Type." );
    TNL_ASSERT_GE( elements, 0, "Number of elements to save must be non-negative." );
 
diff --git a/src/TNL/Matrices/Dense.h b/src/TNL/Matrices/Dense.h
deleted file mode 100644
index 6a4795a7e748a26f32536f912f03886e3305bd9a..0000000000000000000000000000000000000000
--- a/src/TNL/Matrices/Dense.h
+++ /dev/null
@@ -1,240 +0,0 @@
-/***************************************************************************
-                          Dense.h  -  description
-                             -------------------
-    begin                : Nov 29, 2013
-    copyright            : (C) 2013 by Tomas Oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-#pragma once
-
-#include <TNL/Allocators/Default.h>
-#include <TNL/Devices/Host.h>
-#include <TNL/Matrices/DenseMatrixRowView.h>
-#include <TNL/Matrices/Matrix.h>
-#include <TNL/Matrices/DenseMatrixView.h>
-#include <TNL/Containers/Segments/Ellpack.h>
-
-namespace TNL {
-namespace Matrices {
-
-template< typename Device >
-class DenseDeviceDependentCode;
-
-template< typename Real = double,
-          typename Device = Devices::Host,
-          typename Index = int,
-          bool RowMajorOrder = std::is_same< Device, Devices::Host >::value,
-          typename RealAllocator = typename Allocators::Default< Device >::template Allocator< Real > >
-class Dense : public Matrix< Real, Device, Index >
-{
-   public:
-      using RealType = Real;
-      using DeviceType = Device;
-      using IndexType = Index;
-      using RealAllocatorType = RealAllocator;
-      using BaseType = Matrix< Real, Device, Index, RealAllocator >;
-      using ValuesVectorType = typename BaseType::ValuesVectorType;
-      using ValuesViewType = typename ValuesVectorType::ViewType;
-      using SegmentsType = Containers::Segments::Ellpack< DeviceType, IndexType, typename Allocators::Default< Device >::template Allocator< IndexType >, RowMajorOrder, 1 >;
-      using SegmentViewType = typename SegmentsType::SegmentViewType;
-      using ViewType = DenseMatrixView< Real, Device, Index, RowMajorOrder >;
-      using ConstViewType = DenseMatrixView< typename std::add_const< Real >::type, Device, Index, RowMajorOrder >;
-      using RowView = DenseMatrixRowView< SegmentViewType, ValuesViewType >;
-
-      // TODO: remove this
-      using CompressedRowLengthsVector = typename Matrix< Real, Device, Index >::CompressedRowLengthsVector;
-      using ConstCompressedRowLengthsVectorView = typename Matrix< RealType, DeviceType, IndexType >::ConstCompressedRowLengthsVectorView;
-
-      template< typename _Real = Real,
-                typename _Device = Device,
-                typename _Index = Index >
-      using Self = Dense< _Real, _Device, _Index >;
-
-      Dense();
-
-      Dense( const IndexType rows, const IndexType columns );
-
-      Dense( std::initializer_list< std::initializer_list< RealType > > data );
-
-      ViewType getView();
-
-      ConstViewType getConstView() const;
-
-      static String getSerializationType();
-
-      virtual String getSerializationTypeVirtual() const;
-
-      void setDimensions( const IndexType rows,
-                          const IndexType columns );
-
-      template< typename Matrix >
-      void setLike( const Matrix& matrix );
-
-      /**
-       * \brief This method creates dense matrix from 2D initializer list.
-       * 
-       * The matrix dimensions will be adjusted by the input data.
-       * 
-       * @param data
-       */
-      void setElements( std::initializer_list< std::initializer_list< RealType > > data );
-      
-      /**
-       * This method is only for the compatibility with the sparse matrices.
-       */
-      void setCompressedRowLengths( ConstCompressedRowLengthsVectorView rowLengths );
-
-      template< typename Vector >
-      void getCompressedRowLengths( Vector& rowLengths ) const;
-
-      [[deprecated]]
-      IndexType getRowLength( const IndexType row ) const;
-
-      IndexType getMaxRowLength() const;
-
-      IndexType getNumberOfMatrixElements() const;
-
-      IndexType getNumberOfNonzeroMatrixElements() const;
-
-      void reset();
-
-      __cuda_callable__
-      const RowView getRow( const IndexType& rowIdx ) const;
-
-      __cuda_callable__
-      RowView getRow( const IndexType& rowIdx );
-
-
-      void setValue( const RealType& v );
-
-      __cuda_callable__
-      Real& operator()( const IndexType row,
-                        const IndexType column );
-
-      __cuda_callable__
-      const Real& operator()( const IndexType row,
-                              const IndexType column ) const;
-
-      void setElement( const IndexType row,
-                       const IndexType column,
-                       const RealType& value );
-
-      void addElement( const IndexType row,
-                       const IndexType column,
-                       const RealType& value,
-                       const RealType& thisElementMultiplicator = 1.0 );
-
-      Real getElement( const IndexType row,
-                       const IndexType column ) const;
-
-      template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
-      void rowsReduction( IndexType first, IndexType last, Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& zero ) const;
-
-      template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
-      void allRowsReduction( Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& zero ) const;
-
-      template< typename Function >
-      void forRows( IndexType first, IndexType last, Function& function ) const;
-
-      template< typename Function >
-      void forRows( IndexType first, IndexType last, Function& function );
-
-      template< typename Function >
-      void forAllRows( Function& function ) const;
-
-      template< typename Function >
-      void forAllRows( Function& function );
-
-      template< typename Vector >
-      __cuda_callable__
-      typename Vector::RealType rowVectorProduct( const IndexType row,
-                                                  const Vector& vector ) const;
-
-      template< typename InVector, typename OutVector >
-      void vectorProduct( const InVector& inVector,
-                          OutVector& outVector ) const;
-
-      template< typename Matrix >
-      void addMatrix( const Matrix& matrix,
-                      const RealType& matrixMultiplicator = 1.0,
-                      const RealType& thisMatrixMultiplicator = 1.0 );
-
-      template< typename Matrix1, typename Matrix2, int tileDim = 32 >
-      void getMatrixProduct( const Matrix1& matrix1,
-                          const Matrix2& matrix2,
-                          const RealType& matrix1Multiplicator = 1.0,
-                          const RealType& matrix2Multiplicator = 1.0 );
-
-      template< typename Matrix, int tileDim = 32 >
-      void getTransposition( const Matrix& matrix,
-                             const RealType& matrixMultiplicator = 1.0 );
-
-      template< typename Vector1, typename Vector2 >
-      void performSORIteration( const Vector1& b,
-                                const IndexType row,
-                                Vector2& x,
-                                const RealType& omega = 1.0 ) const;
-
-      /**
-       * \brief Assignment operator for exactly the same type of the dense matrix.
-       * 
-       * @param matrix
-       * @return 
-       */
-      Dense& operator=( const Dense& matrix );
-
-      /**
-       * \brief Assignment operator for other dense matrices.
-       * 
-       * @param matrix
-       * @return 
-       */
-      template< typename RHSReal, typename RHSDevice, typename RHSIndex,
-                 bool RHSRowMajorOrder, typename RHSRealAllocator >
-      Dense& operator=( const Dense< RHSReal, RHSDevice, RHSIndex, RHSRowMajorOrder, RHSRealAllocator >& matrix );
-
-      /**
-       * \brief Assignment operator for other (sparse) types of matrices.
-       * @param matrix
-       * @return 
-       */
-      template< typename RHSMatrix >
-      Dense& operator=( const RHSMatrix& matrix );
-
-      template< typename Real_, typename Device_, typename Index_, typename RealAllocator_ >
-      bool operator==( const Dense< Real_, Device_, Index_, RowMajorOrder >& matrix ) const;
-
-      template< typename Real_, typename Device_, typename Index_, typename RealAllocator_ >
-      bool operator!=( const Dense< Real_, Device_, Index_, RowMajorOrder >& matrix ) const;
-
-      void save( const String& fileName ) const;
-
-      void load( const String& fileName );
-
-      void save( File& file ) const;
-
-      void load( File& file );
-
-      void print( std::ostream& str ) const;
-
-   protected:
-
-      __cuda_callable__
-      IndexType getElementIndex( const IndexType row,
-                                 const IndexType column ) const;
-
-      typedef DenseDeviceDependentCode< DeviceType > DeviceDependentCode;
-      friend class DenseDeviceDependentCode< DeviceType >;
-
-      SegmentsType segments;
-
-      ViewType view;
-};
-
-} // namespace Matrices
-} // namespace TNL
-
-#include <TNL/Matrices/Dense.hpp>
diff --git a/src/TNL/Matrices/DenseMatrix.h b/src/TNL/Matrices/DenseMatrix.h
new file mode 100644
index 0000000000000000000000000000000000000000..aea7a33d6e224877a2bb17e4c8858ecc5b7ecaad
--- /dev/null
+++ b/src/TNL/Matrices/DenseMatrix.h
@@ -0,0 +1,677 @@
+/***************************************************************************
+                          DenseMatrix.h  -  description
+                             -------------------
+    begin                : Nov 29, 2013
+    copyright            : (C) 2013 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#include <TNL/Allocators/Default.h>
+#include <TNL/Devices/Host.h>
+#include <TNL/Matrices/DenseMatrixRowView.h>
+#include <TNL/Matrices/Matrix.h>
+#include <TNL/Matrices/DenseMatrixView.h>
+#include <TNL/Containers/Segments/Ellpack.h>
+
+namespace TNL {
+namespace Matrices {
+
+/**
+ * \brief Implementation of dense matrix, i.e. matrix storing explicitly all of its elements including zeros.
+ * 
+ * \tparam Real is a type of matrix elements.
+ * \tparam Device is a device where the matrix is allocated.
+ * \tparam Index is a type for indexing of the matrix elements.
+ * \tparam RowMajorOrder tells the ordering of matrix elements. If it is \e true the matrix elements
+ *         are stored in row major order. If it is \e false, the matrix elements are stored in column major order.
+ * \tparam RealAllocator is allocator for the matrix elements.
+ */
+template< typename Real = double,
+          typename Device = Devices::Host,
+          typename Index = int,
+          bool RowMajorOrder = std::is_same< Device, Devices::Host >::value,
+          typename RealAllocator = typename Allocators::Default< Device >::template Allocator< Real > >
+class DenseMatrix : public Matrix< Real, Device, Index >
+{
+   protected:
+      using BaseType = Matrix< Real, Device, Index, RealAllocator >;
+      using ValuesVectorType = typename BaseType::ValuesVectorType;
+      using ValuesViewType = typename ValuesVectorType::ViewType;
+      using SegmentsType = Containers::Segments::Ellpack< Device, Index, typename Allocators::Default< Device >::template Allocator< Index >, RowMajorOrder, 1 >;
+      using SegmentViewType = typename SegmentsType::SegmentViewType;
+
+
+   public:
+
+      /**
+       * \brief The type of matrix elements.
+       */
+      using RealType = Real;
+
+      /**
+       * \brief The device where the matrix is allocated.
+       */
+      using DeviceType = Device;
+
+      /**
+       * \brief The type used for matrix elements indexing.
+       */
+      using IndexType = Index;
+
+      /**
+       * \brief The allocator for matrix elements.
+       */
+      using RealAllocatorType = RealAllocator;
+
+      /**
+       * \brief Type of related matrix view. 
+       * 
+       * See \ref DenseMatrixView.
+       */
+      using ViewType = DenseMatrixView< Real, Device, Index, RowMajorOrder >;
+
+      /**
+       * \brief Matrix view type for constant instances.
+       * 
+       * See \ref DenseMatrixView.
+       */
+      using ConstViewType = DenseMatrixView< typename std::add_const< Real >::type, Device, Index, RowMajorOrder >;
+
+      /**
+       * \brief Type for accessing matrix row.
+       */
+      using RowView = DenseMatrixRowView< SegmentViewType, ValuesViewType >;
+
+      /**
+       * \brief Helper type for getting self type or its modifications.
+       */
+      template< typename _Real = Real,
+                typename _Device = Device,
+                typename _Index = Index,
+                bool _RowMajorOrder = RowMajorOrder,
+                typename _RealAllocator = RealAllocator >
+      using Self = DenseMatrix< _Real, _Device, _Index, _RowMajorOrder, _RealAllocator >;
+
+      /**
+       * \brief Constructor without parameters.
+       */
+      DenseMatrix();
+
+      /**
+       * \brief Constructor with matrix dimensions.
+       * 
+       * \param rows is number of matrix rows.
+       * \param columns is number of matrix columns.
+       */
+      DenseMatrix( const IndexType rows, const IndexType columns );
+
+      /**
+       * \brief Constructor with 2D initializer list.
+       * 
+       * The number of matrix rows is set to the outer list size and the number
+       * of matrix columns is set to maximum size of inner lists. Missing elements
+       * are filled in with zeros.
+       * 
+       * \param data is a initializer list of initializer lists representing
+       * list of matrix rows.
+       * 
+       * \par Example
+       * \include Matrices/DenseMatrixExample_Constructor_init_list.cpp
+       * \par Output
+       * \include DenseMatrixExample_Constructor_init_list.out
+       */
+      template< typename Value >
+      DenseMatrix( std::initializer_list< std::initializer_list< Value > > data );
+
+      /**
+       * \brief Returns a modifiable view of the dense matrix.
+       * 
+       * See \ref DenseMatrixView.
+       * 
+       * \return dense matrix view.
+       */
+      ViewType getView();
+
+      /**
+       * \brief Returns a non-modifiable view of the dense matrix.
+       * 
+       * See \ref DenseMatrixView.
+       * 
+       * \return dense matrix view.
+       */
+      ConstViewType getConstView() const;
+
+      /**
+       * \brief Returns string with serialization type.
+       * 
+       * The string has a form \e `Matrices::DenseMatrix< RealType,  [any_device], IndexType, [any_allocator], true/false >`.
+       * 
+       * \return \e String with the serialization type.
+       */
+      static String getSerializationType();
+
+      /**
+       * \brief Returns string with serialization type.
+       * 
+       * See \ref DenseMatrix::getSerializationType.
+       * 
+       * \return \e String with the serialization type.
+       */
+      virtual String getSerializationTypeVirtual() const;
+
+      /**
+       * \brief Set number of rows and columns of this matrix.
+       * 
+       * \param rows is the number of matrix rows.
+       * \param columns is the number of matrix columns.
+       */
+      void setDimensions( const IndexType rows,
+                          const IndexType columns );
+
+      /**
+       * \brief Set the number of matrix rows and columns by the given matrix.
+       * 
+       * \tparam Matrix is matrix type. This can be any matrix having methods 
+       *  \ref getRows and \ref getColumns.
+       * 
+       * \param matrix in the input matrix dimensions of which are to be adopted.
+       */
+      template< typename Matrix >
+      void setLike( const Matrix& matrix );
+
+      /**
+       * \brief This method recreates the dense matrix from 2D initializer list.
+       * 
+       * The number of matrix rows is set to the outer list size and the number
+       * of matrix columns is set to maximum size of inner lists. Missing elements
+       * are filled in with zeros.
+       * 
+       * \param data is a initializer list of initializer lists representing
+       * list of matrix rows.
+       * 
+       * \par Example
+       * \include Matrices/DenseMatrixExample_setElements.cpp
+       * \par Output
+       * \include DenseMatrixExample_setElements.out
+       */
+      template< typename Value >
+      void setElements( std::initializer_list< std::initializer_list< Value > > data );
+
+      /**
+       * \brief This method is only for the compatibility with the sparse matrices.
+       * 
+       * This method does nothing. In debug mode it contains assertions checking
+       * that given rowCapacities are compatible with the current matrix dimensions.
+       */
+      template< typename RowCapacitiesVector >
+      void setRowCapacities( const RowCapacitiesVector& rowCapacities );
+
+      /**
+       * \brief Computes number of non-zeros in each row.
+       * 
+       * \param rowLengths is a vector into which the number of non-zeros in each row
+       * will be stored.
+       * 
+       * \par Example
+       * \include Matrices/DenseMatrixExample_getCompressedRowLengths.cpp
+       * \par Output
+       * \include DenseMatrixExample_getCompressedRowLengths.out
+       */
+      template< typename RowLengthsVector >
+      void getCompressedRowLengths( RowLengthsVector& rowLengths ) const;
+
+      /**
+       * \brief Returns number of all matrix elements.
+       * 
+       * This method is here mainly for compatibility with sparse matrices since
+       * the number of all matrix elements is just number of rows times number of
+       * columns.
+       * 
+       * \return number of all matrix elements.
+       * 
+       * \par Example
+       * \include Matrices/DenseMatrixExample_getElementsCount.cpp
+       * \par Output
+       * \include DenseMatrixExample_getElementsCount.out
+       */
+      IndexType getElementsCount() const;
+
+      /**
+       * \brief Returns number of non-zero matrix elements.
+       * 
+       * \return number of all non-zero matrix elements.
+       * 
+       * \par Example
+       * \include Matrices/DenseMatrixExample_getElementsCount.cpp
+       * \par Output
+       * \include DenseMatrixExample_getElementsCount.out
+       */
+      IndexType getNonzeroElementsCount() const;
+
+      /**
+       * \brief Resets the matrix to zero dimensions.
+       */
+      void reset();
+
+      /**
+       * \brief Constant getter of simple structure for accessing given matrix row.
+       * 
+       * \param rowIdx is matrix row index.
+       * 
+       * \return RowView for accessing given matrix row.
+       *
+       * \par Example
+       * \include Matrices/DenseMatrixExample_getConstRow.cpp
+       * \par Output
+       * \include DenseMatrixExample_getConstRow.out
+       * 
+       * See \ref DenseMatrixRowView.
+       */
+      __cuda_callable__
+      const RowView getRow( const IndexType& rowIdx ) const;
+
+      /**
+       * \brief Non-constant getter of simple structure for accessing given matrix row.
+       * 
+       * \param rowIdx is matrix row index.
+       * 
+       * \return RowView for accessing given matrix row.
+       * 
+       * \par Example
+       * \include Matrices/DenseMatrixExample_getRow.cpp
+       * \par Output
+       * \include DenseMatrixExample_getRow.out
+       * 
+       * See \ref DenseMatrixRowView.
+       */
+      __cuda_callable__
+      RowView getRow( const IndexType& rowIdx );
+
+      /**
+       * \brief Sets all matrix elements to value \e v.
+       * 
+       * \param v is value all matrix elements will be set to.
+       */
+      void setValue( const RealType& v );
+
+      /**
+       * \brief Returns non-constant reference to element at row \e row and column column.
+       * 
+       * Since this method returns reference to the element, it cannot be called across
+       * different address spaces. It means that it can be called only form CPU if the matrix
+       * is allocated on CPU or only from GPU kernels if the matrix is allocated on GPU.
+       * 
+       * \param row is a row index of the element.
+       * \param column is a columns index of the element. 
+       * \return reference to given matrix element.
+       */
+      __cuda_callable__
+      Real& operator()( const IndexType row,
+                        const IndexType column );
+
+      /**
+       * \brief Returns constant reference to element at row \e row and column column.
+       * 
+       * Since this method returns reference to the element, it cannot be called across
+       * different address spaces. It means that it can be called only form CPU if the matrix
+       * is allocated on CPU or only from GPU kernels if the matrix is allocated on GPU.
+       * 
+       * \param row is a row index of the element.
+       * \param column is a columns index of the element. 
+       * \return reference to given matrix element.
+       */
+      __cuda_callable__
+      const Real& operator()( const IndexType row,
+                              const IndexType column ) const;
+
+      /**
+       * \brief Sets element at given \e row and \e column to given \e value.
+       * 
+       * This method can be called only from the host system (CPU) no matter
+       * where the matrix is allocated. If the matrix is allocated in GPU device
+       * this methods transfer values of each matrix element separately and so the
+       * performance is very low. For higher performance see. \ref DenseMatrix::getRow
+       * or \ref DenseMatrix::forRows and \ref DenseMatrix::forAllRows.
+       * 
+       * \param row is row index of the element.
+       * \param column is columns index of the element.
+       * \param value is the value the element will be set to.
+       * 
+       * \par Example
+       * \include Matrices/DenseMatrixExample_setElement.cpp
+       * \par Output
+       * \include DenseMatrixExample_setElement.out
+       */
+      void setElement( const IndexType row,
+                       const IndexType column,
+                       const RealType& value );
+
+      /**
+       * \brief Add element at given \e row and \e column to given \e value.
+       * 
+       * This method can be called only from the host system (CPU) no matter
+       * where the matrix is allocated. If the matrix is allocated in GPU device
+       * this methods transfer values of each matrix element separately and so the
+       * performance is very low. For higher performance see. \ref DenseMatrix::getRow
+       * or \ref DenseMatrix::forRows and \ref DenseMatrix::forAllRows.
+       * 
+       * \param row is row index of the element.
+       * \param column is columns index of the element.
+       * \param value is the value the element will be set to.
+       * \param thisElementMultiplicator is multiplicator the original matrix element
+       *   value is multiplied by before addition of given e value.
+       */
+      void addElement( const IndexType row,
+                       const IndexType column,
+                       const RealType& value,
+                       const RealType& thisElementMultiplicator = 1.0 );
+
+      /**
+       * \brief Returns value of matrix element at position given by its row and column index.
+       * 
+       * This method can be called only from the host system (CPU) no matter
+       * where the matrix is allocated. If the matrix is allocated in GPU device
+       * this methods transfer values of each matrix element separately and so the
+       * performance is very low. For higher performance see. \ref DenseMatrix::getRow
+       * or \ref DenseMatrix::forRows and \ref DenseMatrix::forAllRows.
+       * 
+       * \param row is a row index of the matrix element.
+       * \param column i a column index of the matrix element.
+       * 
+       * \return value of given matrix element.
+       */
+      Real getElement( const IndexType row,
+                       const IndexType column ) const;
+
+      /**
+       * \brief Method for performing general reduction on matrix rows.
+       * 
+       * \tparam Fetch is a type of lambda function for data fetch declared as
+       *          `fetch( IndexType rowIdx, IndexType columnIdx, RealType elementValue ) -> FetchValue`.
+       *          The return type of this lambda can be any non void.
+       * \tparam Reduce is a type of lambda function for reduction declared as
+       *          `reduce( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue`.
+       * \tparam Keep is a type of lambda function for storing results of reduction in each row.
+       *          It is declared as `keep( const IndexType rowIdx, const double& value )`.
+       * \tparam FetchValue is type returned by the Fetch lambda function.
+       * 
+       * \param first is an index of the first row the reduction will be performed on.
+       * \param last is an index of the row  after the last row the reduction will be performed on.
+       * \param fetch is an instance of lambda function for data fetch.
+       * \param reduce is an instance of lambda function for reduction.
+       * \param keep in an instance of lambda function for storing results.
+       * \param zero is zero of given reduction operation also known as idempotent element.
+       * 
+       * \par Example
+       * \include Matrices/DenseMatrixExample_rowsReduction.cpp
+       * \par Output
+       * \include DenseMatrixExample_rowsReduction.out
+       */
+      template< typename Fetch, typename Reduce, typename Keep, typename FetchValue >
+      void rowsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchValue& zero ) const;
+
+      /**
+       * \brief Method for performing general reduction on ALL matrix rows.
+       * 
+       * \tparam Fetch is a type of lambda function for data fetch declared as
+       *          `fetch( IndexType rowIdx, IndexType columnIdx, RealType elementValue ) -> FetchValue`.
+       *          The return type of this lambda can be any non void.
+       * \tparam Reduce is a type of lambda function for reduction declared as
+       *          `reduce( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue`.
+       * \tparam Keep is a type of lambda function for storing results of reduction in each row.
+       *          It is declared as `keep( const IndexType rowIdx, const double& value )`.
+       * \tparam FetchValue is type returned by the Fetch lambda function.
+       * 
+       * \param fetch is an instance of lambda function for data fetch.
+       * \param reduce is an instance of lambda function for reduction.
+       * \param keep in an instance of lambda function for storing results.
+       * \param zero is zero of given reduction operation also known as idempotent element.
+       * 
+       * \par Example
+       * \include Matrices/DenseMatrixExample_allRowsReduction.cpp
+       * \par Output
+       * \include DenseMatrixExample_allRowsReduction.out
+       */
+      template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
+      void allRowsReduction( Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& zero ) const;
+
+      /**
+       * \brief Method for iteration over all matrix rows for constant instances.
+       * 
+       * \tparam Function is type of lambda function that will operate on matrix elements.
+       *    It is should have form like
+       *  `function( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx, const RealType& value, bool& compute )`.
+       *  The column index repeats twice only for compatibility with sparse matrices. 
+       *  If the 'compute' variable is set to false the iteration over the row can 
+       *  be interrupted.
+       * 
+       * \param first is index is the first row to be processed.
+       * \param last is index of the row after the last row to be processed.
+       * \param function is an instance of the lambda function to be called in each row.
+       * 
+       * \par Example
+       * \include Matrices/DenseMatrixExample_forRows.cpp
+       * \par Output
+       * \include DenseMatrixExample_forRows.out
+       */
+      template< typename Function >
+      void forRows( IndexType first, IndexType last, Function& function ) const;
+
+      /**
+       * \brief Method for iteration over all matrix rows for non-constant instances.
+       * 
+       * \tparam Function is type of lambda function that will operate on matrix elements.
+       *    It is should have form like
+       *  `function( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx, RealType& value, bool& compute )`.
+       *  The column index repeats twice only for compatibility with sparse matrices. 
+       *  If the 'compute' variable is set to false the iteration over the row can 
+       *  be interrupted.
+       * 
+       * \param first is index is the first row to be processed.
+       * \param last is index of the row after the last row to be processed.
+       * \param function is an instance of the lambda function to be called in each row.
+       * 
+       * \par Example
+       * \include Matrices/DenseMatrixExample_forRows.cpp
+       * \par Output
+       * \include DenseMatrixExample_forRows.out
+       */
+      template< typename Function >
+      void forRows( IndexType first, IndexType last, Function& function );
+
+      /**
+       * \brief This method calls \e forRows for all matrix rows.
+       * 
+       * See \ref DenseMatrix::forRows.
+       * 
+       * \tparam Function is a type of lambda function that will operate on matrix elements.
+       * \param function  is an instance of the lambda function to be called in each row.
+       * 
+       * \par Example
+       * \include Matrices/DenseMatrixExample_forAllRows.cpp
+       * \par Output
+       * \include DenseMatrixExample_forAllRows.out
+       */
+      template< typename Function >
+      void forAllRows( Function& function ) const;
+
+      /**
+       * \brief This method calls \e forRows for all matrix rows.
+       * 
+       * See \ref DenseMatrix::forRows.
+       * 
+       * \tparam Function is a type of lambda function that will operate on matrix elements.
+       * \param function  is an instance of the lambda function to be called in each row.
+       * 
+       * \par Example
+       * \include Matrices/DenseMatrixExample_forAllRows.cpp
+       * \par Output
+       * \include DenseMatrixExample_forAllRows.out
+       */
+      template< typename Function >
+      void forAllRows( Function& function );
+
+      /**
+       * \brief This method computes scalar product of given vector and one 
+       *  row of the matrix.
+       * 
+       * \tparam Vector is type of input vector. It can be \ref Vector,
+       *     \ref VectorView, \ref Array, \ref ArraView or similar container.
+       * \param row is index of the row used for the scalar product.
+       * \param vector is the input vector.
+       * \return result of the matrix row and vector product.
+       */
+      template< typename Vector >
+      __cuda_callable__
+      typename Vector::RealType rowVectorProduct( const IndexType row,
+                                                  const Vector& vector ) const;
+
+      /**
+       * \brief Computes product of matrix and vector.
+       * 
+       * \tparam InVector is type of input vector.  It can be \ref Vector,
+       *     \ref VectorView, \ref Array, \ref ArraView or similar container.
+       * \tparam OutVector is type of output vector. It can be \ref Vector,
+       *     \ref VectorView, \ref Array, \ref ArraView or similar container.
+       * 
+       * \param inVector is input vector.
+       * \param outVector is output vector.
+       */
+      template< typename InVector, typename OutVector >
+      void vectorProduct( const InVector& inVector,
+                          OutVector& outVector ) const;
+
+      template< typename Matrix >
+      void addMatrix( const Matrix& matrix,
+                      const RealType& matrixMultiplicator = 1.0,
+                      const RealType& thisMatrixMultiplicator = 1.0 );
+
+      template< typename Matrix1, typename Matrix2, int tileDim = 32 >
+      void getMatrixProduct( const Matrix1& matrix1,
+                          const Matrix2& matrix2,
+                          const RealType& matrix1Multiplicator = 1.0,
+                          const RealType& matrix2Multiplicator = 1.0 );
+
+      template< typename Matrix, int tileDim = 32 >
+      void getTransposition( const Matrix& matrix,
+                             const RealType& matrixMultiplicator = 1.0 );
+
+      template< typename Vector1, typename Vector2 >
+      void performSORIteration( const Vector1& b,
+                                const IndexType row,
+                                Vector2& x,
+                                const RealType& omega = 1.0 ) const;
+
+      /**
+       * \brief Assignment operator for exactly the same type of the dense matrix.
+       * 
+       * \param matrix is the right-hand side matrix.
+       * \return reference to this matrix.
+       */
+      DenseMatrix& operator=( const DenseMatrix& matrix );
+
+      /**
+       * \brief Assignment operator for other dense matrices.
+       * 
+       * \param matrix is the right-hand side matrix.
+       * \return reference to this matrix.
+       */
+      template< typename RHSReal, typename RHSDevice, typename RHSIndex,
+                 bool RHSRowMajorOrder, typename RHSRealAllocator >
+      DenseMatrix& operator=( const DenseMatrix< RHSReal, RHSDevice, RHSIndex, RHSRowMajorOrder, RHSRealAllocator >& matrix );
+
+      /**
+       * \brief Assignment operator for other (sparse) types of matrices.
+       * 
+       * \param matrix is the right-hand side matrix.
+       * \return reference to this matrix.
+       */
+      template< typename RHSMatrix >
+      DenseMatrix& operator=( const RHSMatrix& matrix );
+
+      /**
+       * \brief Comparison operator with another dense matrix.
+       * 
+       * \param matrix is the right-hand side matrix.
+       * \return \e true if the RHS matrix is equal, \e false otherwise.
+       */
+      template< typename Real_, typename Device_, typename Index_, typename RealAllocator_ >
+      bool operator==( const DenseMatrix< Real_, Device_, Index_, RowMajorOrder >& matrix ) const;
+
+      /**
+       * \brief Comparison operator with another dense matrix.
+       * 
+       * \param matrix is the right-hand side matrix.
+       * \return \e false if the RHS matrix is equal, \e true otherwise.
+       */
+      template< typename Real_, typename Device_, typename Index_, typename RealAllocator_ >
+      bool operator!=( const DenseMatrix< Real_, Device_, Index_, RowMajorOrder >& matrix ) const;
+
+      /**
+       * \brief Method for saving the matrix to the file with given filename.
+       * 
+       * \param fileName is name of the file.
+       */
+      void save( const String& fileName ) const;
+
+      /**
+       * \brief Method for loading the matrix from the file with given filename.
+       * 
+       * \param fileName is name of the file.
+       */
+      void load( const String& fileName );
+
+      /**
+       * \brief Method for saving the matrix to a file.
+       * 
+       * \param fileName is name of the file.
+       */
+      void save( File& file ) const;
+
+      /**
+       * \brief Method for loading the matrix from a file.
+       * 
+       * \param fileName is name of the file.
+       */
+      void load( File& file );
+
+      /**
+       * \brief Method for printing the matrix to output stream.
+       * 
+       * \param str is the output stream.
+       */
+      void print( std::ostream& str ) const;
+
+   protected:
+
+      __cuda_callable__
+      IndexType getElementIndex( const IndexType row,
+                                 const IndexType column ) const;
+
+      SegmentsType segments;
+
+      ViewType view;
+};
+
+/**
+ * \brief Insertion operator for dense matrix and output stream.
+ * 
+ * \param str is the output stream.
+ * \param matrix is the dense matrix.
+ * \return  reference to the stream.
+ */
+template< typename Real,
+          typename Device,
+          typename Index,
+          bool RowMajorOrder,
+          typename RealAllocator >
+std::ostream& operator<< ( std::ostream& str, const DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >& matrix );
+
+} // namespace Matrices
+} // namespace TNL
+
+#include <TNL/Matrices/DenseMatrix.hpp>
diff --git a/src/TNL/Matrices/Dense.hpp b/src/TNL/Matrices/DenseMatrix.hpp
similarity index 86%
rename from src/TNL/Matrices/Dense.hpp
rename to src/TNL/Matrices/DenseMatrix.hpp
index 28f152444aeeedd8625055a59445e82530c7d7fb..bd2ea62126edf56c5083e3888dfb31b73df0317b 100644
--- a/src/TNL/Matrices/Dense.hpp
+++ b/src/TNL/Matrices/DenseMatrix.hpp
@@ -11,7 +11,7 @@
 #pragma once
 
 #include <TNL/Assert.h>
-#include <TNL/Matrices/Dense.h>
+#include <TNL/Matrices/DenseMatrix.h>
 #include <TNL/Exceptions/NotImplementedError.h>
 
 namespace TNL {
@@ -22,7 +22,7 @@ template< typename Real,
           typename Index,
           bool RowMajorOrder,
           typename RealAllocator >
-Dense< Real, Device, Index, RowMajorOrder, RealAllocator >::Dense()
+DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >::DenseMatrix()
 {
 }
 
@@ -31,8 +31,8 @@ template< typename Real,
           typename Index,
           bool RowMajorOrder,
           typename RealAllocator >
-Dense< Real, Device, Index, RowMajorOrder, RealAllocator >::
-Dense( const IndexType rows, const IndexType columns )
+DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >::
+DenseMatrix( const IndexType rows, const IndexType columns )
 {
    this->setDimensions( rows, columns );
 }
@@ -42,8 +42,9 @@ template< typename Real,
           typename Index,
           bool RowMajorOrder,
           typename RealAllocator >
-Dense< Real, Device, Index, RowMajorOrder, RealAllocator >::
-Dense( std::initializer_list< std::initializer_list< RealType > > data )
+   template< typename Value >
+DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >::
+DenseMatrix( std::initializer_list< std::initializer_list< Value > > data )
 {
    this->setElements( data );
 }
@@ -53,9 +54,10 @@ template< typename Real,
           typename Index,
           bool RowMajorOrder,
           typename RealAllocator >
+   template< typename Value >
 void
-Dense< Real, Device, Index, RowMajorOrder, RealAllocator >::
-setElements( std::initializer_list< std::initializer_list< RealType > > data )
+DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >::
+setElements( std::initializer_list< std::initializer_list< Value > > data )
 {
    IndexType rows = data.size();
    IndexType columns = 0;
@@ -64,7 +66,7 @@ setElements( std::initializer_list< std::initializer_list< RealType > > data )
    this->setDimensions( rows, columns );
    if( ! std::is_same< DeviceType, Devices::Host >::value )
    {
-      Dense< RealType, Devices::Host, IndexType > hostDense( rows, columns );
+      DenseMatrix< RealType, Devices::Host, IndexType > hostDense( rows, columns );
       IndexType rowIdx( 0 );
       for( auto row : data )
       {
@@ -94,7 +96,7 @@ template< typename Real,
           bool RowMajorOrder,
           typename RealAllocator >
 auto
-Dense< Real, Device, Index, RowMajorOrder, RealAllocator >::
+DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >::
 getView() -> ViewType
 {
    return ViewType( this->getRows(),
@@ -108,7 +110,7 @@ template< typename Real,
           bool RowMajorOrder,
           typename RealAllocator >
 auto
-Dense< Real, Device, Index, RowMajorOrder, RealAllocator >::
+DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >::
 getConstView() const -> ConstViewType
 {
    return ConstViewType( this->getRows(),
@@ -122,7 +124,7 @@ template< typename Real,
           bool RowMajorOrder,
           typename RealAllocator >
 String
-Dense< Real, Device, Index, RowMajorOrder, RealAllocator >::
+DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >::
 getSerializationType()
 {
    return ViewType::getSerializationType();
@@ -134,7 +136,7 @@ template< typename Real,
           bool RowMajorOrder,
           typename RealAllocator >
 String
-Dense< Real, Device, Index, RowMajorOrder, RealAllocator >::
+DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >::
 getSerializationTypeVirtual() const
 {
    return this->getSerializationType();
@@ -146,7 +148,7 @@ template< typename Real,
           bool RowMajorOrder,
           typename RealAllocator >
 void
-Dense< Real, Device, Index, RowMajorOrder, RealAllocator >::
+DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >::
 setDimensions( const IndexType rows,
                const IndexType columns )
 {
@@ -164,7 +166,7 @@ template< typename Real,
           typename RealAllocator >
    template< typename Matrix_ >
 void
-Dense< Real, Device, Index, RowMajorOrder, RealAllocator >::
+DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >::
 setLike( const Matrix_& matrix )
 {
    this->setDimensions( matrix.getRows(), matrix.getColumns() );
@@ -175,12 +177,13 @@ template< typename Real,
           typename Index,
           bool RowMajorOrder,
           typename RealAllocator >
+   template< typename RowCapacitiesVector >
 void
-Dense< Real, Device, Index, RowMajorOrder, RealAllocator >::
-setCompressedRowLengths( ConstCompressedRowLengthsVectorView rowLengths )
+DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >::
+setRowCapacities( const RowCapacitiesVector& rowCapacities )
 {
-   TNL_ASSERT_EQ( rowLengths.getSize(), this->getRows(), "" );
-   TNL_ASSERT_LE( max( rowLengths ), this->getColumns(), "" );
+   TNL_ASSERT_EQ( rowCapacities.getSize(), this->getRows(), "" );
+   TNL_ASSERT_LE( max( rowCapacities ), this->getColumns(), "" );
 }
 
 template< typename Real,
@@ -188,10 +191,10 @@ template< typename Real,
           typename Index,
           bool RowMajorOrder,
           typename RealAllocator >
-   template< typename Vector >
+   template< typename RowLengthsVector >
 void
-Dense< Real, Device, Index, RowMajorOrder, RealAllocator >::
-getCompressedRowLengths( Vector& rowLengths ) const
+DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >::
+getCompressedRowLengths( RowLengthsVector& rowLengths ) const
 {
    this->view.getCompressedRowLengths( rowLengths );
 }
@@ -201,27 +204,9 @@ template< typename Real,
           typename Index,
           bool RowMajorOrder,
           typename RealAllocator >
-Index Dense< Real, Device, Index, RowMajorOrder, RealAllocator >::getRowLength( const IndexType row ) const
-{
-   return this->getColumns();
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          bool RowMajorOrder,
-          typename RealAllocator >
-Index Dense< Real, Device, Index, RowMajorOrder, RealAllocator >::getMaxRowLength() const
-{
-   return this->getColumns();
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          bool RowMajorOrder,
-          typename RealAllocator >
-Index Dense< Real, Device, Index, RowMajorOrder, RealAllocator >::getNumberOfMatrixElements() const
+Index
+DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >::
+getElementsCount() const
 {
    return this->getRows() * this->getColumns();
 }
@@ -231,9 +216,11 @@ template< typename Real,
           typename Index,
           bool RowMajorOrder,
           typename RealAllocator >
-Index Dense< Real, Device, Index, RowMajorOrder, RealAllocator >::getNumberOfNonzeroMatrixElements() const
+Index
+DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >::
+getNonzeroElementsCount() const
 {
-   return this->view.getNumberOfNonzeroMatrixElements();
+   return this->view.getNonzeroElementsCount();
 }
 
 template< typename Real,
@@ -241,7 +228,9 @@ template< typename Real,
           typename Index,
           bool RowMajorOrder,
           typename RealAllocator >
-void Dense< Real, Device, Index, RowMajorOrder, RealAllocator >::reset()
+void
+DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >::
+reset()
 {
    Matrix< Real, Device, Index >::reset();
 }
@@ -251,7 +240,9 @@ template< typename Real,
           typename Index,
           bool RowMajorOrder,
           typename RealAllocator >
-void Dense< Real, Device, Index, RowMajorOrder, RealAllocator >::setValue( const Real& value )
+void
+DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >::
+setValue( const Real& value )
 {
    this->view.setValue( value );
 }
@@ -262,7 +253,7 @@ template< typename Real,
           bool RowMajorOrder,
           typename RealAllocator >
 __cuda_callable__ auto
-Dense< Real, Device, Index, RowMajorOrder, RealAllocator >::
+DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >::
 getRow( const IndexType& rowIdx ) const -> const RowView
 {
    return this->view.getRow( rowIdx );
@@ -274,7 +265,7 @@ template< typename Real,
           bool RowMajorOrder,
           typename RealAllocator >
 __cuda_callable__ auto
-Dense< Real, Device, Index, RowMajorOrder, RealAllocator >::
+DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >::
 getRow( const IndexType& rowIdx ) -> RowView
 {
    return this->view.getRow( rowIdx );
@@ -286,7 +277,7 @@ template< typename Real,
           bool RowMajorOrder,
           typename RealAllocator >
 __cuda_callable__
-Real& Dense< Real, Device, Index, RowMajorOrder, RealAllocator >::operator()( const IndexType row,
+Real& DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >::operator()( const IndexType row,
                                                 const IndexType column )
 {
    return this->view.operator()( row, column );
@@ -298,7 +289,7 @@ template< typename Real,
           bool RowMajorOrder,
           typename RealAllocator >
 __cuda_callable__
-const Real& Dense< Real, Device, Index, RowMajorOrder, RealAllocator >::operator()( const IndexType row,
+const Real& DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >::operator()( const IndexType row,
                                                       const IndexType column ) const
 {
    return this->view.operator()( row, column );
@@ -310,7 +301,7 @@ template< typename Real,
           bool RowMajorOrder,
           typename RealAllocator >
 void
-Dense< Real, Device, Index, RowMajorOrder, RealAllocator >::
+DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >::
 setElement( const IndexType row,
             const IndexType column,
             const RealType& value )
@@ -324,7 +315,7 @@ template< typename Real,
           bool RowMajorOrder,
           typename RealAllocator >
 void
-Dense< Real, Device, Index, RowMajorOrder, RealAllocator >::
+DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >::
 addElement( const IndexType row,
             const IndexType column,
             const RealType& value,
@@ -339,7 +330,7 @@ template< typename Real,
           bool RowMajorOrder,
           typename RealAllocator >
 Real
-Dense< Real, Device, Index, RowMajorOrder, RealAllocator >::
+DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >::
 getElement( const IndexType row,
             const IndexType column ) const
 {
@@ -353,8 +344,8 @@ template< typename Real,
           typename RealAllocator >
    template< typename Fetch, typename Reduce, typename Keep, typename FetchValue >
 void
-Dense< Real, Device, Index, RowMajorOrder, RealAllocator >::
-rowsReduction( IndexType first, IndexType last, Fetch& fetch, Reduce& reduce, Keep& keep, const FetchValue& zero ) const
+DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >::
+rowsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchValue& zero ) const
 {
    this->view.rowsReduction( first, last, fetch, reduce, keep, zero );
 }
@@ -366,8 +357,8 @@ template< typename Real,
           typename RealAllocator >
    template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
 void
-Dense< Real, Device, Index, RowMajorOrder, RealAllocator >::
-allRowsReduction( Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& zero ) const
+DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >::
+allRowsReduction( Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& zero ) const
 {
    this->rowsReduction( 0, this->getRows(), fetch, reduce, keep, zero );
 }
@@ -379,7 +370,7 @@ template< typename Real,
           typename RealAllocator >
    template< typename Function >
 void
-Dense< Real, Device, Index, RowMajorOrder, RealAllocator >::
+DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >::
 forRows( IndexType first, IndexType last, Function& function ) const
 {
    this->view.forRows( first, last, function );
@@ -392,7 +383,7 @@ template< typename Real,
           typename RealAllocator >
    template< typename Function >
 void
-Dense< Real, Device, Index, RowMajorOrder, RealAllocator >::
+DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >::
 forRows( IndexType first, IndexType last, Function& function )
 {
    this->view.forRows( first, last, function );
@@ -405,7 +396,7 @@ template< typename Real,
           typename RealAllocator >
    template< typename Function >
 void
-Dense< Real, Device, Index, RowMajorOrder, RealAllocator >::
+DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >::
 forAllRows( Function& function ) const
 {
    this->forRows( 0, this->getRows(), function );
@@ -418,7 +409,7 @@ template< typename Real,
           typename RealAllocator >
    template< typename Function >
 void
-Dense< Real, Device, Index, RowMajorOrder, RealAllocator >::
+DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >::
 forAllRows( Function& function )
 {
    this->forRows( 0, this->getRows(), function );
@@ -431,8 +422,9 @@ template< typename Real,
           typename RealAllocator >
    template< typename Vector >
 __cuda_callable__
-typename Vector::RealType Dense< Real, Device, Index, RowMajorOrder, RealAllocator >::rowVectorProduct( const IndexType row,
-                                                                                   const Vector& vector ) const
+typename Vector::RealType 
+DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >::
+rowVectorProduct( const IndexType row, const Vector& vector ) const
 {
    return this->view.rowVectorProduct( row, vector );
 }
@@ -445,7 +437,7 @@ template< typename Real,
    template< typename InVector,
              typename OutVector >
 void
-Dense< Real, Device, Index, RowMajorOrder, RealAllocator >::
+DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >::
 vectorProduct( const InVector& inVector, OutVector& outVector ) const
 {
    this->view.vectorProduct( inVector, outVector );
@@ -457,9 +449,11 @@ template< typename Real,
           bool RowMajorOrder,
           typename RealAllocator >
    template< typename Matrix >
-void Dense< Real, Device, Index, RowMajorOrder, RealAllocator >::addMatrix( const Matrix& matrix,
-                                              const RealType& matrixMultiplicator,
-                                              const RealType& thisMatrixMultiplicator )
+void
+DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >::
+addMatrix( const Matrix& matrix,
+           const RealType& matrixMultiplicator,
+           const RealType& thisMatrixMultiplicator )
 {
    TNL_ASSERT( this->getColumns() == matrix.getColumns() &&
               this->getRows() == matrix.getRows(),
@@ -483,7 +477,7 @@ template< typename Real,
           typename Matrix2,
           int tileDim,
           int tileRowBlockSize >
-__global__ void DenseMatrixProductKernel( Dense< Real, Devices::Cuda, Index >* resultMatrix,
+__global__ void DenseMatrixProductKernel( DenseMatrix< Real, Devices::Cuda, Index >* resultMatrix,
                                                    const Matrix1* matrixA,
                                                    const Matrix2* matrixB,
                                                    const Real matrixAMultiplicator,
@@ -581,7 +575,7 @@ template< typename Real,
           bool RowMajorOrder,
           typename RealAllocator >
    template< typename Matrix1, typename Matrix2, int tileDim >
-void Dense< Real, Device, Index, RowMajorOrder, RealAllocator >::getMatrixProduct( const Matrix1& matrix1,
+void DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >::getMatrixProduct( const Matrix1& matrix1,
                                                               const Matrix2& matrix2,
                                                               const RealType& matrix1Multiplicator,
                                                               const RealType& matrix2Multiplicator )
@@ -638,7 +632,7 @@ void Dense< Real, Device, Index, RowMajorOrder, RealAllocator >::getMatrixProduc
                cudaGridSize.x = columnTiles % Cuda::getMaxGridSize();
             if( gridIdx_y == rowGrids - 1 )
                cudaGridSize.y = rowTiles % Cuda::getMaxGridSize();
-            Dense* this_kernel = Cuda::passToDevice( *this );
+            DenseMatrix* this_kernel = Cuda::passToDevice( *this );
             Matrix1* matrix1_kernel = Cuda::passToDevice( matrix1 );
             Matrix2* matrix2_kernel = Cuda::passToDevice( matrix2 );
             DenseMatrixProductKernel< Real,
@@ -673,7 +667,7 @@ template< typename Real,
           typename RealAllocator,
           int tileDim,
           int tileRowBlockSize >
-__global__ void DenseTranspositionAlignedKernel( Dense< Real, Devices::Cuda, Index >* resultMatrix,
+__global__ void DenseTranspositionAlignedKernel( DenseMatrix< Real, Devices::Cuda, Index >* resultMatrix,
                                                           const Matrix* inputMatrix,
                                                           const Real matrixMultiplicator,
                                                           const Index gridIdx_x,
@@ -744,7 +738,7 @@ template< typename Real,
           typename Matrix,
           int tileDim,
           int tileRowBlockSize >
-__global__ void DenseTranspositionNonAlignedKernel( Dense< Real, Devices::Cuda, Index >* resultMatrix,
+__global__ void DenseTranspositionNonAlignedKernel( DenseMatrix< Real, Devices::Cuda, Index >* resultMatrix,
                                                              const Matrix* inputMatrix,
                                                              const Real matrixMultiplicator,
                                                              const Index gridIdx_x,
@@ -825,7 +819,7 @@ template< typename Real,
           bool RowMajorOrder,
           typename RealAllocator >
    template< typename Matrix, int tileDim >
-void Dense< Real, Device, Index, RowMajorOrder, RealAllocator >::getTransposition( const Matrix& matrix,
+void DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >::getTransposition( const Matrix& matrix,
                                                               const RealType& matrixMultiplicator )
 {
    TNL_ASSERT( this->getColumns() == matrix.getRows() &&
@@ -860,7 +854,7 @@ void Dense< Real, Device, Index, RowMajorOrder, RealAllocator >::getTranspositio
       const IndexType columnGrids = roundUpDivision( columnTiles, Cuda::getMaxGridSize() );
       const IndexType sharedMemorySize = tileDim*tileDim + tileDim*tileDim/Cuda::getNumberOfSharedMemoryBanks();
 
-      Dense* this_device = Cuda::passToDevice( *this );
+      DenseMatrix* this_device = Cuda::passToDevice( *this );
       Matrix* matrix_device = Cuda::passToDevice( matrix );
 
       for( IndexType gridIdx_x = 0; gridIdx_x < columnGrids; gridIdx_x++ )
@@ -918,7 +912,7 @@ template< typename Real,
           bool RowMajorOrder,
           typename RealAllocator >
    template< typename Vector1, typename Vector2 >
-void Dense< Real, Device, Index, RowMajorOrder, RealAllocator >::performSORIteration( const Vector1& b,
+void DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >::performSORIteration( const Vector1& b,
                                                         const IndexType row,
                                                         Vector2& x,
                                                         const RealType& omega ) const
@@ -939,9 +933,9 @@ template< typename Real,
           typename Index,
           bool RowMajorOrder,
           typename RealAllocator >
-Dense< Real, Device, Index, RowMajorOrder, RealAllocator >&
-Dense< Real, Device, Index, RowMajorOrder, RealAllocator >::
-operator=( const Dense< Real, Device, Index, RowMajorOrder, RealAllocator >& matrix )
+DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >&
+DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >::
+operator=( const DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >& matrix )
 {
    setLike( matrix );
    this->values = matrix.values;
@@ -955,11 +949,11 @@ template< typename Real,
           typename RealAllocator >
    template< typename RHSReal, typename RHSDevice, typename RHSIndex,
              bool RHSRowMajorOrder, typename RHSRealAllocator >
-Dense< Real, Device, Index, RowMajorOrder, RealAllocator >&
-Dense< Real, Device, Index, RowMajorOrder, RealAllocator >::
-operator=( const Dense< RHSReal, RHSDevice, RHSIndex, RHSRowMajorOrder, RHSRealAllocator >& matrix )
+DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >&
+DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >::
+operator=( const DenseMatrix< RHSReal, RHSDevice, RHSIndex, RHSRowMajorOrder, RHSRealAllocator >& matrix )
 {
-   using RHSMatrix = Dense< RHSReal, RHSDevice, RHSIndex, RHSRowMajorOrder, RHSRealAllocator >;
+   using RHSMatrix = DenseMatrix< RHSReal, RHSDevice, RHSIndex, RHSRowMajorOrder, RHSRealAllocator >;
    using RHSIndexType = typename RHSMatrix::IndexType;
    using RHSRealType = typename RHSMatrix::RealType;
    using RHSDeviceType = typename RHSMatrix::DeviceType;
@@ -1027,8 +1021,8 @@ template< typename Real,
           bool RowMajorOrder,
           typename RealAllocator >
    template< typename RHSMatrix >
-Dense< Real, Device, Index, RowMajorOrder, RealAllocator >&
-Dense< Real, Device, Index, RowMajorOrder, RealAllocator >::
+DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >&
+DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >::
 operator=( const RHSMatrix& matrix )
 {
    using RHSIndexType = typename RHSMatrix::IndexType;
@@ -1118,8 +1112,8 @@ template< typename Real,
           typename RealAllocator >
    template< typename Real_, typename Device_, typename Index_, typename RealAllocator_ >
 bool
-Dense< Real, Device, Index, RowMajorOrder, RealAllocator >::
-operator==( const Dense< Real_, Device_, Index_, RowMajorOrder >& matrix ) const
+DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >::
+operator==( const DenseMatrix< Real_, Device_, Index_, RowMajorOrder >& matrix ) const
 {
    return( this->getRows() == matrix.getRows() &&
            this->getColumns() == matrix.getColumns() &&
@@ -1133,8 +1127,8 @@ template< typename Real,
           typename RealAllocator >
    template< typename Real_, typename Device_, typename Index_, typename RealAllocator_ >
 bool
-Dense< Real, Device, Index, RowMajorOrder, RealAllocator >::
-operator!=( const Dense< Real_, Device_, Index_, RowMajorOrder >& matrix ) const
+DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >::
+operator!=( const DenseMatrix< Real_, Device_, Index_, RowMajorOrder >& matrix ) const
 {
    return ! ( *this == matrix );
 }
@@ -1144,7 +1138,7 @@ template< typename Real,
           typename Index,
           bool RowMajorOrder,
           typename RealAllocator >
-void Dense< Real, Device, Index, RowMajorOrder, RealAllocator >::save( const String& fileName ) const
+void DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >::save( const String& fileName ) const
 {
    this->view.save( fileName );
 }
@@ -1154,7 +1148,7 @@ template< typename Real,
           typename Index,
           bool RowMajorOrder,
           typename RealAllocator >
-void Dense< Real, Device, Index, RowMajorOrder, RealAllocator >::load( const String& fileName )
+void DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >::load( const String& fileName )
 {
    Object::load( fileName );
 }
@@ -1164,7 +1158,7 @@ template< typename Real,
           typename Index,
           bool RowMajorOrder,
           typename RealAllocator >
-void Dense< Real, Device, Index, RowMajorOrder, RealAllocator >::save( File& file ) const
+void DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >::save( File& file ) const
 {
    this->view.save( file );
 }
@@ -1174,7 +1168,7 @@ template< typename Real,
           typename Index,
           bool RowMajorOrder,
           typename RealAllocator >
-void Dense< Real, Device, Index, RowMajorOrder, RealAllocator >::load( File& file )
+void DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >::load( File& file )
 {
    Matrix< Real, Device, Index >::load( file );
    this->segments.load( file );
@@ -1186,7 +1180,7 @@ template< typename Real,
           typename Index,
           bool RowMajorOrder,
           typename RealAllocator >
-void Dense< Real, Device, Index, RowMajorOrder, RealAllocator >::print( std::ostream& str ) const
+void DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >::print( std::ostream& str ) const
 {
    this->view.print( str );
 }
@@ -1198,11 +1192,22 @@ template< typename Real,
           typename RealAllocator >
 __cuda_callable__
 Index
-Dense< Real, Device, Index, RowMajorOrder, RealAllocator >::
+DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >::
 getElementIndex( const IndexType row, const IndexType column ) const
 {
    return this->segments.getGlobalIndex( row, column );
 }
 
+template< typename Real,
+          typename Device,
+          typename Index,
+          bool RowMajorOrder,
+          typename RealAllocator >
+std::ostream& operator<< ( std::ostream& str, const DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >& matrix )
+{ 
+   matrix.print( str );
+   return str;
+}
+
 } // namespace Matrices
 } // namespace TNL
diff --git a/src/TNL/Matrices/DenseMatrixRowView.h b/src/TNL/Matrices/DenseMatrixRowView.h
index 84c6b141cd7f7cdf25be8e550e573680b4cce902..78fecd0f7dc06572c62cc089d0e24fd180baf916 100644
--- a/src/TNL/Matrices/DenseMatrixRowView.h
+++ b/src/TNL/Matrices/DenseMatrixRowView.h
@@ -13,30 +13,89 @@
 namespace TNL {
    namespace Matrices {
 
+/**
+ * \brief RowView is a simple structure for accessing rows of dense matrix.
+ * 
+ * \tparam SegmentView is a segment view of segments representing the matrix format.
+ * \tparam ValuesView is a vector view storing the matrix elements values.
+ * 
+ * See \ref DenseMatrix and \ref DenseMatrixView.
+ * 
+ * \par Example
+ * \include Matrices/DenseMatrixExample_getRow.cpp
+ * \par Output
+ * \include DenseMatrixExample_getRow.out
+ */
 template< typename SegmentView,
           typename ValuesView >
 class DenseMatrixRowView
 {
    public:
 
+      /**
+       * \brief The type of matrix elements.
+       */
       using RealType = typename ValuesView::RealType;
+
+      /**
+       * \brief The type used for matrix elements indexing.
+       */
+      using IndexType = typename SegmentView::IndexType;
+
+      /**
+       * \brief Type representing matrix row format.
+       */
       using SegmentViewType = SegmentView;
-      using IndexType = typename SegmentViewType::IndexType;
+
+      /**
+       * \brief Type of container view used for storing matrix elements values.
+       */
       using ValuesViewType = ValuesView;
 
+      /**
+       * \brief Constructor with \e segmentView and \e values
+       * 
+       * \param segmentView instance of SegmentViewType representing matrix row.
+       * \param values is a container view for storing the matrix elements values.
+       */
       __cuda_callable__
       DenseMatrixRowView( const SegmentViewType& segmentView,
                           const ValuesViewType& values );
 
+      /**
+       * \brief Returns size of the matrix row, i.e. number of matrix elements in this row.
+       * 
+       * \return Size of the matrix row.
+       */
       __cuda_callable__
       IndexType getSize() const;
 
+      /**
+       * \brief Returns constants reference to an element with given column index.
+       * 
+       * \param column is column index of the matrix element.
+       * 
+       * \return constant reference to the matrix element.
+       */
       __cuda_callable__
-      const RealType& getValue( const IndexType column ) const;
+      const RealType& getElement( const IndexType column ) const;
 
+      /**
+       * \brief Returns non-constants reference to an element with given column index.
+       * 
+       * \param column is a column index of the matrix element.
+       * 
+       * \return non-constant reference to the matrix element.
+       */
       __cuda_callable__
-      RealType& getValue( const IndexType column );
+      RealType& getElement( const IndexType column );
 
+      /**
+       * \brief Sets value of matrix element with given column index
+       * .
+       * \param column is a column index of the matrix element.
+       * \param value is a value the matrix element will be set to.
+       */
       __cuda_callable__
       void setElement( const IndexType column,
                        const RealType& value );
diff --git a/src/TNL/Matrices/DenseMatrixRowView.hpp b/src/TNL/Matrices/DenseMatrixRowView.hpp
index 1962a4d9a8eabe80f28b2e21d1f0506792949225..9ca725396d2c4a0524f9b268cbce900d8e660daf 100644
--- a/src/TNL/Matrices/DenseMatrixRowView.hpp
+++ b/src/TNL/Matrices/DenseMatrixRowView.hpp
@@ -38,7 +38,7 @@ template< typename SegmentView,
           typename ValuesView >
 __cuda_callable__ auto
 DenseMatrixRowView< SegmentView, ValuesView >::
-getValue( const IndexType column ) const -> const RealType&
+getElement( const IndexType column ) const -> const RealType&
 {
    TNL_ASSERT_LT( column, this->getSize(), "Column index exceeds matrix row size." );
    return values[ segmentView.getGlobalIndex( column ) ];
@@ -48,7 +48,7 @@ template< typename SegmentView,
           typename ValuesView >
 __cuda_callable__ auto
 DenseMatrixRowView< SegmentView, ValuesView >::
-getValue( const IndexType column ) -> RealType&
+getElement( const IndexType column ) -> RealType&
 {
    TNL_ASSERT_LT( column, this->getSize(), "Column index exceeds matrix row size." );
    return values[ segmentView.getGlobalIndex( column ) ];
diff --git a/src/TNL/Matrices/DenseMatrixView.h b/src/TNL/Matrices/DenseMatrixView.h
index a7e1a09a78f336b94ae9f62ab84d2846d2e24602..8ae12f64ec3c6cfe4852d98fd82eb8555e878714 100644
--- a/src/TNL/Matrices/DenseMatrixView.h
+++ b/src/TNL/Matrices/DenseMatrixView.h
@@ -87,9 +87,9 @@ class DenseMatrixView : public MatrixView< Real, Device, Index >
 
       IndexType getMaxRowLength() const;
 
-      IndexType getNumberOfMatrixElements() const;
+      IndexType getElementsCount() const;
 
-      IndexType getNumberOfNonzeroMatrixElements() const;
+      IndexType getNonzeroElementsCount() const;
 
       void reset();
 
@@ -123,10 +123,10 @@ class DenseMatrixView : public MatrixView< Real, Device, Index >
                        const IndexType column ) const;
 
       template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
-      void rowsReduction( IndexType first, IndexType last, Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& zero ) const;
+      void rowsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& zero ) const;
 
       template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
-      void allRowsReduction( Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& zero ) const;
+      void allRowsReduction( Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& zero ) const;
 
       template< typename Function >
       void forRows( IndexType first, IndexType last, Function& function ) const;
diff --git a/src/TNL/Matrices/DenseMatrixView.hpp b/src/TNL/Matrices/DenseMatrixView.hpp
index ddd9c93281b70a10d25e05768d409f41303e4774..917fb596b6d9512ff14c9a7829657a36473d12ed 100644
--- a/src/TNL/Matrices/DenseMatrixView.hpp
+++ b/src/TNL/Matrices/DenseMatrixView.hpp
@@ -10,8 +10,10 @@
 
 #pragma once
 
+#include <iomanip>
+#include <functional>
 #include <TNL/Assert.h>
-#include <TNL/Matrices/Dense.h>
+#include <TNL/Matrices/DenseMatrix.h>
 #include <TNL/Exceptions/NotImplementedError.h>
 
 namespace TNL {
@@ -80,7 +82,7 @@ String
 DenseMatrixView< Real, Device, Index, RowMajorOrder >::
 getSerializationType()
 {
-   return String( "Matrices::Dense< " ) +
+   return String( "Matrices::DenseMatrix< " ) +
           TNL::getSerializationType< RealType >() + ", [any_device], " +
           TNL::getSerializationType< IndexType >() + ", " +
           ( RowMajorOrder ? "true" : "false" ) + ", [any_allocator] >";
@@ -112,20 +114,19 @@ getCompressedRowLengths( Vector& rowLengths ) const
    auto fetch = [] __cuda_callable__ ( IndexType row, IndexType column, const RealType& value ) -> IndexType {
       return ( value != 0.0 );
    };
-   auto reduce = [] __cuda_callable__ ( IndexType& aux, const IndexType a ) {
-      aux += a;
-   };
    auto keep = [=] __cuda_callable__ ( const IndexType rowIdx, const IndexType value ) mutable {
       rowLengths_view[ rowIdx ] = value;
    };
-   this->allRowsReduction( fetch, reduce, keep, 0 );
+   this->allRowsReduction( fetch, std::plus<>{}, keep, 0 );
 }
 
 template< typename Real,
           typename Device,
           typename Index,
           bool RowMajorOrder >
-Index DenseMatrixView< Real, Device, Index, RowMajorOrder >::getRowLength( const IndexType row ) const
+Index
+DenseMatrixView< Real, Device, Index, RowMajorOrder >::
+getRowLength( const IndexType row ) const
 {
    return this->getColumns();
 }
@@ -134,7 +135,9 @@ template< typename Real,
           typename Device,
           typename Index,
           bool RowMajorOrder >
-Index DenseMatrixView< Real, Device, Index, RowMajorOrder >::getMaxRowLength() const
+Index
+DenseMatrixView< Real, Device, Index, RowMajorOrder >::
+getMaxRowLength() const
 {
    return this->getColumns();
 }
@@ -143,7 +146,9 @@ template< typename Real,
           typename Device,
           typename Index,
           bool RowMajorOrder >
-Index DenseMatrixView< Real, Device, Index, RowMajorOrder >::getNumberOfMatrixElements() const
+Index
+DenseMatrixView< Real, Device, Index, RowMajorOrder >::
+getElementsCount() const
 {
    return this->getRows() * this->getColumns();
 }
@@ -152,7 +157,9 @@ template< typename Real,
           typename Device,
           typename Index,
           bool RowMajorOrder >
-Index DenseMatrixView< Real, Device, Index, RowMajorOrder >::getNumberOfNonzeroMatrixElements() const
+Index
+DenseMatrixView< Real, Device, Index, RowMajorOrder >::
+getNonzeroElementsCount() const
 {
    const auto values_view = this->values.getConstView();
    auto fetch = [=] __cuda_callable__ ( const IndexType i ) -> IndexType {
@@ -165,7 +172,9 @@ template< typename Real,
           typename Device,
           typename Index,
           bool RowMajorOrder >
-void DenseMatrixView< Real, Device, Index, RowMajorOrder >::reset()
+void
+DenseMatrixView< Real, Device, Index, RowMajorOrder >::
+reset()
 {
    Matrix< Real, Device, Index >::reset();
 }
@@ -174,7 +183,9 @@ template< typename Real,
           typename Device,
           typename Index,
           bool RowMajorOrder >
-void DenseMatrixView< Real, Device, Index, RowMajorOrder >::setValue( const Real& value )
+void
+DenseMatrixView< Real, Device, Index, RowMajorOrder >::
+setValue( const Real& value )
 {
    this->values = value;
 }
@@ -188,7 +199,7 @@ DenseMatrixView< Real, Device, Index, RowMajorOrder >::
 getRow( const IndexType& rowIdx ) const -> const RowView
 {
    TNL_ASSERT_LT( rowIdx, this->getRows(), "Row index is larger than number of matrix rows." );
-   return RowView( this->segments.getSegmentView( rowIdx ), this->values.getView() );
+   return RowView( this->segments.getSegmentView( rowIdx ), this->values.getConstView() );
 }
 
 template< typename Real,
@@ -287,7 +298,7 @@ template< typename Real,
    template< typename Fetch, typename Reduce, typename Keep, typename FetchValue >
 void
 DenseMatrixView< Real, Device, Index, RowMajorOrder >::
-rowsReduction( IndexType first, IndexType last, Fetch& fetch, Reduce& reduce, Keep& keep, const FetchValue& zero ) const
+rowsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchValue& zero ) const
 {
    const auto values_view = this->values.getConstView();
    auto fetch_ = [=] __cuda_callable__ ( IndexType rowIdx, IndexType columnIdx, IndexType globalIdx, bool& compute ) mutable -> decltype( fetch( IndexType(), IndexType(), RealType() ) ) {
@@ -304,7 +315,7 @@ template< typename Real,
    template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
 void
 DenseMatrixView< Real, Device, Index, RowMajorOrder >::
-allRowsReduction( Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& zero ) const
+allRowsReduction( Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& zero ) const
 {
    this->rowsReduction( 0, this->getRows(), fetch, reduce, keep, zero );
 }
@@ -403,13 +414,10 @@ vectorProduct( const InVector& inVector, OutVector& outVector ) const
    auto fetch = [=] __cuda_callable__ ( IndexType row, IndexType column, IndexType offset, bool& compute ) -> RealType {
       return valuesView[ offset ] * inVectorView[ column ];
    };
-   auto reduction = [] __cuda_callable__ ( RealType& sum, const RealType& value ) {
-      sum += value;
-   };
    auto keeper = [=] __cuda_callable__ ( IndexType row, const RealType& value ) mutable {
       outVectorView[ row ] = value;
    };
-   this->segments.segmentsReduction( 0, this->getRows(), fetch, reduction, keeper, ( RealType ) 0.0 );
+   this->segments.segmentsReduction( 0, this->getRows(), fetch, std::plus<>{}, keeper, ( RealType ) 0.0 );
 }
 
 template< typename Real,
@@ -682,7 +690,11 @@ void DenseMatrixView< Real, Device, Index, RowMajorOrder >::print( std::ostream&
    {
       str <<"Row: " << row << " -> ";
       for( IndexType column = 0; column < this->getColumns(); column++ )
-         str << " Col:" << column << "->" << this->getElement( row, column ) << "\t";
+      {
+         std::stringstream str_;
+         str_ << std::setw( 4 ) << std::right << column << ":" << std::setw( 4 ) << std::left << this->getElement( row, column );
+         str << std::setw( 10 ) << str_.str();
+      }
       str << std::endl;
    }
 }
diff --git a/src/TNL/Matrices/DistributedMatrix.h b/src/TNL/Matrices/DistributedMatrix.h
index dde7051c063b4b806131ebdcb1e2412667c72414..98cade7ce77859fa2c1a293354ae258d12f676b3 100644
--- a/src/TNL/Matrices/DistributedMatrix.h
+++ b/src/TNL/Matrices/DistributedMatrix.h
@@ -14,7 +14,6 @@
 
 #include <type_traits>
 
-#include <TNL/Matrices/Legacy/SparseRow.h>
 #include <TNL/Communicators/MpiCommunicator.h>
 #include <TNL/Containers/Subrange.h>
 #include <TNL/Containers/DistributedVector.h>
@@ -56,8 +55,8 @@ public:
 
    using CompressedRowLengthsVector = Containers::DistributedVector< IndexType, DeviceType, IndexType, CommunicatorType >;
 
-   using MatrixRow = Matrices::Legacy::SparseRow< RealType, IndexType >;
-   using ConstMatrixRow = Matrices::Legacy::SparseRow< std::add_const_t< RealType >, std::add_const_t< IndexType > >;
+   using MatrixRow = typename Matrix::RowView;
+   using ConstMatrixRow = typename Matrix::ConstRowView;
 
    template< typename _Real = RealType,
              typename _Device = DeviceType,
@@ -105,19 +104,15 @@ public:
 
    void setCompressedRowLengths( const CompressedRowLengthsVector& rowLengths );
 
-   void getCompressedRowLengths( CompressedRowLengthsVector& rowLengths ) const;
+   template< typename Vector >
+   void getCompressedRowLengths( Vector& rowLengths ) const;
 
-   IndexType getRowLength( IndexType row ) const;
+   IndexType getRowCapacity( IndexType row ) const;
 
-   bool setElement( IndexType row,
+   void setElement( IndexType row,
                     IndexType column,
                     RealType value );
 
-   __cuda_callable__
-   bool setElementFast( IndexType row,
-                        IndexType column,
-                        RealType value );
-
    RealType getElement( IndexType row,
                         IndexType column ) const;
 
@@ -125,17 +120,6 @@ public:
    RealType getElementFast( IndexType row,
                             IndexType column ) const;
 
-   __cuda_callable__
-   bool setRowFast( IndexType row,
-                    const IndexType* columnIndexes,
-                    const RealType* values,
-                    IndexType elements );
-
-   __cuda_callable__
-   void getRowFast( IndexType row,
-                    IndexType* columns,
-                    RealType* values ) const;
-
    __cuda_callable__
    MatrixRow getRow( IndexType row );
 
diff --git a/src/TNL/Matrices/DistributedMatrix_impl.h b/src/TNL/Matrices/DistributedMatrix_impl.h
index c1a13a713391f4231b41191fbedb5aa1cb4050c7..38b7f3af0aec7168cafaee600ebfc332c96925f6 100644
--- a/src/TNL/Matrices/DistributedMatrix_impl.h
+++ b/src/TNL/Matrices/DistributedMatrix_impl.h
@@ -166,13 +166,15 @@ setCompressedRowLengths( const CompressedRowLengthsVector& rowLengths )
 
 template< typename Matrix,
           typename Communicator >
+   template< typename Vector >
 void
 DistributedMatrix< Matrix, Communicator >::
-getCompressedRowLengths( CompressedRowLengthsVector& rowLengths ) const
+getCompressedRowLengths( Vector& rowLengths ) const
 {
    if( getCommunicationGroup() != CommunicatorType::NullGroup ) {
       rowLengths.setDistribution( getLocalRowRange(), getRows(), getCommunicationGroup() );
-      localMatrix.getCompressedRowLengths( rowLengths.getLocalView() );
+      auto localRowLengths = rowLengths.getLocalView();
+      localMatrix.getCompressedRowLengths( localRowLengths );
    }
 }
 
@@ -180,35 +182,22 @@ template< typename Matrix,
           typename Communicator >
 typename Matrix::IndexType
 DistributedMatrix< Matrix, Communicator >::
-getRowLength( IndexType row ) const
+getRowCapacity( IndexType row ) const
 {
    const IndexType localRow = localRowRange.getLocalIndex( row );
-   return localMatrix.getRowLength( localRow );
+   return localMatrix.getRowCapacity( localRow );
 }
 
 template< typename Matrix,
           typename Communicator >
-bool
+void
 DistributedMatrix< Matrix, Communicator >::
 setElement( IndexType row,
             IndexType column,
             RealType value )
 {
    const IndexType localRow = localRowRange.getLocalIndex( row );
-   return localMatrix.setElement( localRow, column, value );
-}
-
-template< typename Matrix,
-          typename Communicator >
-__cuda_callable__
-bool
-DistributedMatrix< Matrix, Communicator >::
-setElementFast( IndexType row,
-                IndexType column,
-                RealType value )
-{
-   const IndexType localRow = localRowRange.getLocalIndex( row );
-   return localMatrix.setElementFast( localRow, column, value );
+   localMatrix.setElement( localRow, column, value );
 }
 
 template< typename Matrix,
@@ -234,33 +223,6 @@ getElementFast( IndexType row,
    return localMatrix.getElementFast( localRow, column );
 }
 
-template< typename Matrix,
-          typename Communicator >
-__cuda_callable__
-bool
-DistributedMatrix< Matrix, Communicator >::
-setRowFast( IndexType row,
-            const IndexType* columnIndexes,
-            const RealType* values,
-            IndexType elements )
-{
-   const IndexType localRow = localRowRange.getLocalIndex( row );
-   return localMatrix.setRowFast( localRow, columnIndexes, values, elements );
-}
-
-template< typename Matrix,
-          typename Communicator >
-__cuda_callable__
-void
-DistributedMatrix< Matrix, Communicator >::
-getRowFast( IndexType row,
-            IndexType* columns,
-            RealType* values ) const
-{
-   const IndexType localRow = localRowRange.getLocalIndex( row );
-   return localMatrix.getRowFast( localRow, columns, values );
-}
-
 template< typename Matrix,
           typename Communicator >
 __cuda_callable__
diff --git a/src/TNL/Matrices/DistributedSpMV.h b/src/TNL/Matrices/DistributedSpMV.h
index 01e9c286ffdadca8138c1020cc2e8a8da598ef00..55527834c8730089aad8a45979039ce31f7b22c4 100644
--- a/src/TNL/Matrices/DistributedSpMV.h
+++ b/src/TNL/Matrices/DistributedSpMV.h
@@ -20,7 +20,7 @@
 #include <utility>  // std::pair
 #include <limits>   // std::numeric_limits
 #include <TNL/Allocators/Host.h>
-#include <TNL/Matrices/Dense.h>
+#include <TNL/Matrices/DenseMatrix.h>
 #include <TNL/Containers/Vector.h>
 #include <TNL/Containers/VectorView.h>
 #include <TNL/Matrices/ThreePartVector.h>
@@ -85,8 +85,8 @@ public:
          const auto row = localMatrix->getRow( i );
          bool comm_left = false;
          bool comm_right = false;
-         for( IndexType c = 0; c < row.getLength(); c++ ) {
-            const IndexType j = row.getElementColumn( c );
+         for( IndexType c = 0; c < row.getSize(); c++ ) {
+            const IndexType j = row.getColumnIndex( c );
             if( j < columns ) {
                const int owner = Partitioner::getOwner( j, columns, nproc );
                // atomic assignment
@@ -120,7 +120,7 @@ public:
 
       // copy the buffer into all rows of the preCommPattern* matrices
       // (in-place copy does not work with some OpenMPI configurations)
-      Matrices::Dense< IndexType, Devices::Host, int > preCommPatternStarts, preCommPatternEnds;
+      Matrices::DenseMatrix< IndexType, Devices::Host, int > preCommPatternStarts, preCommPatternEnds;
       preCommPatternStarts.setLike( commPatternStarts );
       preCommPatternEnds.setLike( commPatternEnds );
       for( int j = 0; j < nproc; j++ )
@@ -190,40 +190,21 @@ public:
 
          // perform matrix-vector multiplication
          auto outVectorView = outVector.getLocalView();
-         const Pointers::DevicePointer< const MatrixType > localMatrixPointer( localMatrix );
-         auto kernel = [=] __cuda_callable__ ( IndexType i, const MatrixType* localMatrix ) mutable
-         {
-            outVectorView[ i ] = localMatrix->rowVectorProduct( i, globalBufferView );
-         };
-         Algorithms::ParallelFor< DeviceType >::exec( (IndexType) 0, localMatrix.getRows(), kernel,
-                                                      &localMatrixPointer.template getData< DeviceType >() );
+         localMatrix.vectorProduct( globalBuffer, outVectorView );
       }
       // optimization for banded matrices
       else {
          auto outVectorView = outVector.getLocalView();
-         const Pointers::DevicePointer< const MatrixType > localMatrixPointer( localMatrix );
-         const auto inView = inVector.getConstView();
 
          // matrix-vector multiplication using local-only rows
-         auto kernel1 = [=] __cuda_callable__ ( IndexType i, const MatrixType* localMatrix ) mutable
-         {
-            outVectorView[ i ] = localMatrix->rowVectorProduct( i, inView );
-         };
-         Algorithms::ParallelFor< DeviceType >::exec( localOnlySpan.first, localOnlySpan.second, kernel1,
-                                                      &localMatrixPointer.template getData< DeviceType >() );
+         localMatrix.vectorProduct( inVector, outVectorView, 1.0, 0.0, localOnlySpan.first, localOnlySpan.second );
 
          // wait for all communications to finish
          CommunicatorType::WaitAll( &commRequests[0], commRequests.size() );
 
          // finish the multiplication by adding the non-local entries
-         auto kernel2 = [=] __cuda_callable__ ( IndexType i, const MatrixType* localMatrix ) mutable
-         {
-            outVectorView[ i ] = localMatrix->rowVectorProduct( i, globalBufferView );
-         };
-         Algorithms::ParallelFor< DeviceType >::exec( (IndexType) 0, localOnlySpan.first, kernel2,
-                                                      &localMatrixPointer.template getData< DeviceType >() );
-         Algorithms::ParallelFor< DeviceType >::exec( localOnlySpan.second, localMatrix.getRows(), kernel2,
-                                                      &localMatrixPointer.template getData< DeviceType >() );
+         localMatrix.vectorProduct( globalBufferView, outVectorView, 1.0, 0.0, 0, localOnlySpan.first );
+         localMatrix.vectorProduct( globalBufferView, outVectorView, 1.0, 0.0, localOnlySpan.second, localMatrix.getRows() );
       }
    }
 
@@ -237,7 +218,7 @@ public:
 
 protected:
    // communication pattern
-   Matrices::Dense< IndexType, Devices::Host, int > commPatternStarts, commPatternEnds;
+   Matrices::DenseMatrix< IndexType, Devices::Host, int, true, Allocators::Host< IndexType > > commPatternStarts, commPatternEnds;
 
    // span of rows with only block-diagonal entries
    std::pair< IndexType, IndexType > localOnlySpan;
diff --git a/src/TNL/Matrices/Legacy/BiEllpack_impl.h b/src/TNL/Matrices/Legacy/BiEllpack_impl.h
index 1bb393bb939aed770f4a3878ab3fca895920243f..c83c9e0fbbad8aaaedd17850ced7793ad4d85330 100644
--- a/src/TNL/Matrices/Legacy/BiEllpack_impl.h
+++ b/src/TNL/Matrices/Legacy/BiEllpack_impl.h
@@ -1070,6 +1070,7 @@ public:
 						if( matrix.rowPermArray.getElement( k ) == j + 1 )
 							permIndex2 = k;
 					}
+               std::cerr << "permIndex2 = " << permIndex2 << std::endl;
 					if( rowLengths.getElement( permIndex1 ) < rowLengths.getElement( permIndex2 ) )
 					{
 						Index temp = matrix.rowPermArray.getElement( permIndex1 );
diff --git a/src/TNL/Matrices/Legacy/SparseRow.h b/src/TNL/Matrices/Legacy/SparseRow.h
index d0008c93fa03f7e1f0ebaa385e1b981c88f8c3b9..eb7a461fba5d59763bfad608dbef3ea3327aa5d1 100644
--- a/src/TNL/Matrices/Legacy/SparseRow.h
+++ b/src/TNL/Matrices/Legacy/SparseRow.h
@@ -51,12 +51,30 @@ class SparseRow
       __cuda_callable__
       const Index& getElementColumn( const Index& elementIndex ) const;
 
+      __cuda_callable__
+      const Index& getColumnIndex( const Index& elementIndex ) const
+      {
+         return getElementColumn( elementIndex );
+      };
+
+      
       __cuda_callable__
       const Real& getElementValue( const Index& elementIndex ) const;
 
+      __cuda_callable__
+      const Real& getValue( const Index& elementIndex ) const
+      {
+         return getElementValue( elementIndex );
+      };
+
+
       __cuda_callable__
       Index getLength() const;
 
+      __cuda_callable__
+      Index getSize() const { return length; };
+
+
       __cuda_callable__
       Index getNonZeroElementsCount() const;
 
diff --git a/src/TNL/Matrices/Matrix.h b/src/TNL/Matrices/Matrix.h
index 129a54cbe0cf47499fa5faa5dab45ad09b50834e..ba21721686b58bddb3b7af316223639319d9750d 100644
--- a/src/TNL/Matrices/Matrix.h
+++ b/src/TNL/Matrices/Matrix.h
@@ -47,8 +47,8 @@ public:
            const IndexType columns,
            const RealAllocatorType& allocator = RealAllocatorType() );
 
-   void setDimensions( const IndexType rows,
-                       const IndexType columns );
+   virtual void setDimensions( const IndexType rows,
+                               const IndexType columns );
 
    template< typename Matrix_ >
    void setLike( const Matrix_& matrix );
diff --git a/src/TNL/Matrices/MatrixInfo.h b/src/TNL/Matrices/MatrixInfo.h
index ed999c9f2c458be9a746e43dbaf9a503e69feee9..e91e8a40444223f2831fa3a1231309a9d855cce9 100644
--- a/src/TNL/Matrices/MatrixInfo.h
+++ b/src/TNL/Matrices/MatrixInfo.h
@@ -11,7 +11,7 @@
 #pragma once
 
 #include <TNL/String.h>
-#include <TNL/Matrices/Dense.h>
+#include <TNL/Matrices/DenseMatrix.h>
 #include <TNL/Matrices/DenseMatrixView.h>
 #include <TNL/Matrices/SparseMatrix.h>
 #include <TNL/Matrices/SparseMatrixView.h>
@@ -48,8 +48,8 @@ template< typename Real,
           typename Index,
           bool RowMajorOrder,
           typename RealAllocator >
-struct MatrixInfo< Dense< Real, Device, Index, RowMajorOrder, RealAllocator > >
-: public MatrixInfo< typename Dense< Real, Device, Index, RowMajorOrder, RealAllocator >::ViewType >
+struct MatrixInfo< DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator > >
+: public MatrixInfo< typename DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >::ViewType >
 {
 };
 
diff --git a/src/TNL/Matrices/MatrixView.h b/src/TNL/Matrices/MatrixView.h
index 89551018186e279ced32005971ccfe44d7bda918..76a3948a98792388120097cc7e20190ba58c95e5 100644
--- a/src/TNL/Matrices/MatrixView.h
+++ b/src/TNL/Matrices/MatrixView.h
@@ -83,8 +83,6 @@ public:
 
    virtual void save( File& file ) const;
 
-   virtual void load( File& file );
-
    virtual void print( std::ostream& str ) const;
 
 
diff --git a/src/TNL/Matrices/MatrixView.hpp b/src/TNL/Matrices/MatrixView.hpp
index dfac8f3afc671db39382fff6cc1916e73f3fc4b0..b2b181e4c4671607728bfb9f37935a23fe258a30 100644
--- a/src/TNL/Matrices/MatrixView.hpp
+++ b/src/TNL/Matrices/MatrixView.hpp
@@ -155,17 +155,6 @@ void MatrixView< Real, Device, Index >::save( File& file ) const
    file << this->values;
 }
 
-template< typename Real,
-          typename Device,
-          typename Index >
-void MatrixView< Real, Device, Index >::load( File& file )
-{
-   Object::load( file );
-   file.load( &this->rows );
-   file.load( &this->columns );
-   file >> this->values;
-}
-
 template< typename Real,
           typename Device,
           typename Index >
diff --git a/src/TNL/Matrices/SparseMatrix.h b/src/TNL/Matrices/SparseMatrix.h
index 7dc554ae407ab5968738ca9485f7dafde7ebf0af..ab5caacfc39b7233ae057405ff0787ca13a7a13b 100644
--- a/src/TNL/Matrices/SparseMatrix.h
+++ b/src/TNL/Matrices/SparseMatrix.h
@@ -17,7 +17,7 @@
 #include <TNL/Containers/Segments/CSR.h>
 #include <TNL/Matrices/SparseMatrixRowView.h>
 #include <TNL/Matrices/SparseMatrixView.h>
-#include <TNL/Matrices/Dense.h>
+#include <TNL/Matrices/DenseMatrix.h>
 
 namespace TNL {
 namespace Matrices {
@@ -59,11 +59,23 @@ class SparseMatrix : public Matrix< Real, Device, Index, RealAllocator >
       using ConstRowsCapacitiesView = typename RowsCapacitiesView::ConstViewType;
       using ValuesVectorType = typename Matrix< Real, Device, Index, RealAllocator >::ValuesVectorType;
       using ValuesViewType = typename ValuesVectorType::ViewType;
+      using ConstValuesViewType = typename ValuesViewType::ConstViewType;
       using ColumnsIndexesVectorType = Containers::Vector< IndexType, DeviceType, IndexType, IndexAllocatorType >;
       using ColumnsIndexesViewType = typename ColumnsIndexesVectorType::ViewType;
+      using ConstColumnsIndexesViewType = typename ColumnsIndexesViewType::ConstViewType;
       using ViewType = SparseMatrixView< Real, Device, Index, MatrixType, SegmentsViewTemplate >;
       using ConstViewType = SparseMatrixView< typename std::add_const< Real >::type, Device, Index, MatrixType, SegmentsViewTemplate >;
       using RowView = SparseMatrixRowView< SegmentViewType, ValuesViewType, ColumnsIndexesViewType, isBinary() >;
+      using ConstRowView = typename RowView::ConstViewType;
+
+      template< typename _Real = Real,
+                typename _Device = Device,
+                typename _Index = Index,
+                typename _MatrixType = MatrixType,
+                template< typename, typename, typename > class _Segments = Segments,
+                typename _RealAllocator = typename Allocators::Default< _Device >::template Allocator< _Real >,
+                typename _IndexAllocator = typename Allocators::Default< _Device >::template Allocator< _Index > >
+      using Self = SparseMatrix< _Real, _Device, _Index, _MatrixType, _Segments, _RealAllocator, _IndexAllocator >;
 
       // TODO: remove this - it is here only for compatibility with original matrix implementation
       typedef Containers::Vector< IndexType, DeviceType, IndexType > CompressedRowLengthsVector;
@@ -73,9 +85,9 @@ class SparseMatrix : public Matrix< Real, Device, Index, RealAllocator >
       SparseMatrix( const RealAllocatorType& realAllocator = RealAllocatorType(),
                     const IndexAllocatorType& indexAllocator = IndexAllocatorType() );
 
-      SparseMatrix( const SparseMatrix& m );
+      SparseMatrix( const SparseMatrix& m ) = default;
 
-      SparseMatrix( const SparseMatrix&& m );
+      SparseMatrix( SparseMatrix&& m ) = default;
 
       SparseMatrix( const IndexType rows,
                     const IndexType columns,
@@ -99,6 +111,9 @@ class SparseMatrix : public Matrix< Real, Device, Index, RealAllocator >
                              const IndexType columns,
                              const std::map< std::pair< MapIndex, MapIndex > , MapValue >& map );
 
+      virtual void setDimensions( const IndexType rows,
+                                  const IndexType columns ) override;
+
       ViewType getView() const; // TODO: remove const
 
       ConstViewType getConstView() const;
@@ -108,11 +123,12 @@ class SparseMatrix : public Matrix< Real, Device, Index, RealAllocator >
       virtual String getSerializationTypeVirtual() const;
 
       template< typename RowsCapacitiesVector >
-      void setCompressedRowLengths( const RowsCapacitiesVector& rowCapacities );
+      void setRowCapacities( const RowsCapacitiesVector& rowCapacities );
 
       // TODO: Remove this when possible
-      void setCompressedRowLengths( ConstCompressedRowLengthsVectorView rowLengths ) {
-         this->setCompressedRowLengths( rowLengths );
+      template< typename RowsCapacitiesVector >
+      void setCompressedRowLengths( const RowsCapacitiesVector& rowLengths ) {
+         this->setRowCapacities( rowLengths );
       };
 
       void setElements( const std::initializer_list< std::tuple< IndexType, IndexType, RealType > >& data );
@@ -124,8 +140,7 @@ class SparseMatrix : public Matrix< Real, Device, Index, RealAllocator >
       template< typename Vector >
       void getCompressedRowLengths( Vector& rowLengths ) const;
 
-      [[deprecated]]
-      virtual IndexType getRowLength( const IndexType row ) const { return 0;};
+      IndexType getRowCapacity( const IndexType row ) const;
 
       template< typename Matrix >
       void setLike( const Matrix& matrix );
@@ -135,27 +150,30 @@ class SparseMatrix : public Matrix< Real, Device, Index, RealAllocator >
       void reset();
 
       __cuda_callable__
-      const RowView getRow( const IndexType& rowIdx ) const;
+      const ConstRowView getRow( const IndexType& rowIdx ) const;
 
       __cuda_callable__
       RowView getRow( const IndexType& rowIdx );
 
+      __cuda_callable__
       void setElement( const IndexType row,
                        const IndexType column,
                        const RealType& value );
 
+      __cuda_callable__
       void addElement( const IndexType row,
                        const IndexType column,
                        const RealType& value,
                        const RealType& thisElementMultiplicator );
 
+      __cuda_callable__
       RealType getElement( const IndexType row,
                            const IndexType column ) const;
 
-      template< typename Vector >
+      /*template< typename Vector >
       __cuda_callable__
       typename Vector::RealType rowVectorProduct( const IndexType row,
-                                                  const Vector& vector ) const;
+                                                  const Vector& vector ) const;*/
 
       /***
        * \brief This method computes outVector = matrixMultiplicator * ( *this ) * inVector + inVectorAddition * inVector
@@ -165,7 +183,9 @@ class SparseMatrix : public Matrix< Real, Device, Index, RealAllocator >
       void vectorProduct( const InVector& inVector,
                           OutVector& outVector,
                           const RealType& matrixMultiplicator = 1.0,
-                          const RealType& outVectorMultiplicator = 0.0 ) const;
+                          const RealType& outVectorMultiplicator = 0.0,
+                          const IndexType firstRow = 0,
+                          const IndexType lastRow = 0 ) const;
 
       /*template< typename Real2, typename Index2 >
       void addMatrix( const SparseMatrix< Real2, Segments, Device, Index2 >& matrix,
@@ -178,10 +198,10 @@ class SparseMatrix : public Matrix< Real, Device, Index, RealAllocator >
        */
 
       template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
-      void rowsReduction( IndexType first, IndexType last, Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& zero ) const;
+      void rowsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& zero ) const;
 
       template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
-      void allRowsReduction( Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& zero ) const;
+      void allRowsReduction( Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& zero ) const;
 
       template< typename Function >
       void forRows( IndexType first, IndexType last, Function& function ) const;
@@ -212,7 +232,7 @@ class SparseMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * \brief Assignment of dense matrix
        */
       template< typename Real_, typename Device_, typename Index_, bool RowMajorOrder, typename RealAllocator_ >
-      SparseMatrix& operator=( const Dense< Real_, Device_, Index_, RowMajorOrder, RealAllocator_ >& matrix );
+      SparseMatrix& operator=( const DenseMatrix< Real_, Device_, Index_, RowMajorOrder, RealAllocator_ >& matrix );
 
 
       /**
@@ -223,6 +243,12 @@ class SparseMatrix : public Matrix< Real, Device, Index, RealAllocator >
       template< typename RHSMatrix >
       SparseMatrix& operator=( const RHSMatrix& matrix );
 
+      template< typename Matrix >
+      bool operator==( const Matrix& m ) const;
+
+      template< typename Matrix >
+      bool operator!=( const Matrix& m ) const;
+
       void save( File& file ) const;
 
       void load( File& file );
@@ -252,7 +278,7 @@ class SparseMatrix : public Matrix< Real, Device, Index, RealAllocator >
       ViewType view;
 };
 
-} // namespace Matrices
+   } // namespace Matrices
 } // namespace TNL
 
 #include <TNL/Matrices/SparseMatrix.hpp>
diff --git a/src/TNL/Matrices/SparseMatrix.hpp b/src/TNL/Matrices/SparseMatrix.hpp
index e143c014f8d085a69cd4506e071139cacd8a2090..7ce144e34dcd829319683718fc3db79e2ee2930e 100644
--- a/src/TNL/Matrices/SparseMatrix.hpp
+++ b/src/TNL/Matrices/SparseMatrix.hpp
@@ -28,33 +28,7 @@ template< typename Real,
 SparseMatrix< Real, Device, Index, MatrixType, Segments, RealAllocator, IndexAllocator >::
 SparseMatrix( const RealAllocatorType& realAllocator,
               const IndexAllocatorType& indexAllocator )
-   : BaseType( realAllocator ), columnIndexes( indexAllocator )
-{
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          typename MatrixType,
-          template< typename, typename, typename > class Segments,
-          typename RealAllocator,
-          typename IndexAllocator >
-SparseMatrix< Real, Device, Index, MatrixType, Segments, RealAllocator, IndexAllocator >::
-SparseMatrix( const SparseMatrix& m )
-   : Matrix< Real, Device, Index, RealAllocator >( m ), columnIndexes( m.columnIndexes )
-{
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          typename MatrixType,
-          template< typename, typename, typename > class Segments,
-          typename RealAllocator,
-          typename IndexAllocator >
-SparseMatrix< Real, Device, Index, MatrixType, Segments, RealAllocator, IndexAllocator >::
-SparseMatrix( const SparseMatrix&& m )
-   : Matrix< Real, Device, Index, RealAllocator >( std::move( m ) ), columnIndexes( std::move( m.columnIndexes ) )
+: BaseType( realAllocator ), columnIndexes( indexAllocator ), view( this->getView() )
 {
 }
 
@@ -70,7 +44,9 @@ SparseMatrix( const IndexType rows,
               const IndexType columns,
               const RealAllocatorType& realAllocator,
               const IndexAllocatorType& indexAllocator )
-: BaseType( rows, columns, realAllocator ), columnIndexes( indexAllocator )
+: BaseType( rows, columns, realAllocator ), columnIndexes( indexAllocator ),
+  segments( Containers::Vector< IndexType, DeviceType, IndexType >( rows, 0 ) ),
+  view( this->getView() )
 {
 }
 
@@ -127,6 +103,23 @@ SparseMatrix( const IndexType rows,
    this->setElements( map );
 }
 
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          template< typename, typename, typename > class Segments,
+          typename RealAllocator,
+          typename IndexAllocator >
+void
+SparseMatrix< Real, Device, Index, MatrixType, Segments, RealAllocator, IndexAllocator >::
+setDimensions( const IndexType rows,
+               const IndexType columns )
+{
+   BaseType::setDimensions( rows, columns );
+   segments.setSegmentsSizes( Containers::Vector< IndexType, DeviceType, IndexType >( rows, 0 ) );
+   this->view = this->getView();
+}
+
 template< typename Real,
           typename Device,
           typename Index,
@@ -204,7 +197,7 @@ template< typename Real,
    template< typename RowsCapacitiesVector >
 void
 SparseMatrix< Real, Device, Index, MatrixType, Segments, RealAllocator, IndexAllocator >::
-setCompressedRowLengths( const RowsCapacitiesVector& rowsCapacities )
+setRowCapacities( const RowsCapacitiesVector& rowsCapacities )
 {
    TNL_ASSERT_EQ( rowsCapacities.getSize(), this->getRows(), "Number of matrix rows does not fit with rowLengths vector size." );
    using RowsCapacitiesVectorDevice = typename RowsCapacitiesVector::DeviceType;
@@ -312,6 +305,20 @@ getCompressedRowLengths( Vector& rowLengths ) const
    this->view.getCompressedRowLengths( rowLengths );
 }
 
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          template< typename, typename, typename > class Segments,
+          typename RealAllocator,
+          typename IndexAllocator >
+Index
+SparseMatrix< Real, Device, Index, MatrixType, Segments, RealAllocator, IndexAllocator >::
+getRowCapacity( const IndexType row ) const
+{
+   return this->view.getRowCapacity( row );
+}
+
 template< typename Real,
           typename Device,
           typename Index,
@@ -325,6 +332,9 @@ SparseMatrix< Real, Device, Index, MatrixType, Segments, RealAllocator, IndexAll
 setLike( const Matrix_& matrix )
 {
    BaseType::setLike( matrix );
+   this->segments.setSegmentsSizes( Containers::Vector< IndexType, DeviceType, IndexType >( matrix.getRows(), 0 ) ),
+   this->view = this->getView();
+   TNL_ASSERT_EQ( this->getRows(), segments.getSegmentsCount(), "mismatched segments count" );
 }
 
 template< typename Real,
@@ -353,6 +363,9 @@ SparseMatrix< Real, Device, Index, MatrixType, Segments, RealAllocator, IndexAll
 reset()
 {
    BaseType::reset();
+   this->segments.reset();
+   this->view = this->getView();
+   TNL_ASSERT_EQ( this->getRows(), segments.getSegmentsCount(), "mismatched segments count" );
 }
 
 template< typename Real,
@@ -364,7 +377,7 @@ template< typename Real,
           typename IndexAllocator >
 __cuda_callable__ auto
 SparseMatrix< Real, Device, Index, MatrixType, Segments, RealAllocator, IndexAllocator >::
-getRow( const IndexType& rowIdx ) const -> const RowView
+getRow( const IndexType& rowIdx ) const -> const ConstRowView
 {
    return this->view.getRow( rowIdx );
 }
@@ -390,7 +403,7 @@ template< typename Real,
           template< typename, typename, typename > class Segments,
           typename RealAllocator,
           typename IndexAllocator >
-void
+__cuda_callable__ void
 SparseMatrix< Real, Device, Index, MatrixType, Segments, RealAllocator, IndexAllocator >::
 setElement( const IndexType row,
             const IndexType column,
@@ -406,7 +419,7 @@ template< typename Real,
           template< typename, typename, typename > class Segments,
           typename RealAllocator,
           typename IndexAllocator >
-void
+__cuda_callable__ void
 SparseMatrix< Real, Device, Index, MatrixType, Segments, RealAllocator, IndexAllocator >::
 addElement( const IndexType row,
             const IndexType column,
@@ -423,6 +436,7 @@ template< typename Real,
           template< typename, typename, typename > class Segments,
           typename RealAllocator,
           typename IndexAllocator >
+__cuda_callable__
 Real
 SparseMatrix< Real, Device, Index, MatrixType, Segments, RealAllocator, IndexAllocator >::
 getElement( const IndexType row,
@@ -431,7 +445,7 @@ getElement( const IndexType row,
    return this->view.getElement( row, column );
 }
 
-template< typename Real,
+/*template< typename Real,
           typename Device,
           typename Index,
           typename MatrixType,
@@ -445,8 +459,8 @@ SparseMatrix< Real, Device, Index, MatrixType, Segments, RealAllocator, IndexAll
 rowVectorProduct( const IndexType row,
                   const Vector& vector ) const
 {
-   this->view.rowVectorProduct( row, vector );
-}
+   return this->view.rowVectorProduct( row, vector );
+}*/
 
 template< typename Real,
           typename Device,
@@ -462,31 +476,11 @@ SparseMatrix< Real, Device, Index, MatrixType, Segments, RealAllocator, IndexAll
 vectorProduct( const InVector& inVector,
                OutVector& outVector,
                const RealType& matrixMultiplicator,
-               const RealType& outVectorMultiplicator ) const
+               const RealType& outVectorMultiplicator,
+               const IndexType firstRow,
+               const IndexType lastRow ) const
 {
-   this->view.vectorProduct( inVector, outVector, matrixMultiplicator, outVectorMultiplicator );
-   /*TNL_ASSERT_EQ( this->getColumns(), inVector.getSize(), "Matrix columns do not fit with input vector." );
-   TNL_ASSERT_EQ( this->getRows(), outVector.getSize(), "Matrix rows do not fit with output vector." );
-
-   const auto inVectorView = inVector.getConstView();
-   auto outVectorView = outVector.getView();
-   const auto valuesView = this->values.getConstView();
-   const auto columnIndexesView = this->columnIndexes.getConstView();
-   const IndexType paddingIndex = this->getPaddingIndex();
-   auto fetch = [=] __cuda_callable__ ( IndexType row, IndexType localIdx, IndexType globalIdx, bool& compute ) -> RealType {
-      const IndexType column = columnIndexesView[ globalIdx ];
-      compute = ( column != paddingIndex );
-      if( ! compute )
-         return 0.0;
-      return valuesView[ globalIdx ] * inVectorView[ column ];
-   };
-   auto reduction = [] __cuda_callable__ ( RealType& sum, const RealType& value ) {
-      sum += value;
-   };
-   auto keeper = [=] __cuda_callable__ ( IndexType row, const RealType& value ) mutable {
-      outVectorView[ row ] = value;
-   };
-   this->segments.segmentsReduction( 0, this->getRows(), fetch, reduction, keeper, ( RealType ) 0.0 );*/
+   this->view.vectorProduct( inVector, outVector, matrixMultiplicator, outVectorMultiplicator, firstRow, lastRow );
 }
 
 template< typename Real,
@@ -499,19 +493,9 @@ template< typename Real,
    template< typename Fetch, typename Reduce, typename Keep, typename FetchValue >
 void
 SparseMatrix< Real, Device, Index, MatrixType, Segments, RealAllocator, IndexAllocator >::
-rowsReduction( IndexType first, IndexType last, Fetch& fetch, Reduce& reduce, Keep& keep, const FetchValue& zero ) const
+rowsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchValue& zero ) const
 {
    this->view.rowsReduction( first, last, fetch, reduce, keep, zero );
-   /*const auto columns_view = this->columnIndexes.getConstView();
-   const auto values_view = this->values.getConstView();
-   const IndexType paddingIndex_ = this->getPaddingIndex();
-   auto fetch_ = [=] __cuda_callable__ ( IndexType rowIdx, IndexType localIdx, IndexType globalIdx, bool& compute ) mutable -> decltype( fetch( IndexType(), IndexType(), IndexType(), RealType() ) ) {
-      IndexType columnIdx = columns_view[ globalIdx ];
-      if( columnIdx != paddingIndex_ )
-         return fetch( rowIdx, columnIdx, globalIdx, values_view[ globalIdx ] );
-      return zero;
-   };
-   this->segments.segmentsReduction( first, last, fetch_, reduce, keep, zero );*/
 }
 
 template< typename Real,
@@ -524,7 +508,7 @@ template< typename Real,
    template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
 void
 SparseMatrix< Real, Device, Index, MatrixType, Segments, RealAllocator, IndexAllocator >::
-allRowsReduction( Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& zero ) const
+allRowsReduction( Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& zero ) const
 {
    this->rowsReduction( 0, this->getRows(), fetch, reduce, keep, zero );
 }
@@ -542,15 +526,6 @@ SparseMatrix< Real, Device, Index, MatrixType, Segments, RealAllocator, IndexAll
 forRows( IndexType first, IndexType last, Function& function ) const
 {
    this->view.forRows( first, last, function );
-   /*const auto columns_view = this->columnIndexes.getConstView();
-   const auto values_view = this->values.getConstView();
-   const IndexType paddingIndex_ = this->getPaddingIndex();
-   auto f = [=] __cuda_callable__ ( IndexType rowIdx, IndexType localIdx, IndexType globalIdx ) mutable -> bool {
-      function( rowIdx, localIdx, columns_view[ globalIdx ], values_view[ globalIdx ] );
-      return true;
-   };
-   this->segments.forSegments( first, last, f );
-    */
 }
 
 template< typename Real,
@@ -566,14 +541,6 @@ SparseMatrix< Real, Device, Index, MatrixType, Segments, RealAllocator, IndexAll
 forRows( IndexType first, IndexType last, Function& function )
 {
    this->view.forRows( first, last, function );
-   /*auto columns_view = this->columnIndexes.getView();
-   auto values_view = this->values.getView();
-   const IndexType paddingIndex_ = this->getPaddingIndex();
-   auto f = [=] __cuda_callable__ ( IndexType rowIdx, IndexType localIdx, IndexType globalIdx ) mutable -> bool {
-      function( rowIdx, localIdx, columns_view[ globalIdx ], values_view[ globalIdx ] );
-      return true;
-   };
-   this->segments.forSegments( first, last, f );*/
 }
 
 template< typename Real,
@@ -684,9 +651,9 @@ template< typename Real,
    template< typename Real_, typename Device_, typename Index_, bool RowMajorOrder, typename RealAllocator_ >
 SparseMatrix< Real, Device, Index, MatrixType, Segments, RealAllocator, IndexAllocator >&
 SparseMatrix< Real, Device, Index, MatrixType, Segments, RealAllocator, IndexAllocator >::
-operator=( const Dense< Real_, Device_, Index_, RowMajorOrder, RealAllocator_ >& matrix )
+operator=( const DenseMatrix< Real_, Device_, Index_, RowMajorOrder, RealAllocator_ >& matrix )
 {
-   using RHSMatrix = Dense< Real_, Device_, Index_, RowMajorOrder, RealAllocator_ >;
+   using RHSMatrix = DenseMatrix< Real_, Device_, Index_, RowMajorOrder, RealAllocator_ >;
    using RHSIndexType = typename RHSMatrix::IndexType;
    using RHSRealType = typename RHSMatrix::RealType;
    using RHSDeviceType = typename RHSMatrix::DeviceType;
@@ -913,6 +880,36 @@ operator=( const RHSMatrix& matrix )
    return *this;
 }
 
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          template< typename, typename, typename > class Segments,
+          typename RealAllocator,
+          typename IndexAllocator >
+   template< typename Matrix >
+bool
+SparseMatrix< Real, Device, Index, MatrixType, Segments, RealAllocator, IndexAllocator >::
+operator==( const Matrix& m ) const
+{
+   return view == m;
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          template< typename, typename, typename > class Segments,
+          typename RealAllocator,
+          typename IndexAllocator >
+   template< typename Matrix >
+bool
+SparseMatrix< Real, Device, Index, MatrixType, Segments, RealAllocator, IndexAllocator >::
+operator!=( const Matrix& m ) const
+{
+   return view != m;
+}
+
 template< typename Real,
           typename Device,
           typename Index,
diff --git a/src/TNL/Matrices/SparseMatrixRowView.h b/src/TNL/Matrices/SparseMatrixRowView.h
index 8906ab5ae9fd1457ee6690597898a001bdab7c18..c859655ef5ba00fa5ed759e01b6a126dd4fd2324 100644
--- a/src/TNL/Matrices/SparseMatrixRowView.h
+++ b/src/TNL/Matrices/SparseMatrixRowView.h
@@ -10,8 +10,12 @@
 
 #pragma once
 
+#include <ostream>
+
+#include <TNL/Cuda/CudaCallable.h>
+
 namespace TNL {
-   namespace Matrices {
+namespace Matrices {
 
 template< typename SegmentView,
           typename ValuesView,
@@ -26,6 +30,9 @@ class SparseMatrixRowView
       using IndexType = typename SegmentViewType::IndexType;
       using ValuesViewType = ValuesView;
       using ColumnsIndexesViewType = ColumnsIndexesView;
+      using ConstValuesViewType = typename ValuesViewType::ConstViewType;
+      using ConstColumnsIndexesViewType = typename ColumnsIndexesViewType::ConstViewType;
+      using ConstViewType = SparseMatrixRowView< SegmentView, ConstValuesViewType, ConstColumnsIndexesViewType, isBinary_ >;
 
       static constexpr bool isBinary() { return isBinary_; };
 
@@ -49,10 +56,22 @@ class SparseMatrixRowView
       __cuda_callable__
       RealType& getValue( const IndexType localIdx );
 
+      __cuda_callable__
+      void setValue( const IndexType localIdx,
+                     const RealType& value );
+
       __cuda_callable__
       void setElement( const IndexType localIdx,
                        const IndexType column,
                        const RealType& value );
+
+      template< typename _SegmentView,
+                typename _ValuesView,
+                typename _ColumnsIndexesView,
+                bool _isBinary >
+      __cuda_callable__
+      bool operator==( const SparseMatrixRowView< _SegmentView, _ValuesView, _ColumnsIndexesView, _isBinary >& other ) const;
+
    protected:
 
       SegmentViewType segmentView;
@@ -61,7 +80,14 @@ class SparseMatrixRowView
 
       ColumnsIndexesViewType columnIndexes;
 };
-   } // namespace Matrices
+
+template< typename SegmentView,
+          typename ValuesView,
+          typename ColumnsIndexesView,
+          bool isBinary_ >
+std::ostream& operator<<( std::ostream& str, const SparseMatrixRowView< SegmentView, ValuesView, ColumnsIndexesView, isBinary_ >& row );
+
+} // namespace Matrices
 } // namespace TNL
 
 #include <TNL/Matrices/SparseMatrixRowView.hpp>
diff --git a/src/TNL/Matrices/SparseMatrixRowView.hpp b/src/TNL/Matrices/SparseMatrixRowView.hpp
index 67d0845d4af23dd57065b516b212a278fbd0fd5d..545e395fc309cc193fe6f0ed58bff92f7218a6de 100644
--- a/src/TNL/Matrices/SparseMatrixRowView.hpp
+++ b/src/TNL/Matrices/SparseMatrixRowView.hpp
@@ -11,9 +11,10 @@
 #pragma once
 
 #include <TNL/Matrices/SparseMatrixRowView.h>
+#include <TNL/Assert.h>
 
 namespace TNL {
-   namespace Matrices {
+namespace Matrices {
 
 template< typename SegmentView,
           typename ValuesView,
@@ -89,6 +90,22 @@ getValue( const IndexType localIdx ) -> RealType&
    return values[ segmentView.getGlobalIndex( localIdx ) ];
 }
 
+template< typename SegmentView,
+          typename ValuesView,
+          typename ColumnsIndexesView,
+          bool isBinary_ >
+__cuda_callable__ void
+SparseMatrixRowView< SegmentView, ValuesView, ColumnsIndexesView, isBinary_ >::
+setValue( const IndexType localIdx,
+          const RealType& value )
+{
+   TNL_ASSERT_LT( localIdx, this->getSize(), "Local index exceeds matrix row capacity." );
+   if( ! isBinary() ) {
+      const IndexType globalIdx = segmentView.getGlobalIndex( localIdx );
+      values[ globalIdx ] = value;
+   }
+}
+
 template< typename SegmentView,
           typename ValuesView,
           typename ColumnsIndexesView,
@@ -106,6 +123,49 @@ setElement( const IndexType localIdx,
       values[ globalIdx ] = value;
 }
 
+template< typename SegmentView,
+          typename ValuesView,
+          typename ColumnsIndexesView,
+          bool isBinary_ >
+   template< typename _SegmentView,
+             typename _ValuesView,
+             typename _ColumnsIndexesView,
+             bool _isBinary >
+__cuda_callable__
+bool
+SparseMatrixRowView< SegmentView, ValuesView, ColumnsIndexesView, isBinary_ >::
+operator==( const SparseMatrixRowView< _SegmentView, _ValuesView, _ColumnsIndexesView, _isBinary >& other ) const
+{
+   IndexType i = 0;
+   while( i < getSize() && i < other.getSize() ) {
+      if( getColumnIndex( i ) != other.getColumnIndex( i ) )
+         return false;
+      if( getValue( i ) != other.getValue( i ) )
+         return false;
+      ++i;
+   }
+   for( IndexType j = i; j < getSize(); j++ )
+      // TODO: use ... != getPaddingIndex()
+      if( getColumnIndex( j ) >= 0 )
+         return false;
+   for( IndexType j = i; j < other.getSize(); j++ )
+      // TODO: use ... != getPaddingIndex()
+      if( other.getColumnIndex( j ) >= 0 )
+         return false;
+   return true;
+}
+
+template< typename SegmentView,
+          typename ValuesView,
+          typename ColumnsIndexesView,
+          bool isBinary_ >
+std::ostream& operator<<( std::ostream& str, const SparseMatrixRowView< SegmentView, ValuesView, ColumnsIndexesView, isBinary_ >& row )
+{
+   using NonConstIndex = std::remove_const_t< typename SparseMatrixRowView< SegmentView, ValuesView, ColumnsIndexesView, isBinary_ >::IndexType >;
+   for( NonConstIndex i = 0; i < row.getSize(); i++ )
+      str << " [ " << row.getColumnIndex( i ) << " ] = " << row.getValue( i ) << ", ";
+   return str;
+}
 
-   } // namespace Matrices
+} // namespace Matrices
 } // namespace TNL
diff --git a/src/TNL/Matrices/SparseMatrixView.h b/src/TNL/Matrices/SparseMatrixView.h
index 4fa65b70a09e7834aacac1ac80d74ee08c9e4ece..4dc3413b8f68ab07250e9bbca455057c72fa5cab 100644
--- a/src/TNL/Matrices/SparseMatrixView.h
+++ b/src/TNL/Matrices/SparseMatrixView.h
@@ -41,10 +41,13 @@ class SparseMatrixView : public MatrixView< Real, Device, Index >
       using RowsCapacitiesView = Containers::VectorView< IndexType, DeviceType, IndexType >;
       using ConstRowsCapacitiesView = typename RowsCapacitiesView::ConstViewType;
       using ValuesViewType = typename BaseType::ValuesView;
+      using ConstValuesViewType = typename ValuesViewType::ConstViewType;
       using ColumnsIndexesViewType = Containers::VectorView< IndexType, DeviceType, IndexType >;
+      using ConstColumnsIndexesViewType = typename ColumnsIndexesViewType::ConstViewType;
       using ViewType = SparseMatrixView< typename std::remove_const< Real >::type, Device, Index, MatrixType, SegmentsViewTemplate >;
       using ConstViewType = SparseMatrixView< typename std::add_const< Real >::type, Device, Index, MatrixType, SegmentsViewTemplate >;
       using RowView = SparseMatrixRowView< SegmentViewType, ValuesViewType, ColumnsIndexesViewType, isBinary() >;
+      using ConstRowView = typename RowView::ConstViewType;
 
       // TODO: remove this - it is here only for compatibility with original matrix implementation
       typedef Containers::Vector< IndexType, DeviceType, IndexType > CompressedRowLengthsVector;
@@ -80,35 +83,37 @@ class SparseMatrixView : public MatrixView< Real, Device, Index >
       template< typename Vector >
       void getCompressedRowLengths( Vector& rowLengths ) const;
 
-      [[deprecated]]
-      IndexType getRowLength( const IndexType row ) const;
+      IndexType getRowCapacity( const IndexType row ) const;
 
       IndexType getNumberOfNonzeroMatrixElements() const;
 
       void reset();
 
       __cuda_callable__
-      const RowView getRow( const IndexType& rowIdx ) const;
+      ConstRowView getRow( const IndexType& rowIdx ) const;
 
       __cuda_callable__
       RowView getRow( const IndexType& rowIdx );
 
+      __cuda_callable__
       void setElement( const IndexType row,
                        const IndexType column,
                        const RealType& value );
 
+      __cuda_callable__
       void addElement( IndexType row,
                        IndexType column,
                        const RealType& value,
                        const RealType& thisElementMultiplicator = 1.0 );
 
+      __cuda_callable__
       RealType getElement( IndexType row,
                            IndexType column ) const;
 
-      template< typename Vector >
+      /*template< typename Vector >
       __cuda_callable__
       typename Vector::RealType rowVectorProduct( const IndexType row,
-                                                  const Vector& vector ) const;
+                                                  const Vector& vector ) const;*/
 
       /***
        * \brief This method computes outVector = matrixMultiplicator * ( *this ) * inVector + inVectorAddition * inVector
@@ -118,13 +123,15 @@ class SparseMatrixView : public MatrixView< Real, Device, Index >
       void vectorProduct( const InVector& inVector,
                           OutVector& outVector,
                           const RealType matrixMultiplicator = 1.0,
-                          const RealType outVectorMultiplicator = 0.0 ) const;
+                          const RealType outVectorMultiplicator = 0.0,
+                          const IndexType firstRow = 0,
+                          IndexType lastRow = 0 ) const;
 
       template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
-      void rowsReduction( IndexType first, IndexType last, Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& zero ) const;
+      void rowsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& zero ) const;
 
       template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
-      void allRowsReduction( Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& zero ) const;
+      void allRowsReduction( Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& zero ) const;
 
       template< typename Function >
       void forRows( IndexType first, IndexType last, Function& function ) const;
@@ -146,6 +153,12 @@ class SparseMatrixView : public MatrixView< Real, Device, Index >
 
       SparseMatrixView& operator=( const SparseMatrixView& matrix );
 
+      template< typename Matrix >
+      bool operator==( const Matrix& m ) const;
+
+      template< typename Matrix >
+      bool operator!=( const Matrix& m ) const;
+
       void save( File& file ) const;
 
       void save( const String& fileName ) const;
@@ -160,9 +173,25 @@ class SparseMatrixView : public MatrixView< Real, Device, Index >
       ColumnsIndexesViewType columnIndexes;
 
       SegmentsViewType segments;
+
+   private:
+      // TODO: this should be probably moved into a detail namespace
+      template< typename VectorOrView,
+                std::enable_if_t< HasSetSizeMethod< VectorOrView >::value, bool > = true >
+      static void set_size_if_resizable( VectorOrView& v, IndexType size )
+      {
+         v.setSize( size );
+      }
+
+      template< typename VectorOrView,
+                std::enable_if_t< ! HasSetSizeMethod< VectorOrView >::value, bool > = true >
+      static void set_size_if_resizable( VectorOrView& v, IndexType size )
+      {
+         TNL_ASSERT_EQ( v.getSize(), size, "view has wrong size" );
+      }
 };
 
-}  // namespace Conatiners
+} // namespace Conatiners
 } // namespace TNL
 
 #include <TNL/Matrices/SparseMatrixView.hpp>
diff --git a/src/TNL/Matrices/SparseMatrixView.hpp b/src/TNL/Matrices/SparseMatrixView.hpp
index 2bae61f985c312279d60de9f809b71ea3a19629f..26de19dcbd47192394c9b0fa132d62b46d0b01f4 100644
--- a/src/TNL/Matrices/SparseMatrixView.hpp
+++ b/src/TNL/Matrices/SparseMatrixView.hpp
@@ -14,6 +14,7 @@
 #include <TNL/Matrices/SparseMatrixView.h>
 #include <TNL/Algorithms/Reduction.h>
 #include <TNL/Algorithms/AtomicOperations.h>
+#include <TNL/Matrices/details/SparseMatrix.h>
 
 namespace TNL {
 namespace Matrices {
@@ -41,7 +42,7 @@ SparseMatrixView( const IndexType rows,
                   const ValuesViewType& values,
                   const ColumnsIndexesViewType& columnIndexes,
                   const SegmentsViewType& segments )
- : MatrixView< Real, Device, Index >( rows, columns, values ), columnIndexes( columnIndexes ), segments( segments )
+: MatrixView< Real, Device, Index >( rows, columns, values ), columnIndexes( columnIndexes ), segments( segments )
 {
 }
 
@@ -116,19 +117,16 @@ void
 SparseMatrixView< Real, Device, Index, MatrixType, SegmentsView >::
 getCompressedRowLengths( Vector& rowLengths ) const
 {
-   rowLengths.setSize( this->getRows() );
+   details::set_size_if_resizable( rowLengths, this->getRows() );
    rowLengths = 0;
    auto rowLengths_view = rowLengths.getView();
    auto fetch = [] __cuda_callable__ ( IndexType row, IndexType column, IndexType globalIdx, const RealType& value ) -> IndexType {
       return ( value != 0.0 );
    };
-   auto reduce = [] __cuda_callable__ ( IndexType& aux, const IndexType a ) {
-      aux += a;
-   };
    auto keep = [=] __cuda_callable__ ( const IndexType rowIdx, const IndexType value ) mutable {
       rowLengths_view[ rowIdx ] = value;
    };
-   this->allRowsReduction( fetch, reduce, keep, 0 );
+   this->allRowsReduction( fetch, std::plus<>{}, keep, 0 );
 }
 
 template< typename Real,
@@ -138,9 +136,9 @@ template< typename Real,
           template< typename, typename > class SegmentsView >
 Index
 SparseMatrixView< Real, Device, Index, MatrixType, SegmentsView >::
-getRowLength( const IndexType row ) const
+getRowCapacity( const IndexType row ) const
 {
-   return 0;
+   return this->segments.getSegmentSize( row );
 }
 
 template< typename Real,
@@ -175,13 +173,13 @@ getNumberOfNonzeroMatrixElements() const
             return 0.0;
          return 1 + ( column != row && column < rows && row < columns ); // the addition is for non-diagonal elements
       };
-      auto reduction = [] __cuda_callable__ ( IndexType& sum, const IndexType& value ) {
-         sum += value;
-      };
+      //auto reduction = [] __cuda_callable__ ( IndexType& sum, const IndexType& value ) {
+      //   sum += value;
+      //};
       auto keeper = [=] __cuda_callable__ ( IndexType row, const IndexType& value ) mutable {
          row_sums_view[ row ] = value;
       };
-      this->segments.segmentsReduction( 0, this->getRows(), fetch, reduction, keeper, ( IndexType ) 0 );
+      this->segments.segmentsReduction( 0, this->getRows(), fetch, std::plus<>{}, keeper, ( IndexType ) 0 );
       return sum( row_sums );
    }
 }
@@ -193,10 +191,10 @@ template< typename Real,
           template< typename, typename > class SegmentsView >
 __cuda_callable__ auto
 SparseMatrixView< Real, Device, Index, MatrixType, SegmentsView >::
-getRow( const IndexType& rowIdx ) const -> const RowView
+getRow( const IndexType& rowIdx ) const -> ConstRowView
 {
    TNL_ASSERT_LT( rowIdx, this->getRows(), "Row index is larger than number of matrix rows." );
-   return RowView( this->segments.getSegmentView( rowIdx ), this->values.getView(), this->columnIndexes.getView() );
+   return ConstRowView( this->segments.getSegmentView( rowIdx ), this->values, this->columnIndexes );
 }
 
 template< typename Real,
@@ -209,7 +207,7 @@ SparseMatrixView< Real, Device, Index, MatrixType, SegmentsView >::
 getRow( const IndexType& rowIdx ) -> RowView
 {
    TNL_ASSERT_LT( rowIdx, this->getRows(), "Row index is larger than number of matrix rows." );
-   return RowView( this->segments.getSegmentView( rowIdx ), this->values.getView(), this->columnIndexes.getView() );
+   return RowView( this->segments.getSegmentView( rowIdx ), this->values, this->columnIndexes );
 }
 
 template< typename Real,
@@ -217,7 +215,7 @@ template< typename Real,
           typename Index,
           typename MatrixType,
           template< typename, typename > class SegmentsView >
-void
+__cuda_callable__ void
 SparseMatrixView< Real, Device, Index, MatrixType, SegmentsView >::
 setElement( const IndexType row,
             const IndexType column,
@@ -231,7 +229,7 @@ template< typename Real,
           typename Index,
           typename MatrixType,
           template< typename, typename > class SegmentsView >
-void
+__cuda_callable__ void
 SparseMatrixView< Real, Device, Index, MatrixType, SegmentsView >::
 addElement( IndexType row,
             IndexType column,
@@ -270,9 +268,14 @@ addElement( IndexType row,
    }
    if( i == rowSize )
    {
+#ifndef __CUDA_ARCH__
       std::stringstream msg;
       msg << "The capacity of the sparse matrix row number "  << row << " was exceeded.";
       throw std::logic_error( msg.str() );
+#else
+      TNL_ASSERT_TRUE( false, "");
+      return;
+#endif
    }
    if( col == this->getPaddingIndex() )
    {
@@ -308,6 +311,7 @@ template< typename Real,
           typename Index,
           typename MatrixType,
           template< typename, typename > class SegmentsView >
+__cuda_callable__ 
 Real
 SparseMatrixView< Real, Device, Index, MatrixType, SegmentsView >::
 getElement( IndexType row,
@@ -342,7 +346,7 @@ getElement( IndexType row,
    return 0.0;
 }
 
-template< typename Real,
+/*template< typename Real,
           typename Device,
           typename Index,
           typename MatrixType,
@@ -354,8 +358,9 @@ SparseMatrixView< Real, Device, Index, MatrixType, SegmentsView >::
 rowVectorProduct( const IndexType row,
                   const Vector& vector ) const
 {
-
-}
+   TNL_ASSERT_TRUE( false, "TODO: rowVectorProduct is not implemented yet.");
+   return 0;
+}*/
 
 template< typename Real,
           typename Device,
@@ -369,7 +374,9 @@ SparseMatrixView< Real, Device, Index, MatrixType, SegmentsView >::
 vectorProduct( const InVector& inVector,
                OutVector& outVector,
                const RealType matrixMultiplicator,
-               const RealType outVectorMultiplicator ) const
+               const RealType outVectorMultiplicator,
+               const IndexType firstRow,
+               IndexType lastRow ) const
 {
    TNL_ASSERT_EQ( this->getColumns(), inVector.getSize(), "Matrix columns do not fit with input vector." );
    TNL_ASSERT_EQ( this->getRows(), outVector.getSize(), "Matrix rows do not fit with output vector." );
@@ -381,7 +388,7 @@ vectorProduct( const InVector& inVector,
    const IndexType paddingIndex = this->getPaddingIndex();
    if( isSymmetric() )
       outVector *= outVectorMultiplicator;
-   auto fetch = [=] __cuda_callable__ ( IndexType row, IndexType localIdx, IndexType globalIdx, bool& compute ) mutable -> RealType {
+   auto symmetricFetch = [=] __cuda_callable__ ( IndexType row, IndexType localIdx, IndexType globalIdx, bool& compute ) mutable -> RealType {
       const IndexType column = columnIndexesView[ globalIdx ];
       compute = ( column != paddingIndex );
       if( ! compute )
@@ -397,9 +404,16 @@ vectorProduct( const InVector& inVector,
          return inVectorView[ column ];
       return valuesView[ globalIdx ] * inVectorView[ column ];
    };
-   auto reduction = [] __cuda_callable__ ( RealType& sum, const RealType& value ) {
-      sum += value;
+   auto fetch = [=] __cuda_callable__ ( IndexType globalIdx, bool& compute ) mutable -> RealType {
+      const IndexType column = columnIndexesView[ globalIdx ];
+      compute = ( column != paddingIndex );
+      if( ! compute )
+         return 0.0;
+      if( isBinary() )
+         return inVectorView[ column ];
+      return valuesView[ globalIdx ] * inVectorView[ column ];
    };
+
    auto keeper = [=] __cuda_callable__ ( IndexType row, const RealType& value ) mutable {
       if( isSymmetric() )
          outVectorView[ row ] += matrixMultiplicator * value;
@@ -411,28 +425,12 @@ vectorProduct( const InVector& inVector,
             outVectorView[ row ] = outVectorMultiplicator * outVectorView[ row ] + matrixMultiplicator * value;
       }
    };
-   this->segments.segmentsReduction( 0, this->getRows(), fetch, reduction, keeper, ( RealType ) 0.0 );
-
-   /*const auto inVectorView = inVector.getConstView();
-   auto outVectorView = outVector.getView();
-   const auto valuesView = this->values.getConstView();
-   const auto columnIndexesView = this->columnIndexes.getConstView();
-   const IndexType paddingIndex = this->getPaddingIndex();
-   auto fetch = [=] __cuda_callable__ ( IndexType row, IndexType offset, bool& compute ) -> RealType {
-      const IndexType column = columnIndexesView[ offset ];
-      compute = ( column != paddingIndex );
-      if( ! compute )
-         return 0.0;
-      return valuesView[ offset ] * inVectorView[ column ];
-   };
-   auto reduction = [] __cuda_callable__ ( RealType& sum, const RealType& value ) {
-      sum += value;
-   };
-   auto keeper = [=] __cuda_callable__ ( IndexType row, const RealType& value ) mutable {
-      outVectorView[ row ] = value;
-   };
-   this->segments.segmentsReduction( 0, this->getRows(), fetch, reduction, keeper, ( RealType ) 0.0 );
-   */
+   if( lastRow == 0 )
+      lastRow = this->getRows();
+   if( isSymmetric() )
+      this->segments.segmentsReduction( firstRow, lastRow, symmetricFetch, std::plus<>{}, keeper, ( RealType ) 0.0 );
+   else
+      this->segments.segmentsReduction( firstRow, lastRow, fetch, std::plus<>{}, keeper, ( RealType ) 0.0 );
 }
 
 template< typename Real,
@@ -443,7 +441,7 @@ template< typename Real,
    template< typename Fetch, typename Reduce, typename Keep, typename FetchValue >
 void
 SparseMatrixView< Real, Device, Index, MatrixType, SegmentsView >::
-rowsReduction( IndexType first, IndexType last, Fetch& fetch, Reduce& reduce, Keep& keep, const FetchValue& zero ) const
+rowsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchValue& zero ) const
 {
    const auto columns_view = this->columnIndexes.getConstView();
    const auto values_view = this->values.getConstView();
@@ -470,7 +468,7 @@ template< typename Real,
    template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
 void
 SparseMatrixView< Real, Device, Index, MatrixType, SegmentsView >::
-allRowsReduction( Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& zero ) const
+allRowsReduction( Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& zero ) const
 {
    this->rowsReduction( 0, this->getRows(), fetch, reduce, keep, zero );
 }
@@ -611,6 +609,40 @@ operator=( const SparseMatrixView< Real, Device, Index, MatrixType, SegmentsView
    return *this;
 }
 
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          template< typename, typename > class SegmentsView >
+   template< typename Matrix >
+bool
+SparseMatrixView< Real, Device, Index, MatrixType, SegmentsView >::
+operator==( const Matrix& m ) const
+{
+   const auto& view1 = *this;
+   // FIXME: getConstView does not work
+   //const auto view2 = m.getConstView();
+   const auto view2 = m.getView();
+   auto fetch = [=] __cuda_callable__ ( const IndexType i ) -> bool
+   {
+      return view1.getRow( i ) == view2.getRow( i );
+   };
+   return Algorithms::Reduction< DeviceType >::reduce( this->getRows(), std::logical_and<>{}, fetch, true );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          template< typename, typename > class SegmentsView >
+   template< typename Matrix >
+bool
+SparseMatrixView< Real, Device, Index, MatrixType, SegmentsView >::
+operator!=( const Matrix& m ) const
+{
+   return ! operator==( m );
+}
+
 template< typename Real,
           typename Device,
           typename Index,
diff --git a/src/TNL/Matrices/SparseOperations_impl.h b/src/TNL/Matrices/SparseOperations_impl.h
index ff507c3268ce059108bd217e207c9c6487cb30c5..97f86c4eec8e60b5f7cf95cfd8996e549885826a 100644
--- a/src/TNL/Matrices/SparseOperations_impl.h
+++ b/src/TNL/Matrices/SparseOperations_impl.h
@@ -36,11 +36,10 @@ SparseMatrixSetRowLengthsVectorKernel( Vector* rowLengths,
    const IndexType gridSize = blockDim.x * gridDim.x;
 
    while( rowIdx < rows ) {
-      const auto max_length = matrix->getRowLengthFast( rowIdx );
       const auto row = matrix->getRow( rowIdx );
       IndexType length = 0;
-      for( IndexType c_j = 0; c_j < max_length; c_j++ )
-         if( row.getElementColumn( c_j ) < cols )
+      for( IndexType c_j = 0; c_j < row.getSize(); c_j++ )
+         if( row.getColumnIndex( c_j ) < cols )
             length++;
          else
             break;
@@ -66,7 +65,7 @@ SparseMatrixCopyKernel( Matrix1* A,
       const auto rowB = B->getRow( rowIdx );
       auto rowA = A->getRow( rowIdx );
       for( IndexType c = 0; c < length; c++ )
-         rowA.setElement( c, rowB.getElementColumn( c ), rowB.getElementValue( c ) );
+         rowA.setElement( c, rowB.getColumnIndex( c ), rowB.getValue( c ) );
       rowIdx += gridSize;
    }
 }
@@ -102,11 +101,10 @@ copySparseMatrix_impl( Matrix1& A, const Matrix2& B )
 #pragma omp parallel for if( Devices::Host::isOMPEnabled() )
 #endif
       for( IndexType i = 0; i < rows; i++ ) {
-         const auto max_length = B.getRowLength( i );
          const auto row = B.getRow( i );
          IndexType length = 0;
-         for( IndexType c_j = 0; c_j < max_length; c_j++ )
-            if( row.getElementColumn( c_j ) < cols )
+         for( IndexType c_j = 0; c_j < row.getSize(); c_j++ )
+            if( row.getColumnIndex( c_j ) < cols )
                length++;
             else
                break;
@@ -122,7 +120,7 @@ copySparseMatrix_impl( Matrix1& A, const Matrix2& B )
          const auto rowB = B.getRow( i );
          auto rowA = A.getRow( i );
          for( IndexType c = 0; c < length; c++ )
-            rowA.setElement( c, rowB.getElementColumn( c ), rowB.getElementValue( c ) );
+            rowA.setElement( c, rowB.getColumnIndex( c ), rowB.getValue( c ) );
       }
    }
 
@@ -228,11 +226,10 @@ copyAdjacencyStructure( const Matrix& A, AdjacencyMatrix& B,
    rowLengths.setSize( N );
    rowLengths.setValue( 0 );
    for( IndexType i = 0; i < A.getRows(); i++ ) {
-      const int maxLength = A.getRowLength( i );
       const auto row = A.getRow( i );
       IndexType length = 0;
-      for( int c_j = 0; c_j < maxLength; c_j++ ) {
-         const IndexType j = row.getElementColumn( c_j );
+      for( int c_j = 0; c_j < row.getSize(); c_j++ ) {
+         const IndexType j = row.getColumnIndex( c_j );
          if( j >= A.getColumns() )
             break;
          length++;
@@ -248,10 +245,9 @@ copyAdjacencyStructure( const Matrix& A, AdjacencyMatrix& B,
 
    // set non-zeros
    for( IndexType i = 0; i < A.getRows(); i++ ) {
-      const int maxLength = A.getRowLength( i );
       const auto row = A.getRow( i );
-      for( int c_j = 0; c_j < maxLength; c_j++ ) {
-         const IndexType j = row.getElementColumn( c_j );
+      for( int c_j = 0; c_j < row.getSize(); c_j++ ) {
+         const IndexType j = row.getColumnIndex( c_j );
          if( j >= A.getColumns() )
             break;
          if( ! ignore_diagonal || i != j )
@@ -282,11 +278,10 @@ reorderSparseMatrix( const Matrix1& matrix1, Matrix2& matrix2, const Permutation
    typename Matrix2::CompressedRowLengthsVector rowLengths;
    rowLengths.setSize( matrix1.getRows() );
    for( IndexType i = 0; i < matrix1.getRows(); i++ ) {
-      const IndexType maxLength = matrix1.getRowLength( perm[ i ] );
       const auto row = matrix1.getRow( perm[ i ] );
       IndexType length = 0;
-      for( IndexType j = 0; j < maxLength; j++ )
-         if( row.getElementColumn( j ) < matrix1.getColumns() )
+      for( IndexType j = 0; j < row.getSize(); j++ )
+         if( row.getColumnIndex( j ) < matrix1.getColumns() )
             length++;
       rowLengths[ i ] = length;
    }
@@ -303,8 +298,8 @@ reorderSparseMatrix( const Matrix1& matrix1, Matrix2& matrix2, const Permutation
       typename Matrix2::IndexType columns[ rowLength ];
       typename Matrix2::RealType values[ rowLength ];
       for( IndexType j = 0; j < rowLength; j++ ) {
-         columns[ j ] = iperm[ row1.getElementColumn( j ) ];
-         values[ j ] = row1.getElementValue( j );
+         columns[ j ] = iperm[ row1.getColumnIndex( j ) ];
+         values[ j ] = row1.getValue( j );
       }
 
       // sort
@@ -319,14 +314,10 @@ reorderSparseMatrix( const Matrix1& matrix1, Matrix2& matrix2, const Permutation
       };
       std::sort( indices, indices + rowLength, comparator );
 
-      typename Matrix2::IndexType sortedColumns[ rowLength ];
-      typename Matrix2::RealType sortedValues[ rowLength ];
-      for( IndexType j = 0; j < rowLength; j++ ) {
-         sortedColumns[ j ] = columns[ indices[ j ] ];
-         sortedValues[ j ] = values[ indices[ j ] ];
-      }
-
-      matrix2.setRow( i, sortedColumns, sortedValues, rowLength );
+      // set the row
+      auto row2 = matrix2.getRow( i );
+      for( IndexType j = 0; j < rowLength; j++ )
+         row2.setElement( j, columns[ indices[ j ] ], values[ indices[ j ] ] );
    }
 }
 
diff --git a/src/TNL/Matrices/ThreePartVector.h b/src/TNL/Matrices/ThreePartVector.h
index f28f544f5bac6eeceb61d01ef49852fd1b36b6af..01caaae52f4312d8f4674e9661cdded2bd7f12d0 100644
--- a/src/TNL/Matrices/ThreePartVector.h
+++ b/src/TNL/Matrices/ThreePartVector.h
@@ -24,6 +24,7 @@ template< typename Real,
           typename Index = int >
 class ThreePartVectorView
 {
+   using ConstReal = std::add_const_t< Real >;
 public:
    using RealType = Real;
    using DeviceType = Device;
@@ -53,6 +54,16 @@ public:
       right.reset();
    }
 
+   IndexType getSize() const
+   {
+      return left.getSize() + middle.getSize() + right.getSize();
+   }
+
+   ThreePartVectorView< ConstReal, Device, Index > getConstView() const
+   {
+      return {left.getConstView(), middle, right.getConstView()};
+   }
+
 //   __cuda_callable__
 //   Real& operator[]( Index i )
 //   {
@@ -127,7 +138,12 @@ public:
       right.reset();
    }
 
-   ThreePartVectorView< ConstReal, Device, Index > getConstView()
+   IndexType getSize() const
+   {
+      return left.getSize() + middle.getSize() + right.getSize();
+   }
+
+   ThreePartVectorView< ConstReal, Device, Index > getConstView() const
    {
       return {left.getConstView(), middle, right.getConstView()};
    }
diff --git a/src/TNL/Matrices/details/SparseMatrix.h b/src/TNL/Matrices/details/SparseMatrix.h
new file mode 100644
index 0000000000000000000000000000000000000000..9eeac76142f26533b96f2eff56fc6e9d9a8d5c75
--- /dev/null
+++ b/src/TNL/Matrices/details/SparseMatrix.h
@@ -0,0 +1,40 @@
+/***************************************************************************
+                          SparseMatrix.h  -  description
+                             -------------------
+    begin                : Jan 5, 2020
+    copyright            : (C) 2020 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#include <TNL/Containers/ArrayView.h>
+#include <TNL/Containers/VectorView.h>
+#include <TNL/Containers/DistributedArray.h>
+#include <TNL/Containers/DistributedVector.h>
+
+
+namespace TNL {
+   namespace Matrices {
+      namespace details {
+
+
+template< typename VectorOrView,
+          std::enable_if_t< HasSetSizeMethod< VectorOrView >::value, bool > = true >
+static void set_size_if_resizable( VectorOrView& v, typename VectorOrView::IndexType size )
+{
+   v.setSize( size );
+}
+
+template< typename VectorOrView,
+          std::enable_if_t< ! HasSetSizeMethod< VectorOrView >::value, bool > = true >
+static void set_size_if_resizable( VectorOrView& v, typename VectorOrView::IndexType size )
+{
+   TNL_ASSERT_EQ( v.getSize(), size, "view has wrong size" );
+}
+
+      } //namespace details
+   } //namepsace Matrices
+} //namespace TNL
diff --git a/src/TNL/Operators/DirichletBoundaryConditions.h b/src/TNL/Operators/DirichletBoundaryConditions.h
index 31389407261bf598bee4de13fe58639e3ba33fda..ddc9b08ff5141b71ce49a6e952d9981a38783b68 100644
--- a/src/TNL/Operators/DirichletBoundaryConditions.h
+++ b/src/TNL/Operators/DirichletBoundaryConditions.h
@@ -105,7 +105,7 @@ class DirichletBoundaryConditions
                               Matrix& matrix,
                               Vector& b ) const
       {
-         typename Matrix::MatrixRow matrixRow = matrix.getRow( entity.getIndex() );
+         auto matrixRow = matrix.getRow( entity.getIndex() );
          const IndexType& index = entity.getIndex();
          matrixRow.setElement( 0, index, 1.0 );
          b[ index ] = Functions::FunctionAdapter< MeshType, Function >::getValue( this->function, entity, time );
diff --git a/src/TNL/Operators/NeumannBoundaryConditions.h b/src/TNL/Operators/NeumannBoundaryConditions.h
index a46545cd176a7b6f4daecd61ed953c7b4fd22409..3b6d48fa18c186abd83bba63100576e34c6f1369 100644
--- a/src/TNL/Operators/NeumannBoundaryConditions.h
+++ b/src/TNL/Operators/NeumannBoundaryConditions.h
@@ -155,7 +155,7 @@ class NeumannBoundaryConditions< Meshes::Grid< 1, MeshReal, Device, MeshIndex >,
       {
          const auto& neighborEntities = entity.getNeighborEntities();
          const IndexType& index = entity.getIndex();
-         typename Matrix::MatrixRow matrixRow = matrix.getRow( index );
+         auto matrixRow = matrix.getRow( index );
          if( entity.getCoordinates().x() == 0 )
          {
             matrixRow.setElement( 0, index, 1.0 );
@@ -261,7 +261,7 @@ class NeumannBoundaryConditions< Meshes::Grid< 2, MeshReal, Device, MeshIndex >,
       {
          const auto& neighborEntities = entity.getNeighborEntities();
          const IndexType& index = entity.getIndex();
-         typename Matrix::MatrixRow matrixRow = matrix.getRow( index );
+         auto matrixRow = matrix.getRow( index );
          if( entity.getCoordinates().x() == 0 )
          {
             matrixRow.setElement( 0, index,                                                1.0 );
@@ -390,7 +390,7 @@ class NeumannBoundaryConditions< Meshes::Grid< 3, MeshReal, Device, MeshIndex >,
       {
          const auto& neighborEntities = entity.getNeighborEntities();
          const IndexType& index = entity.getIndex();
-         typename Matrix::MatrixRow matrixRow = matrix.getRow( index );
+         auto matrixRow = matrix.getRow( index );
          if( entity.getCoordinates().x() == 0 )
          {
             matrixRow.setElement( 0, index,                                                   1.0 );
diff --git a/src/TNL/Operators/diffusion/LinearDiffusion_impl.h b/src/TNL/Operators/diffusion/LinearDiffusion_impl.h
index 51bdf8a62372f82acf85b941f9de580b6d69c6a2..bbdfb4db18f4b5ade6d88d8801bcc8bdc9822582 100644
--- a/src/TNL/Operators/diffusion/LinearDiffusion_impl.h
+++ b/src/TNL/Operators/diffusion/LinearDiffusion_impl.h
@@ -87,7 +87,7 @@ setMatrixElements( const PreimageFunction& u,
    static_assert( PreimageFunction::getEntitiesDimension() == 1, "Wrong preimage function" );
    const typename MeshEntity::template NeighborEntities< 1 >& neighborEntities = entity.getNeighborEntities();
    const IndexType& index = entity.getIndex();
-   typename Matrix::MatrixRow matrixRow = matrix.getRow( index );
+   auto matrixRow = matrix.getRow( index );
    const RealType lambdaX = tau * entity.getMesh().template getSpaceStepsProducts< -2 >();
    matrixRow.setElement( 0, neighborEntities.template getEntityIndex< -1 >(),      - lambdaX );
    matrixRow.setElement( 1, index,                                              2.0 * lambdaX );
@@ -162,7 +162,7 @@ setMatrixElements( const PreimageFunction& u,
    static_assert( MeshEntity::getEntityDimension() == 2, "Wrong mesh entity dimensions." );
    static_assert( PreimageFunction::getEntitiesDimension() == 2, "Wrong preimage function" );
    const IndexType& index = entity.getIndex();
-   typename Matrix::MatrixRow matrixRow = matrix.getRow( index );
+   auto matrixRow = matrix.getRow( index );
    const RealType lambdaX = tau * entity.getMesh().template getSpaceStepsProducts< -2, 0 >();
    const RealType lambdaY = tau * entity.getMesh().template getSpaceStepsProducts< 0, -2 >();
    const typename MeshEntity::template NeighborEntities< 2 >& neighborEntities = entity.getNeighborEntities();
@@ -244,7 +244,7 @@ setMatrixElements( const PreimageFunction& u,
    static_assert( PreimageFunction::getEntitiesDimension() == 3, "Wrong preimage function" );
    const typename MeshEntity::template NeighborEntities< 3 >& neighborEntities = entity.getNeighborEntities();
    const IndexType& index = entity.getIndex();
-   typename Matrix::MatrixRow matrixRow = matrix.getRow( index );
+   auto matrixRow = matrix.getRow( index );
    const RealType lambdaX = tau * entity.getMesh().template getSpaceStepsProducts< -2, 0, 0 >();
    const RealType lambdaY = tau * entity.getMesh().template getSpaceStepsProducts< 0, -2, 0 >();
    const RealType lambdaZ = tau * entity.getMesh().template getSpaceStepsProducts< 0, 0, -2 >();
diff --git a/src/TNL/Pointers/SharedPointerCuda.h b/src/TNL/Pointers/SharedPointerCuda.h
index f4f73ec39f30ae0455cabdf5fd72886d20fb9b8a..975904a7374c863405cac8062ace9f3e097adbb8 100644
--- a/src/TNL/Pointers/SharedPointerCuda.h
+++ b/src/TNL/Pointers/SharedPointerCuda.h
@@ -90,6 +90,34 @@ class SharedPointer< Object, Devices::Cuda > : public SmartPointer
          this->allocate( args... );
       }
 
+      /**
+       * \brief Constructor with initializer list.
+       *
+       * \tparam Value is type of the initializer list elements.
+       * \param list is the instance of the initializer list..
+       */
+      template< typename Value >
+      explicit  SharedPointer( std::initializer_list< Value > list )
+      : pd( nullptr ),
+        cuda_pointer( nullptr )
+      {
+         this->allocate( list );
+      }
+
+      /**
+       * \brief Constructor with nested initializer lists.
+       *
+       * \tparam Value is type of the nested initializer list elements.
+       * \param list is the instance of the nested initializer list..
+       */
+      template< typename Value >
+      explicit  SharedPointer( std::initializer_list< std::initializer_list< Value > > list )
+      : pd( nullptr ),
+        cuda_pointer( nullptr )
+      {
+         this->allocate( list );
+      }
+
       /**
        * \brief Copy constructor.
        *
diff --git a/src/TNL/Pointers/SharedPointerHost.h b/src/TNL/Pointers/SharedPointerHost.h
index ea8654d16ee8c443b1073f9890d6081c497367ab..2ef8d7abd45f73c97e1ccbae600fc90d887eafbf 100644
--- a/src/TNL/Pointers/SharedPointerHost.h
+++ b/src/TNL/Pointers/SharedPointerHost.h
@@ -73,7 +73,7 @@ class SharedPointer< Object, Devices::Host > : public SmartPointer
        * \brief Constructor with parameters of the Object constructor.
        *
        * \tparam Args is variadic template type of arguments of the Object constructor.
-       * \tparam args are arguments passed to the Object constructor.
+       * \param args are arguments passed to the Object constructor.
        */
       template< typename... Args >
       explicit  SharedPointer( Args... args )
@@ -85,6 +85,38 @@ class SharedPointer< Object, Devices::Host > : public SmartPointer
          this->allocate( args... );
       }
 
+      /**
+       * \brief Constructor with initializer list.
+       *
+       * \tparam Value is type of the initializer list elements.
+       * \param list is the instance of the initializer list..
+       */
+      template< typename Value >
+      explicit  SharedPointer( std::initializer_list< Value > list )
+      : pd( nullptr )
+      {
+#ifdef TNL_DEBUG_SHARED_POINTERS
+         std::cerr << "Creating shared pointer to " << getType< ObjectType >() << std::endl;
+#endif
+         this->allocate( list );
+      }
+
+      /**
+       * \brief Constructor with nested initializer lists.
+       *
+       * \tparam Value is type of the nested initializer list elements.
+       * \param list is the instance of the nested initializer list..
+       */
+      template< typename Value >
+      explicit  SharedPointer( std::initializer_list< std::initializer_list< Value > > list )
+      : pd( nullptr )
+      {
+#ifdef TNL_DEBUG_SHARED_POINTERS
+         std::cerr << "Creating shared pointer to " << getType< ObjectType >() << std::endl;
+#endif
+         this->allocate( list );
+      }
+
       /**
        * \brief Copy constructor.
        *
diff --git a/src/TNL/Pointers/UniquePointer.h b/src/TNL/Pointers/UniquePointer.h
index 66bc4a33c3869ef0e076ad1f60e08a92a4735135..8683dbcbd79360e5fcb6a9af501611256ff0a8bb 100644
--- a/src/TNL/Pointers/UniquePointer.h
+++ b/src/TNL/Pointers/UniquePointer.h
@@ -96,6 +96,30 @@ class UniquePointer< Object, Devices::Host > : public SmartPointer
          this->pointer = new Object( args... );
       }
 
+      /**
+       * \brief Constructor with initializer list.
+       *
+       * \tparam Value is type of the initializer list elements.
+       * \param list is the instance of the initializer list..
+       */
+      template< typename Value >
+      explicit  UniquePointer( std::initializer_list< Value > list )
+      {
+         this->pointer = new Object( list );
+      }
+
+      /**
+       * \brief Constructor with nested initializer lists.
+       *
+       * \tparam Value is type of the nested initializer list elements.
+       * \param list is the instance of the nested initializer list..
+       */
+      template< typename Value >
+      explicit  UniquePointer( std::initializer_list< std::initializer_list< Value > > list )
+      {
+         this->pointer = new Object( list );
+      }
+
       /**
        * \brief Arrow operator for accessing the object owned by constant smart pointer.
        *
@@ -300,6 +324,34 @@ class UniquePointer< Object, Devices::Cuda > : public SmartPointer
          this->allocate( args... );
       }
 
+      /**
+       * \brief Constructor with initializer list.
+       *
+       * \tparam Value is type of the initializer list elements.
+       * \param list is the instance of the initializer list..
+       */
+      template< typename Value >
+      explicit  UniquePointer( std::initializer_list< Value > list )
+      : pd( nullptr ),
+        cuda_pointer( nullptr )
+      {
+         this->allocate( list );
+      }
+
+      /**
+       * \brief Constructor with nested initializer lists.
+       *
+       * \tparam Value is type of the nested initializer list elements.
+       * \param list is the instance of the nested initializer list..
+       */
+      template< typename Value >
+      explicit  UniquePointer( std::initializer_list< std::initializer_list< Value > > list )
+      : pd( nullptr ),
+        cuda_pointer( nullptr )
+      {
+         this->allocate( list );
+      }
+
       /**
        * \brief Arrow operator for accessing the object owned by constant smart pointer.
        *
diff --git a/src/TNL/Problems/HeatEquationProblem.h b/src/TNL/Problems/HeatEquationProblem.h
index 4b2a0d430108bf0ae8f0f0411b855047db071f46..76e46738015b565a412753ad58a9043f69a4bb58 100644
--- a/src/TNL/Problems/HeatEquationProblem.h
+++ b/src/TNL/Problems/HeatEquationProblem.h
@@ -18,7 +18,6 @@
 
 #include <TNL/Problems/PDEProblem.h>
 #include <TNL/Operators/diffusion/LinearDiffusion.h>
-#include <TNL/Matrices/Legacy/Ellpack.h>
 #include <TNL/Functions/MeshFunction.h>
 #include <TNL/Timer.h>
 #include <TNL/Solvers/PDE/ExplicitUpdater.h>
@@ -50,7 +49,6 @@ class HeatEquationProblem : public PDEProblem< Mesh,
       typedef Functions::MeshFunction< Mesh > MeshFunctionType;
       typedef Pointers::SharedPointer< MeshFunctionType, DeviceType > MeshFunctionPointer;
       typedef PDEProblem< Mesh, Communicator, RealType, DeviceType, IndexType > BaseType;
-      typedef Matrices::Legacy::SlicedEllpack< RealType, DeviceType, IndexType > MatrixType;
       typedef Pointers::SharedPointer<  DifferentialOperator > DifferentialOperatorPointer;
       typedef Pointers::SharedPointer<  BoundaryCondition > BoundaryConditionPointer;
       typedef Pointers::SharedPointer<  RightHandSide, DeviceType > RightHandSidePointer;
@@ -59,6 +57,7 @@ class HeatEquationProblem : public PDEProblem< Mesh,
       using typename BaseType::MeshPointer;
       using typename BaseType::DofVectorType;
       using typename BaseType::DofVectorPointer;
+      using typename BaseType::MatrixType;
 
       typedef Communicator CommunicatorType;
 
diff --git a/src/TNL/Problems/HeatEquationProblem_impl.h b/src/TNL/Problems/HeatEquationProblem_impl.h
index 98cd6d5e4f0f74b797fb88b8b88c83079aee76ee..fbb7e7d01084385b0ee111d3aa090163550c0ffc 100644
--- a/src/TNL/Problems/HeatEquationProblem_impl.h
+++ b/src/TNL/Problems/HeatEquationProblem_impl.h
@@ -18,7 +18,6 @@
 
 #include <TNL/FileName.h>
 #include <TNL/Matrices/MatrixSetter.h>
-#include <TNL/Matrices/Legacy/MultidiagonalMatrixSetter.h>
 #include <TNL/Logger.h>
 #include <TNL/Solvers/PDE/BoundaryConditionsSetter.h>
 
@@ -192,7 +191,6 @@ setupLinearSystem( MatrixPointer& matrixPointer )
    matrixPointer->setDimensions( dofs, dofs );
    matrixPointer->setCompressedRowLengths( *rowLengthsPointer );
    return true;
-   //return MultidiagonalMatrixSetter< Mesh >::setupMatrix( mesh, matrix );
 }
 
 template< typename Mesh,
diff --git a/src/TNL/Problems/PDEProblem.h b/src/TNL/Problems/PDEProblem.h
index e73bf633c2b3b6556677776db778f236fe6ae209..179255b93c99da025d542bcb7c75bed1ad197fb4 100644
--- a/src/TNL/Problems/PDEProblem.h
+++ b/src/TNL/Problems/PDEProblem.h
@@ -13,7 +13,8 @@
 #include <TNL/Problems/Problem.h>
 #include <TNL/Problems/CommonData.h>
 #include <TNL/Pointers/SharedPointer.h>
-#include <TNL/Matrices/Legacy/SlicedEllpack.h>
+#include <TNL/Matrices/SparseMatrix.h>
+#include <TNL/Containers/Segments/SlicedEllpack.h>
 #include <TNL/Solvers/PDE/TimeDependentPDESolver.h>
 
 namespace TNL {
@@ -39,7 +40,14 @@ class PDEProblem : public Problem< Real, Device, Index >
       using SubdomainOverlapsType = typename DistributedMeshType::SubdomainOverlapsType;
       using DofVectorType = Containers::Vector< RealType, DeviceType, IndexType>;
       using DofVectorPointer = Pointers::SharedPointer< DofVectorType, DeviceType >;
-      using MatrixType = Matrices::Legacy::SlicedEllpack< RealType, DeviceType, IndexType >;
+      template< typename _Device, typename _Index, typename _IndexAlocator >
+      using SegmentsType = Containers::Segments::SlicedEllpack< _Device, _Index, _IndexAlocator >;
+      using MatrixType = TNL::Matrices::SparseMatrix< Real,
+                                                      Device,
+                                                      Index,
+                                                      TNL::Matrices::GeneralMatrix,
+                                                      SegmentsType
+                                                    >;
       using CommunicatorType = Communicator;
       using CommonDataType = CommonData;
       using CommonDataPointer = Pointers::SharedPointer< CommonDataType, DeviceType >;
diff --git a/src/TNL/Solvers/Linear/LinearResidueGetter_impl.h b/src/TNL/Solvers/Linear/LinearResidueGetter_impl.h
index 21fc726aa4d38db53f86085f3b76d125714ee4f4..7165cdde769331fb196ded3005f5bfd43f9c0198 100644
--- a/src/TNL/Solvers/Linear/LinearResidueGetter_impl.h
+++ b/src/TNL/Solvers/Linear/LinearResidueGetter_impl.h
@@ -27,18 +27,22 @@ getResidue( const Matrix& matrix,
             typename Matrix::RealType bNorm )
 {
    using RealType = typename Matrix::RealType;
+   using DeviceType = typename Matrix::DeviceType;
    using IndexType = typename Matrix::IndexType;
 
    const IndexType size = matrix.getRows();
    RealType res( 0.0 );
    if( bNorm == 0.0 )
       bNorm = lpNorm( b, 2.0 );
-   for( IndexType i = 0; i < size; i ++ )
+   Containers::Vector< RealType, DeviceType, IndexType > v( b.getSize() );
+   matrix.vectorProduct( x, v );
+   return l2Norm( v - b );
+   /*for( IndexType i = 0; i < size; i ++ )
    {
       RealType err = abs( matrix.rowVectorProduct( i, x ) - b[ i ] );
       res += err * err;
    }
-   return std::sqrt( res ) / bNorm;
+   return std::sqrt( res ) / bNorm;*/
 }
 
 } // namespace Linear
diff --git a/src/TNL/Solvers/Linear/Preconditioners/Diagonal_impl.h b/src/TNL/Solvers/Linear/Preconditioners/Diagonal_impl.h
index ae404321d3a5ea2e0539526f2771837584ee5a47..104768ef74cd537494104f77a4af8c1f2fb6c311 100644
--- a/src/TNL/Solvers/Linear/Preconditioners/Diagonal_impl.h
+++ b/src/TNL/Solvers/Linear/Preconditioners/Diagonal_impl.h
@@ -32,7 +32,8 @@ update( const MatrixPointer& matrixPointer )
    diagonal.setSize( matrixPointer->getRows() );
 
    VectorViewType diag_view( diagonal );
-   const auto kernel_matrix = matrixPointer->getView(); //.template getData< DeviceType >();
+
+   const auto kernel_matrix = matrixPointer->getView();
 
    // TODO: Rewrite this with SparseMatrix::forAllRows
    auto kernel = [=] __cuda_callable__ ( IndexType i ) mutable
@@ -75,7 +76,7 @@ update( const MatrixPointer& matrixPointer )
    auto kernel = [=] __cuda_callable__ ( IndexType i ) mutable
    {
       const IndexType gi = kernel_matrix->getLocalRowRange().getGlobalIndex( i );
-      diag_view[ i ] = kernel_matrix->getLocalMatrix().getElementFast( i, gi );
+      diag_view[ i ] = kernel_matrix->getLocalMatrix().getElement( i, gi );
    };
 
    Algorithms::ParallelFor< DeviceType >::exec( (IndexType) 0, diagonal.getSize(), kernel );
diff --git a/src/TNL/Solvers/Linear/Preconditioners/ILU0.h b/src/TNL/Solvers/Linear/Preconditioners/ILU0.h
index 8e9b49cd0a75d079b8e1107c883f47469d488b19..07f0aea4e83be59ea118af9209634468f8b4f62c 100644
--- a/src/TNL/Solvers/Linear/Preconditioners/ILU0.h
+++ b/src/TNL/Solvers/Linear/Preconditioners/ILU0.h
@@ -15,11 +15,12 @@
 #include "Preconditioner.h"
 
 #include <TNL/Containers/Vector.h>
-#include <TNL/Matrices/Legacy/CSR.h>
+#include <TNL/Matrices/SparseMatrix.h>
 #include <TNL/Pointers/UniquePointer.h>
 #include <TNL/Exceptions/NotImplementedError.h>
 
 #if defined(HAVE_CUDA) && defined(HAVE_CUSPARSE)
+#include <TNL/Matrices/Legacy/CSR.h>
 #include <cusparse.h>
 #endif
 
@@ -76,7 +77,7 @@ public:
 
 protected:
    // The factors L and U are stored separately and the rows of U are reversed.
-   Matrices::Legacy::CSR< RealType, DeviceType, IndexType > L, U;
+   Matrices::SparseMatrix< RealType, DeviceType, IndexType, Matrices::GeneralMatrix, Containers::Segments::CSR > L, U;
 
    // Specialized methods to distinguish between normal and distributed matrices
    // in the implementation.
diff --git a/src/TNL/Solvers/Linear/Preconditioners/ILU0_impl.h b/src/TNL/Solvers/Linear/Preconditioners/ILU0_impl.h
index 5ae255304f89eebb7a97fe2bfeac7ebc82b9c765..acf2ce129df7d853806ab3de6dee2338c88860b6 100644
--- a/src/TNL/Solvers/Linear/Preconditioners/ILU0_impl.h
+++ b/src/TNL/Solvers/Linear/Preconditioners/ILU0_impl.h
@@ -43,11 +43,10 @@ update( const MatrixPointer& matrixPointer )
    typename decltype(U)::CompressedRowLengthsVector U_rowLengths( N );
    for( IndexType i = 0; i < N; i++ ) {
       const auto row = localMatrix.getRow( i );
-      const auto max_length = row.getLength();
       IndexType L_entries = 0;
       IndexType U_entries = 0;
-      for( IndexType j = 0; j < max_length; j++ ) {
-         const auto column = row.getElementColumn( j );
+      for( IndexType j = 0; j < row.getSize(); j++ ) {
+         const auto column = row.getColumnIndex( j );
          if( column < minColumn )
             continue;
          if( column < i + minColumn )
@@ -67,10 +66,14 @@ update( const MatrixPointer& matrixPointer )
    // The factors L and U are stored separately and the rows of U are reversed.
    for( IndexType i = 0; i < N; i++ ) {
       // copy all non-zero entries from A into L and U
-      const auto max_length = localMatrix.getRowLength( i );
+      const auto row = localMatrix.getRow( i );
+      const auto max_length = row.getSize();
       IndexType all_columns[ max_length ];
       RealType all_values[ max_length ];
-      localMatrix.getRowFast( i, all_columns, all_values );
+      for( IndexType j = 0; j < max_length; j++ ) {
+         all_columns[ j ] = row.getColumnIndex( j );
+         all_values[ j ] = row.getValue( j );
+      }
 
       // skip non-local elements
       IndexType* columns = all_columns;
@@ -80,39 +83,46 @@ update( const MatrixPointer& matrixPointer )
          values++;
       }
 
-      // update column column indices
+      // update column indices
       if( minColumn > 0 )
          for( IndexType c_j = 0; c_j < max_length; c_j++ )
             all_columns[ c_j ] -= minColumn;
 
       const auto L_entries = L_rowLengths[ i ];
       const auto U_entries = U_rowLengths[ N - 1 - i ];
-      L.setRow( i, columns, values, L_entries );
-      U.setRow( N - 1 - i, &columns[ L_entries ], &values[ L_entries ], U_entries );
+//      L.setRow( i, columns, values, L_entries );
+//      U.setRow( N - 1 - i, &columns[ L_entries ], &values[ L_entries ], U_entries );
+
+      // copy values into U
+      auto U_i = U.getRow( N - 1 - i );
+      for( IndexType c_j = 0; c_j < U_entries; c_j++ )
+         U_i.setElement( c_j, columns[ L_entries + c_j ], values[ L_entries + c_j ] );
 
       // this condition is to avoid segfaults on empty L.getRow( i )
       if( L_entries > 0 ) {
-         const auto L_i = L.getRow( i );
-         const auto U_i = U.getRow( N - 1 - i );
+         // copy values into L
+         auto L_i = L.getRow( i );
+         for( IndexType c_j = 0; c_j < L_entries; c_j++ )
+            L_i.setElement( c_j, columns[ c_j ], values[ c_j ] );
 
          // loop for k = 0, ..., i - 1; but only over the non-zero entries
          for( IndexType c_k = 0; c_k < L_entries; c_k++ ) {
-            const auto k = L_i.getElementColumn( c_k );
+            const auto k = L_i.getColumnIndex( c_k );
 
-            auto L_ik = L.getElementFast( i, k ) / U.getElementFast( N - 1 - k, k );
-            L.setElement( i, k, L_ik );
+            auto L_ik = L_i.getValue( c_k ) / U.getElement( N - 1 - k, k );
+            L_i.setValue( c_k, L_ik );
 
             // loop for j = k+1, ..., N-1; but only over the non-zero entries
             // and split into two loops over L and U separately
             for( IndexType c_j = c_k + 1; c_j < L_entries; c_j++ ) {
-               const auto j = L_i.getElementColumn( c_j );
-               const auto L_ij = L.getElementFast( i, j ) - L_ik * U.getElementFast( N - 1 - k, j );
-               L.setElement( i, j, L_ij );
+               const auto j = L_i.getColumnIndex( c_j );
+               const auto L_ij = L_i.getValue( c_j ) - L_ik * U.getElement( N - 1 - k, j );
+               L_i.setValue( c_j, L_ij );
             }
             for( IndexType c_j = 0; c_j < U_entries; c_j++ ) {
-               const auto j = U_i.getElementColumn( c_j );
-               const auto U_ij = U.getElementFast( N - 1 - i, j ) - L_ik * U.getElementFast( N - 1 - k, j );
-               U.setElement( N - 1 - i, j, U_ij );
+               const auto j = U_i.getColumnIndex( c_j );
+               const auto U_ij = U_i.getValue( c_j ) - L_ik * U.getElement( N - 1 - k, j );
+               U_i.setValue( c_j, U_ij );
             }
          }
       }
@@ -293,11 +303,10 @@ allocate_LU()
    auto kernel_copy_row_lengths = [=] __cuda_callable__ ( IndexType i ) mutable
    {
       const auto row = kernel_A->getRow( i );
-      const int max_length = row.getLength();
       int L_entries = 0;
       int U_entries = 0;
-      for( int c_j = 0; c_j < max_length; c_j++ ) {
-         const IndexType j = row.getElementColumn( c_j );
+      for( int c_j = 0; c_j < row.getSize(); c_j++ ) {
+         const IndexType j = row.getColumnIndex( c_j );
          if( j < i )
             L_entries++;
          else if( j < N )
@@ -338,13 +347,12 @@ copy_triangular_factors()
    auto kernel_copy_values = [=] __cuda_callable__ ( IndexType i ) mutable
    {
       const auto row = kernel_A->getRow( i );
-      const int max_length = row.getLength();
-      for( int c_j = 0; c_j < max_length; c_j++ ) {
-         const IndexType j = row.getElementColumn( c_j );
+      for( int c_j = 0; c_j < row.getSize(); c_j++ ) {
+         const IndexType j = row.getColumnIndex( c_j );
          if( j < i )
-            kernel_L->setElementFast( i, j, row.getElementValue( c_j ) );
+            kernel_L->setElement( i, j, row.getValue( c_j ) );
          else if( j < N )
-            kernel_U->setElementFast( i, j, row.getElementValue( c_j ) );
+            kernel_U->setElement( i, j, row.getValue( c_j ) );
          else
             break;
       }
diff --git a/src/TNL/Solvers/Linear/Preconditioners/ILUT.h b/src/TNL/Solvers/Linear/Preconditioners/ILUT.h
index 99ac7fe521c85dc4daa8b34dc76faa69408aa294..6edf6e37625c77334e24990b11f8e1b0931f2546 100644
--- a/src/TNL/Solvers/Linear/Preconditioners/ILUT.h
+++ b/src/TNL/Solvers/Linear/Preconditioners/ILUT.h
@@ -15,7 +15,7 @@
 #include "Preconditioner.h"
 
 #include <TNL/Containers/Vector.h>
-#include <TNL/Matrices/Legacy/CSR.h>
+#include <TNL/Matrices/SparseMatrix.h>
 
 namespace TNL {
 namespace Solvers {
@@ -66,7 +66,7 @@ protected:
    Real tau = 1e-4;
 
    // The factors L and U are stored separately and the rows of U are reversed.
-   Matrices::Legacy::CSR< RealType, DeviceType, IndexType > L, U;
+   Matrices::SparseMatrix< RealType, DeviceType, IndexType, Matrices::GeneralMatrix, Containers::Segments::CSR > L, U;
 
    // Specialized methods to distinguish between normal and distributed matrices
    // in the implementation.
diff --git a/src/TNL/Solvers/Linear/Preconditioners/ILUT_impl.h b/src/TNL/Solvers/Linear/Preconditioners/ILUT_impl.h
index 858f037febbd7f26ccfbc39d3c4de8e2fdbeecdb..29b173b250b5a0eb47801d3f5e8b36b908dd6cac 100644
--- a/src/TNL/Solvers/Linear/Preconditioners/ILUT_impl.h
+++ b/src/TNL/Solvers/Linear/Preconditioners/ILUT_impl.h
@@ -61,11 +61,10 @@ update( const MatrixPointer& matrixPointer )
    typename decltype(U)::CompressedRowLengthsVector U_rowLengths( N );
    for( IndexType i = 0; i < N; i++ ) {
       const auto row = localMatrix.getRow( i );
-      const auto max_length = localMatrix.getRowLength( i );
       IndexType L_entries = 0;
       IndexType U_entries = 0;
-      for( IndexType j = 0; j < max_length; j++ ) {
-         const auto column = row.getElementColumn( j );
+      for( IndexType j = 0; j < row.getSize(); j++ ) {
+         const auto column = row.getColumnIndex( j );
          if( column < minColumn )
             continue;
          if( column < i + minColumn )
@@ -103,7 +102,6 @@ update( const MatrixPointer& matrixPointer )
    // Incomplete LU factorization with threshold
    // (see Saad - Iterative methods for sparse linear systems, section 10.4)
    for( IndexType i = 0; i < N; i++ ) {
-      const auto max_length = localMatrix.getRowLength( i );
       const auto A_i = localMatrix.getRow( i );
 
       RealType A_i_norm = 0.0;
@@ -113,16 +111,16 @@ update( const MatrixPointer& matrixPointer )
 
       // copy A_i into the full vector w
 //      timer_copy_into_w.start();
-      for( IndexType c_j = 0; c_j < max_length; c_j++ ) {
-         auto j = A_i.getElementColumn( c_j );
+      for( IndexType c_j = 0; c_j < A_i.getSize(); c_j++ ) {
+         auto j = A_i.getColumnIndex( c_j );
          if( minColumn > 0 ) {
             // skip non-local elements
             if( j < minColumn ) continue;
             j -= minColumn;
          }
          // handle ellpack dummy entries
-         if( j >= N ) break;
-         w[ j ] = A_i.getElementValue( c_j );
+         if( j == localMatrix.getPaddingIndex() ) break;
+         w[ j ] = A_i.getValue( c_j );
 
          // running computation of norm
          A_i_norm += w[ j ] * w[ j ];
@@ -141,7 +139,7 @@ update( const MatrixPointer& matrixPointer )
          if( k >= i )
             break;
 
-         RealType w_k = w[ k ] / localMatrix.getElementFast( k, k + minColumn );
+         RealType w_k = w[ k ] / localMatrix.getElement( k, k + minColumn );
 
          // apply dropping rule to w_k
          if( std::abs( w_k ) < tau_i )
@@ -154,11 +152,11 @@ update( const MatrixPointer& matrixPointer )
             const auto U_k = U.getRow( N - 1 - k );
             // loop for j = 0, ..., N-1; but only over the non-zero entries
             for( Index c_j = 0; c_j < U_rowLengths[ N - 1 - k ]; c_j++ ) {
-               const auto j = U_k.getElementColumn( c_j );
+               const auto j = U_k.getColumnIndex( c_j );
 
                // skip dropped entries
-               if( j >= N ) break;
-               w[ j ] -= w_k * U_k.getElementValue( c_j );
+               if( j == localMatrix.getPaddingIndex() ) break;
+               w[ j ] -= w_k * U_k.getValue( c_j );
 
                // add non-zero to the w_k_set
                w_k_set.insert( j );
diff --git a/src/TNL/Solvers/Linear/Preconditioners/TriangularSolve.h b/src/TNL/Solvers/Linear/Preconditioners/TriangularSolve.h
index 71f51a6ebf2531ab000bcdff687e736c67a0457e..f47eba8bf1c51f6d3c8bccc1085b5d1c544bc075 100644
--- a/src/TNL/Solvers/Linear/Preconditioners/TriangularSolve.h
+++ b/src/TNL/Solvers/Linear/Preconditioners/TriangularSolve.h
@@ -43,7 +43,7 @@ void triangularSolveLower( const Matrix& L, Vector1& x, const Vector2& b )
    for( IndexType i = 0; i < N; i++ ) {
       RealType x_i = b[ i ];
 
-      const auto L_entries = L.getRowLength( i );
+      const auto L_entries = L.getRowCapacity( i );
 
       // this condition is to avoid segfaults on empty L.getRow( i )
       if( L_entries > 0 ) {
@@ -51,11 +51,11 @@ void triangularSolveLower( const Matrix& L, Vector1& x, const Vector2& b )
 
          // loop for j = 0, ..., i - 1; but only over the non-zero entries
          for( IndexType c_j = 0; c_j < L_entries; c_j++ ) {
-            const auto j = L_i.getElementColumn( c_j );
+            const auto j = L_i.getColumnIndex( c_j );
             // skip padding zeros
             if( fullStorage == false && j >= N )
                break;
-            x_i -= L_i.getElementValue( c_j ) * x[ j ];
+            x_i -= L_i.getValue( c_j ) * x[ j ];
          }
       }
 
@@ -93,18 +93,18 @@ void triangularSolveUpper( const Matrix& U, Vector1& x, const Vector2& b )
 
       const IndexType U_idx = (reversedRows) ? N - 1 - i : i;
 
-      const auto U_entries = U.getRowLength( U_idx );
+      const auto U_entries = U.getRowCapacity( U_idx );
       const auto U_i = U.getRow( U_idx );
 
-      const auto U_ii = U_i.getElementValue( 0 );
+      const auto U_ii = U_i.getValue( 0 );
 
       // loop for j = i+1, ..., N-1; but only over the non-zero entries
       for( IndexType c_j = 1; c_j < U_entries ; c_j++ ) {
-         const auto j = U_i.getElementColumn( c_j );
+         const auto j = U_i.getColumnIndex( c_j );
          // skip padding zeros
          if( fullStorage == false && j >= N )
             break;
-         x_i -= U_i.getElementValue( c_j ) * x[ j ];
+         x_i -= U_i.getValue( c_j ) * x[ j ];
       }
 
       x[ i ] = x_i / U_ii;
diff --git a/src/TNL/Solvers/PDE/BackwardTimeDiscretisation.h b/src/TNL/Solvers/PDE/BackwardTimeDiscretisation.h
index 7172e08e865954fe7d38a943fcf402ebe55f638c..2050fb0a7489b0f76e585e957766e0e5af31a7b6 100644
--- a/src/TNL/Solvers/PDE/BackwardTimeDiscretisation.h
+++ b/src/TNL/Solvers/PDE/BackwardTimeDiscretisation.h
@@ -32,7 +32,7 @@ class BackwardTimeDiscretisation
                                                                const RealType& rhs )
         {
             b += u + tau * rhs;
-            matrix.addElementFast( index, index, 1.0, 1.0 );
+            matrix.addElement( index, index, 1.0, 1.0 );
         }
 };
 
diff --git a/src/TNL/Solvers/PDE/LinearSystemAssembler.h b/src/TNL/Solvers/PDE/LinearSystemAssembler.h
index b74cb2660fc17f5d2eab5682b8e60954ecef0cc4..abc80f9b77a07e6e4a70336202a700db3ef02a8d 100644
--- a/src/TNL/Solvers/PDE/LinearSystemAssembler.h
+++ b/src/TNL/Solvers/PDE/LinearSystemAssembler.h
@@ -114,8 +114,8 @@ class LinearSystemAssembler
                                            typename MeshFunction::IndexType > >::value != true,
       "Error: I am getting Vector instead of MeshFunction or similar object. You might forget to bind DofVector into MeshFunction in you method getExplicitUpdate."  );
 
-      const IndexType maxRowLength = matrixPointer.template getData< Devices::Host >().getMaxRowLength();
-      TNL_ASSERT_GT( maxRowLength, 0, "maximum row length must be positive" );
+      //const IndexType maxRowLength = matrixPointer.template getData< Devices::Host >().getMaxRowLength();
+      //TNL_ASSERT_GT( maxRowLength, 0, "maximum row length must be positive" );
       this->userData.time = time;
       this->userData.tau = tau;
       this->userData.u = &uPointer.template getData< DeviceType >();
diff --git a/src/TNL/Solvers/SolverConfig_impl.h b/src/TNL/Solvers/SolverConfig_impl.h
index 5642995e6b87c081d2cc891ffa9d97c50af707fa..3c21a7b239efb3d9c5bfdd5b19ce02e5e19d6a00 100644
--- a/src/TNL/Solvers/SolverConfig_impl.h
+++ b/src/TNL/Solvers/SolverConfig_impl.h
@@ -16,7 +16,7 @@
 #include <TNL/Solvers/PDE/ExplicitTimeStepper.h>
 #include <TNL/Solvers/PDE/TimeDependentPDESolver.h>
 #include <TNL/Solvers/LinearSolverTypeResolver.h>
-#include <TNL/Matrices/Legacy/CSR.h>
+#include <TNL/Matrices/SparseMatrix.h>
 
 namespace TNL {
 namespace Solvers {
@@ -139,7 +139,7 @@ bool SolverConfig< ConfigTag, ProblemConfig >::configSetup( Config::ConfigDescri
    if( ConfigTagTimeDiscretisation< ConfigTag, SemiImplicitTimeDiscretisationTag >::enabled )
    {
       config.addDelimiter( " === Semi-implicit solvers parameters === " );
-      typedef Matrices::Legacy::CSR< double, Devices::Host, int > MatrixType;
+      using MatrixType = Matrices::SparseMatrix< double >;
       Linear::CG< MatrixType >::configSetup( config );
       Linear::BICGStab< MatrixType >::configSetup( config );
       Linear::BICGStabL< MatrixType >::configSetup( config );
@@ -157,7 +157,6 @@ bool SolverConfig< ConfigTag, ProblemConfig >::configSetup( Config::ConfigDescri
    config.addEntry< String >( "log-file", "Log file for the computation.", "log.txt" );
    config.addEntry< int >( "log-width", "Number of columns of the log table.", 80 );
    return true;
-
 }
 
 } // namespace Solvers
diff --git a/src/UnitTests/Containers/ArrayTest.h b/src/UnitTests/Containers/ArrayTest.h
index 4f6fd7c92c8ea1198e6c9e521c26951df5060e15..c7d9a37405fcb3b8a5078c82890ef9e2f9ee455c 100644
--- a/src/UnitTests/Containers/ArrayTest.h
+++ b/src/UnitTests/Containers/ArrayTest.h
@@ -16,6 +16,9 @@
 #include <TNL/Containers/Array.h>
 #include <TNL/Containers/Vector.h>
 #include <TNL/Pointers/DevicePointer.h>
+#include <TNL/Pointers/SharedPointer.h>
+#include <TNL/Pointers/SmartPointersRegister.h>
+#include <TNL/Algorithms/ParallelFor.h>
 
 #include "gtest/gtest.h"
 
@@ -340,6 +343,37 @@ TYPED_TEST( ArrayTest, elementwiseAccess )
    testArrayElementwiseAccess( ArrayType() );
 }
 
+template< typename ArrayType >
+void test_setElement()
+{
+   Pointers::SharedPointer< ArrayType > a( 10, 0 ), b( 10, 0 );
+   auto set = [=] __cuda_callable__ ( int i ) mutable {
+      a->setElement( i, i );
+      b->setElement( i, a->getElement( i ) );
+   };
+   Pointers::synchronizeSmartPointersOnDevice< typename ArrayType::DeviceType >();
+   Algorithms::ParallelFor< typename ArrayType::DeviceType >::exec( 0, 10, set );
+   for( int i = 0; i < 10; i++ )
+   {
+      EXPECT_EQ( a->getElement( i ), i );
+      EXPECT_EQ( b->getElement( i ), i );
+   }
+}
+
+TYPED_TEST( ArrayTest, setElement )
+{
+   using ArrayType = typename TestFixture::ArrayType;
+
+   ArrayType a( 10 );
+   for( int i = 0; i < 10; i++ )
+      a.setElement( i, i );
+
+   for( int i = 0; i < 10; i++ )
+      EXPECT_EQ( a.getElement( i ), i );
+
+   test_setElement< ArrayType >();
+}
+
 TYPED_TEST( ArrayTest, containsValue )
 {
    using ArrayType = typename TestFixture::ArrayType;
diff --git a/src/UnitTests/Containers/ArrayViewTest.h b/src/UnitTests/Containers/ArrayViewTest.h
index e5a9d5a2091781669d81391f89e9097c4f0b36b5..b6f152c54604775d100c6c96cf6ddc591a259e45 100644
--- a/src/UnitTests/Containers/ArrayViewTest.h
+++ b/src/UnitTests/Containers/ArrayViewTest.h
@@ -287,6 +287,40 @@ void ArrayViewEvaluateTest( ArrayType& u )
    }
 }
 
+template< typename ArrayType >
+void test_setElement()
+{
+   ArrayType a( 10, 0 ), b( 10, 0 );
+   auto a_view = a.getView();
+   auto b_view = b.getView();
+   auto set = [=] __cuda_callable__ ( int i ) mutable {
+      a_view.setElement( i, i );
+      b_view.setElement( i, a_view.getElement( i ) );
+   };
+   Algorithms::ParallelFor< typename ArrayType::DeviceType >::exec( 0, 10, set );
+   for( int i = 0; i < 10; i++ )
+   {
+      EXPECT_EQ( a.getElement( i ), i );
+      EXPECT_EQ( b.getElement( i ), i );
+   }
+}
+
+TYPED_TEST( ArrayViewTest, setElement )
+{
+   using ArrayType = typename TestFixture::ArrayType;
+
+   ArrayType a( 10 );
+   auto a_view = a.getView();
+   for( int i = 0; i < 10; i++ )
+      a_view.setElement( i, i );
+
+   for( int i = 0; i < 10; i++ )
+      EXPECT_EQ( a_view.getElement( i ), i );
+
+   test_setElement< ArrayType >();
+}
+
+
 TYPED_TEST( ArrayViewTest, evaluate )
 {
    using ArrayType = typename TestFixture::ArrayType;
diff --git a/src/UnitTests/Containers/Segments/SegmentsTest.hpp b/src/UnitTests/Containers/Segments/SegmentsTest.hpp
index 6d4692dbe76785970d75bc7763216ad98f9b7be4..b520df21aa15630bd7e6ad75da1bbc2808808ad0 100644
--- a/src/UnitTests/Containers/Segments/SegmentsTest.hpp
+++ b/src/UnitTests/Containers/Segments/SegmentsTest.hpp
@@ -128,7 +128,7 @@ void test_AllReduction_MaximumInSegments()
    TNL::Containers::Vector< IndexType, DeviceType, IndexType > v( segments.getStorageSize() );
 
    auto view = v.getView();
-   auto init = [=] __cuda_callable__ ( const IndexType segmentIdx, const IndexType localIdx, const IndexType globalIdx ) mutable -> bool {
+   auto init = [=] __cuda_callable__ ( const IndexType segmentIdx, const IndexType localIdx, const IndexType globalIdx, bool& compute ) mutable -> bool {
       view[ globalIdx ] =  segmentIdx * 5 + localIdx + 1;
       return true;
    };
@@ -141,8 +141,8 @@ void test_AllReduction_MaximumInSegments()
    auto fetch = [=] __cuda_callable__ ( IndexType segmentIdx, IndexType localIdx, IndexType globalIdx, bool& compute ) -> IndexType {
       return v_view[ globalIdx ];
    };
-   auto reduce = [] __cuda_callable__ ( IndexType& a, const IndexType b ) {
-      a = TNL::max( a, b );
+   auto reduce = [] __cuda_callable__ ( IndexType& a, const IndexType b ) -> IndexType {
+      return TNL::max( a, b );
    };
    auto keep = [=] __cuda_callable__ ( const IndexType i, const IndexType a ) mutable {
       result_view[ i ] = a;
diff --git a/src/UnitTests/Matrices/BinarySparseMatrixCopyTest.h b/src/UnitTests/Matrices/BinarySparseMatrixCopyTest.h
index b901acbbd93dd7a7416645e70441d1382bd381a3..d7a3a429dc00dd194aef5aab2f05f289b9c9215e 100644
--- a/src/UnitTests/Matrices/BinarySparseMatrixCopyTest.h
+++ b/src/UnitTests/Matrices/BinarySparseMatrixCopyTest.h
@@ -14,7 +14,7 @@
 
 #include <TNL/Matrices/SparseMatrix.h>
 #include <TNL/Matrices/MatrixType.h>
-#include <TNL/Matrices/Dense.h>
+#include <TNL/Matrices/DenseMatrix.h>
 #include <TNL/Matrices/Tridiagonal.h>
 #include <TNL/Matrices/Multidiagonal.h>
 #include <TNL/Containers/Segments/CSR.h>
@@ -539,8 +539,8 @@ void denseMatrixAssignment()
    using DeviceType = typename Matrix::DeviceType;
    using IndexType = typename Matrix::IndexType;
 
-   using DenseHost = TNL::Matrices::Dense< RealType, TNL::Devices::Host, IndexType >;
-   using DenseCuda = TNL::Matrices::Dense< RealType, TNL::Devices::Cuda, IndexType >;
+   using DenseHost = TNL::Matrices::DenseMatrix< RealType, TNL::Devices::Host, IndexType >;
+   using DenseCuda = TNL::Matrices::DenseMatrix< RealType, TNL::Devices::Cuda, IndexType >;
 
    const IndexType rows( 10 ), columns( 10 );
    DenseHost hostMatrix( rows, columns );
diff --git a/src/UnitTests/Matrices/BinarySparseMatrixTest.hpp b/src/UnitTests/Matrices/BinarySparseMatrixTest.hpp
index 276c432ff349321beb642c177cb3ef6cd282059a..87d5e139bfc662cdd487e8fcfa2ce3c64a4b10c2 100644
--- a/src/UnitTests/Matrices/BinarySparseMatrixTest.hpp
+++ b/src/UnitTests/Matrices/BinarySparseMatrixTest.hpp
@@ -951,13 +951,10 @@ void test_RowsReduction()
    auto fetch = [] __cuda_callable__ ( IndexType row, IndexType column, IndexType globalIdx, const RealType& value ) -> IndexType {
       return ( value != 0.0 );
    };
-   auto reduce = [] __cuda_callable__ ( IndexType& aux, const IndexType a ) {
-      aux += a;
-   };
    auto keep = [=] __cuda_callable__ ( const IndexType rowIdx, const IndexType value ) mutable {
       rowLengths_view[ rowIdx ] = value;
    };
-   m.allRowsReduction( fetch, reduce, keep, 0 );
+   m.allRowsReduction( fetch, std::plus<>{}, keep, 0 );
    EXPECT_EQ( rowsCapacities, rowLengths );
    m.getCompressedRowLengths( rowLengths );
    EXPECT_EQ( rowsCapacities, rowLengths );
@@ -969,13 +966,10 @@ void test_RowsReduction()
    auto max_fetch = [] __cuda_callable__ ( IndexType row, IndexType column, IndexType globalIdx, const RealType& value ) -> IndexType {
       return abs( value );
    };
-   auto max_reduce = [] __cuda_callable__ ( IndexType& aux, const IndexType a ) {
-      aux += a;
-   };
    auto max_keep = [=] __cuda_callable__ ( const IndexType rowIdx, const IndexType value ) mutable {
       rowSums_view[ rowIdx ] = value;
    };
-   m.allRowsReduction( max_fetch, max_reduce, max_keep, 0 );
+   m.allRowsReduction( max_fetch, std::plus<>{}, max_keep, 0 );
    const RealType maxNorm = TNL::max( rowSums );
    EXPECT_EQ( maxNorm, 8 ) ; // 29+30+31+32+33+34+35+36
 }
diff --git a/src/UnitTests/Matrices/CMakeLists.txt b/src/UnitTests/Matrices/CMakeLists.txt
index f2ffd0c4bab1af47284ee98a09ff1f1002c5647b..eb8e2e1d5076f07963156a982bc2c2241c1bc683 100644
--- a/src/UnitTests/Matrices/CMakeLists.txt
+++ b/src/UnitTests/Matrices/CMakeLists.txt
@@ -25,6 +25,9 @@ IF( BUILD_CUDA )
    CUDA_ADD_EXECUTABLE( SparseMatrixTest_ChunkedEllpack SparseMatrixTest_ChunkedEllpack.cu OPTIONS ${CXX_TESTS_FLAGS} )
    TARGET_LINK_LIBRARIES( SparseMatrixTest_ChunkedEllpack ${GTEST_BOTH_LIBRARIES} )
 
+   CUDA_ADD_EXECUTABLE( SparseMatrixTest_BiEllpack SparseMatrixTest_BiEllpack.cu OPTIONS ${CXX_TESTS_FLAGS} )
+   TARGET_LINK_LIBRARIES( SparseMatrixTest_BiEllpack ${GTEST_BOTH_LIBRARIES} )
+
    CUDA_ADD_EXECUTABLE( SparseMatrixCopyTest SparseMatrixCopyTest.cu OPTIONS ${CXX_TESTS_FLAGS} )
    TARGET_LINK_LIBRARIES( SparseMatrixCopyTest ${GTEST_BOTH_LIBRARIES} )
 
@@ -79,6 +82,10 @@ ELSE(  BUILD_CUDA )
    TARGET_COMPILE_OPTIONS( SparseMatrixTest_ChunkedEllpack PRIVATE ${CXX_TESTS_FLAGS} )
    TARGET_LINK_LIBRARIES( SparseMatrixTest_ChunkedEllpack ${GTEST_BOTH_LIBRARIES} )
 
+   ADD_EXECUTABLE( SparseMatrixTest_BiEllpack SparseMatrixTest_BiEllpack.cpp )
+   TARGET_COMPILE_OPTIONS( SparseMatrixTest_BiEllpack PRIVATE ${CXX_TESTS_FLAGS} )
+   TARGET_LINK_LIBRARIES( SparseMatrixTest_BiEllpack ${GTEST_BOTH_LIBRARIES} )
+
    ADD_EXECUTABLE( SparseMatrixCopyTest SparseMatrixCopyTest.cpp )
    TARGET_COMPILE_OPTIONS( SparseMatrixCopyTest PRIVATE ${CXX_TESTS_FLAGS} )
    TARGET_LINK_LIBRARIES( SparseMatrixCopyTest ${GTEST_BOTH_LIBRARIES} )
@@ -117,6 +124,7 @@ ADD_TEST( SparseMatrixTest_CSR ${EXECUTABLE_OUTPUT_PATH}/SparseMatrixTest_CSR${C
 ADD_TEST( SparseMatrixTest_Ellpack ${EXECUTABLE_OUTPUT_PATH}/SparseMatrixTest_Ellpack${CMAKE_EXECUTABLE_SUFFIX} )
 ADD_TEST( SparseMatrixTest_SlicedEllpack ${EXECUTABLE_OUTPUT_PATH}/SparseMatrixTest_SlicedEllpack${CMAKE_EXECUTABLE_SUFFIX} )
 ADD_TEST( SparseMatrixTest_ChunkedEllpack ${EXECUTABLE_OUTPUT_PATH}/SparseMatrixTest_ChunkedEllpack${CMAKE_EXECUTABLE_SUFFIX} )
+ADD_TEST( SparseMatrixTest_BiEllpack ${EXECUTABLE_OUTPUT_PATH}/SparseMatrixTest_BiEllpack${CMAKE_EXECUTABLE_SUFFIX} )
 ADD_TEST( SparseMatrixCopyTest ${EXECUTABLE_OUTPUT_PATH}/SparseMatrixCopyTest${CMAKE_EXECUTABLE_SUFFIX} )
 ADD_TEST( BinarySparseMatrixTest_CSR ${EXECUTABLE_OUTPUT_PATH}/BinarySparseMatrixTest_CSR${CMAKE_EXECUTABLE_SUFFIX} )
 ADD_TEST( BinarySparseMatrixTest_Ellpack ${EXECUTABLE_OUTPUT_PATH}/BinarySparseMatrixTest_Ellpack${CMAKE_EXECUTABLE_SUFFIX} )
diff --git a/src/UnitTests/Matrices/DenseMatrixCopyTest.h b/src/UnitTests/Matrices/DenseMatrixCopyTest.h
index 3ef31f1075beb311374ad0a45e4a4aff7d2641eb..d041de1dbb19b6fa177932b40d0af5b6125eb6b3 100644
--- a/src/UnitTests/Matrices/DenseMatrixCopyTest.h
+++ b/src/UnitTests/Matrices/DenseMatrixCopyTest.h
@@ -14,7 +14,7 @@
 
 #include <TNL/Matrices/SparseMatrix.h>
 #include <TNL/Matrices/MatrixType.h>
-#include <TNL/Matrices/Dense.h>
+#include <TNL/Matrices/DenseMatrix.h>
 #include <TNL/Matrices/Tridiagonal.h>
 #include <TNL/Matrices/Multidiagonal.h>
 #include <TNL/Containers/Segments/CSR.h>
@@ -33,10 +33,10 @@ using E_host   = TNL::Matrices::SparseMatrix< int, TNL::Devices::Host, int, TNL:
 using E_cuda   = TNL::Matrices::SparseMatrix< int, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, EllpackSegments >;
 using SE_host  = TNL::Matrices::SparseMatrix< int, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, SlicedEllpackSegments >;
 using SE_cuda  = TNL::Matrices::SparseMatrix< int, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, SlicedEllpackSegments >;
-using Dense_host               = TNL::Matrices::Dense< int, TNL::Devices::Host, int, false >;
-using Dense_host_RowMajorOrder = TNL::Matrices::Dense< int, TNL::Devices::Host, int, true >;
-using Dense_cuda               = TNL::Matrices::Dense< int, TNL::Devices::Cuda, int, false >;
-using Dense_cuda_RowMajorOrder = TNL::Matrices::Dense< int, TNL::Devices::Cuda, int, true >;
+using Dense_host               = TNL::Matrices::DenseMatrix< int, TNL::Devices::Host, int, false >;
+using Dense_host_RowMajorOrder = TNL::Matrices::DenseMatrix< int, TNL::Devices::Host, int, true >;
+using Dense_cuda               = TNL::Matrices::DenseMatrix< int, TNL::Devices::Cuda, int, false >;
+using Dense_cuda_RowMajorOrder = TNL::Matrices::DenseMatrix< int, TNL::Devices::Cuda, int, true >;
 
 
 #ifdef HAVE_GTEST
@@ -72,7 +72,7 @@ void setupUnevenRowSizeMatrix( Matrix& m )
    rowLengths.setElement( 6,  1 );
    rowLengths.setElement( 7,  1 );
    rowLengths.setElement( 9,  1 );
-   m.setCompressedRowLengths( rowLengths );
+   m.setRowCapacities( rowLengths );
 
     int value = 1;
     for( int i = 0; i < cols - 4; i++ )  // 0th row
@@ -202,7 +202,7 @@ void setupAntiTriDiagMatrix( Matrix& m )
    rowLengths.setValue( 3 );
    rowLengths.setElement( 0, 4);
    rowLengths.setElement( 1,  4 );
-   m.setCompressedRowLengths( rowLengths );
+   m.setRowCapacities( rowLengths );
 
    int value = 1;
    for( int i = 0; i < rows; i++ )
@@ -289,7 +289,7 @@ void setupTriDiagMatrix( Matrix& m )
    rowLengths.setValue( 3 );
    rowLengths.setElement( 0 , 4 );
    rowLengths.setElement( 1,  4 );
-   m.setCompressedRowLengths( rowLengths );
+   m.setRowCapacities( rowLengths );
 
 
    int value = 1;
@@ -501,8 +501,8 @@ void denseMatrixAssignment()
    using DeviceType = typename Matrix::DeviceType;
    using IndexType = typename Matrix::IndexType;
 
-   using DenseHost = TNL::Matrices::Dense< RealType, TNL::Devices::Host, IndexType >;
-   using DenseCuda = TNL::Matrices::Dense< RealType, TNL::Devices::Cuda, IndexType >;
+   using DenseHost = TNL::Matrices::DenseMatrix< RealType, TNL::Devices::Host, IndexType >;
+   using DenseCuda = TNL::Matrices::DenseMatrix< RealType, TNL::Devices::Cuda, IndexType >;
 
    const IndexType rows( 10 ), columns( 10 );
    DenseHost hostMatrix( rows, columns );
diff --git a/src/UnitTests/Matrices/DenseMatrixTest.h b/src/UnitTests/Matrices/DenseMatrixTest.h
index 8791b51fa6d8eb14a79f032a0cac0d1d91c653fd..391043f0f3129740afde07adb179e37ac9cab7e3 100644
--- a/src/UnitTests/Matrices/DenseMatrixTest.h
+++ b/src/UnitTests/Matrices/DenseMatrixTest.h
@@ -10,7 +10,7 @@
 
 #include <TNL/Devices/Host.h>
 #include <TNL/Matrices/Matrix.h>
-#include <TNL/Matrices/Dense.h>
+#include <TNL/Matrices/DenseMatrix.h>
 #include <TNL/Containers/Array.h>
 
 #include <TNL/Containers/Vector.h>
@@ -18,11 +18,11 @@
 #include <TNL/Math.h>
 #include <iostream>
 
-using Dense_host_float = TNL::Matrices::Dense< float, TNL::Devices::Host, int >;
-using Dense_host_int = TNL::Matrices::Dense< int, TNL::Devices::Host, int >;
+using Dense_host_float = TNL::Matrices::DenseMatrix< float, TNL::Devices::Host, int >;
+using Dense_host_int = TNL::Matrices::DenseMatrix< int, TNL::Devices::Host, int >;
 
-using Dense_cuda_float = TNL::Matrices::Dense< float, TNL::Devices::Cuda, int >;
-using Dense_cuda_int = TNL::Matrices::Dense< int, TNL::Devices::Cuda, int >;
+using Dense_cuda_float = TNL::Matrices::DenseMatrix< float, TNL::Devices::Cuda, int >;
+using Dense_cuda_int = TNL::Matrices::DenseMatrix< int, TNL::Devices::Cuda, int >;
 
 static const char* TEST_FILE_NAME = "test_DenseMatrixTest.tnl";
 
@@ -33,14 +33,14 @@ static const char* TEST_FILE_NAME = "test_DenseMatrixTest.tnl";
 
 void test_GetSerializationType()
 {
-   EXPECT_EQ( ( TNL::Matrices::Dense< float, TNL::Devices::Host, int, true >::getSerializationType() ), TNL::String( "Matrices::Dense< float, [any_device], int, true, [any_allocator] >" ) );
-   EXPECT_EQ( ( TNL::Matrices::Dense< int,   TNL::Devices::Host, int, true >::getSerializationType() ), TNL::String( "Matrices::Dense< int, [any_device], int, true, [any_allocator] >" ) );
-   EXPECT_EQ( ( TNL::Matrices::Dense< float, TNL::Devices::Cuda, int, true >::getSerializationType() ), TNL::String( "Matrices::Dense< float, [any_device], int, true, [any_allocator] >" ) );
-   EXPECT_EQ( ( TNL::Matrices::Dense< int,   TNL::Devices::Cuda, int, true >::getSerializationType() ), TNL::String( "Matrices::Dense< int, [any_device], int, true, [any_allocator] >" ) );
-   EXPECT_EQ( ( TNL::Matrices::Dense< float, TNL::Devices::Host, int, false >::getSerializationType() ), TNL::String( "Matrices::Dense< float, [any_device], int, false, [any_allocator] >" ) );
-   EXPECT_EQ( ( TNL::Matrices::Dense< int,   TNL::Devices::Host, int, false >::getSerializationType() ), TNL::String( "Matrices::Dense< int, [any_device], int, false, [any_allocator] >" ) );
-   EXPECT_EQ( ( TNL::Matrices::Dense< float, TNL::Devices::Cuda, int, false >::getSerializationType() ), TNL::String( "Matrices::Dense< float, [any_device], int, false, [any_allocator] >" ) );
-   EXPECT_EQ( ( TNL::Matrices::Dense< int,   TNL::Devices::Cuda, int, false >::getSerializationType() ), TNL::String( "Matrices::Dense< int, [any_device], int, false, [any_allocator] >" ) );
+   EXPECT_EQ( ( TNL::Matrices::DenseMatrix< float, TNL::Devices::Host, int, true >::getSerializationType() ), TNL::String( "Matrices::DenseMatrix< float, [any_device], int, true, [any_allocator] >" ) );
+   EXPECT_EQ( ( TNL::Matrices::DenseMatrix< int,   TNL::Devices::Host, int, true >::getSerializationType() ), TNL::String( "Matrices::DenseMatrix< int, [any_device], int, true, [any_allocator] >" ) );
+   EXPECT_EQ( ( TNL::Matrices::DenseMatrix< float, TNL::Devices::Cuda, int, true >::getSerializationType() ), TNL::String( "Matrices::DenseMatrix< float, [any_device], int, true, [any_allocator] >" ) );
+   EXPECT_EQ( ( TNL::Matrices::DenseMatrix< int,   TNL::Devices::Cuda, int, true >::getSerializationType() ), TNL::String( "Matrices::DenseMatrix< int, [any_device], int, true, [any_allocator] >" ) );
+   EXPECT_EQ( ( TNL::Matrices::DenseMatrix< float, TNL::Devices::Host, int, false >::getSerializationType() ), TNL::String( "Matrices::DenseMatrix< float, [any_device], int, false, [any_allocator] >" ) );
+   EXPECT_EQ( ( TNL::Matrices::DenseMatrix< int,   TNL::Devices::Host, int, false >::getSerializationType() ), TNL::String( "Matrices::DenseMatrix< int, [any_device], int, false, [any_allocator] >" ) );
+   EXPECT_EQ( ( TNL::Matrices::DenseMatrix< float, TNL::Devices::Cuda, int, false >::getSerializationType() ), TNL::String( "Matrices::DenseMatrix< float, [any_device], int, false, [any_allocator] >" ) );
+   EXPECT_EQ( ( TNL::Matrices::DenseMatrix< int,   TNL::Devices::Cuda, int, false >::getSerializationType() ), TNL::String( "Matrices::DenseMatrix< int, [any_device], int, false, [any_allocator] >" ) );
 }
 
 template< typename Matrix >
@@ -163,31 +163,7 @@ void test_GetCompressedRowLengths()
 }
 
 template< typename Matrix >
-void test_GetRowLength()
-{
-    using RealType = typename Matrix::RealType;
-    using DeviceType = typename Matrix::DeviceType;
-    using IndexType = typename Matrix::IndexType;
-
-    const IndexType rows = 8;
-    const IndexType cols = 7;
-
-    Matrix m;
-    m.reset();
-    m.setDimensions( rows, cols );
-
-    EXPECT_EQ( m.getRowLength( 0 ), 7 );
-    EXPECT_EQ( m.getRowLength( 1 ), 7 );
-    EXPECT_EQ( m.getRowLength( 2 ), 7 );
-    EXPECT_EQ( m.getRowLength( 3 ), 7 );
-    EXPECT_EQ( m.getRowLength( 4 ), 7 );
-    EXPECT_EQ( m.getRowLength( 5 ), 7 );
-    EXPECT_EQ( m.getRowLength( 6 ), 7 );
-    EXPECT_EQ( m.getRowLength( 7 ), 7 );
-}
-
-template< typename Matrix >
-void test_GetNumberOfMatrixElements()
+void test_GetElementsCount()
 {
     using RealType = typename Matrix::RealType;
     using DeviceType = typename Matrix::DeviceType;
@@ -200,11 +176,11 @@ void test_GetNumberOfMatrixElements()
     m.reset();
     m.setDimensions( rows, cols );
 
-    EXPECT_EQ( m.getNumberOfMatrixElements(), 42 );
+    EXPECT_EQ( m.getElementsCount(), 42 );
 }
 
 template< typename Matrix >
-void test_GetNumberOfNonzeroMatrixElements()
+void test_GetNonzeroElementsCount()
 {
     using RealType = typename Matrix::RealType;
     using DeviceType = typename Matrix::DeviceType;
@@ -236,7 +212,7 @@ void test_GetNumberOfNonzeroMatrixElements()
     m.setElement( 0, 0, 0); // Set the first element of the diagonal to 0.
     m.setElement( 6, 5, 0); // Set the last element of the diagonal to 0.
 
-    EXPECT_EQ( m.getNumberOfNonzeroMatrixElements(), 40 );
+    EXPECT_EQ( m.getNonzeroElementsCount(), 40 );
 }
 
 template< typename Matrix >
@@ -730,7 +706,7 @@ void test_AddRow()
       auto row = matrix_view.getRow( rowIdx );
       for( IndexType i = 0; i < 5; i++ )
       {
-         RealType& val = row.getValue( i );
+         RealType& val = row.getElement( i );
          val = rowIdx * val + values[ rowIdx ][ i ];
       }
    };
@@ -1191,8 +1167,8 @@ void test_AssignmentOperator()
    using DeviceType = typename Matrix::DeviceType;
    using IndexType = typename Matrix::IndexType;
 
-   using DenseHost = TNL::Matrices::Dense< RealType, TNL::Devices::Host, IndexType >;
-   using DenseCuda = TNL::Matrices::Dense< RealType, TNL::Devices::Cuda, IndexType >;
+   using DenseHost = TNL::Matrices::DenseMatrix< RealType, TNL::Devices::Host, IndexType >;
+   using DenseCuda = TNL::Matrices::DenseMatrix< RealType, TNL::Devices::Cuda, IndexType >;
 
    const IndexType rows( 10 ), columns( 10 );
    DenseHost hostMatrix( rows, columns );
@@ -1305,53 +1281,6 @@ void test_SaveAndLoad()
     EXPECT_EQ( savedMatrix.getElement( 3, 3 ), 16 );
 }
 
-template< typename Matrix >
-void test_Print()
-{
-    using RealType = typename Matrix::RealType;
-    using DeviceType = typename Matrix::DeviceType;
-    using IndexType = typename Matrix::IndexType;
-/*
- * Sets up the following 5x4 sparse matrix:
- *
- *    /  1  2  3  4 \
- *    |  5  6  7  8 |
- *    |  9 10 11 12 |
- *    | 13 14 15 16 |
- *    \ 17 18 19 20 /
- */
-    const IndexType rows = 5;
-    const IndexType cols = 4;
-
-    Matrix m;
-    m.reset();
-    m.setDimensions( rows, cols );
-
-    RealType value = 1;
-    for( IndexType i = 0; i < rows; i++)
-        for( IndexType j = 0; j < cols; j++)
-            m.setElement( i, j, value++ );
-
-    #include <sstream>
-    std::stringstream printed;
-    std::stringstream couted;
-
-    //change the underlying buffer and save the old buffer
-    auto old_buf = std::cout.rdbuf(printed.rdbuf());
-
-    m.print( std::cout ); //all the std::cout goes to ss
-
-    std::cout.rdbuf(old_buf); //reset
-
-    couted << "Row: 0 ->  Col:0->1	 Col:1->2	 Col:2->3	 Col:3->4\t\n"
-              "Row: 1 ->  Col:0->5	 Col:1->6	 Col:2->7	 Col:3->8\t\n"
-              "Row: 2 ->  Col:0->9	 Col:1->10	 Col:2->11	 Col:3->12\t\n"
-              "Row: 3 ->  Col:0->13	 Col:1->14	 Col:2->15	 Col:3->16\t\n"
-              "Row: 4 ->  Col:0->17	 Col:1->18	 Col:2->19	 Col:3->20\t\n";
-
-    EXPECT_EQ( printed.str(), couted.str() );
-}
-
 // test fixture for typed tests
 template< typename Matrix >
 class MatrixTest : public ::testing::Test
@@ -1363,31 +1292,31 @@ protected:
 // types for which MatrixTest is instantiated
 using MatrixTypes = ::testing::Types
 <
-    TNL::Matrices::Dense< int,    TNL::Devices::Host, short >,
-    TNL::Matrices::Dense< long,   TNL::Devices::Host, short >,
-    TNL::Matrices::Dense< float,  TNL::Devices::Host, short >,
-    TNL::Matrices::Dense< double, TNL::Devices::Host, short >,
-    TNL::Matrices::Dense< int,    TNL::Devices::Host, int >,
-    TNL::Matrices::Dense< long,   TNL::Devices::Host, int >,
-    TNL::Matrices::Dense< float,  TNL::Devices::Host, int >,
-    TNL::Matrices::Dense< double, TNL::Devices::Host, int >,
-    TNL::Matrices::Dense< int,    TNL::Devices::Host, long >,
-    TNL::Matrices::Dense< long,   TNL::Devices::Host, long >,
-    TNL::Matrices::Dense< float,  TNL::Devices::Host, long >,
-    TNL::Matrices::Dense< double, TNL::Devices::Host, long >
+    TNL::Matrices::DenseMatrix< int,    TNL::Devices::Host, short >,
+    TNL::Matrices::DenseMatrix< long,   TNL::Devices::Host, short >,
+    TNL::Matrices::DenseMatrix< float,  TNL::Devices::Host, short >,
+    TNL::Matrices::DenseMatrix< double, TNL::Devices::Host, short >,
+    TNL::Matrices::DenseMatrix< int,    TNL::Devices::Host, int >,
+    TNL::Matrices::DenseMatrix< long,   TNL::Devices::Host, int >,
+    TNL::Matrices::DenseMatrix< float,  TNL::Devices::Host, int >,
+    TNL::Matrices::DenseMatrix< double, TNL::Devices::Host, int >,
+    TNL::Matrices::DenseMatrix< int,    TNL::Devices::Host, long >,
+    TNL::Matrices::DenseMatrix< long,   TNL::Devices::Host, long >,
+    TNL::Matrices::DenseMatrix< float,  TNL::Devices::Host, long >,
+    TNL::Matrices::DenseMatrix< double, TNL::Devices::Host, long >
 #ifdef HAVE_CUDA
-    ,TNL::Matrices::Dense< int,    TNL::Devices::Cuda, short >,
-    TNL::Matrices::Dense< long,   TNL::Devices::Cuda, short >,
-    TNL::Matrices::Dense< float,  TNL::Devices::Cuda, short >,
-    TNL::Matrices::Dense< double, TNL::Devices::Cuda, short >,
-    TNL::Matrices::Dense< int,    TNL::Devices::Cuda, int >,
-    TNL::Matrices::Dense< long,   TNL::Devices::Cuda, int >,
-    TNL::Matrices::Dense< float,  TNL::Devices::Cuda, int >,
-    TNL::Matrices::Dense< double, TNL::Devices::Cuda, int >,
-    TNL::Matrices::Dense< int,    TNL::Devices::Cuda, long >,
-    TNL::Matrices::Dense< long,   TNL::Devices::Cuda, long >,
-    TNL::Matrices::Dense< float,  TNL::Devices::Cuda, long >,
-    TNL::Matrices::Dense< double, TNL::Devices::Cuda, long >
+    ,TNL::Matrices::DenseMatrix< int,    TNL::Devices::Cuda, short >,
+    TNL::Matrices::DenseMatrix< long,   TNL::Devices::Cuda, short >,
+    TNL::Matrices::DenseMatrix< float,  TNL::Devices::Cuda, short >,
+    TNL::Matrices::DenseMatrix< double, TNL::Devices::Cuda, short >,
+    TNL::Matrices::DenseMatrix< int,    TNL::Devices::Cuda, int >,
+    TNL::Matrices::DenseMatrix< long,   TNL::Devices::Cuda, int >,
+    TNL::Matrices::DenseMatrix< float,  TNL::Devices::Cuda, int >,
+    TNL::Matrices::DenseMatrix< double, TNL::Devices::Cuda, int >,
+    TNL::Matrices::DenseMatrix< int,    TNL::Devices::Cuda, long >,
+    TNL::Matrices::DenseMatrix< long,   TNL::Devices::Cuda, long >,
+    TNL::Matrices::DenseMatrix< float,  TNL::Devices::Cuda, long >,
+    TNL::Matrices::DenseMatrix< double, TNL::Devices::Cuda, long >
 #endif
 >;
 
@@ -1419,25 +1348,18 @@ TYPED_TEST( MatrixTest, setElementsTest )
     test_SetElements< MatrixType >();
 }
 
-TYPED_TEST( MatrixTest, getRowLengthTest )
+TYPED_TEST( MatrixTest, getElementsCountTest )
 {
     using MatrixType = typename TestFixture::MatrixType;
 
-    test_GetRowLength< MatrixType >();
+    test_GetElementsCount< MatrixType >();
 }
 
-TYPED_TEST( MatrixTest, getNumberOfMatrixElementsTest )
+TYPED_TEST( MatrixTest, getNonzeroElementsCountTest )
 {
     using MatrixType = typename TestFixture::MatrixType;
 
-    test_GetNumberOfMatrixElements< MatrixType >();
-}
-
-TYPED_TEST( MatrixTest, getNumberOfNonzeroMatrixElementsTest )
-{
-    using MatrixType = typename TestFixture::MatrixType;
-
-    test_GetNumberOfNonzeroMatrixElements< MatrixType >();
+    test_GetNonzeroElementsCount< MatrixType >();
 }
 
 TYPED_TEST( MatrixTest, resetTest )
@@ -1510,13 +1432,6 @@ TYPED_TEST( MatrixTest, saveAndLoadTest )
     test_SaveAndLoad< MatrixType >();
 }
 
-TYPED_TEST( MatrixTest, printTest )
-{
-    using MatrixType = typename TestFixture::MatrixType;
-
-    test_Print< MatrixType >();
-}
-
 //// test_getType is not general enough yet. DO NOT TEST IT YET.
 
 //TEST( DenseMatrixTest, Dense_GetTypeTest_Host )
diff --git a/src/UnitTests/Matrices/DistributedMatrixTest.h b/src/UnitTests/Matrices/DistributedMatrixTest.h
index ca3f2a100280e16036fca29a717b0ab5db02dfd3..d030777a696ce36a6808d2f13cc556802688cb59 100644
--- a/src/UnitTests/Matrices/DistributedMatrixTest.h
+++ b/src/UnitTests/Matrices/DistributedMatrixTest.h
@@ -13,7 +13,7 @@
 #include <TNL/Communicators/NoDistrCommunicator.h>
 #include <TNL/Matrices/DistributedMatrix.h>
 #include <TNL/Containers/Partitioner.h>
-#include <TNL/Matrices/Legacy/CSR.h>
+#include <TNL/Matrices/SparseMatrix.h>
 
 using namespace TNL;
 
@@ -57,9 +57,7 @@ void setMatrix( Matrix& matrix, const RowLengths& rowLengths )
  * - Number of processes is not limited.
  * - Global size is hardcoded as 97 to force non-uniform distribution.
  * - Communication group is hardcoded as AllGroup -- it may be changed as needed.
- * - Matrix format is hardcoded as CSR -- it should be possible to change it to
- *   any other format which does not include padding zeros in the getRowLength()
- *   result.
+ * - Matrix format is hardcoded as CSR.
  */
 template< typename DistributedMatrix >
 class DistributedMatrixTest
@@ -103,12 +101,12 @@ protected:
 
 // types for which DistributedMatrixTest is instantiated
 using DistributedMatrixTypes = ::testing::Types<
-   Matrices::DistributedMatrix< Matrices::Legacy::CSR< double, Devices::Host, int >, Communicators::MpiCommunicator >,
-   Matrices::DistributedMatrix< Matrices::Legacy::CSR< double, Devices::Host, int >, Communicators::NoDistrCommunicator >
+   Matrices::DistributedMatrix< Matrices::SparseMatrix< double, Devices::Host, int >, Communicators::MpiCommunicator >,
+   Matrices::DistributedMatrix< Matrices::SparseMatrix< double, Devices::Host, int >, Communicators::NoDistrCommunicator >
 #ifdef HAVE_CUDA
    ,
-   Matrices::DistributedMatrix< Matrices::Legacy::CSR< double, Devices::Cuda, int >, Communicators::MpiCommunicator >,
-   Matrices::DistributedMatrix< Matrices::Legacy::CSR< double, Devices::Cuda, int >, Communicators::NoDistrCommunicator >
+   Matrices::DistributedMatrix< Matrices::SparseMatrix< double, Devices::Cuda, int >, Communicators::MpiCommunicator >,
+   Matrices::DistributedMatrix< Matrices::SparseMatrix< double, Devices::Cuda, int >, Communicators::NoDistrCommunicator >
 #endif
 >;
 
@@ -154,14 +152,14 @@ TYPED_TEST( DistributedMatrixTest, setCompressedRowLengths )
 {
    for( int i = 0; i < this->matrix.getLocalMatrix().getRows(); i++ ) {
       const auto gi = this->matrix.getLocalRowRange().getGlobalIndex( i );
-      EXPECT_EQ( this->matrix.getRowLength( gi ), 0 );
-      EXPECT_EQ( this->matrix.getLocalMatrix().getRowLength( i ), 0 );
+      EXPECT_EQ( this->matrix.getRowCapacity( gi ), 0 );
+      EXPECT_EQ( this->matrix.getLocalMatrix().getRowCapacity( i ), 0 );
    }
    this->matrix.setCompressedRowLengths( this->rowLengths );
    for( int i = 0; i < this->matrix.getLocalMatrix().getRows(); i++ ) {
       const auto gi = this->matrix.getLocalRowRange().getGlobalIndex( i );
-      EXPECT_EQ( this->matrix.getRowLength( gi ), gi + 1 );
-      EXPECT_EQ( this->matrix.getLocalMatrix().getRowLength( i ), gi + 1 );
+      EXPECT_EQ( this->matrix.getRowCapacity( gi ), gi + 1 );
+      EXPECT_EQ( this->matrix.getLocalMatrix().getRowCapacity( i ), gi + 1 );
    }
 }
 
@@ -171,7 +169,13 @@ TYPED_TEST( DistributedMatrixTest, getCompressedRowLengths )
 
    this->matrix.setCompressedRowLengths( this->rowLengths );
    RowLengthsVector output;
-   this->matrix.getCompressedRowLengths( output ); // TODO: replace this with getRowCapacities
+   this->matrix.getCompressedRowLengths( output );
+   // zero row lengths because the matrix is empty
+   EXPECT_EQ( output, 0 );
+   for( int i = 0; i < this->matrix.getLocalMatrix().getRows(); i++ ) {
+      const auto gi = this->matrix.getLocalRowRange().getGlobalIndex( i );
+      output.setElement( gi, this->matrix.getRowCapacity( gi ) );
+   }
    EXPECT_EQ( output, this->rowLengths );
 }
 
@@ -203,6 +207,7 @@ TYPED_TEST( DistributedMatrixTest, setGetElement )
 
 // TODO: getRow (const and non-const)
 
+
 TYPED_TEST( DistributedMatrixTest, vectorProduct_globalInput )
 {
    using GlobalVector = typename TestFixture::GlobalVector;
diff --git a/src/UnitTests/Matrices/SparseMatrixCopyTest.h b/src/UnitTests/Matrices/SparseMatrixCopyTest.h
index 829c30677b2c7e3a0209ed72c01a991ffec56d1c..6f8a142a673f6e206d6fa589145706c55aed3ec5 100644
--- a/src/UnitTests/Matrices/SparseMatrixCopyTest.h
+++ b/src/UnitTests/Matrices/SparseMatrixCopyTest.h
@@ -14,7 +14,7 @@
 
 #include <TNL/Matrices/SparseMatrix.h>
 #include <TNL/Matrices/MatrixType.h>
-#include <TNL/Matrices/Dense.h>
+#include <TNL/Matrices/DenseMatrix.h>
 #include <TNL/Matrices/Tridiagonal.h>
 #include <TNL/Matrices/Multidiagonal.h>
 #include <TNL/Containers/Segments/CSR.h>
@@ -542,8 +542,8 @@ void denseMatrixAssignment()
    using DeviceType = typename Matrix::DeviceType;
    using IndexType = typename Matrix::IndexType;
 
-   using DenseHost = TNL::Matrices::Dense< RealType, TNL::Devices::Host, IndexType >;
-   using DenseCuda = TNL::Matrices::Dense< RealType, TNL::Devices::Cuda, IndexType >;
+   using DenseHost = TNL::Matrices::DenseMatrix< RealType, TNL::Devices::Host, IndexType >;
+   using DenseCuda = TNL::Matrices::DenseMatrix< RealType, TNL::Devices::Cuda, IndexType >;
 
    const IndexType rows( 10 ), columns( 10 );
    DenseHost hostMatrix( rows, columns );
diff --git a/src/UnitTests/Matrices/SparseMatrixTest.h b/src/UnitTests/Matrices/SparseMatrixTest.h
index 12cdbeef3fca46946193ff95f7a9f8ab455e0d19..a00e696871b4393672cc6c2475fb961f08187b8d 100644
--- a/src/UnitTests/Matrices/SparseMatrixTest.h
+++ b/src/UnitTests/Matrices/SparseMatrixTest.h
@@ -115,5 +115,4 @@ TYPED_TEST( MatrixTest, printTest )
 
     test_Print< MatrixType >();
 }
-
 #endif
\ No newline at end of file
diff --git a/src/UnitTests/Matrices/SparseMatrixTest.hpp b/src/UnitTests/Matrices/SparseMatrixTest.hpp
index 8080d45e54553f53a2f02b502d3dee9b7a680426..6d7c6436058cec31efa1fe4e64cc4e27238a67a5 100644
--- a/src/UnitTests/Matrices/SparseMatrixTest.hpp
+++ b/src/UnitTests/Matrices/SparseMatrixTest.hpp
@@ -10,6 +10,7 @@
 
 #pragma once
 
+#include <functional>
 #include <TNL/Containers/Vector.h>
 #include <TNL/Containers/VectorView.h>
 #include <TNL/Math.h>
@@ -79,6 +80,16 @@ void test_Constructors()
       EXPECT_EQ( m2.getRow( 3 ).getValue( 0 ), 1 );   // 3rd row
       EXPECT_EQ( m2.getRow( 3 ).getValue( 1 ), 1 );
       EXPECT_EQ( m2.getRow( 4 ).getValue( 0 ), 1 );   // 4th row
+
+      const Matrix& mm = m2;
+      EXPECT_EQ( mm.getRow( 0 ).getValue( 0 ), 1 );   // 0th row
+      EXPECT_EQ( mm.getRow( 1 ).getValue( 0 ), 1 );   // 1st row
+      EXPECT_EQ( mm.getRow( 1 ).getValue( 1 ), 1 );
+      EXPECT_EQ( mm.getRow( 2 ).getValue( 0 ), 1 );   // 2nd row
+      EXPECT_EQ( mm.getRow( 2 ).getValue( 1 ), 1 );
+      EXPECT_EQ( mm.getRow( 3 ).getValue( 0 ), 1 );   // 3rd row
+      EXPECT_EQ( mm.getRow( 3 ).getValue( 1 ), 1 );
+      EXPECT_EQ( mm.getRow( 4 ).getValue( 0 ), 1 );   // 4th row
    }
 
    m2.getCompressedRowLengths( v1 );
@@ -95,7 +106,7 @@ void test_Constructors()
     *    \  0  0  0 12  0 /
     */
 
-   Matrix m3( 6, 5, {
+   const Matrix m3( 6, 5, {
       { 0, 0,  1.0 }, { 0, 1, 2.0 }, { 0, 2, 3.0 },
       { 1, 1,  4.0 }, { 1, 2, 5.0 }, { 1, 3, 6.0 },
       { 2, 2,  7.0 }, { 2, 3, 8.0 }, { 2, 4, 9.0 },
@@ -140,6 +151,27 @@ void test_Constructors()
    EXPECT_EQ( m3.getElement( 5, 3 ), 12 );
    EXPECT_EQ( m3.getElement( 5, 4 ),  0 );
 
+   if( std::is_same< DeviceType, TNL::Devices::Host >::value )
+   {
+      EXPECT_EQ( m3.getRow( 0 ).getValue( 0 ),  1 );
+      EXPECT_EQ( m3.getRow( 0 ).getValue( 1 ),  2 );
+      EXPECT_EQ( m3.getRow( 0 ).getValue( 2 ),  3 );
+
+      EXPECT_EQ( m3.getRow( 1 ).getValue( 0 ),  4 );
+      EXPECT_EQ( m3.getRow( 1 ).getValue( 1 ),  5 );
+      EXPECT_EQ( m3.getRow( 1 ).getValue( 2 ),  6 );
+
+      EXPECT_EQ( m3.getRow( 2 ).getValue( 0 ),  7 );
+      EXPECT_EQ( m3.getRow( 2 ).getValue( 1 ),  8 );
+      EXPECT_EQ( m3.getRow( 2 ).getValue( 2 ),  9 );
+
+      EXPECT_EQ( m3.getRow( 3 ).getValue( 0 ), 10 );
+
+      EXPECT_EQ( m3.getRow( 4 ).getValue( 0 ), 11 );
+
+      EXPECT_EQ( m3.getRow( 5 ).getValue( 0 ), 12 );
+   }
+
    std::map< std::pair< int, int >, float > map;
    map[ { 0, 0 } ] = 1.0;
    map[ { 0, 1 } ] = 2.0;
@@ -374,6 +406,118 @@ void test_GetRow()
    using DeviceType = typename Matrix::DeviceType;
    using IndexType = typename Matrix::IndexType;
 
+   Matrix m2( {1, 2, 2, 2, 1 }, 5 );
+   typename Matrix::RowsCapacitiesType v1, v2{ 1, 2, 2, 2, 1 };
+   m2.setElement( 0, 0, 1 );   // 0th row
+   m2.setElement( 1, 0, 1 );   // 1st row
+   m2.setElement( 1, 1, 1 );
+   m2.setElement( 2, 1, 1 );   // 2nd row
+   m2.setElement( 2, 2, 1 );
+   m2.setElement( 3, 2, 1 );   // 3rd row
+   m2.setElement( 3, 3, 1 );
+   m2.setElement( 4, 4, 1 );   // 4th row
+
+   EXPECT_EQ( m2.getElement( 0, 0 ), 1 );   // 0th row
+   EXPECT_EQ( m2.getElement( 1, 0 ), 1 );   // 1st row
+   EXPECT_EQ( m2.getElement( 1, 1 ), 1 );
+   EXPECT_EQ( m2.getElement( 2, 1 ), 1 );   // 2nd row
+   EXPECT_EQ( m2.getElement( 2, 2 ), 1 );
+   EXPECT_EQ( m2.getElement( 3, 2 ), 1 );   // 3rd row
+   EXPECT_EQ( m2.getElement( 3, 3 ), 1 );
+   EXPECT_EQ( m2.getElement( 4, 4 ), 1 );   // 4th row
+
+   if( std::is_same< DeviceType, TNL::Devices::Host >::value )
+   {
+      EXPECT_EQ( m2.getRow( 0 ).getValue( 0 ), 1 );   // 0th row
+      EXPECT_EQ( m2.getRow( 1 ).getValue( 0 ), 1 );   // 1st row
+      EXPECT_EQ( m2.getRow( 1 ).getValue( 1 ), 1 );
+      EXPECT_EQ( m2.getRow( 2 ).getValue( 0 ), 1 );   // 2nd row
+      EXPECT_EQ( m2.getRow( 2 ).getValue( 1 ), 1 );
+      EXPECT_EQ( m2.getRow( 3 ).getValue( 0 ), 1 );   // 3rd row
+      EXPECT_EQ( m2.getRow( 3 ).getValue( 1 ), 1 );
+      EXPECT_EQ( m2.getRow( 4 ).getValue( 0 ), 1 );   // 4th row
+   }
+
+   m2.getCompressedRowLengths( v1 );
+   EXPECT_EQ( v1, v2 );
+
+   /*
+    * Sets up the following 6x5 sparse matrix:
+    *
+    *    /  1  2  3  0  0 \
+    *    |  0  4  5  6  0 |
+    *    |  0  0  7  8  9 |
+    *    | 10  0  0  0  0 |
+    *    |  0 11  0  0  0 |
+    *    \  0  0  0 12  0 /
+    */
+
+   const Matrix m3( 6, 5, {
+      { 0, 0,  1.0 }, { 0, 1, 2.0 }, { 0, 2, 3.0 },
+      { 1, 1,  4.0 }, { 1, 2, 5.0 }, { 1, 3, 6.0 },
+      { 2, 2,  7.0 }, { 2, 3, 8.0 }, { 2, 4, 9.0 },
+      { 3, 0, 10.0 },
+      { 4, 1, 11.0 },
+      { 5, 3, 12.0 } } );
+
+   // Check the set elements
+   EXPECT_EQ( m3.getElement( 0, 0 ),  1 );
+   EXPECT_EQ( m3.getElement( 0, 1 ),  2 );
+   EXPECT_EQ( m3.getElement( 0, 2 ),  3 );
+   EXPECT_EQ( m3.getElement( 0, 3 ),  0 );
+   EXPECT_EQ( m3.getElement( 0, 4 ),  0 );
+
+   EXPECT_EQ( m3.getElement( 1, 0 ),  0 );
+   EXPECT_EQ( m3.getElement( 1, 1 ),  4 );
+   EXPECT_EQ( m3.getElement( 1, 2 ),  5 );
+   EXPECT_EQ( m3.getElement( 1, 3 ),  6 );
+   EXPECT_EQ( m3.getElement( 1, 4 ),  0 );
+
+   EXPECT_EQ( m3.getElement( 2, 0 ),  0 );
+   EXPECT_EQ( m3.getElement( 2, 1 ),  0 );
+   EXPECT_EQ( m3.getElement( 2, 2 ),  7 );
+   EXPECT_EQ( m3.getElement( 2, 3 ),  8 );
+   EXPECT_EQ( m3.getElement( 2, 4 ),  9 );
+
+   EXPECT_EQ( m3.getElement( 3, 0 ), 10 );
+   EXPECT_EQ( m3.getElement( 3, 1 ),  0 );
+   EXPECT_EQ( m3.getElement( 3, 2 ),  0 );
+   EXPECT_EQ( m3.getElement( 3, 3 ),  0 );
+   EXPECT_EQ( m3.getElement( 3, 4 ),  0 );
+
+   EXPECT_EQ( m3.getElement( 4, 0 ),  0 );
+   EXPECT_EQ( m3.getElement( 4, 1 ), 11 );
+   EXPECT_EQ( m3.getElement( 4, 2 ),  0 );
+   EXPECT_EQ( m3.getElement( 4, 3 ),  0 );
+   EXPECT_EQ( m3.getElement( 4, 4 ),  0 );
+
+   EXPECT_EQ( m3.getElement( 5, 0 ),  0 );
+   EXPECT_EQ( m3.getElement( 5, 1 ),  0 );
+   EXPECT_EQ( m3.getElement( 5, 2 ),  0 );
+   EXPECT_EQ( m3.getElement( 5, 3 ), 12 );
+   EXPECT_EQ( m3.getElement( 5, 4 ),  0 );
+
+   if( std::is_same< DeviceType, TNL::Devices::Host >::value )
+   {
+      EXPECT_EQ( m3.getRow( 0 ).getValue( 0 ),  1 );
+      EXPECT_EQ( m3.getRow( 0 ).getValue( 1 ),  2 );
+      EXPECT_EQ( m3.getRow( 0 ).getValue( 2 ),  3 );
+
+      EXPECT_EQ( m3.getRow( 1 ).getValue( 0 ),  4 );
+      EXPECT_EQ( m3.getRow( 1 ).getValue( 1 ),  5 );
+      EXPECT_EQ( m3.getRow( 1 ).getValue( 2 ),  6 );
+
+      EXPECT_EQ( m3.getRow( 2 ).getValue( 0 ),  7 );
+      EXPECT_EQ( m3.getRow( 2 ).getValue( 1 ),  8 );
+      EXPECT_EQ( m3.getRow( 2 ).getValue( 2 ),  9 );
+
+      EXPECT_EQ( m3.getRow( 3 ).getValue( 0 ), 10 );
+
+      EXPECT_EQ( m3.getRow( 4 ).getValue( 0 ), 11 );
+
+      EXPECT_EQ( m3.getRow( 5 ).getValue( 0 ), 12 );
+   }
+
    /*
     * Sets up the following 10x10 sparse matrix:
     *
@@ -1233,13 +1377,10 @@ void test_RowsReduction()
    auto fetch = [] __cuda_callable__ ( IndexType row, IndexType column, IndexType globalIdx, const RealType& value ) -> IndexType {
       return ( value != 0.0 );
    };
-   auto reduce = [] __cuda_callable__ ( IndexType& aux, const IndexType a ) {
-      aux += a;
-   };
    auto keep = [=] __cuda_callable__ ( const IndexType rowIdx, const IndexType value ) mutable {
       rowLengths_view[ rowIdx ] = value;
    };
-   m.allRowsReduction( fetch, reduce, keep, 0 );
+   m.allRowsReduction( fetch, std::plus<>{}, keep, 0 );
    EXPECT_EQ( rowsCapacities, rowLengths );
    m.getCompressedRowLengths( rowLengths );
    EXPECT_EQ( rowsCapacities, rowLengths );
@@ -1251,13 +1392,10 @@ void test_RowsReduction()
    auto max_fetch = [] __cuda_callable__ ( IndexType row, IndexType column, IndexType globalIdx, const RealType& value ) -> IndexType {
       return abs( value );
    };
-   auto max_reduce = [] __cuda_callable__ ( IndexType& aux, const IndexType a ) {
-      aux += a;
-   };
    auto max_keep = [=] __cuda_callable__ ( const IndexType rowIdx, const IndexType value ) mutable {
       rowSums_view[ rowIdx ] = value;
    };
-   m.allRowsReduction( max_fetch, max_reduce, max_keep, 0 );
+   m.allRowsReduction( max_fetch, std::plus<>{}, max_keep, 0 );
    const RealType maxNorm = TNL::max( rowSums );
    EXPECT_EQ( maxNorm, 260 ) ; // 29+30+31+32+33+34+35+36
 }
diff --git a/src/UnitTests/Matrices/SparseMatrixTest_BiEllpack.cpp b/src/UnitTests/Matrices/SparseMatrixTest_BiEllpack.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ba7f3cf8db2a49c7df8a950557d637fb5c777bbb
--- /dev/null
+++ b/src/UnitTests/Matrices/SparseMatrixTest_BiEllpack.cpp
@@ -0,0 +1,11 @@
+/***************************************************************************
+                          SparseMatrixTest_BiEllpack.cpp -  description
+                             -------------------
+    begin                : Apr 7, 2020
+    copyright            : (C) 2020 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#include "SparseMatrixTest_BiEllpack.h"
diff --git a/src/UnitTests/Matrices/SparseMatrixTest_BiEllpack.cu b/src/UnitTests/Matrices/SparseMatrixTest_BiEllpack.cu
new file mode 100644
index 0000000000000000000000000000000000000000..1121477b5752e14eaaf2dd3de725559de344f7fb
--- /dev/null
+++ b/src/UnitTests/Matrices/SparseMatrixTest_BiEllpack.cu
@@ -0,0 +1,11 @@
+/***************************************************************************
+                          SparseMatrixTest_BiEllpack.cu -  description
+                             -------------------
+    begin                : Apr 7, 2020
+    copyright            : (C) 2020 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#include "SparseMatrixTest_BiEllpack.h"
diff --git a/src/UnitTests/Matrices/SparseMatrixTest_BiEllpack.h b/src/UnitTests/Matrices/SparseMatrixTest_BiEllpack.h
new file mode 100644
index 0000000000000000000000000000000000000000..03cc3646bf4176387c282bf4ed8d456a7e930172
--- /dev/null
+++ b/src/UnitTests/Matrices/SparseMatrixTest_BiEllpack.h
@@ -0,0 +1,58 @@
+/***************************************************************************
+                          SparseMatrixTest_BiEllpack.h -  description
+                             -------------------
+    begin                : Apr 7, 2020
+    copyright            : (C) 2020 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#include <iostream>
+#include <TNL/Containers/Segments/BiEllpack.h>
+#include <TNL/Matrices/SparseMatrix.h>
+
+#ifdef HAVE_GTEST
+#include <gtest/gtest.h>
+
+const char* saveAndLoadFileName = "test_SparseMatrixTest_BiEllpack_segments";
+
+////
+// Row-major format is used for the host system
+template< typename Device, typename Index, typename IndexAllocator >
+using RowMajorBiEllpack = TNL::Containers::Segments::BiEllpack< Device, Index, IndexAllocator, true >;
+
+////
+// Column-major format is used for GPUs
+template< typename Device, typename Index, typename IndexAllocator >
+using ColumnMajorBiEllpack = TNL::Containers::Segments::BiEllpack< Device, Index, IndexAllocator, false >;
+
+// types for which MatrixTest is instantiated
+using MatrixTypes = ::testing::Types
+<
+    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, ColumnMajorBiEllpack >,
+    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, RowMajorBiEllpack >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, RowMajorBiEllpack >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, RowMajorBiEllpack >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, RowMajorBiEllpack >,
+    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, RowMajorBiEllpack >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, RowMajorBiEllpack >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, RowMajorBiEllpack >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, RowMajorBiEllpack >
+#ifdef HAVE_CUDA
+   ,TNL::Matrices::SparseMatrix< int,     TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, ColumnMajorBiEllpack >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, ColumnMajorBiEllpack >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, ColumnMajorBiEllpack >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, ColumnMajorBiEllpack >,
+    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, ColumnMajorBiEllpack >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, ColumnMajorBiEllpack >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, ColumnMajorBiEllpack >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, ColumnMajorBiEllpack >,
+    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, RowMajorBiEllpack >,
+#endif
+>;
+
+#endif
+
+#include "SparseMatrixTest.h"
+#include "../main.h"
diff --git a/src/UnitTests/Matrices/SymmetricSparseMatrixTest.hpp b/src/UnitTests/Matrices/SymmetricSparseMatrixTest.hpp
index 58a4f4fce0e4fdafcaf2c095e6d35875c51d286b..4e28842ba066ea5f794d8a279dacda09fbad1a85 100644
--- a/src/UnitTests/Matrices/SymmetricSparseMatrixTest.hpp
+++ b/src/UnitTests/Matrices/SymmetricSparseMatrixTest.hpp
@@ -903,13 +903,10 @@ void test_RowsReduction()
          TNL::Algorithms::AtomicOperations< DeviceType >::add( rowLengths_view[ column ], ( IndexType ) 1 );
       return ( value != 0.0 );
    };
-   auto reduce = [] __cuda_callable__ ( IndexType& aux, const IndexType a ) {
-      aux += a;
-   };
    auto keep = [=] __cuda_callable__ ( const IndexType rowIdx, const IndexType value ) mutable {
       rowLengths_view[ rowIdx ] += value;
    };
-   m_5.allRowsReduction( fetch, reduce, keep, 0 );
+   m_5.allRowsReduction( fetch, std::plus<>{}, keep, 0 );
 
    EXPECT_EQ( rowLengths_true, rowLengths );
    m_5.getCompressedRowLengths( rowLengths );