diff --git a/CMakeLists.txt b/CMakeLists.txt
index ecc55634635c3104ed6bf65d6b30bce20277de2c..18c358ee6b213c3c3836bd2aaecbd67a3b991f7f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -309,30 +309,34 @@ if( ${WITH_GMP} )
    endif()
 endif()
 
-#if( BUILD_MPI )
-#   FIND_PATH( PETSC_INCLUDE_DIR petsc.h
-#     /usr/include/petsc
-#     ${PETSC_DIR}/${PETSC_ARCH}/include
-#     ${PETSC_DIR}/include
-#     DOC "PETSC headers."
-#   )
-#   if( ${PETSC_INCLUDE_DIR} STREQUAL "PETSC_INCLUDE_DIR-NOTFOUND" )
-#      message( "PETSC not found." )
-#   else()
-#      message( "PETSC headers found -- ${PETSC_INCLUDE_DIR}" )
-#      FIND_LIBRARY(PETSC_LIBRARY petsc
-#                  ${PETSC_INCLUDE_DIR}/../lib
-#                  /usr/local/lib
-#                  /usr/lib)
-#      if( PETSC_LIBRARY )
-#         #string( REPLACE ";" " " MPI_LIBRARIES "${MPI_CXX_LIBRARIES}" )
-#         #set( PETSC_LIBRARY "${MPI_LIBRARIES} ${PETSC_LIBRARY}")
-#         message( "PETSC library found -- ${PETSC_LIBRARY}")
-#         list( GET MPI_CXX_INCLUDE_PATH 0 MPI_CXX_PATH )
-#         set(PETSC_CXX_FLAGS "-DHAVE_PETSC -I${PETSC_INCLUDE_DIR} -DHAVE_MPI -I${MPI_CXX_PATH}")
-#      endif()
-#   endif()
-#endif()
+####
+# Test for PETSc
+if( BUILD_MPI )
+   FIND_PATH( PETSC_INCLUDE_DIR petsc.h
+      /usr/include/petsc
+      ${PETSC_DIR}/${PETSC_ARCH}/include
+      ${PETSC_DIR}/include
+      DOC "PETSC headers."
+   )
+   if( ${PETSC_INCLUDE_DIR} STREQUAL "PETSC_INCLUDE_DIR-NOTFOUND" )
+      message( "PETSC not found." )
+   else()
+      message( "PETSC headers found -- ${PETSC_INCLUDE_DIR}" )
+      FIND_LIBRARY(PETSC_LIBRARY petsc
+                  ${PETSC_INCLUDE_DIR}/../lib
+                  /usr/local/lib
+                  /usr/lib)
+      if( PETSC_LIBRARY )
+         #string( REPLACE ";" " " MPI_LIBRARIES "${MPI_CXX_LIBRARIES}" )
+         #set( PETSC_LIBRARY "${MPI_LIBRARIES} ${PETSC_LIBRARY}")
+         message( "PETSC library found -- ${PETSC_LIBRARY}")
+         #list( GET MPI_CXX_INCLUDE_PATH 0 MPI_CXX_PATH )
+         #set(PETSC_CXX_FLAGS "-DHAVE_PETSC -I${PETSC_INCLUDE_DIR} -DHAVE_MPI -I${MPI_CXX_PATH}")
+         set(PETSC_CXX_FLAGS -DHAVE_PETSC -I${PETSC_INCLUDE_DIR})
+         set(PETSC_LINKER_FLAGS ${PETSC_LIBRARY})
+      endif()
+   endif()
+endif()
 
 # configure build paths
 set( CMAKE_RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/bin )
@@ -413,6 +417,9 @@ message( "   CMAKE_SHARED_LINKER_FLAGS_RELEASE = ${CMAKE_SHARED_LINKER_FLAGS_REL
 message( "   CUDA_NVCC_FLAGS = ${CUDA_NVCC_FLAGS}" )
 message( "   CUDA_SAMPLES_FLAGS = ${CUDA_SAMPLES_FLAGS}" )
 message( "   GMP_LIBRARIES = ${GMP_LIBRARIES}" )
+message( "   PETSC_CXX_FLAGS = ${PETSC_CXX_FLAGS}" )
+message( "   PETSC_LINKER_FLAGS = ${PETSC_LINKER_FLAGS}" )
+
 if( MPI_CXX_FOUND AND ${WITH_MPI} )
    message( "   MPI_CXX_COMPILE_OPTIONS = ${MPI_CXX_COMPILE_OPTIONS}" )
    message( "   MPI_CXX_COMPILE_DEFINITIONS = ${MPI_CXX_COMPILE_DEFINITIONS}" )
diff --git a/Documentation/Examples/Algorithms/CMakeLists.txt b/Documentation/Examples/Algorithms/CMakeLists.txt
index 982b9c06f4901ec57240ca5affe6c5cdc81512e4..a2fc4569da1de701087a321256d0f2ce836c1dd1 100644
--- a/Documentation/Examples/Algorithms/CMakeLists.txt
+++ b/Documentation/Examples/Algorithms/CMakeLists.txt
@@ -1,54 +1,33 @@
-IF( BUILD_CUDA )
-   CUDA_ADD_EXECUTABLE( SortingExampleCuda SortingExample.cu)
-   ADD_CUSTOM_COMMAND( COMMAND SortingExampleCuda > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SortingExample.out OUTPUT SortingExample.out )
-
-   CUDA_ADD_EXECUTABLE( SortingExample2Cuda SortingExample2.cu)
-   ADD_CUSTOM_COMMAND( COMMAND SortingExample2Cuda > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SortingExample2.out OUTPUT SortingExample2.out )
-
-   CUDA_ADD_EXECUTABLE( SortingExample3Cuda SortingExample3.cu)
-   ADD_CUSTOM_COMMAND( COMMAND SortingExample3Cuda > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SortingExample3.out OUTPUT SortingExample3.out )
-
-   CUDA_ADD_EXECUTABLE(ParallelForExampleCuda ParallelForExample.cu)
-   ADD_CUSTOM_COMMAND( COMMAND ParallelForExampleCuda > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/ParallelForExample.out OUTPUT ParallelForExample.out )
+ADD_SUBDIRECTORY( Segments )
+
+set( COMMON_EXAMPLES
+   SortingExample
+   SortingExample2
+   SortingExample3
+   ParallelForExample
+   SequentialForExample
+)
 
-   CUDA_ADD_EXECUTABLE(reduceArrayExampleCuda reduceArrayExample.cu)
-   ADD_CUSTOM_COMMAND( COMMAND reduceArrayExampleCuda > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/reduceArrayExample.out OUTPUT reduceArrayExample.out )
+set( HOST_EXAMPLES
+   staticForExample
+   unrolledForExample
+)
+if( BUILD_CUDA )
+   foreach( target IN ITEMS ${COMMON_EXAMPLES} )
+      cuda_add_executable( ${target}-cuda ${target}.cu OPTIONS )
+      add_custom_command( COMMAND ${target}-cuda > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/${target}.out OUTPUT ${target}.out )
+      set( CUDA_OUTPUTS ${CUDA_OUTPUTS} ${target}.out )
+   endforeach()
+else()
+   foreach( target IN ITEMS ${COMMON_EXAMPLES} ${HOST_EXAMPLES})
+      add_executable( ${target} ${target}.cpp )
+      add_custom_command( COMMAND ${target} > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/${target}.out OUTPUT ${target}.out )
+      set( HOST_OUTPUTS ${HOST_OUTPUTS} ${target}.out )
+   endforeach()
+endif()
 
-   CUDA_ADD_EXECUTABLE(reduceWithArgumentArrayExampleCuda reduceWithArgumentArrayExample.cu)
-   ADD_CUSTOM_COMMAND( COMMAND reduceWithArgumentArrayExampleCuda > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/reduceWithArgumentArrayExample.out OUTPUT reduceWithArgumentArrayExample.out )
+IF( BUILD_CUDA )
+   ADD_CUSTOM_TARGET( RunAlgorithmsExamples-cuda ALL DEPENDS ${CUDA_OUTPUTS} )
 ELSE()
-   ADD_EXECUTABLE( SortingExample SortingExample.cpp)
-   ADD_CUSTOM_COMMAND( COMMAND SortingExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SortingExample.out OUTPUT SortingExample.out )
-
-   ADD_EXECUTABLE( SortingExample2 SortingExample2.cpp)
-   ADD_CUSTOM_COMMAND( COMMAND SortingExample2 > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SortingExample2.out OUTPUT SortingExample2.out )
-
-   ADD_EXECUTABLE( SortingExample3 SortingExample3.cpp)
-   ADD_CUSTOM_COMMAND( COMMAND SortingExample3 > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SortingExample3.out OUTPUT SortingExample3.out )
-
-   ADD_EXECUTABLE(ParallelForExample ParallelForExample.cpp)
-   ADD_CUSTOM_COMMAND( COMMAND ParallelForExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/ParallelForExample.out OUTPUT ParallelForExample.out )
-
-   ADD_EXECUTABLE(reduceArrayExample reduceArrayExample.cpp)
-   ADD_CUSTOM_COMMAND( COMMAND reduceArrayExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/reduceArrayExample.out OUTPUT reduceArrayExample.out )
-
-   ADD_EXECUTABLE(reduceWithArgumentArrayExample reduceWithArgumentArrayExample.cpp)
-   ADD_CUSTOM_COMMAND( COMMAND reduceWithArgumentArrayExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/reduceWithArgumentArrayExample.out OUTPUT reduceWithArgumentArrayExample.out )
-ENDIF()
-
-ADD_EXECUTABLE(staticForExample staticForExample.cpp)
-ADD_CUSTOM_COMMAND( COMMAND staticForExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/staticForExample.out OUTPUT staticForExample.out )
-
-ADD_EXECUTABLE(unrolledForExample unrolledForExample.cpp)
-ADD_CUSTOM_COMMAND( COMMAND unrolledForExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/unrolledForExample.out OUTPUT unrolledForExample.out )
-
-ADD_CUSTOM_TARGET( RunAlgorithmsExamples ALL DEPENDS
-   SortingExample.out
-   SortingExample2.out
-   SortingExample3.out
-   ParallelForExample.out
-   reduceArrayExample.out
-   reduceWithArgumentArrayExample.out
-   unrolledForExample.out
-   staticForExample.out
-)
+   ADD_CUSTOM_TARGET( RunAlgorithmsExamples ALL DEPENDS ${HOST_OUTPUTS} )
+ENDIF()
\ No newline at end of file
diff --git a/Documentation/Examples/Algorithms/Segments/CMakeLists.txt b/Documentation/Examples/Algorithms/Segments/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..ccf157446d4bc759b64f2c1eefe1f6c66e50c695
--- /dev/null
+++ b/Documentation/Examples/Algorithms/Segments/CMakeLists.txt
@@ -0,0 +1,39 @@
+set( COMMON_EXAMPLES
+   SegmentsExample_General
+   SegmentsPrintingExample-1
+   SegmentsPrintingExample-2
+   SegmentsExample_CSR_constructor_1
+   SegmentsExample_CSR_constructor_2
+   SegmentsExample_CSR_getSerializationType
+   SegmentsExample_CSR_getSegmentsType
+   SegmentsExample_CSR_setSegmentsSizes
+   SegmentsExample_CSR_getSegmentView
+   SegmentsExample_CSR_forElements
+   SegmentsExample_CSR_forSegments
+   SegmentsExample_CSR_sequentialForSegments
+   SegmentsExample_CSR_reduceSegments
+   SegmentsExample_forElements
+   SegmentsExample_forSegments-1
+   SegmentsExample_forSegments-2
+   SegmentsExample_reduceSegments
+)
+
+if( BUILD_CUDA )
+   foreach( target IN ITEMS ${COMMON_EXAMPLES} )
+      cuda_add_executable( ${target}-cuda ${target}.cu OPTIONS )
+      add_custom_command( COMMAND ${target}-cuda > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/${target}.out OUTPUT ${target}.out )
+      set( CUDA_OUTPUTS ${CUDA_OUTPUTS} ${target}.out )
+   endforeach()
+else()
+   foreach( target IN ITEMS ${COMMON_EXAMPLES} )
+      add_executable( ${target} ${target}.cpp )
+      add_custom_command( COMMAND ${target} > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/${target}.out OUTPUT ${target}.out )
+      set( HOST_OUTPUTS ${HOST_OUTPUTS} ${target}.out )
+   endforeach()
+endif()
+
+IF( BUILD_CUDA )
+   ADD_CUSTOM_TARGET( RunSegmentsExamples-cuda ALL DEPENDS ${CUDA_OUTPUTS} )
+ELSE()
+   ADD_CUSTOM_TARGET( RunSegmentsExamples ALL DEPENDS ${HOST_OUTPUTS} )
+ENDIF()
\ No newline at end of file
diff --git a/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_constructor_1.cpp b/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_constructor_1.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0ceb7a6bd496932c7cb79bca153378f59d3911f0
--- /dev/null
+++ b/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_constructor_1.cpp
@@ -0,0 +1,51 @@
+#include <iostream>
+#include <functional>
+#include <TNL/Containers/Vector.h>
+#include <TNL/Algorithms/Segments/CSR.h>
+#include <TNL/Devices/Host.h>
+#include <TNL/Devices/Cuda.h>
+
+template< typename Device >
+void SegmentsExample()
+{
+   using SegmentsType = typename TNL::Algorithms::Segments::CSR< Device, int >;
+
+   /***
+    * Create segments with given segments sizes.
+    */
+   TNL::Containers::Vector< int, Device > segmentsSizes{ 1, 2, 3, 4, 5 };
+   SegmentsType segments( segmentsSizes );
+   std::cout << "Segments sizes are: " << segments << std::endl;
+
+   /***
+    * Allocate array for the segments;
+    */
+   TNL::Containers::Array< double, Device > data( segments.getStorageSize(), 0.0 );
+
+   /***
+    * Insert data into particular segments.
+    */
+   auto data_view = data.getView();
+   segments.forAllElements( [=] __cuda_callable__ ( int segmentIdx, int localIdx, int globalIdx ) mutable {
+      if( localIdx <= segmentIdx )
+         data_view[ globalIdx ] = segmentIdx;
+   } );
+
+   /***
+    * Print the data managed by the segments.
+    */
+   auto fetch = [=] __cuda_callable__ ( int globalIdx ) -> double { return data_view[ globalIdx ]; };
+   printSegments( segments, fetch, std::cout );
+}
+
+int main( int argc, char* argv[] )
+{
+   std::cout << "Example of CSR segments on host: " << std::endl;
+   SegmentsExample< TNL::Devices::Host >();
+
+#ifdef HAVE_CUDA
+   std::cout << "Example of CSR segments on CUDA GPU: " << std::endl;
+   SegmentsExample< TNL::Devices::Cuda >();
+#endif
+   return EXIT_SUCCESS;
+}
diff --git a/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_constructor_1.cu b/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_constructor_1.cu
new file mode 120000
index 0000000000000000000000000000000000000000..9daf42acefa262936fd0d476cbd4912a93fe63b1
--- /dev/null
+++ b/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_constructor_1.cu
@@ -0,0 +1 @@
+SegmentsExample_CSR_constructor_1.cpp
\ No newline at end of file
diff --git a/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_constructor_2.cpp b/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_constructor_2.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9493758b4954d8056a5398c3134b46c072608ea9
--- /dev/null
+++ b/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_constructor_2.cpp
@@ -0,0 +1,50 @@
+#include <iostream>
+#include <functional>
+#include <TNL/Containers/Vector.h>
+#include <TNL/Algorithms/Segments/CSR.h>
+#include <TNL/Devices/Host.h>
+#include <TNL/Devices/Cuda.h>
+
+template< typename Device >
+void SegmentsExample()
+{
+   using SegmentsType = typename TNL::Algorithms::Segments::CSR< Device, int >;
+
+   /***
+    * Create segments with given segments sizes.
+    */
+   SegmentsType segments{ 1, 2, 3, 4, 5 };
+   std::cout << "Segments sizes are: " << segments << std::endl;
+
+   /***
+    * Allocate array for the segments;
+    */
+   TNL::Containers::Array< double, Device > data( segments.getStorageSize(), 0.0 );
+
+   /***
+    * Insert data into particular segments.
+    */
+   auto data_view = data.getView();
+   segments.forAllElements( [=] __cuda_callable__ ( int segmentIdx, int localIdx, int globalIdx ) mutable {
+      if( localIdx <= segmentIdx )
+         data_view[ globalIdx ] = segmentIdx;
+   } );
+
+   /***
+    * Print the data managed by the segments.
+    */
+   auto fetch = [=] __cuda_callable__ ( int globalIdx ) -> double { return data_view[ globalIdx ]; };
+   printSegments( segments, fetch, std::cout );
+}
+
+int main( int argc, char* argv[] )
+{
+   std::cout << "Example of CSR segments on host: " << std::endl;
+   SegmentsExample< TNL::Devices::Host >();
+
+#ifdef HAVE_CUDA
+   std::cout << "Example of CSR segments on CUDA GPU: " << std::endl;
+   SegmentsExample< TNL::Devices::Cuda >();
+#endif
+   return EXIT_SUCCESS;
+}
diff --git a/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_constructor_2.cu b/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_constructor_2.cu
new file mode 120000
index 0000000000000000000000000000000000000000..9286174a1ee2b0797c99f97a699acb186281caef
--- /dev/null
+++ b/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_constructor_2.cu
@@ -0,0 +1 @@
+SegmentsExample_CSR_constructor_2.cpp
\ No newline at end of file
diff --git a/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_forElements.cpp b/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_forElements.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..37267a889fd87a5880e24de6d0d0ed1a83d6402b
--- /dev/null
+++ b/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_forElements.cpp
@@ -0,0 +1,49 @@
+#include <iostream>
+#include <TNL/Containers/Vector.h>
+#include <TNL/Algorithms/Segments/CSR.h>
+#include <TNL/Devices/Host.h>
+#include <TNL/Devices/Cuda.h>
+
+template< typename Device >
+void SegmentsExample()
+{
+   using SegmentsType = typename TNL::Algorithms::Segments::CSR< Device, int >;
+
+   /***
+    * Create segments with given segments sizes.
+    */
+   const int size( 5 );
+   SegmentsType segments{ 1, 2, 3, 4, 5 };
+
+   /***
+    * Allocate array for the segments;
+    */
+   TNL::Containers::Array< double, Device > data( segments.getStorageSize(), 0.0 );
+
+   /***
+    * Insert data into particular segments.
+    */
+   auto data_view = data.getView();
+   segments.forElements( 0, size, [=] __cuda_callable__ ( int segmentIdx, int localIdx, int globalIdx ) mutable {
+      if( localIdx <= segmentIdx )
+         data_view[ globalIdx ] = segmentIdx;
+   } );
+
+   /***
+    * Print the data managed by the segments.
+    */
+   auto fetch = [=] __cuda_callable__ ( int globalIdx ) -> double { return data_view[ globalIdx ]; };
+   printSegments( segments, fetch, std::cout );
+}
+
+int main( int argc, char* argv[] )
+{
+   std::cout << "Example of CSR segments on host: " << std::endl;
+   SegmentsExample< TNL::Devices::Host >();
+
+#ifdef HAVE_CUDA
+   std::cout << "Example of CSR segments on CUDA GPU: " << std::endl;
+   SegmentsExample< TNL::Devices::Cuda >();
+#endif
+   return EXIT_SUCCESS;
+}
diff --git a/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_forElements.cu b/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_forElements.cu
new file mode 120000
index 0000000000000000000000000000000000000000..59a419856a1162f6c75d916b1f7492ea1dee30f0
--- /dev/null
+++ b/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_forElements.cu
@@ -0,0 +1 @@
+SegmentsExample_CSR_forElements.cpp
\ No newline at end of file
diff --git a/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_forSegments.cpp b/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_forSegments.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3bf7cc50bd46dd6dc2bc72d78d9bc47700fa2589
--- /dev/null
+++ b/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_forSegments.cpp
@@ -0,0 +1,52 @@
+#include <iostream>
+#include <functional>
+#include <TNL/Containers/Vector.h>
+#include <TNL/Algorithms/Segments/CSR.h>
+#include <TNL/Devices/Host.h>
+#include <TNL/Devices/Cuda.h>
+
+template< typename Device >
+void SegmentsExample()
+{
+   using SegmentsType = typename TNL::Algorithms::Segments::CSR< Device, int >;
+   using SegmentViewType = typename SegmentsType::SegmentViewType;
+
+   /***
+    * Create segments with given segments sizes.
+    */
+   const int size( 5 );
+   SegmentsType segments{ 1, 2, 3, 4, 5 };
+
+   /***
+    * Allocate array for the segments;
+    */
+   TNL::Containers::Array< double, Device > data( segments.getStorageSize(), 0.0 );
+
+   /***
+    * Insert data into particular segments.
+    */
+   auto data_view = data.getView();
+   segments.forSegments( 0, size, [=] __cuda_callable__ ( const SegmentViewType& segment ) mutable {
+      for( auto element : segment )
+         if( element.localIndex() <= element.segmentIndex() )
+            data_view[ element.globalIndex() ] = element.segmentIndex() + element.localIndex();
+   } );
+
+   /***
+    * Print the data managed by the segments.
+    */
+   auto fetch = [=] __cuda_callable__ ( int globalIdx ) -> double { return data_view[ globalIdx ]; };
+   printSegments( segments, fetch, std::cout );
+}
+
+int main( int argc, char* argv[] )
+{
+   std::cout << "Example of CSR segments on host: " << std::endl;
+   SegmentsExample< TNL::Devices::Host >();
+
+#ifdef HAVE_CUDA
+   std::cout << "Example of CSR segments on CUDA GPU: " << std::endl;
+   SegmentsExample< TNL::Devices::Cuda >();
+#endif
+   return EXIT_SUCCESS;
+}
diff --git a/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_forSegments.cu b/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_forSegments.cu
new file mode 120000
index 0000000000000000000000000000000000000000..07825a022346bb3100dd6c7261a32a7f8335e4a1
--- /dev/null
+++ b/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_forSegments.cu
@@ -0,0 +1 @@
+SegmentsExample_CSR_forSegments.cpp
\ No newline at end of file
diff --git a/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_getSegmentView.cpp b/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_getSegmentView.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e9ef92da5e0965d45848f3d92746b81f0ac0d21d
--- /dev/null
+++ b/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_getSegmentView.cpp
@@ -0,0 +1,47 @@
+#include <iostream>
+#include <functional>
+#include <TNL/Containers/Vector.h>
+#include <TNL/Algorithms/Segments/CSR.h>
+#include <TNL/Algorithms/SequentialFor.h>
+#include <TNL/Devices/Host.h>
+#include <TNL/Devices/Cuda.h>
+
+template< typename Device >
+void SegmentsExample()
+{
+   using SegmentsType = typename TNL::Algorithms::Segments::CSR< Device, int >;
+   using SegmentView = typename SegmentsType::SegmentViewType;
+
+   /***
+    * Create segments with given segments sizes.
+    */
+   const int size( 5 );
+   SegmentsType segments{ 1, 2, 3, 4, 5 };
+   auto view = segments.getView();
+
+   /***
+    * Print the elemets mapping using segment view.
+    */
+   std::cout << "Mapping of local indexes to global indexes:" << std::endl;
+
+   auto f = [=] __cuda_callable__ ( int segmentIdx ) {
+      printf( "Segment idx. %d: ", segmentIdx );                 // printf works even in GPU kernels
+      auto segment = view.getSegmentView( segmentIdx );
+      for( auto element : segment )
+         printf( "%d -> %d \t", element.localIndex(), element.globalIndex() );
+      printf( "\n" );
+   };
+   TNL::Algorithms::SequentialFor< Device >::exec( 0, size, f );
+}
+
+int main( int argc, char* argv[] )
+{
+   std::cout << "Example of CSR segments on host: " << std::endl;
+   SegmentsExample< TNL::Devices::Host >();
+
+#ifdef HAVE_CUDA
+   std::cout << "Example of CSR segments on CUDA GPU: " << std::endl;
+   SegmentsExample< TNL::Devices::Cuda >();
+#endif
+   return EXIT_SUCCESS;
+}
diff --git a/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_getSegmentView.cu b/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_getSegmentView.cu
new file mode 120000
index 0000000000000000000000000000000000000000..fd9d2382214c293e70080914e07450ec6d3b86c6
--- /dev/null
+++ b/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_getSegmentView.cu
@@ -0,0 +1 @@
+SegmentsExample_CSR_getSegmentView.cpp
\ No newline at end of file
diff --git a/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_getSegmentsType.cpp b/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_getSegmentsType.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..fea90117304d1465c2da781a8d0ce0202edf427a
--- /dev/null
+++ b/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_getSegmentsType.cpp
@@ -0,0 +1,29 @@
+#include <iostream>
+#include <functional>
+#include <TNL/Algorithms/Segments/CSR.h>
+#include <TNL/Devices/Host.h>
+#include <TNL/Devices/Cuda.h>
+
+template< typename Device >
+void SegmentsExample()
+{
+   using SegmentsType = typename TNL::Algorithms::Segments::CSR< Device, int >;
+
+   /***
+    * Create segments and print the segments type.
+    */
+   SegmentsType segments;
+   std::cout << "The segments type is: " << segments.getSegmentsType() << std::endl;
+}
+
+int main( int argc, char* argv[] )
+{
+   std::cout << "Example of CSR segments on host: " << std::endl;
+   SegmentsExample< TNL::Devices::Host >();
+
+#ifdef HAVE_CUDA
+   std::cout << "Example of CSR segments on CUDA GPU: " << std::endl;
+   SegmentsExample< TNL::Devices::Cuda >();
+#endif
+   return EXIT_SUCCESS;
+}
diff --git a/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_getSegmentsType.cu b/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_getSegmentsType.cu
new file mode 120000
index 0000000000000000000000000000000000000000..fcb8d7eb740337eb6ddede8c7cb8eff123d17765
--- /dev/null
+++ b/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_getSegmentsType.cu
@@ -0,0 +1 @@
+SegmentsExample_CSR_getSegmentsType.cpp
\ No newline at end of file
diff --git a/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_getSerializationType.cpp b/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_getSerializationType.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a52a18e5069f0c9d75ea841bcd687520760fc7cc
--- /dev/null
+++ b/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_getSerializationType.cpp
@@ -0,0 +1,29 @@
+#include <iostream>
+#include <functional>
+#include <TNL/Algorithms/Segments/CSR.h>
+#include <TNL/Devices/Host.h>
+#include <TNL/Devices/Cuda.h>
+
+template< typename Device >
+void SegmentsExample()
+{
+   using SegmentsType = typename TNL::Algorithms::Segments::CSR< Device, int >;
+
+   /***
+    * Create segments and print the serialization type.
+    */
+   SegmentsType segments;
+   std::cout << "The serialization type is: " << segments.getSerializationType() << std::endl;
+}
+
+int main( int argc, char* argv[] )
+{
+   std::cout << "Example of CSR segments on host: " << std::endl;
+   SegmentsExample< TNL::Devices::Host >();
+
+#ifdef HAVE_CUDA
+   std::cout << "Example of CSR segments on CUDA GPU: " << std::endl;
+   SegmentsExample< TNL::Devices::Cuda >();
+#endif
+   return EXIT_SUCCESS;
+}
diff --git a/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_getSerializationType.cu b/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_getSerializationType.cu
new file mode 120000
index 0000000000000000000000000000000000000000..31c65453cf012b2a16e40c5010f7c2d899d15a18
--- /dev/null
+++ b/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_getSerializationType.cu
@@ -0,0 +1 @@
+SegmentsExample_CSR_getSerializationType.cpp
\ No newline at end of file
diff --git a/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_reduceSegments.cpp b/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_reduceSegments.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f784177af3985fc1587a0e6838222116e9134fe1
--- /dev/null
+++ b/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_reduceSegments.cpp
@@ -0,0 +1,69 @@
+#include <iostream>
+#include <functional>
+#include <TNL/Containers/Vector.h>
+#include <TNL/Algorithms/Segments/CSR.h>
+#include <TNL/Devices/Host.h>
+#include <TNL/Devices/Cuda.h>
+
+template< typename Device >
+void SegmentsExample()
+{
+   using SegmentsType = typename TNL::Algorithms::Segments::CSR< Device, int >;
+
+   /***
+    * Create segments with given segments sizes.
+    */
+   const int size( 5 );
+   SegmentsType segments{ 1, 2, 3, 4, 5 };
+
+   /***
+    * Allocate array for the segments;
+    */
+   TNL::Containers::Array< double, Device > data( segments.getStorageSize(), 0.0 );
+
+   /***
+    * Insert data into particular segments.
+    */
+   auto data_view = data.getView();
+   segments.forElements( 0, size, [=] __cuda_callable__ ( int segmentIdx, int localIdx, int globalIdx ) mutable {
+      if( localIdx <= segmentIdx )
+         data_view[ globalIdx ] = segmentIdx;
+   } );
+
+   /***
+    * Compute sums of elements in each segment.
+    */
+   TNL::Containers::Vector< double, Device > sums( size );
+   auto sums_view = sums.getView();
+   auto fetch_full = [=] __cuda_callable__ ( int segmentIdx, int localIdx, int globalIdx, bool& compute ) -> double {
+      if( localIdx <= segmentIdx )
+         return data_view[ globalIdx ];
+      else
+      {
+         compute = false;
+         return 0.0;
+      }
+   };
+   auto fetch_brief = [=] __cuda_callable__ ( int globalIdx, bool& compute ) -> double {
+      return data_view[ globalIdx ];
+   };
+
+   auto keep = [=] __cuda_callable__ ( int globalIdx, const double& value  ) mutable {
+      sums_view[ globalIdx ] = value; };
+   segments.reduceAllSegments( fetch_full, std::plus<>{}, keep, 0.0 );
+   std::cout << "The sums with full fetch form are: " << sums << std::endl;
+   segments.reduceAllSegments( fetch_brief, std::plus<>{}, keep, 0.0 );
+   std::cout << "The sums with brief fetch form are: " << sums << std::endl;
+}
+
+int main( int argc, char* argv[] )
+{
+   std::cout << "Example of CSR segments on host: " << std::endl;
+   SegmentsExample< TNL::Devices::Host >();
+
+#ifdef HAVE_CUDA
+   std::cout << "Example of CSR segments on CUDA GPU: " << std::endl;
+   SegmentsExample< TNL::Devices::Cuda >();
+#endif
+   return EXIT_SUCCESS;
+}
diff --git a/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_reduceSegments.cu b/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_reduceSegments.cu
new file mode 120000
index 0000000000000000000000000000000000000000..c133b0c2df3ed29adab10546a5b8435508d4abd0
--- /dev/null
+++ b/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_reduceSegments.cu
@@ -0,0 +1 @@
+SegmentsExample_CSR_reduceSegments.cpp
\ No newline at end of file
diff --git a/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_sequentialForAllSegments.cpp b/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_sequentialForAllSegments.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..433ae6a61567b3666a888a7559124502a5711334
--- /dev/null
+++ b/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_sequentialForAllSegments.cpp
@@ -0,0 +1,42 @@
+#include <iostream>
+#include <functional>
+#include <TNL/Containers/Vector.h>
+#include <TNL/Algorithms/Segments/CSR.h>
+#include <TNL/Devices/Host.h>
+#include <TNL/Devices/Cuda.h>
+
+template< typename Device >
+void SegmentsExample()
+{
+   using SegmentsType = typename TNL::Algorithms::Segments::CSR< Device, int >;
+   using SegmentViewType = typename SegmentsType::SegmentView;
+
+   /***
+    * Create segments with given segments sizes.
+    */
+   SegmentsType segments{ 1, 2, 3, 4, 5 };
+   std::cout << "Segments sizes are: " << segments << std::endl;
+
+   /***
+    * Print the elemets mapping using segment view.
+    */
+   std::cout << "Elements mapping:" << std::endl;
+   segments.sequentialForAllSegments( [] __cuda_callable__ ( const SegmentView segment ) {
+      printf( "Segment idx. %d: \n", segments.getSegmentIndex() );                 // printf works even in GPU kernels
+      for( auto element : segment )
+         printf( "%d -> %d  ", element.localIndex(), element.globalIndex() );
+   } );
+
+}
+
+int main( int argc, char* argv[] )
+{
+   std::cout << "Example of CSR segments on host: " << std::endl;
+   SegmentsExample< TNL::Devices::Host >();
+
+#ifdef HAVE_CUDA
+   std::cout << "Example of CSR segments on CUDA GPU: " << std::endl;
+   SegmentsExample< TNL::Devices::Cuda >();
+#endif
+   return EXIT_SUCCESS;
+}
diff --git a/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_sequentialForSegments.cpp b/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_sequentialForSegments.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..76affa43b33283601ef44550975af51d5a45aa26
--- /dev/null
+++ b/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_sequentialForSegments.cpp
@@ -0,0 +1,45 @@
+#include <iostream>
+#include <functional>
+#include <TNL/Containers/Vector.h>
+#include <TNL/Algorithms/Segments/CSR.h>
+#include <TNL/Algorithms/SequentialFor.h>
+#include <TNL/Devices/Host.h>
+#include <TNL/Devices/Cuda.h>
+
+template< typename Device >
+void SegmentsExample()
+{
+   using SegmentsType = typename TNL::Algorithms::Segments::CSR< Device, int >;
+   using SegmentView = typename SegmentsType::SegmentViewType;
+
+   /***
+    * Create segments with given segments sizes.
+    */
+   const int size( 5 );
+   SegmentsType segments{ 1, 2, 3, 4, 5 };
+
+   /***
+    * Print the elemets mapping using segment view.
+    */
+   std::cout << "Mapping of local indexes to global indexes:" << std::endl;
+
+   auto f = [=] __cuda_callable__ ( const SegmentView& segment ) {
+      printf( "Segment idx. %d: ", segment.getSegmentIndex() );                 // printf works even in GPU kernels
+      for( auto element : segment )
+         printf( "%d -> %d \t", element.localIndex(), element.globalIndex() );
+      printf( "\n" );
+   };
+   segments.sequentialForSegments( 0, size, f );
+}
+
+int main( int argc, char* argv[] )
+{
+   std::cout << "Example of CSR segments on host: " << std::endl;
+   SegmentsExample< TNL::Devices::Host >();
+
+#ifdef HAVE_CUDA
+   std::cout << "Example of CSR segments on CUDA GPU: " << std::endl;
+   SegmentsExample< TNL::Devices::Cuda >();
+#endif
+   return EXIT_SUCCESS;
+}
diff --git a/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_sequentialForSegments.cu b/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_sequentialForSegments.cu
new file mode 120000
index 0000000000000000000000000000000000000000..06e162fd7f3752949bad9b90a5147d012ee5ebe1
--- /dev/null
+++ b/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_sequentialForSegments.cu
@@ -0,0 +1 @@
+SegmentsExample_CSR_sequentialForSegments.cpp
\ No newline at end of file
diff --git a/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_setSegmentsSizes.cpp b/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_setSegmentsSizes.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..59a9e1bfad95a33706cd082e7a153c7c1de8284a
--- /dev/null
+++ b/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_setSegmentsSizes.cpp
@@ -0,0 +1,32 @@
+#include <iostream>
+#include <functional>
+#include <TNL/Containers/Vector.h>
+#include <TNL/Algorithms/Segments/CSR.h>
+#include <TNL/Devices/Host.h>
+#include <TNL/Devices/Cuda.h>
+
+template< typename Device >
+void SegmentsExample()
+{
+   using SegmentsType = typename TNL::Algorithms::Segments::CSR< Device, int >;
+
+   /***
+    * Create segments with given segments sizes.
+    */
+   TNL::Containers::Vector< int, Device > segmentsSizes{ 1, 2, 3, 4, 5 };
+   SegmentsType segments;
+   segments.setSegmentsSizes( segmentsSizes );
+   std::cout << "Segments sizes are: " << segments << std::endl;
+}
+
+int main( int argc, char* argv[] )
+{
+   std::cout << "Example of CSR segments on host: " << std::endl;
+   SegmentsExample< TNL::Devices::Host >();
+
+#ifdef HAVE_CUDA
+   std::cout << "Example of CSR segments on CUDA GPU: " << std::endl;
+   SegmentsExample< TNL::Devices::Cuda >();
+#endif
+   return EXIT_SUCCESS;
+}
diff --git a/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_setSegmentsSizes.cu b/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_setSegmentsSizes.cu
new file mode 120000
index 0000000000000000000000000000000000000000..f56df02ad1b0444f3a952d1287de91bbc096b51e
--- /dev/null
+++ b/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_setSegmentsSizes.cu
@@ -0,0 +1 @@
+SegmentsExample_CSR_setSegmentsSizes.cpp
\ No newline at end of file
diff --git a/Documentation/Examples/Algorithms/Segments/SegmentsExample_General.cpp b/Documentation/Examples/Algorithms/Segments/SegmentsExample_General.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ade0263fbc590e96c3a27309c7c97737de5d708f
--- /dev/null
+++ b/Documentation/Examples/Algorithms/Segments/SegmentsExample_General.cpp
@@ -0,0 +1,78 @@
+#include <iostream>
+#include <functional>
+#include <TNL/Containers/Vector.h>
+#include <TNL/Algorithms/Segments/CSR.h>
+#include <TNL/Algorithms/Segments/Ellpack.h>
+#include <TNL/Devices/Host.h>
+#include <TNL/Devices/Cuda.h>
+
+template< typename Segments >
+void SegmentsExample()
+{
+   using DeviceType = typename Segments::DeviceType;
+   using IndexType = typename Segments::IndexType;
+
+   /***
+    * Create segments with given segments sizes.
+    */
+   Segments segments{ 1, 2, 3, 4, 5 };
+   std::cout << "Segments sizes are: " << segments << std::endl;
+
+   /***
+    * Allocate array for the segments;
+    */
+   TNL::Containers::Array< double, DeviceType > data( segments.getStorageSize(), 0.0 );
+
+   /***
+    * Insert data into particular segments.
+    */
+   auto data_view = data.getView();
+   segments.forAllElements( [=] __cuda_callable__ ( IndexType segmentIdx, IndexType localIdx, IndexType globalIdx ) mutable {
+      if( localIdx <= segmentIdx )
+         data_view[ globalIdx ] = segmentIdx;
+   } );
+
+   /***
+    * Print the data managed by the segments.
+    */
+   auto fetch = [=] __cuda_callable__ ( IndexType globalIdx ) -> double { return data_view[ globalIdx ]; };
+   printSegments( segments, fetch, std::cout );
+
+   /***
+    * Compute sums of elements in particular segments.
+    */
+   TNL::Containers::Vector< double, DeviceType, IndexType > sums( segments.getSegmentsCount() );
+   auto sums_view = sums.getView();
+   auto sum_fetch = [=] __cuda_callable__ ( IndexType segmentIdx, IndexType localIdx, IndexType globalIdx, bool& compute ) -> double {
+      return data_view[ globalIdx ];
+   };
+   auto keep = [=] __cuda_callable__ ( const IndexType& segmentIdx, const double& value ) mutable {
+      sums_view[ segmentIdx ] = value;
+   };
+   segments.reduceAllSegments( sum_fetch, std::plus<>{}, keep, 0.0 );
+   std::cout << "The sums are: " << sums << std::endl;
+}
+
+int main( int argc, char* argv[] )
+{
+   using HostCSR = TNL::Algorithms::Segments::CSR< TNL::Devices::Host, int >;
+   using HostEllpack = TNL::Algorithms::Segments::Ellpack< TNL::Devices::Host, int >;
+   using CudaCSR = TNL::Algorithms::Segments::CSR< TNL::Devices::Cuda, int >;
+   using CudaEllpack = TNL::Algorithms::Segments::Ellpack< TNL::Devices::Cuda, int >;
+
+
+   std::cout << "Example of CSR segments on host: " << std::endl;
+   SegmentsExample< HostCSR >();
+
+   std::cout << "Example of Ellpack segments on host: " << std::endl;
+   SegmentsExample< HostEllpack >();
+
+#ifdef HAVE_CUDA
+   std::cout << "Example of CSR segments on CUDA GPU: " << std::endl;
+   SegmentsExample< CudaCSR >();
+
+   std::cout << "Example of Ellpack segments on CUDA GPU: " << std::endl;
+   SegmentsExample< CudaEllpack >();
+#endif
+   return EXIT_SUCCESS;
+}
diff --git a/Documentation/Examples/Algorithms/Segments/SegmentsExample_General.cu b/Documentation/Examples/Algorithms/Segments/SegmentsExample_General.cu
new file mode 120000
index 0000000000000000000000000000000000000000..64abaf44de42411ff746c0bf6e4a151aecc8fef9
--- /dev/null
+++ b/Documentation/Examples/Algorithms/Segments/SegmentsExample_General.cu
@@ -0,0 +1 @@
+SegmentsExample_General.cpp
\ No newline at end of file
diff --git a/Documentation/Examples/Algorithms/Segments/SegmentsExample_forElements.cpp b/Documentation/Examples/Algorithms/Segments/SegmentsExample_forElements.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7d7eac76c1285720c740add0f75711a33f958b8f
--- /dev/null
+++ b/Documentation/Examples/Algorithms/Segments/SegmentsExample_forElements.cpp
@@ -0,0 +1,73 @@
+#include <iostream>
+#include <TNL/Containers/Vector.h>
+#include <TNL/Algorithms/Segments/CSR.h>
+#include <TNL/Algorithms/Segments/Ellpack.h>
+#include <TNL/Devices/Host.h>
+#include <TNL/Devices/Cuda.h>
+
+template< typename Segments >
+void SegmentsExample()
+{
+   using Device = typename Segments::DeviceType;
+
+   /***
+    * Create segments with given segments sizes.
+    */
+   Segments segments{ 1, 2, 3, 4, 5 };
+
+   /***
+    * Allocate array for the segments;
+    */
+   TNL::Containers::Array< double, Device > data( segments.getStorageSize(), 0.0 );
+
+   /***
+    * Insert data into particular segments with no check.
+    */
+   auto data_view = data.getView();
+   segments.forAllElements( [=] __cuda_callable__ ( int segmentIdx, int localIdx, int globalIdx ) mutable {
+      data_view[ globalIdx ] = segmentIdx;
+   } );
+
+   /***
+    * Print the data managed by the segments.
+    */
+   std::cout << "Data setup with no check ... " << std::endl;
+   std::cout << "Array: " << data << std::endl;
+   auto fetch = [=] __cuda_callable__ ( int globalIdx ) -> double { return data_view[ globalIdx ]; };
+   printSegments( segments, fetch, std::cout );
+
+   /***
+    * Insert data into particular segments.
+    */
+   data = 0.0;
+   segments.forAllElements( [=] __cuda_callable__ ( int segmentIdx, int localIdx, int globalIdx ) mutable {
+      if( localIdx <= segmentIdx )
+         data_view[ globalIdx ] = segmentIdx;
+   } );
+
+   /***
+    * Print the data managed by the segments.
+    */
+   std::cout << "Data setup with check for padding elements..." << std::endl;
+   std::cout << "Array: " << data << std::endl;
+   printSegments( segments, fetch, std::cout );
+}
+
+int main( int argc, char* argv[] )
+{
+   std::cout << "Example of CSR segments on host: " << std::endl;
+   SegmentsExample< TNL::Algorithms::Segments::CSR< TNL::Devices::Host, int > >();
+
+   std::cout << "Example of Ellpack segments on host: " << std::endl;
+   SegmentsExample< TNL::Algorithms::Segments::Ellpack< TNL::Devices::Host, int > >();
+
+
+#ifdef HAVE_CUDA
+   std::cout << "Example of CSR segments on CUDA GPU: " << std::endl;
+   SegmentsExample< TNL::Algorithms::Segments::CSR< TNL::Devices::Cuda, int > >();
+
+   std::cout << "Example of Ellpack segments on CUDA GPU: " << std::endl;
+   SegmentsExample< TNL::Algorithms::Segments::Ellpack< TNL::Devices::Cuda, int > >();
+#endif
+   return EXIT_SUCCESS;
+}
diff --git a/Documentation/Examples/Algorithms/Segments/SegmentsExample_forElements.cu b/Documentation/Examples/Algorithms/Segments/SegmentsExample_forElements.cu
new file mode 120000
index 0000000000000000000000000000000000000000..5f881d30f679253f0fd8ec4fbf150be9f3a0f40f
--- /dev/null
+++ b/Documentation/Examples/Algorithms/Segments/SegmentsExample_forElements.cu
@@ -0,0 +1 @@
+SegmentsExample_forElements.cpp
\ No newline at end of file
diff --git a/Documentation/Examples/Algorithms/Segments/SegmentsExample_forSegments-1.cpp b/Documentation/Examples/Algorithms/Segments/SegmentsExample_forSegments-1.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d8be1f04c737c4c2e8da55dc2104741d445e5c96
--- /dev/null
+++ b/Documentation/Examples/Algorithms/Segments/SegmentsExample_forSegments-1.cpp
@@ -0,0 +1,62 @@
+#include <iostream>
+#include <TNL/Containers/Vector.h>
+#include <TNL/Algorithms/Segments/CSR.h>
+#include <TNL/Algorithms/Segments/Ellpack.h>
+#include <TNL/Devices/Host.h>
+#include <TNL/Devices/Cuda.h>
+
+template< typename Segments >
+void SegmentsExample()
+{
+   using Device = typename Segments::DeviceType;
+
+   /***
+    * Create segments with given segments sizes.
+    */
+   Segments segments{ 1, 2, 3, 4, 5 };
+
+   /***
+    * Allocate array for the segments;
+    */
+   TNL::Containers::Array< double, Device > data( segments.getStorageSize(), 0.0 );
+
+   /***
+    * Insert data into particular segments.
+    */
+   auto data_view = data.getView();
+   using SegmentViewType = typename Segments::SegmentViewType;
+   segments.forAllSegments( [=] __cuda_callable__ ( const SegmentViewType& segment ) mutable {
+      double sum( 0.0 );
+      for( auto element : segment )
+         if( element.localIndex() <= element.segmentIndex() )
+         {
+             sum += element.localIndex() + 1;
+             data_view[ element.globalIndex() ] = sum;
+         }
+   } );
+
+   /***
+    * Print the data managed by the segments.
+    */
+   auto fetch = [=] __cuda_callable__ ( int globalIdx ) -> double { return data_view[ globalIdx ]; };
+   printSegments( segments, fetch, std::cout );
+}
+
+int main( int argc, char* argv[] )
+{
+   std::cout << "Example of CSR segments on host: " << std::endl;
+   SegmentsExample< TNL::Algorithms::Segments::CSR< TNL::Devices::Host, int > >();
+
+   std::cout << "Example of Ellpack segments on host: " << std::endl;
+   SegmentsExample< TNL::Algorithms::Segments::Ellpack< TNL::Devices::Host, int > >();
+
+
+#ifdef HAVE_CUDA
+   std::cout << "Example of CSR segments on CUDA GPU: " << std::endl;
+   SegmentsExample< TNL::Algorithms::Segments::CSR< TNL::Devices::Cuda, int > >();
+
+   std::cout << "Example of Ellpack segments on CUDA GPU: " << std::endl;
+   SegmentsExample< TNL::Algorithms::Segments::Ellpack< TNL::Devices::Cuda, int > >();
+#endif
+   return EXIT_SUCCESS;
+}
diff --git a/Documentation/Examples/Algorithms/Segments/SegmentsExample_forSegments-1.cu b/Documentation/Examples/Algorithms/Segments/SegmentsExample_forSegments-1.cu
new file mode 120000
index 0000000000000000000000000000000000000000..3df81cf9d7ee5641a2ff886c0f35dc6dfe82f4c6
--- /dev/null
+++ b/Documentation/Examples/Algorithms/Segments/SegmentsExample_forSegments-1.cu
@@ -0,0 +1 @@
+SegmentsExample_forSegments-1.cpp
\ No newline at end of file
diff --git a/Documentation/Examples/Algorithms/Segments/SegmentsExample_forSegments-2.cpp b/Documentation/Examples/Algorithms/Segments/SegmentsExample_forSegments-2.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a5d7d0caa943c658b9ffd86c36e24003f6c6d331
--- /dev/null
+++ b/Documentation/Examples/Algorithms/Segments/SegmentsExample_forSegments-2.cpp
@@ -0,0 +1,71 @@
+#include <iostream>
+#include <functional>
+#include <TNL/Containers/Vector.h>
+#include <TNL/Algorithms/Segments/CSR.h>
+#include <TNL/Devices/Host.h>
+#include <TNL/Devices/Cuda.h>
+
+template< typename Device >
+void SegmentsExample()
+{
+   using SegmentsType = typename TNL::Algorithms::Segments::CSR< Device, int >;
+
+   /***
+    * Create segments with given segments sizes.
+    */
+   SegmentsType segments{ 1, 2, 3, 4, 5 };
+
+   /***
+    * Allocate array for the segments;
+    */
+   TNL::Containers::Array< double, Device > data( segments.getStorageSize(), 0.0 );
+
+   /***
+    * Insert data into particular segments.
+    */
+   auto data_view = data.getView();
+   segments.forAllElements( [=] __cuda_callable__ ( int segmentIdx, int localIdx, int globalIdx ) mutable {
+      data_view[ globalIdx ] = localIdx + 1;
+   } );
+
+   /***
+    * Print the data by the segments.
+    */
+   std::cout << "Values of elements after intial setup: " << std::endl;
+   auto fetch = [=] __cuda_callable__ ( int globalIdx ) -> double { return data_view[ globalIdx ]; };
+   printSegments( segments, fetch, std::cout );
+
+   /***
+    * Divide elements in each segment by a sum of all elements in the segment
+    */
+   using SegmentViewType = typename SegmentsType::SegmentViewType;
+   segments.forAllSegments( [=] __cuda_callable__ ( const SegmentViewType& segment ) mutable {
+      // Compute the sum first ...
+      double sum = 0.0;
+      for( auto element : segment )
+         if( element.localIndex() <= element.segmentIndex() )
+            sum += data_view[ element.globalIndex() ];
+      // ... divide all elements.
+      for( auto element : segment )
+         if( element.localIndex() <= element.segmentIndex() )
+            data_view[ element.globalIndex() ] /= sum;
+   } );
+
+   /***
+    * Print the data managed by the segments.
+    */
+   std::cout << "Value of elements after dividing by sum in each segment:" << std::endl;
+   printSegments( segments, fetch, std::cout );
+}
+
+int main( int argc, char* argv[] )
+{
+   std::cout << "Example of CSR segments on host: " << std::endl;
+   SegmentsExample< TNL::Devices::Host >();
+
+#ifdef HAVE_CUDA
+   std::cout << "Example of CSR segments on CUDA GPU: " << std::endl;
+   SegmentsExample< TNL::Devices::Cuda >();
+#endif
+   return EXIT_SUCCESS;
+}
diff --git a/Documentation/Examples/Algorithms/Segments/SegmentsExample_forSegments-2.cu b/Documentation/Examples/Algorithms/Segments/SegmentsExample_forSegments-2.cu
new file mode 120000
index 0000000000000000000000000000000000000000..6dde7c8916d516bedc70425d9fa40ef70e4a6513
--- /dev/null
+++ b/Documentation/Examples/Algorithms/Segments/SegmentsExample_forSegments-2.cu
@@ -0,0 +1 @@
+SegmentsExample_forSegments-2.cpp
\ No newline at end of file
diff --git a/Documentation/Examples/Algorithms/Segments/SegmentsExample_reduceSegments.cpp b/Documentation/Examples/Algorithms/Segments/SegmentsExample_reduceSegments.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c9a7476c724ec7cc36f9b7af4c3159bcc5e5af6e
--- /dev/null
+++ b/Documentation/Examples/Algorithms/Segments/SegmentsExample_reduceSegments.cpp
@@ -0,0 +1,83 @@
+#include <iostream>
+#include <functional>
+#include <TNL/Containers/Vector.h>
+#include <TNL/Algorithms/Segments/CSR.h>
+#include <TNL/Algorithms/Segments/Ellpack.h>
+#include <TNL/Devices/Host.h>
+#include <TNL/Devices/Cuda.h>
+
+template< typename Segments >
+void SegmentsExample()
+{
+   using Device = typename Segments::DeviceType;
+
+   /***
+    * Create segments with given segments sizes.
+    */
+   const int size = 5;
+   Segments segments{ 1, 2, 3, 4, 5 };
+
+   /***
+    * Allocate array for the segments;
+    */
+   TNL::Containers::Array< double, Device > data( segments.getStorageSize(), 0.0 );
+
+   /***
+    * Insert data into particular segments.
+    */
+   auto data_view = data.getView();
+   segments.forAllElements( [=] __cuda_callable__ ( int segmentIdx, int localIdx, int globalIdx ) mutable {
+      if( localIdx <= segmentIdx )
+         data_view[ globalIdx ] = segmentIdx;
+   } );
+
+   /***
+    * Print the data by the segments.
+    */
+   std::cout << "Values of elements after intial setup: " << std::endl;
+   auto fetch = [=] __cuda_callable__ ( int globalIdx ) -> double { return data_view[ globalIdx ]; };
+   printSegments( segments, fetch, std::cout );
+
+   /***
+    * Compute sums of elements in each segment.
+    */
+   TNL::Containers::Vector< double, Device > sums( size );
+   auto sums_view = sums.getView();
+   auto fetch_full = [=] __cuda_callable__ ( int segmentIdx, int localIdx, int globalIdx, bool& compute ) -> double {
+      if( localIdx <= segmentIdx )
+         return data_view[ globalIdx ];
+      else
+      {
+         compute = false;
+         return 0.0;
+      }
+   };
+   auto fetch_brief = [=] __cuda_callable__ ( int globalIdx, bool& compute ) -> double {
+      return data_view[ globalIdx ];
+   };
+
+   auto keep = [=] __cuda_callable__ ( int globalIdx, const double& value  ) mutable {
+      sums_view[ globalIdx ] = value; };
+   segments.reduceAllSegments( fetch_full, std::plus<>{}, keep, 0.0 );
+   std::cout << "The sums with full fetch form are: " << sums << std::endl;
+   segments.reduceAllSegments( fetch_brief, std::plus<>{}, keep, 0.0 );
+   std::cout << "The sums with brief fetch form are: " << sums << std::endl << std::endl;
+}
+
+int main( int argc, char* argv[] )
+{
+   std::cout << "Example of CSR segments on host: " << std::endl;
+   SegmentsExample< TNL::Algorithms::Segments::CSR< TNL::Devices::Host, int > >();
+
+   std::cout << "Example of Ellpack segments on host: " << std::endl;
+   SegmentsExample< TNL::Algorithms::Segments::Ellpack< TNL::Devices::Host, int > >();
+
+#ifdef HAVE_CUDA
+   std::cout << "Example of CSR segments on host: " << std::endl;
+   SegmentsExample< TNL::Algorithms::Segments::CSR< TNL::Devices::Cuda, int > >();
+
+   std::cout << "Example of Ellpack segments on host: " << std::endl;
+   SegmentsExample< TNL::Algorithms::Segments::Ellpack< TNL::Devices::Cuda, int > >();
+#endif
+   return EXIT_SUCCESS;
+}
diff --git a/Documentation/Examples/Algorithms/Segments/SegmentsExample_reduceSegments.cu b/Documentation/Examples/Algorithms/Segments/SegmentsExample_reduceSegments.cu
new file mode 120000
index 0000000000000000000000000000000000000000..ce5db1005a820ec4190decbf69a5a7fb8840e8fa
--- /dev/null
+++ b/Documentation/Examples/Algorithms/Segments/SegmentsExample_reduceSegments.cu
@@ -0,0 +1 @@
+SegmentsExample_reduceSegments.cpp
\ No newline at end of file
diff --git a/Documentation/Examples/Algorithms/Segments/SegmentsPrintingExample-1.cpp b/Documentation/Examples/Algorithms/Segments/SegmentsPrintingExample-1.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..62b30d7ef5efdec3860ed3fd2a8f89697cb4faf2
--- /dev/null
+++ b/Documentation/Examples/Algorithms/Segments/SegmentsPrintingExample-1.cpp
@@ -0,0 +1,48 @@
+#include <iostream>
+#include <TNL/Containers/Vector.h>
+#include <TNL/Algorithms/Segments/CSR.h>
+#include <TNL/Algorithms/Segments/Ellpack.h>
+#include <TNL/Algorithms/Segments/ChunkedEllpack.h>
+#include <TNL/Algorithms/Segments/BiEllpack.h>
+#include <TNL/Devices/Host.h>
+#include <TNL/Devices/Cuda.h>
+
+template< typename Segments >
+void SegmentsExample()
+{
+   /***
+    * Create segments with given segments sizes and print their setup.
+    */
+   Segments segments{ 1, 2, 3, 4, 5 };
+   std::cout << "Segments sizes are: " << segments << std::endl << std::endl;
+}
+
+int main( int argc, char* argv[] )
+{
+   std::cout << "Example of CSR segments on host: " << std::endl;
+   SegmentsExample< TNL::Algorithms::Segments::CSR< TNL::Devices::Host, int > >();
+
+   std::cout << "Example of Ellpack segments on host: " << std::endl;
+   SegmentsExample< TNL::Algorithms::Segments::Ellpack< TNL::Devices::Host, int > >();
+
+   std::cout << "Example of ChunkedEllpack segments on host: " << std::endl;
+   SegmentsExample< TNL::Algorithms::Segments::ChunkedEllpack< TNL::Devices::Host, int > >();
+
+   std::cout << "Example of BiEllpack segments on host: " << std::endl;
+   SegmentsExample< TNL::Algorithms::Segments::BiEllpack< TNL::Devices::Host, int > >();
+
+#ifdef HAVE_CUDA
+   std::cout << "Example of CSR segments on host: " << std::endl;
+   SegmentsExample< TNL::Algorithms::Segments::CSR< TNL::Devices::Cuda, int > >();
+
+   std::cout << "Example of Ellpack segments on host: " << std::endl;
+   SegmentsExample< TNL::Algorithms::Segments::Ellpack< TNL::Devices::Cuda, int > >();
+
+   std::cout << "Example of ChunkedEllpack segments on host: " << std::endl;
+   SegmentsExample< TNL::Algorithms::Segments::ChunkedEllpack< TNL::Devices::Cuda, int > >();
+
+   std::cout << "Example of BiEllpack segments on host: " << std::endl;
+   SegmentsExample< TNL::Algorithms::Segments::BiEllpack< TNL::Devices::Cuda, int > >();
+#endif
+   return EXIT_SUCCESS;
+}
diff --git a/Documentation/Examples/Algorithms/Segments/SegmentsPrintingExample-1.cu b/Documentation/Examples/Algorithms/Segments/SegmentsPrintingExample-1.cu
new file mode 120000
index 0000000000000000000000000000000000000000..42cd3852fcf53d8d8fe347f3e5f6b4d47ea9d86c
--- /dev/null
+++ b/Documentation/Examples/Algorithms/Segments/SegmentsPrintingExample-1.cu
@@ -0,0 +1 @@
+SegmentsPrintingExample-1.cpp
\ No newline at end of file
diff --git a/Documentation/Examples/Algorithms/Segments/SegmentsPrintingExample-2.cpp b/Documentation/Examples/Algorithms/Segments/SegmentsPrintingExample-2.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8f25b8bad929cae6863356196ba107a17f29154e
--- /dev/null
+++ b/Documentation/Examples/Algorithms/Segments/SegmentsPrintingExample-2.cpp
@@ -0,0 +1,66 @@
+#include <iostream>
+#include <TNL/Containers/Vector.h>
+#include <TNL/Algorithms/Segments/CSR.h>
+#include <TNL/Algorithms/Segments/Ellpack.h>
+#include <TNL/Algorithms/Segments/ChunkedEllpack.h>
+#include <TNL/Algorithms/Segments/BiEllpack.h>
+#include <TNL/Devices/Host.h>
+#include <TNL/Devices/Cuda.h>
+
+template< typename Segments >
+void SegmentsExample()
+{
+   using Device = typename Segments::DeviceType;
+
+   /***
+    * Create segments with given segments sizes.
+    */
+   TNL::Containers::Vector< int, Device > sizes{ 1, 2, 3, 4, 5 };
+   Segments segments( sizes );
+   std::cout << "Segments sizes are: " << segments << std::endl;
+
+   /***
+    * Allocate array for the segments;
+    */
+   TNL::Containers::Array< double, Device > data( segments.getStorageSize(), 0.0 );
+   data.forAllElements( [=] __cuda_callable__ ( int idx, double& value ) {
+      value = idx;
+   } );
+
+   /***
+    * Print the data managed by the segments.
+    */
+   auto data_view = data.getView();
+   auto fetch = [=] __cuda_callable__ ( int globalIdx ) -> double { return data_view[ globalIdx ]; };
+   printSegments( segments, fetch, std::cout ) << std::endl;
+}
+
+int main( int argc, char* argv[] )
+{
+   std::cout << "Example of CSR segments on host: " << std::endl;
+   SegmentsExample< TNL::Algorithms::Segments::CSR< TNL::Devices::Host, int > >();
+
+   std::cout << "Example of Ellpack segments on host: " << std::endl;
+   SegmentsExample< TNL::Algorithms::Segments::Ellpack< TNL::Devices::Host, int > >();
+
+   std::cout << "Example of ChunkedEllpack segments on host: " << std::endl;
+   SegmentsExample< TNL::Algorithms::Segments::ChunkedEllpack< TNL::Devices::Host, int > >();
+
+   std::cout << "Example of BiEllpack segments on host: " << std::endl;
+   SegmentsExample< TNL::Algorithms::Segments::BiEllpack< TNL::Devices::Host, int > >();
+
+#ifdef HAVE_CUDA
+   std::cout << "Example of CSR segments on host: " << std::endl;
+   SegmentsExample< TNL::Algorithms::Segments::CSR< TNL::Devices::Cuda, int > >();
+
+   std::cout << "Example of Ellpack segments on host: " << std::endl;
+   SegmentsExample< TNL::Algorithms::Segments::Ellpack< TNL::Devices::Cuda, int > >();
+
+   std::cout << "Example of ChunkedEllpack segments on host: " << std::endl;
+   SegmentsExample< TNL::Algorithms::Segments::ChunkedEllpack< TNL::Devices::Cuda, int > >();
+
+   std::cout << "Example of BiEllpack segments on host: " << std::endl;
+   SegmentsExample< TNL::Algorithms::Segments::BiEllpack< TNL::Devices::Cuda, int > >();
+#endif
+   return EXIT_SUCCESS;
+}
diff --git a/Documentation/Examples/Algorithms/Segments/SegmentsPrintingExample-2.cu b/Documentation/Examples/Algorithms/Segments/SegmentsPrintingExample-2.cu
new file mode 120000
index 0000000000000000000000000000000000000000..2f3149802f11c622ca908556b3d21412361879cc
--- /dev/null
+++ b/Documentation/Examples/Algorithms/Segments/SegmentsPrintingExample-2.cu
@@ -0,0 +1 @@
+SegmentsPrintingExample-2.cpp
\ No newline at end of file
diff --git a/Documentation/Examples/Algorithms/SequentialForExample.cpp b/Documentation/Examples/Algorithms/SequentialForExample.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4bf83a64ada9bd47cd63ad74cfee2b8cb57ac816
--- /dev/null
+++ b/Documentation/Examples/Algorithms/SequentialForExample.cpp
@@ -0,0 +1,38 @@
+#include <iostream>
+#include <cstdlib>
+#include <TNL/Containers/Vector.h>
+#include <TNL/Algorithms/ParallelFor.h>
+#include <TNL/Algorithms/SequentialFor.h>
+
+using namespace TNL;
+using namespace TNL::Containers;
+
+template< typename Device >
+void printVector()
+{
+   const int size( 60 );
+   TNL::Containers::Vector< float, Device > v( size, 1.0 );
+   auto view = v.getView();
+   auto print = [=] __cuda_callable__  ( int i ) mutable {
+      if( i % 5 == 0 )
+         printf( "v[ %d ] = %f \n", i, view[ i ] );  // we use printf because of compatibility with GPU kernels
+   };
+   std::cout << "Printing vector using parallel for: " << std::endl;
+   Algorithms::ParallelFor< Device >::exec( 0, v.getSize(), print );
+
+   std::cout << "Printing vector using sequential for: " << std::endl;
+   Algorithms::SequentialFor< Device >::exec( 0, v.getSize(), print );
+}
+
+int main( int argc, char* argv[] )
+{
+   std::cout << "Example on the host:" << std::endl;
+   printVector< TNL::Devices::Host >();
+
+#ifdef HAVE_CUDA
+   std::cout << "Example on CUDA GPU:" << std::endl;
+   printVector< TNL::Devices::Cuda >();
+#endif
+   return EXIT_SUCCESS;
+}
+
diff --git a/Documentation/Examples/Algorithms/SequentialForExample.cu b/Documentation/Examples/Algorithms/SequentialForExample.cu
new file mode 120000
index 0000000000000000000000000000000000000000..ac78b379b6e3ed91150ba91d3c7c465a41f70501
--- /dev/null
+++ b/Documentation/Examples/Algorithms/SequentialForExample.cu
@@ -0,0 +1 @@
+SequentialForExample.cpp
\ No newline at end of file
diff --git a/Documentation/Examples/CMakeLists.txt b/Documentation/Examples/CMakeLists.txt
index 29ba5a5dfc33b4fb6bbc88edd7f467e136f602ae..7aa7364299a853bc261ebd6fba0b225204218121 100644
--- a/Documentation/Examples/CMakeLists.txt
+++ b/Documentation/Examples/CMakeLists.txt
@@ -3,80 +3,44 @@ ADD_SUBDIRECTORY( Containers )
 ADD_SUBDIRECTORY( Pointers )
 ADD_SUBDIRECTORY( Matrices )
 
-ADD_EXECUTABLE( FileExample FileExample.cpp )
-ADD_CUSTOM_COMMAND( COMMAND FileExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/FileExample.out OUTPUT FileExample.out )
-
-IF( BUILD_CUDA )
-   CUDA_ADD_EXECUTABLE(FileExampleCuda FileExampleCuda.cu)
-   ADD_CUSTOM_COMMAND( COMMAND FileExampleCuda > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/FileExampleCuda.out OUTPUT FileExampleCuda.out )
-ENDIF()
-
-ADD_EXECUTABLE( FileExampleSaveAndLoad FileExampleSaveAndLoad.cpp )
-ADD_CUSTOM_COMMAND( COMMAND FileExampleSaveAndLoad > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/FileExampleSaveAndLoad.out OUTPUT FileExampleSaveAndLoad.out )
-
-ADD_EXECUTABLE( FileNameExample FileNameExample.cpp )
-ADD_CUSTOM_COMMAND( COMMAND FileNameExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/FileNameExample.out OUTPUT FileNameExample.out )
-
-ADD_EXECUTABLE( FileNameExampleDistributedSystemNodeCoordinates FileNameExampleDistributedSystemNodeCoordinates.cpp )
-ADD_CUSTOM_COMMAND( COMMAND FileNameExampleDistributedSystemNodeCoordinates > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/FileNameExampleDistributedSystemNodeCoordinates.out OUTPUT FileNameExampleDistributedSystemNodeCoordinates.out )
-
-
-ADD_EXECUTABLE( FileNameExampleDistributedSystemNodeId FileNameExampleDistributedSystemNodeId.cpp )
-ADD_CUSTOM_COMMAND( COMMAND FileNameExampleDistributedSystemNodeId > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/FileNameExampleDistributedSystemNodeId.out OUTPUT FileNameExampleDistributedSystemNodeId.out )
-
-ADD_EXECUTABLE( ObjectExample_getType ObjectExample_getType.cpp )
-ADD_CUSTOM_COMMAND( COMMAND ObjectExample_getType > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/ObjectExample_getType.out OUTPUT ObjectExample_getType.out )
-
-ADD_EXECUTABLE( ParameterContainerExample ParameterContainerExample.cpp )
-ADD_EXECUTABLE( ConfigDescriptionExample ConfigDescriptionExample.cpp )
-ADD_EXECUTABLE( LoggerExample LoggerExample.cpp )
-ADD_EXECUTABLE( MathExample MathExample.cpp )
-
-ADD_EXECUTABLE( ParseObjectTypeExample ParseObjectTypeExample.cpp )
-ADD_CUSTOM_COMMAND( COMMAND ParseObjectTypeExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/ParseObjectTypeExample.out OUTPUT ParseObjectTypeExample.out )
-
-ADD_EXECUTABLE( StringExample StringExample.cpp )
-ADD_CUSTOM_COMMAND( COMMAND StringExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/StringExample.out OUTPUT StringExample.out )
-
-ADD_EXECUTABLE( StringExampleGetAllocatedSize StringExampleGetAllocatedSize.cpp )
-ADD_CUSTOM_COMMAND( COMMAND StringExampleGetAllocatedSize > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/StringExampleGetAllocatedSize.out OUTPUT StringExampleGetAllocatedSize.out )
-
-ADD_EXECUTABLE( StringExampleReplace StringExampleReplace.cpp )
-ADD_CUSTOM_COMMAND( COMMAND StringExampleReplace > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/StringExampleReplace.out OUTPUT StringExampleReplace.out )
-
-ADD_EXECUTABLE( StringExampleSetSize StringExampleSetSize.cpp )
-ADD_CUSTOM_COMMAND( COMMAND StringExampleSetSize > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/StringExampleSetSize.out OUTPUT StringExampleSetSize.out )
-
-ADD_EXECUTABLE( StringExampleSplit StringExampleSplit.cpp )
-ADD_CUSTOM_COMMAND( COMMAND StringExampleSplit > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/StringExampleSplit.out OUTPUT StringExampleSplit.out )
-
-ADD_EXECUTABLE( StringExampleStrip StringExampleStrip.cpp )
-ADD_CUSTOM_COMMAND( COMMAND StringExampleStrip > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/StringExampleStrip.out OUTPUT StringExampleStrip.out )
-
-ADD_EXECUTABLE( TimerExample TimerExample.cpp )
-ADD_CUSTOM_COMMAND( COMMAND TimerExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/TimerExample.out OUTPUT TimerExample.out )
-
-ADD_EXECUTABLE( TimerExampleLogger TimerExampleLogger.cpp )
-ADD_CUSTOM_COMMAND( COMMAND TimerExampleLogger > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/TimerExampleLogger.out OUTPUT TimerExampleLogger.out )
-
-
-ADD_CUSTOM_TARGET( RunExamples ALL DEPENDS
-   FileExample.out
-   FileExampleSaveAndLoad.out
-   FileNameExample.out
-   FileNameExampleDistributedSystemNodeCoordinates.out
-   FileNameExampleDistributedSystemNodeId.out
-   ObjectExample_getType.out
-   ParseObjectTypeExample.out
-   StringExample.out
-   StringExampleGetAllocatedSize.out
-   StringExampleReplace.out
-   StringExampleSplit.out
-   StringExampleStrip.out
-   TimerExample.out
-   TimerExampleLogger.out )
+set( COMMON_EXAMPLES )
+
+set( CUDA_EXAMPLES
+   FileExampleCuda
+)
+
+set( HOST_EXAMPLES
+   FileExample
+   FileExampleSaveAndLoad
+   FileNameExample
+   FileNameExampleDistributedSystemNodeCoordinates
+   FileNameExampleDistributedSystemNodeId
+   ObjectExample_getType
+   ParseObjectTypeExample
+   StringExample
+   StringExampleGetAllocatedSize
+   StringExampleReplace
+   StringExampleSplit
+   StringExampleStrip
+   TimerExample
+   TimerExampleLogger )
 
 if( BUILD_CUDA )
-   ADD_CUSTOM_TARGET( RunExamples-cuda ALL DEPENDS
-      FileExampleCuda.out )
-ENDIF()
+   foreach( target IN ITEMS ${COMMON_EXAMPLES} ${CUDA_EXAMPLES} )
+      cuda_add_executable( ${target}-cuda ${target}.cu OPTIONS )
+      add_custom_command( COMMAND ${target}-cuda > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/${target}.out OUTPUT ${target}.out )
+      set( CUDA_OUTPUTS ${CUDA_OUTPUTS} ${target}.out )
+   endforeach()
+else()
+   foreach( target IN ITEMS ${COMMON_EXAMPLES} ${HOST_EXAMPLES})
+      add_executable( ${target} ${target}.cpp )
+      add_custom_command( COMMAND ${target} > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/${target}.out OUTPUT ${target}.out )
+      set( HOST_OUTPUTS ${HOST_OUTPUTS} ${target}.out )
+   endforeach()
+endif()
+
+IF( BUILD_CUDA )
+   ADD_CUSTOM_TARGET( RunExamples-cuda ALL DEPENDS ${CUDA_OUTPUTS} )
+ELSE()
+   ADD_CUSTOM_TARGET( RunExamples ALL DEPENDS ${HOST_OUTPUTS} )
+ENDIF()
\ No newline at end of file
diff --git a/Documentation/Examples/Matrices/DenseMatrix/CMakeLists.txt b/Documentation/Examples/Matrices/DenseMatrix/CMakeLists.txt
index e2814577681fa85d54b707b091f7d86fe7dcd49e..d88862afb2338e5e4581ce2f8204291f919a5cd8 100644
--- a/Documentation/Examples/Matrices/DenseMatrix/CMakeLists.txt
+++ b/Documentation/Examples/Matrices/DenseMatrix/CMakeLists.txt
@@ -26,6 +26,7 @@ set( COMMON_EXAMPLES
     DenseMatrixViewExample_forElements
     DenseMatrixViewExample_forRows
     DenseMatrixViewExample_forAllElements
+    DenseMatrixViewExample_wrap
 )
 
 if( BUILD_CUDA )
diff --git a/Documentation/Examples/Matrices/DenseMatrix/DenseMatrixExample_forAllElements.cpp b/Documentation/Examples/Matrices/DenseMatrix/DenseMatrixExample_forAllElements.cpp
index 4fd7d3b4727e8c4aca8d7c066a88038b80b4a887..f143164a25fd87c46abde31621fdc22552547256 100644
--- a/Documentation/Examples/Matrices/DenseMatrix/DenseMatrixExample_forAllElements.cpp
+++ b/Documentation/Examples/Matrices/DenseMatrix/DenseMatrixExample_forAllElements.cpp
@@ -8,10 +8,8 @@ void forAllElementsExample()
 {
    TNL::Matrices::DenseMatrix< double, Device > matrix( 5, 5 );
 
-   auto f = [=] __cuda_callable__ ( int rowIdx, int columnIdx, int columnIdx_, double& value, bool& compute ) {
-      if( rowIdx < columnIdx )
-         compute = false;
-      else
+   auto f = [=] __cuda_callable__ ( int rowIdx, int columnIdx, int columnIdx_, double& value ) {
+      if( rowIdx >= columnIdx )
          value = rowIdx + columnIdx;
    };
 
diff --git a/Documentation/Examples/Matrices/DenseMatrix/DenseMatrixExample_forElements.cpp b/Documentation/Examples/Matrices/DenseMatrix/DenseMatrixExample_forElements.cpp
index 0764eecdfc39c76ed05161ef7eee8512be3b08d6..b37470c43c6756f11e99eff932aea4200c48eb0c 100644
--- a/Documentation/Examples/Matrices/DenseMatrix/DenseMatrixExample_forElements.cpp
+++ b/Documentation/Examples/Matrices/DenseMatrix/DenseMatrixExample_forElements.cpp
@@ -8,10 +8,8 @@ void forElementsExample()
 {
    TNL::Matrices::DenseMatrix< double, Device > matrix( 5, 5 );
 
-   auto f = [=] __cuda_callable__ ( int rowIdx, int columnIdx, int columnIdx_, double& value, bool& compute ) {
-      if( rowIdx < columnIdx )
-         compute = false;
-      else
+   auto f = [=] __cuda_callable__ ( int rowIdx, int columnIdx, int columnIdx_, double& value ) {
+      if( rowIdx >= columnIdx )
          value = rowIdx + columnIdx;
    };
 
diff --git a/Documentation/Examples/Matrices/DenseMatrix/DenseMatrixViewExample_forAllElements.cpp b/Documentation/Examples/Matrices/DenseMatrix/DenseMatrixViewExample_forAllElements.cpp
index 66b39413017b41b98b67eb2a3ff48455110cd8ec..c5802a0e1f7415bb3f3b43627c3ae88d6010325c 100644
--- a/Documentation/Examples/Matrices/DenseMatrix/DenseMatrixViewExample_forAllElements.cpp
+++ b/Documentation/Examples/Matrices/DenseMatrix/DenseMatrixViewExample_forAllElements.cpp
@@ -9,10 +9,8 @@ void forAllElementsExample()
    TNL::Matrices::DenseMatrix< double, Device > matrix( 5, 5 );
    auto matrixView = matrix.getView();
 
-   auto f = [=] __cuda_callable__ ( int rowIdx, int columnIdx, int globalIdx, double& value, bool& compute ) {
-      if( rowIdx < columnIdx )
-         compute = false;
-      else
+   auto f = [=] __cuda_callable__ ( int rowIdx, int columnIdx, int globalIdx, double& value ) {
+      if( rowIdx >= columnIdx )
          value = rowIdx + columnIdx;
    };
 
diff --git a/Documentation/Examples/Matrices/DenseMatrix/DenseMatrixViewExample_forElements.cpp b/Documentation/Examples/Matrices/DenseMatrix/DenseMatrixViewExample_forElements.cpp
index 6a980d23c148599835882425db3a03475612ccef..572c526f02554bd66baaa2f7e900426edd9118bb 100644
--- a/Documentation/Examples/Matrices/DenseMatrix/DenseMatrixViewExample_forElements.cpp
+++ b/Documentation/Examples/Matrices/DenseMatrix/DenseMatrixViewExample_forElements.cpp
@@ -9,10 +9,8 @@ void forElementsExample()
    TNL::Matrices::DenseMatrix< double, Device > matrix( 5, 5 );
    auto matrixView = matrix.getView();
 
-   auto f = [=] __cuda_callable__ ( int rowIdx, int columnIdx, int globalIdx, double& value, bool& compute ) {
-      if( columnIdx > rowIdx )
-         compute = false;
-      else
+   auto f = [=] __cuda_callable__ ( int rowIdx, int columnIdx, int globalIdx, double& value ) {
+      if( columnIdx <= rowIdx )
          value = rowIdx + columnIdx;
    };
 
diff --git a/Documentation/Examples/Matrices/DenseMatrix/DenseMatrixViewExample_wrap.cpp b/Documentation/Examples/Matrices/DenseMatrix/DenseMatrixViewExample_wrap.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..81919146816ee771de4ac0626d14b2315c490460
--- /dev/null
+++ b/Documentation/Examples/Matrices/DenseMatrix/DenseMatrixViewExample_wrap.cpp
@@ -0,0 +1,34 @@
+#include <iostream>
+#include <TNL/Algorithms/ParallelFor.h>
+#include <TNL/Matrices/DenseMatrix.h>
+#include <TNL/Matrices/MatrixWrapping.h>
+#include <TNL/Devices/Host.h>
+#include <TNL/Devices/Cuda.h>
+
+template< typename Device >
+void wrapMatrixView()
+{
+   const int rows( 3 ), columns( 4 );
+   TNL::Containers::Vector< double, Device > valuesVector {
+      1,  2,  3,  4,
+      5,  6,  7,  8,
+      9, 10, 11, 12 };
+   double* values = valuesVector.getData();
+
+   /***
+    * Wrap the array `values` to dense matrix view
+    */
+   auto matrix = TNL::Matrices::wrapDenseMatrix< Device >( rows, columns, values );
+   std::cout << "Matrix reads as: " << std::endl << matrix << std::endl;
+}
+
+int main( int argc, char* argv[] )
+{
+   std::cout << "Wraping matrix view on host: " << std::endl;
+   wrapMatrixView< TNL::Devices::Host >();
+
+#ifdef HAVE_CUDA
+   std::cout << "Wraping matrix view on CUDA device: " << std::endl;
+   wrapMatrixView< TNL::Devices::Cuda >();
+#endif
+}
diff --git a/Documentation/Examples/Matrices/DenseMatrix/DenseMatrixViewExample_wrap.cu b/Documentation/Examples/Matrices/DenseMatrix/DenseMatrixViewExample_wrap.cu
new file mode 120000
index 0000000000000000000000000000000000000000..fbdc1d8bb41c18719f26e47cf4abb680d7d6e92f
--- /dev/null
+++ b/Documentation/Examples/Matrices/DenseMatrix/DenseMatrixViewExample_wrap.cu
@@ -0,0 +1 @@
+DenseMatrixViewExample_wrap.cpp
\ No newline at end of file
diff --git a/Documentation/Examples/Matrices/LambdaMatrix/LambdaMatrixExample_forAllElements.cpp b/Documentation/Examples/Matrices/LambdaMatrix/LambdaMatrixExample_forAllElements.cpp
index 293f173d2a0115ea46b03ac09018e399ad7c99f1..6b335f5f298f4d750a8f7ce22a42755bebfdbcef 100644
--- a/Documentation/Examples/Matrices/LambdaMatrix/LambdaMatrixExample_forAllElements.cpp
+++ b/Documentation/Examples/Matrices/LambdaMatrix/LambdaMatrixExample_forAllElements.cpp
@@ -22,7 +22,7 @@ void forAllElementsExample()
    TNL::Matrices::DenseMatrix< double, Device > denseMatrix( 5, 5 );
    auto denseView = denseMatrix.getView();
 
-   auto f = [=] __cuda_callable__ ( int rowIdx, int localIdx, int columnIdx, double value, bool& compute ) mutable {
+   auto f = [=] __cuda_callable__ ( int rowIdx, int localIdx, int columnIdx, double value ) mutable {
       denseView.setElement( rowIdx, columnIdx, value );
    };
 
diff --git a/Documentation/Examples/Matrices/LambdaMatrix/LambdaMatrixExample_forElements.cpp b/Documentation/Examples/Matrices/LambdaMatrix/LambdaMatrixExample_forElements.cpp
index f23f031b1ba3b4544ee1f20908d831c117879553..8472ef28d2a86955e03ade98929429b5ebadd0a6 100644
--- a/Documentation/Examples/Matrices/LambdaMatrix/LambdaMatrixExample_forElements.cpp
+++ b/Documentation/Examples/Matrices/LambdaMatrix/LambdaMatrixExample_forElements.cpp
@@ -22,7 +22,7 @@ void forElementsExample()
    TNL::Matrices::DenseMatrix< double, Device > denseMatrix( 5, 5 );
    auto denseView = denseMatrix.getView();
 
-   auto f = [=] __cuda_callable__ ( int rowIdx, int localIdx, int columnIdx, double value, bool& compute ) mutable {
+   auto f = [=] __cuda_callable__ ( int rowIdx, int localIdx, int columnIdx, double value ) mutable {
       denseView.setElement( rowIdx, columnIdx, value );
    };
 
diff --git a/Documentation/Examples/Matrices/MultidiagonalMatrix/MultidiagonalMatrixExample_forAllElements.cpp b/Documentation/Examples/Matrices/MultidiagonalMatrix/MultidiagonalMatrixExample_forAllElements.cpp
index b29543d9e0cb88da80f5ed0f9381d7be772c3f48..96a4668b9683ad577054673b94a2014f68387df9 100644
--- a/Documentation/Examples/Matrices/MultidiagonalMatrix/MultidiagonalMatrixExample_forAllElements.cpp
+++ b/Documentation/Examples/Matrices/MultidiagonalMatrix/MultidiagonalMatrixExample_forAllElements.cpp
@@ -23,7 +23,7 @@ void forAllElementsExample()
       5,               // number of matrix columns
       { -2, -1, 0 } ); // matrix diagonals offsets
 
-   auto f = [=] __cuda_callable__ ( int rowIdx, int localIdx, int columnIdx, double& value, bool& compute ) {
+   auto f = [=] __cuda_callable__ ( int rowIdx, int localIdx, int columnIdx, double& value ) {
       /***
        * 'forElements' method iterates only over matrix elements lying on given subdiagonals
        * and so we do not need to check anything. The element value can be expressed
diff --git a/Documentation/Examples/Matrices/MultidiagonalMatrix/MultidiagonalMatrixExample_forElements.cpp b/Documentation/Examples/Matrices/MultidiagonalMatrix/MultidiagonalMatrixExample_forElements.cpp
index dd30694e6f1fcae01f948b5d896a00eed23df20e..1dc957af245717e59c0082c3eb8fa6ac2932eef7 100644
--- a/Documentation/Examples/Matrices/MultidiagonalMatrix/MultidiagonalMatrixExample_forElements.cpp
+++ b/Documentation/Examples/Matrices/MultidiagonalMatrix/MultidiagonalMatrixExample_forElements.cpp
@@ -23,7 +23,7 @@ void forElementsExample()
       5,               // number of matrix columns
       { -2, -1, 0 } ); // matrix diagonals offsets
 
-   auto f = [=] __cuda_callable__ ( int rowIdx, int localIdx, int columnIdx, double& value, bool& compute ) {
+   auto f = [=] __cuda_callable__ ( int rowIdx, int localIdx, int columnIdx, double& value ) {
       /***
        * 'forElements' method iterates only over matrix elements lying on given subdiagonals
        * and so we do not need to check anything. The element value can be expressed
diff --git a/Documentation/Examples/Matrices/MultidiagonalMatrix/MultidiagonalMatrixViewExample_forAllElements.cpp b/Documentation/Examples/Matrices/MultidiagonalMatrix/MultidiagonalMatrixViewExample_forAllElements.cpp
index b05da1d8213143bc5ffd11625f3580c9185248e7..4ca0940cbedebc2e26ca628303f69841cdca3f3d 100644
--- a/Documentation/Examples/Matrices/MultidiagonalMatrix/MultidiagonalMatrixViewExample_forAllElements.cpp
+++ b/Documentation/Examples/Matrices/MultidiagonalMatrix/MultidiagonalMatrixViewExample_forAllElements.cpp
@@ -24,7 +24,7 @@ void forAllElementsExample()
       { -2, -1, 0 } ); // matrix diagonals offsets
    auto view = matrix.getView();
 
-   auto f = [=] __cuda_callable__ ( int rowIdx, int localIdx, int columnIdx, double& value, bool& compute ) {
+   auto f = [=] __cuda_callable__ ( int rowIdx, int localIdx, int columnIdx, double& value ) {
       /***
        * 'forElements' method iterates only over matrix elements lying on given subdiagonals
        * and so we do not need to check anything. The element value can be expressed
diff --git a/Documentation/Examples/Matrices/MultidiagonalMatrix/MultidiagonalMatrixViewExample_forElements.cpp b/Documentation/Examples/Matrices/MultidiagonalMatrix/MultidiagonalMatrixViewExample_forElements.cpp
index 9663a2c0d4648ef85c27d5a76f17454b7b6de55e..d941fc4a2039064d6e999dda5abef65df9d37a5e 100644
--- a/Documentation/Examples/Matrices/MultidiagonalMatrix/MultidiagonalMatrixViewExample_forElements.cpp
+++ b/Documentation/Examples/Matrices/MultidiagonalMatrix/MultidiagonalMatrixViewExample_forElements.cpp
@@ -24,7 +24,7 @@ void forElementsExample()
       { -2, -1, 0 } ); // matrix diagonals offsets
    auto view = matrix.getView();
 
-   auto f = [=] __cuda_callable__ ( int rowIdx, int localIdx, int columnIdx, double& value, bool& compute ) {
+   auto f = [=] __cuda_callable__ ( int rowIdx, int localIdx, int columnIdx, double& value ) {
       /***
        * 'forElements' method iterates only over matrix elements lying on given subdiagonals
        * and so we do not need to check anything. The element value can be expressed
diff --git a/Documentation/Examples/Matrices/SparseMatrix/CMakeLists.txt b/Documentation/Examples/Matrices/SparseMatrix/CMakeLists.txt
index c2db3879eea91791f4ee5239e3cb1acda1092702..641534bae21216ab91fe798af67715440f763c5f 100644
--- a/Documentation/Examples/Matrices/SparseMatrix/CMakeLists.txt
+++ b/Documentation/Examples/Matrices/SparseMatrix/CMakeLists.txt
@@ -30,6 +30,8 @@ set( COMMON_EXAMPLES
    SparseMatrixViewExample_forElements
    SparseMatrixViewExample_forRows
    SparseMatrixViewExample_forAllElements
+   SparseMatrixViewExample_wrapCSR
+   SparseMatrixViewExample_wrapEllpack
 )
 
 if( BUILD_CUDA )
diff --git a/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixExample_forAllElements.cpp b/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixExample_forAllElements.cpp
index c603fe32f9d345063975cbbae8ada8859c9285df..8c9fb368c15f85dfff6bcfca639ec59e0d0b9011 100644
--- a/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixExample_forAllElements.cpp
+++ b/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixExample_forAllElements.cpp
@@ -8,11 +8,9 @@ void forAllElementsExample()
 {
    TNL::Matrices::SparseMatrix< double, Device > matrix( { 1, 2, 3, 4, 5 }, 5 );
 
-   auto f = [=] __cuda_callable__ ( int rowIdx, int localIdx, int& columnIdx, double& value, bool& compute ) {
-      if( rowIdx < localIdx )  // This is important, some matrix formats may allocate more matrix elements
+   auto f = [=] __cuda_callable__ ( int rowIdx, int localIdx, int& columnIdx, double& value ) {
+      if( rowIdx >= localIdx )  // This is important, some matrix formats may allocate more matrix elements
                                // than we requested. These padding elements are processed here as well.
-         compute = false;
-      else
       {
          columnIdx = localIdx;
          value = rowIdx + localIdx + 1;
diff --git a/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixExample_forElements.cpp b/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixExample_forElements.cpp
index 216433b637bf4870b05ca160cc70bbb44bd00287..2d7bbeba55d62f0f18fa2f1402f4b159d57731a6 100644
--- a/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixExample_forElements.cpp
+++ b/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixExample_forElements.cpp
@@ -8,11 +8,9 @@ void forElementsExample()
 {
    TNL::Matrices::SparseMatrix< double, Device > matrix( { 1, 2, 3, 4, 5 }, 5 );
 
-   auto f = [=] __cuda_callable__ ( int rowIdx, int localIdx, int& columnIdx, double& value, bool& compute ) {
-      if( rowIdx < localIdx )  // This is important, some matrix formats may allocate more matrix elements
+   auto f = [=] __cuda_callable__ ( int rowIdx, int localIdx, int& columnIdx, double& value ) {
+      if( rowIdx >= localIdx )  // This is important, some matrix formats may allocate more matrix elements
                                // than we requested. These padding elements are processed here as well.
-         compute = false;
-      else
       {
          columnIdx = localIdx;
          value = rowIdx + localIdx + 1;
diff --git a/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixViewExample_forAllElements.cpp b/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixViewExample_forAllElements.cpp
index 4000107eb325c6abb8c8f79c379eecbdfd9f2386..79fb7890d5df862f79d2d65e60d88c6727c34748 100644
--- a/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixViewExample_forAllElements.cpp
+++ b/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixViewExample_forAllElements.cpp
@@ -9,11 +9,9 @@ void forAllElementsExample()
    TNL::Matrices::SparseMatrix< double, Device > matrix( { 1, 2, 3, 4, 5 }, 5 );
    auto view = matrix.getView();
 
-   auto f = [=] __cuda_callable__ ( int rowIdx, int localIdx, int& columnIdx, double& value, bool& compute ) {
-      if( rowIdx < localIdx )  // This is important, some matrix formats may allocate more matrix elements
+   auto f = [=] __cuda_callable__ ( int rowIdx, int localIdx, int& columnIdx, double& value ) {
+      if( rowIdx >= localIdx )  // This is important, some matrix formats may allocate more matrix elements
                                // than we requested. These padding elements are processed here as well.
-         compute = false;
-      else
       {
          columnIdx = localIdx;
          value = rowIdx + localIdx + 1;
diff --git a/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixViewExample_forElements.cpp b/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixViewExample_forElements.cpp
index 4ffb2ee834d8b76a5b557f71a5f45a674aa17b1c..6e296d3dec1fb78b72b6e4b140217f9007b02c68 100644
--- a/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixViewExample_forElements.cpp
+++ b/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixViewExample_forElements.cpp
@@ -9,11 +9,9 @@ void forElementsExample()
    TNL::Matrices::SparseMatrix< double, Device > matrix( { 1, 2, 3, 4, 5 }, 5 );
    auto view = matrix.getView();
 
-   auto f = [=] __cuda_callable__ ( int rowIdx, int localIdx, int& columnIdx, double& value, bool& compute ) {
-      if( rowIdx < localIdx )  // This is important, some matrix formats may allocate more matrix elements
+   auto f = [=] __cuda_callable__ ( int rowIdx, int localIdx, int& columnIdx, double& value ) {
+      if( rowIdx >= localIdx )  // This is important, some matrix formats may allocate more matrix elements
                                // than we requested. These padding elements are processed here as well.
-         compute = false;
-      else
       {
          columnIdx = localIdx;
          value = rowIdx + localIdx + 1;
diff --git a/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixViewExample_wrapCSR.cpp b/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixViewExample_wrapCSR.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0c574bb7a0f52371cf40a35c5e8bba3f85c880ab
--- /dev/null
+++ b/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixViewExample_wrapCSR.cpp
@@ -0,0 +1,45 @@
+#include <iostream>
+#include <TNL/Algorithms/ParallelFor.h>
+#include <TNL/Matrices/DenseMatrix.h>
+#include <TNL/Matrices/MatrixWrapping.h>
+#include <TNL/Devices/Host.h>
+#include <TNL/Devices/Cuda.h>
+
+template< typename Device >
+void wrapMatrixView()
+{
+   /***
+    * Encode the following matrix to CSR format...
+    *
+    * /  1  2  0  0 \.
+    * |  0  6  0  0 |
+    * |  9  0  0  0 |
+    * \  0  0 15 16 /
+    */
+   const int rows( 4 ), columns( 4 );
+   TNL::Containers::Vector< double, Device > valuesVector     { 1, 2, 6, 9, 15, 16 };
+   TNL::Containers::Vector< int, Device > columnIndexesVector { 0, 1, 1, 0,  2,  3 };
+   TNL::Containers::Vector< int, Device > rowPointersVector   { 0, 2, 3, 4, 6 };
+
+   double* values = valuesVector.getData();
+   int* columnIndexes = columnIndexesVector.getData();
+   int* rowPointers = rowPointersVector.getData();
+
+   /***
+    * Wrap the arrays `rowPointers, `values` and `columnIndexes` to sparse matrix view
+    */
+   auto matrix = TNL::Matrices::wrapCSRMatrix< Device >( rows, columns, rowPointers, values, columnIndexes );
+
+   std::cout << "Matrix reads as: " << std::endl << matrix << std::endl;
+}
+
+int main( int argc, char* argv[] )
+{
+   std::cout << "Wraping matrix view on host: " << std::endl;
+   wrapMatrixView< TNL::Devices::Host >();
+
+#ifdef HAVE_CUDA
+   std::cout << "Wraping matrix view on CUDA device: " << std::endl;
+   wrapMatrixView< TNL::Devices::Cuda >();
+#endif
+}
diff --git a/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixViewExample_wrapCSR.cu b/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixViewExample_wrapCSR.cu
new file mode 120000
index 0000000000000000000000000000000000000000..6581b62dcbc8cc88eb84616f57dd933208df3823
--- /dev/null
+++ b/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixViewExample_wrapCSR.cu
@@ -0,0 +1 @@
+SparseMatrixViewExample_wrapCSR.cpp
\ No newline at end of file
diff --git a/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixViewExample_wrapEllpack.cpp b/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixViewExample_wrapEllpack.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..67df09891612c6d10f0743a4796cbbac31f5af83
--- /dev/null
+++ b/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixViewExample_wrapEllpack.cpp
@@ -0,0 +1,43 @@
+#include <iostream>
+#include <TNL/Algorithms/ParallelFor.h>
+#include <TNL/Matrices/DenseMatrix.h>
+#include <TNL/Matrices/MatrixWrapping.h>
+#include <TNL/Devices/Host.h>
+#include <TNL/Devices/Cuda.h>
+
+template< typename Device >
+void wrapMatrixView()
+{
+   /***
+    * Encode the following matrix to Ellpack format...
+    *
+    * /  1  2  0  0 \.
+    * |  0  6  0  0 |
+    * |  9  0  0  0 |
+    * \  0  0 15 16 /
+    */
+   const int rows( 4 ), columns( 4 );
+   TNL::Containers::Vector< double, Device > valuesVector     { 1,  2,  6,  0,  9,  0, 15, 16 };
+   TNL::Containers::Vector< int, Device > columnIndexesVector { 0,  1,  1, -1,  0, -1,  2,  3 };
+
+   double* values = valuesVector.getData();
+   int* columnIndexes = columnIndexesVector.getData();
+
+   /***
+    * Wrap the arrays `values` and `columnIndexes` to sparse matrix view
+    */
+   auto matrix = TNL::Matrices::wrapEllpackMatrix< Device, TNL::Algorithms::Segments::RowMajorOrder >( rows, columns, 2, values, columnIndexes );
+
+   std::cout << "Matrix reads as: " << std::endl << matrix << std::endl;
+}
+
+int main( int argc, char* argv[] )
+{
+   std::cout << "Wraping matrix view on host: " << std::endl;
+   wrapMatrixView< TNL::Devices::Host >();
+
+#ifdef HAVE_CUDA
+   std::cout << "Wraping matrix view on CUDA device: " << std::endl;
+   wrapMatrixView< TNL::Devices::Cuda >();
+#endif
+}
diff --git a/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixViewExample_wrapEllpack.cu b/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixViewExample_wrapEllpack.cu
new file mode 120000
index 0000000000000000000000000000000000000000..3d0a09594910fd369499289eb2a7be62ccdeb964
--- /dev/null
+++ b/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixViewExample_wrapEllpack.cu
@@ -0,0 +1 @@
+SparseMatrixViewExample_wrapEllpack.cpp
\ No newline at end of file
diff --git a/Documentation/Examples/Matrices/TridiagonalMatrix/TridiagonalMatrixExample_forAllElements.cpp b/Documentation/Examples/Matrices/TridiagonalMatrix/TridiagonalMatrixExample_forAllElements.cpp
index c29b439a6b85274f863b110e80aea7ca8537fbb3..314cd6e4b8491b9f4ee482936100eaec432b9e84 100644
--- a/Documentation/Examples/Matrices/TridiagonalMatrix/TridiagonalMatrixExample_forAllElements.cpp
+++ b/Documentation/Examples/Matrices/TridiagonalMatrix/TridiagonalMatrixExample_forAllElements.cpp
@@ -20,7 +20,7 @@ void forAllElementsExample()
       5,      // number of matrix rows
       5 );    // number of matrix columns
 
-   auto f = [=] __cuda_callable__ ( int rowIdx, int localIdx, int columnIdx, double& value, bool& compute ) {
+   auto f = [=] __cuda_callable__ ( int rowIdx, int localIdx, int columnIdx, double& value ) {
       /***
        * 'forElements' method iterates only over matrix elements lying on given subdiagonals
        * and so we do not need to check anything. The element value can be expressed
diff --git a/Documentation/Examples/Matrices/TridiagonalMatrix/TridiagonalMatrixExample_forElements.cpp b/Documentation/Examples/Matrices/TridiagonalMatrix/TridiagonalMatrixExample_forElements.cpp
index 243e9468eb5fc6ea48fa521cd0a651f2f77798e4..b15a9f5818d16ad963a5ff8ab95d98a0f7da7f64 100644
--- a/Documentation/Examples/Matrices/TridiagonalMatrix/TridiagonalMatrixExample_forElements.cpp
+++ b/Documentation/Examples/Matrices/TridiagonalMatrix/TridiagonalMatrixExample_forElements.cpp
@@ -20,7 +20,7 @@ void forElementsExample()
       5,      // number of matrix rows
       5 );    // number of matrix columns
 
-   auto f = [=] __cuda_callable__ ( int rowIdx, int localIdx, int columnIdx, double& value, bool& compute ) {
+   auto f = [=] __cuda_callable__ ( int rowIdx, int localIdx, int columnIdx, double& value ) {
       /***
        * 'forElements' method iterates only over matrix elements lying on given subdiagonals
        * and so we do not need to check anything. The element value can be expressed
diff --git a/Documentation/Examples/Matrices/TridiagonalMatrix/TridiagonalMatrixViewExample_forAllElements.cpp b/Documentation/Examples/Matrices/TridiagonalMatrix/TridiagonalMatrixViewExample_forAllElements.cpp
index 0ef4304623f9d5e1a6330c04c46ef30332f9fd89..8d90c989ef08e7beceb3bc70842f93b88eaafa91 100644
--- a/Documentation/Examples/Matrices/TridiagonalMatrix/TridiagonalMatrixViewExample_forAllElements.cpp
+++ b/Documentation/Examples/Matrices/TridiagonalMatrix/TridiagonalMatrixViewExample_forAllElements.cpp
@@ -21,7 +21,7 @@ void forAllElementsExample()
       5 );    // number of matrix columns
    auto view = matrix.getView();
 
-   auto f = [=] __cuda_callable__ ( int rowIdx, int localIdx, int columnIdx, double& value, bool& compute ) {
+   auto f = [=] __cuda_callable__ ( int rowIdx, int localIdx, int columnIdx, double& value ) {
       /***
        * 'forElements' method iterates only over matrix elements lying on given subdiagonals
        * and so we do not need to check anything. The element value can be expressed
diff --git a/Documentation/Examples/Matrices/TridiagonalMatrix/TridiagonalMatrixViewExample_forElements.cpp b/Documentation/Examples/Matrices/TridiagonalMatrix/TridiagonalMatrixViewExample_forElements.cpp
index 3045bc655ef1271d5d04d0c807bd979f9a996fcb..b077c008c70507f991abd29935980c54d08c072b 100644
--- a/Documentation/Examples/Matrices/TridiagonalMatrix/TridiagonalMatrixViewExample_forElements.cpp
+++ b/Documentation/Examples/Matrices/TridiagonalMatrix/TridiagonalMatrixViewExample_forElements.cpp
@@ -21,7 +21,7 @@ void forElementsExample()
       5 );    // number of matrix columns
    auto view = matrix.getView();
 
-   auto f = [=] __cuda_callable__ ( int rowIdx, int localIdx, int columnIdx, double& value, bool& compute ) {
+   auto f = [=] __cuda_callable__ ( int rowIdx, int localIdx, int columnIdx, double& value ) {
       /***
        * 'forElements' method iterates only over matrix elements lying on given subdiagonals
        * and so we do not need to check anything. The element value can be expressed
diff --git a/Documentation/Examples/Pointers/CMakeLists.txt b/Documentation/Examples/Pointers/CMakeLists.txt
index ef7a5f6150c600bcc016ccad02c099052afd1ec4..2b08ac329958d7d1f7b95ccaf876fce45def420f 100644
--- a/Documentation/Examples/Pointers/CMakeLists.txt
+++ b/Documentation/Examples/Pointers/CMakeLists.txt
@@ -1,15 +1,26 @@
-IF( BUILD_CUDA )
-   CUDA_ADD_EXECUTABLE(UniquePointerExampleCuda UniquePointerExample.cu)
-   ADD_CUSTOM_COMMAND( COMMAND UniquePointerExampleCuda > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/UniquePointerExample.out OUTPUT UniquePointerExample.out )
-   CUDA_ADD_EXECUTABLE(SharedPointerExampleCuda SharedPointerExample.cu)
-   ADD_CUSTOM_COMMAND( COMMAND SharedPointerExampleCuda > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SharedPointerExample.out OUTPUT SharedPointerExample.out )
-   CUDA_ADD_EXECUTABLE(DevicePointerExampleCuda DevicePointerExample.cu)
-   ADD_CUSTOM_COMMAND( COMMAND DevicePointerExampleCuda > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DevicePointerExample.out OUTPUT DevicePointerExample.out )
+set( COMMON_EXAMPLES
+   UniquePointerExample
+   SharedPointerExample
+   DevicePointerExample
+)
 
-ADD_CUSTOM_TARGET( RunPointersExamples ALL DEPENDS
-   UniquePointerExample.out
-   SharedPointerExample.out
-   DevicePointerExample.out
- )
+if( BUILD_CUDA )
+   foreach( target IN ITEMS ${COMMON_EXAMPLES} )
+      cuda_add_executable( ${target}-cuda ${target}.cu OPTIONS )
+      add_custom_command( COMMAND ${target}-cuda > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/${target}.out OUTPUT ${target}.out )
+      set( CUDA_OUTPUTS ${CUDA_OUTPUTS} ${target}.out )
+   endforeach()
+else()
+   foreach( target IN ITEMS ${COMMON_EXAMPLES} )
+      add_executable( ${target} ${target}.cpp )
+      add_custom_command( COMMAND ${target} > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/${target}.out OUTPUT ${target}.out )
+      set( HOST_OUTPUTS ${HOST_OUTPUTS} ${target}.out )
+   endforeach()
+endif()
 
+IF( BUILD_CUDA )
+   ADD_CUSTOM_TARGET( RunPointersExamples-cuda ALL DEPENDS ${CUDA_OUTPUTS} )
+ELSE()
+   ADD_CUSTOM_TARGET( RunPointersExamples ALL DEPENDS ${HOST_OUTPUTS} )
 ENDIF()
+
diff --git a/Documentation/Tutorials/Matrices/DenseMatrixSetup_Benchmark.cpp b/Documentation/Tutorials/Matrices/DenseMatrixSetup_Benchmark.cpp
index 7696e9d0d7e99e750b6f6d68e63d013c2938655e..64979b0d4162c9df5a69fbd527e5d667d4e9afb6 100644
--- a/Documentation/Tutorials/Matrices/DenseMatrixSetup_Benchmark.cpp
+++ b/Documentation/Tutorials/Matrices/DenseMatrixSetup_Benchmark.cpp
@@ -62,7 +62,7 @@ void forElements( const int matrixSize, Matrix& matrix )
 {
    matrix.setDimensions( matrixSize, matrixSize );
 
-   auto f = [=] __cuda_callable__ ( int rowIdx, int localIdx, int& columnIdx, float& value, bool& compute ) mutable {
+   auto f = [=] __cuda_callable__ ( int rowIdx, int localIdx, int& columnIdx, float& value ) mutable {
       value = rowIdx + columnIdx;
    };
    matrix.forElements( 0, matrixSize, f );
diff --git a/Documentation/Tutorials/Matrices/MultidiagonalMatrixSetup_Benchmark.cpp b/Documentation/Tutorials/Matrices/MultidiagonalMatrixSetup_Benchmark.cpp
index d323105cd08972c2ad4d3aba6bde982e38374948..75186957eaa6b469fcddf598645029d42626a3a0 100644
--- a/Documentation/Tutorials/Matrices/MultidiagonalMatrixSetup_Benchmark.cpp
+++ b/Documentation/Tutorials/Matrices/MultidiagonalMatrixSetup_Benchmark.cpp
@@ -143,7 +143,7 @@ void forElements( const int gridSize, Matrix& matrix )
    const int matrixSize = gridSize * gridSize;
    matrix.setDimensions( matrixSize, matrixSize, getOffsets< typename Matrix::DeviceType >( gridSize ) );
 
-   auto f = [=] __cuda_callable__ ( int rowIdx, int localIdx, int columnIdx, float& value, bool& compute ) mutable {
+   auto f = [=] __cuda_callable__ ( int rowIdx, int localIdx, int columnIdx, float& value ) mutable {
       const int i = rowIdx % gridSize;
       const int j = rowIdx / gridSize;
       if( ( i == 0 || j == 0 || i == gridSize - 1 || j == gridSize - 1 ) && localIdx == 0 )
diff --git a/Documentation/Tutorials/Matrices/SparseMatrixSetup_Benchmark.cpp b/Documentation/Tutorials/Matrices/SparseMatrixSetup_Benchmark.cpp
index 7af7de1e1031545ededa7e1ca29d333deaa9cc94..31a2a039c09abd3c8466c16e8dbcc6a7e9cd15bd 100644
--- a/Documentation/Tutorials/Matrices/SparseMatrixSetup_Benchmark.cpp
+++ b/Documentation/Tutorials/Matrices/SparseMatrixSetup_Benchmark.cpp
@@ -168,7 +168,7 @@ void forElements( const int gridSize, Matrix& matrix )
    matrix.setDimensions( matrixSize, matrixSize );
    matrix.setRowCapacities( rowCapacities );
 
-   auto f = [=] __cuda_callable__ ( int rowIdx, int localIdx, int& columnIdx, float& value, bool& compute ) mutable {
+   auto f = [=] __cuda_callable__ ( int rowIdx, int localIdx, int& columnIdx, float& value ) mutable {
       const int i = rowIdx % gridSize;
       const int j = rowIdx / gridSize;
       if( ( i == 0 || j == 0 || i == gridSize - 1 || j == gridSize - 1 ) && localIdx == 0 )
diff --git a/Documentation/Tutorials/Matrices/tutorial_Matrices.md b/Documentation/Tutorials/Matrices/tutorial_Matrices.md
index 5c60bece9e45b774ae332a531b1f5170ebd33d8b..efcf7a4bfe9dcbabc90983e7c064877ad1c1ef46 100644
--- a/Documentation/Tutorials/Matrices/tutorial_Matrices.md
+++ b/Documentation/Tutorials/Matrices/tutorial_Matrices.md
@@ -432,6 +432,16 @@ The result looks as follows:
 
 \include DenseMatrixExample_forElements.out
 
+#### Wrapping existing data to dense matrix view
+
+In case when you have already allocated data for dense matrix (for example in some other library), you may wrap it to dense matrix view with a function \ref TNL::Matrices::wrapDenseMatrix . See the following example:
+
+\includelineno DenseMatrixViewExample_wrap.cpp
+
+Here we create dense matrix having three rows and four columns. We use TNL vector (\ref TNL::Containers::Vector) only for allocation of the matrix elements (lines 12-15) and we get a pointer to the allocated array immediately (line 16). Next we use just the array to get dense matrix view with proper matrix dimensions (line 21). Note that we must explicitly state the device type as a template parameter of the function `wrapDenseMatrix` (\ref TNL::Matrices::wrapDenseMatrix). Finally, we print the matrix to see if it is correct (line 22). The result looks as follows:
+
+\include DenseMatrixViewExample_wrap.out
+
 ### Sparse matrices
 
 [Sparse matrices](https://en.wikipedia.org/wiki/Sparse_matrix) are extremely important in a lot of numerical algorithms. They are used at situations when we need to operate with matrices having majority of the matrix elements equal to zero. In this case, only the non-zero matrix elements are stored with possibly some *padding zeros* used for memory alignment. This is necessary mainly on GPUs. See the [Overview of matrix types](#overview_of_matrix_types) for the differences in memory requirements.
@@ -647,6 +657,30 @@ would not make sense. If we pass through this test, the matrix element lies in t
 
 \include SparseMatrixExample_forElements.out
 
+#### Wrapping existing data to sparse matrix view
+
+Standard sparse matrix format like [CSR](https://en.wikipedia.org/wiki/Sparse_matrix#Compressed_sparse_row_(CSR,_CRS_or_Yale_format)) and [Ellpack](https://people.math.sc.edu/Burkardt/data/sparse_ellpack/sparse_ellpack.html) store the matrix elements in specifically defined arrays. In case that you have already allocated them (for example in some other library), they can be wrapped into a sparse matrix view with given matrix format. This can be done by means of functions \ref TNL::Matrices::wrapCSRMatrix and \ref TNL::Matrices::wrapEllpackMatrix . See the following example for demonstration of the CSR format:
+
+\includelineno SparseMatrixViewExample_wrapCSR.cpp
+
+We create sparse matrix having four rows and four columns (line 19). We use TNL vector (\ref TNL::Containers::Vector) to allocate arrays necessary for the CSR format:
+
+1. `valuesVector` (line 20) - contains values of the nonzero matrix elements.
+2. `columnIndexesVector` (line 21) - contains column indexes of the nonzero matrix elements.
+3. `rowPointersVector` (line 22) - contains positions of the first nonzero matrix elements in each row within `valuesVector` and `columnIndexesVector`. The size of this array equals number of matrix rows plus one.
+
+Next we turn the vectors into C style pointers (lines 24-26) to wrap them into sparse matrix view (line 31). Note, that we must explicitly state the device on which the arrays are allocated. Finlay we print the matrix to check the correctness (line 33). The result looks as follows:
+
+\include SparseMatrixViewExample_wrapCSR.out
+
+Wrapping data corresponding with the Ellpack format is very similar as we can see in the following example:
+
+\includelineno SparseMatrixViewExample_wrapEllpack.cpp
+
+We encode the same sparse matrix as in the previous example. The essence of the Ellpack format is that we allocate the same number of matrix elements for each row which is two in our example. For some matrix rows we use the padding zeros for which we set the column index to -1 (line 21). Therefore the size of `valuesVector` and `columnIndexesVector` equals number of matrix rows times number of matrix elements allocated in each row. As before, we turn the vectors into C style pointers (lines 23-24) and wrap them into sparse matrix view with Ellpack format (line 29). Note that we must state the device on which the arrays are allocated explicitly and also the matrix elements organization, which is \ref TNL::Algorithms::Segments::RowMajorOrder in this case. For Ellpack matrix stored on GPU, \ref TNL::Algorithms::Segments::ColumnMajorOrder is preferred. The result looks as follows:
+
+\include SparseMatrixViewExample_wrapEllpack.out
+
 #### Symmetric sparse matrices
 
 For sparse [symmetric matrices](https://en.wikipedia.org/wiki/Symmetric_matrix), TNL offers a format storing only a half of the matrix elements. More precisely, ony the matrix diagonal and the elements bellow are stored in the memory. The matrix elements above the diagonal are deduced from those bellow. If such a symmetric format is used on GPU, atomic operations must be used in some matrix operations. For this reason, symmetric matrices can be combined only with matrix elements values expressed in `float` or `double` type. An advantage of the symmetric formats is lower memory consumption. Since less data need to be transferred from the memory, better performance might be observed. In some cases, however, the use of atomic operations on GPU may cause performance drop. Mostly we can see approximately the same performance compared to general formats but we can profit from lower memory requirements which is appreciated especially on GPU. The following example shows how to create symmetric sparse matrix.
@@ -833,7 +867,7 @@ The output of the example looks as:
 
 \include TridiagonalMatrixExample_Constructor_init_list_1.out
 
-#### Methods `setElement` and `addElement`
+#### Methods setElement and addElement
 
 Similar way of the tridiagonal matrix setup is offered by the method `setElements` (\ref TNL::Matrices::TridiagonalMatrix::setElements) as the following example demonstrates:
 
@@ -851,7 +885,7 @@ The result looks as follows:
 
 \include TridiagonalMatrixExample_setElement.out
 
-#### Method `getRow`
+#### Method getRow
 
  A bit different way of setting up the matrix, is the use of tridiagonal matrix view and the method `getRow` (\ref TNL::Matrices::TridiagonalMatrixView::getRow) as the following example demonstrates:
 
@@ -863,7 +897,7 @@ The result looks as follows:
 
 \include TridiagonalMatrixViewExample_getRow.out
 
-### Method `forRows`
+#### Method forRows
 
 As in the case of other matrix types, the method `forRows` (\ref TNL::Matrices::TridiagonalMatrix::forRows) calls the method `getRow` (\ref TNL::Matrices::TridiagonalMatrix::getRow) in parallel. It is demonstrated by the following example which we may directly compare with the previous one:
 
@@ -881,7 +915,7 @@ The result looks as follows:
 
 \include TridiagonalMatrixExample_forRows.out
 
-#### Method `forElements`
+#### Method forElements
 
 Finally, even a bit more simple way of matrix elements manipulation with the method `forElements` (\ref TNL::Matrices::TridiagonalMatrix::forElements) is demonstrated in the following example:
 
@@ -1043,7 +1077,7 @@ On the lines 25-46, we call the constructor which, in addition to matrix dimensi
 
 \include MultidiagonalMatrixExample_Constructor_init_list_2.out
 
-#### Methods `setElement` and `addElement`
+#### Methods setElement and addElement
 
 Another and more efficient way of setting the matrix elements is by means of the method `setElement` (\ref TNL::Matrices::MultidiagonalMatrix::setElement). It is demonstrated in the following example:
 
@@ -1053,7 +1087,7 @@ This examples shows that the method `setElement` can be used both on the host (C
 
 \include MultidiagonalMatrixViewExample_setElement.out
 
-#### Method `getRow`
+#### Method getRow
 
 Slightly more efficient way of the multidiagonal matrix setup is offered by the method `getRow` (\ref TNL::Matrices::MultidiagonalMatrix::getRow). We will use it to create a matrix of the following form:
 
@@ -1137,7 +1171,7 @@ We use `ParallelFor2D` (\ref TNL::Algorithms::ParallelFor2D) to iterate over all
 
 \include MultidiagonalMatrixExample_Constructor.out
 
-### Method `forRows`
+#### Method forRows
 
 As in the case of other matrix types, the method `forRows` (\ref TNL::Matrices::MultidiagonalMatrix::forRows) calls the method `getRow` (\ref TNL::Matrices::MultidiagonalMatrix::getRow) in parallel. It is demonstrated by the following example:
 
@@ -1151,7 +1185,7 @@ The result looks as follows:
 
 \include MultidiagonalMatrixExample_forRows.out
 
-#### Method `forElements`
+#### Method forElements
 
 Similar and even a bit simpler way of setting the matrix elements is offered by the method `forElements` (\ref TNL::Matrices::MultidiagonalMatrix::forElements, \ref TNL::Matrices::MultidiagonalMatrixView::forElements) as demonstrated in the following example:
 
@@ -1220,7 +1254,7 @@ The result looks as follows:
 
 \include LambdaMatrixExample_Constructor.out
 
-#### Method `forRows`
+#### Method forRows
 
 Method `forRows` (\ref TNL::Matrices::LambdaMatrix::forRows, \ref TNL::Matrices::LambdaMatrix::forAllRows) iterates in parallel over all matrix rows. In the case of lambda matrices, it cannot be used for changing the matrix elements since they cannot be changed. In the following example, we show how to use this method to copy the matrix elements values to the dense matrix:
 
@@ -1238,7 +1272,7 @@ The result looks as follows:
 
 \include LambdaMatrixExample_forRows.out
 
-#### Method `forElements`
+#### Method forElements
 
 The lambda matrix has the same interface as other matrix types except of the method `getRow`. The following example demonstrates the use of the method `forElements` (\ref TNL::Matrices::LambdaMatrix::forElements) to copy the lambda matrix into the dense matrix:
 
diff --git a/Documentation/Tutorials/Pointers/CMakeLists.txt b/Documentation/Tutorials/Pointers/CMakeLists.txt
index 0535e8fd5df0c242c4df984a483ec6a34dd32e46..9b83841fbf8a928e0ede88273e58bbac8722ce45 100644
--- a/Documentation/Tutorials/Pointers/CMakeLists.txt
+++ b/Documentation/Tutorials/Pointers/CMakeLists.txt
@@ -1,13 +1,13 @@
 IF( BUILD_CUDA )
-   CUDA_ADD_EXECUTABLE( UniquePointerExample UniquePointerExample.cu )
-   ADD_CUSTOM_COMMAND( COMMAND UniquePointerExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/UniquePointerExample.out OUTPUT UniquePointerExample.out )
+   CUDA_ADD_EXECUTABLE( UniquePointerExample_ UniquePointerExample.cu )
+   ADD_CUSTOM_COMMAND( COMMAND UniquePointerExample_ > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/UniquePointerExample.out OUTPUT UniquePointerExample.out )
    CUDA_ADD_EXECUTABLE( SharedPointerExample SharedPointerExample.cu )
    ADD_CUSTOM_COMMAND( COMMAND SharedPointerExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SharedPointerExample.out OUTPUT SharedPointerExample.out )
    CUDA_ADD_EXECUTABLE( DevicePointerExample DevicePointerExample.cu )
    ADD_CUSTOM_COMMAND( COMMAND DevicePointerExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DevicePointerExample.out OUTPUT DevicePointerExample.out )
 ELSE()
-   ADD_EXECUTABLE( UniquePointerExample UniquePointerExample.cpp )
-   ADD_CUSTOM_COMMAND( COMMAND UniquePointerExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/UniquePointerExample.out OUTPUT UniquePointerExample.out )
+   ADD_EXECUTABLE( UniquePointerExample_ UniquePointerExample.cpp )
+   ADD_CUSTOM_COMMAND( COMMAND UniquePointerExample_ > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/UniquePointerExample.out OUTPUT UniquePointerExample.out )
 ENDIF()
 
 ADD_EXECUTABLE( UniquePointerHostExample UniquePointerHostExample.cpp )
diff --git a/Documentation/Tutorials/ReductionAndScan/tutorial_ReductionAndScan.md b/Documentation/Tutorials/ReductionAndScan/tutorial_ReductionAndScan.md
index 59ab08d3f315de8f92211e4593a917c0ee6b4b1b..0c55abd2ef0f8650ecedfac2394c6d15cef42d61 100644
--- a/Documentation/Tutorials/ReductionAndScan/tutorial_ReductionAndScan.md
+++ b/Documentation/Tutorials/ReductionAndScan/tutorial_ReductionAndScan.md
@@ -216,7 +216,7 @@ and exclusive prefix sum of the same sequence is
 [0,1,4,9,16,25,36]
 ```
 
-Both kinds of [scan](https://en.wikipedia.org/wiki/Prefix_sum) are usually applied only on summation, however product or logical operations could be handy as well. In TNL, scan is implemented in similar way as reduction and uses the same functors as the reduction operation. The following example shows how it works:
+Both kinds of [scan](https://en.wikipedia.org/wiki/Prefix_sum) have many different [applications](https://www.cs.cmu.edu/~guyb/papers/Ble93.pdf) but they are usually applied only on summation, however product or logical operations could be handy as well. In TNL, prefix sum is implemented in similar way as reduction and so it can be easily modified by lambda functions. The following example shows how it works:
 
 ```
 inplaceInclusiveScan( array, 0, array.getSize(), TNL::Plus{} );
diff --git a/Documentation/Tutorials/Segments/CMakeLists.txt b/Documentation/Tutorials/Segments/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..04990870884a5538b0111e0186346e5ebe55ec53
--- /dev/null
+++ b/Documentation/Tutorials/Segments/CMakeLists.txt
@@ -0,0 +1,35 @@
+set( COMMON_EXAMPLES
+
+)
+
+
+if( BUILD_CUDA )
+   foreach( target IN ITEMS ${COMMON_EXAMPLES} )
+      cuda_add_executable( ${target}-cuda ${target}.cu OPTIONS )
+      add_custom_command( COMMAND ${target}-cuda > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/${target}.out OUTPUT ${target}.out )
+      set( CUDA_OUTPUTS ${CUDA_OUTPUTS} ${target}.out )
+   endforeach()
+   foreach( target IN ITEMS ${LONG_EXAMPLES} )
+      cuda_add_executable( ${target}-cuda ${target}.cu OPTIONS )
+      #add_custom_command( COMMAND ${target}-cuda > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/${target}.out OUTPUT ${target}.out )
+      #set( CUDA_OUTPUTS ${CUDA_OUTPUTS} ${target}.out )
+   endforeach()
+else()
+   foreach( target IN ITEMS ${COMMON_EXAMPLES} )
+      add_executable( ${target} ${target}.cpp )
+      add_custom_command( COMMAND ${target} > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/${target}.out OUTPUT ${target}.out )
+      set( HOST_OUTPUTS ${HOST_OUTPUTS} ${target}.out )
+   endforeach()
+   foreach( target IN ITEMS ${LONG_EXAMPLES} )
+      add_executable( ${target} ${target}.cpp )
+      #add_custom_command( COMMAND ${target} > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/${target}.out OUTPUT ${target}.out )
+      #set( HOST_OUTPUTS ${HOST_OUTPUTS} ${target}.out )
+   endforeach()
+endif()
+
+
+IF( BUILD_CUDA )
+   ADD_CUSTOM_TARGET( RunTutorialsSegmentsExamples-cuda ALL DEPENDS ${CUDA_OUTPUTS} )
+ELSE()
+   ADD_CUSTOM_TARGET( RunTutorialsSegmentsExamples ALL DEPENDS ${HOST_OUTPUTS} )
+ENDIF()
\ No newline at end of file
diff --git a/Documentation/Tutorials/Segments/tutorial_Segments.md b/Documentation/Tutorials/Segments/tutorial_Segments.md
new file mode 100644
index 0000000000000000000000000000000000000000..619f576824bdef4f7d644e95e4bf77694d81c786
--- /dev/null
+++ b/Documentation/Tutorials/Segments/tutorial_Segments.md
@@ -0,0 +1,212 @@
+\page tutorial_Segments  Segments tutorial
+
+[TOC]
+
+
+## Introduction
+
+*Segments* represent data structure for manipulation with several local arrays (denoted also as segments) having different size in general. All the local arrays are supposed to be allocated in one continuos global array. The data structure segments offers mapping between indexes of particular local arrays and indexes of the global array. Segments do not store any data, segments just represent a layer for efficient access and operations with group of segments of linear containers (i.e. local arrays) with different size in general. One can perform parallel operations like *for* or *flexible reduction* on particular segments (local arrays).
+
+A typical example of *segments* are different formats for sparse matrices. Sparse matrix like the following
+ \f[
+  \left(
+  \begin{array}{ccccc}
+   1  &  0  &  2  &  0  &  0 \\
+    0  &  0  &  5  &  0  &  0 \\
+    3  &  4  &  7  &  9  &  0 \\
+    0  &  0  &  0  &  0  & 12 \\
+   0  &  0  & 15  & 17  & 20
+  \end{array}
+  \right)
+ \f]
+ is usually first compressed which means that the zero elements are omitted to get the following "matrix":
+
+ \f[
+ \begin{array}{ccccc}
+    1  &   2  \\
+    5   \\
+    3  &   4  &  7 &  9   \\
+    12 \\
+    15 & 17  & 20
+ \end{array}
+ \f]
+ We have to store column index of each matrix elements as well in a "matrix" like this:
+ \f[
+ \begin{array}{ccccc}
+    0  &   2  \\
+    2   \\
+    0  &   1  &  2 &  3   \\
+    4 \\
+    2 & 3  & 4
+ \end{array}
+ \f]
+
+ Such "matrices" can be stored in memory in a row-wise manner in one contiguous array because of the performance reasons. The first "matrix" (i.e. values of the matrix elements)  would be stored as follows
+
+ \f[
+    \begin{array}{|cc|c|cccc|c|cc|} 1 & 2 &  5 & 3 & 4 & 7 & 9 & 12 & 15 & 17 & 20 \end{array}
+ \f]
+
+and the second one (i.e. column indexes of the matrix values) as follows
+
+\f[
+    \begin{array}{|cc|c|cccc|c|cc|} 0 & 2 & 2 & 0 & 1 & 2 & 3 & 4 & 2 & 3 & 4 \end{array}
+ \f]
+
+What we see above is so called [CSR sparse matrix format](https://en.wikipedia.org/wiki/Sparse_matrix#Compressed_sparse_row_(CSR,_CRS_or_Yale_format)). It is the most popular format for storage of sparse matrices designed for high performance. However, it may not be the most efficient format for storage of sparse matrices on GPUs. Therefore many other formats have been developed to get better performance. These formats often have different layout of the matrix elements in the memory. They have to deal especially with two difficulties:
+
+1. Efficient storage of matrix elements in the memory to fulfill the requirements of coalesced memory accesses on GPUs or good spatial locality for efficient use of caches on CPUs.
+2. Efficient mapping of GPU threads to different matrix rows.
+
+TNL offers the following sparse matrix formats in a form of segments (Ellpack formats often use so called *padding elements* like padding zeros in terms of sparse matrices):
+
+1. [CSR format](https://en.wikipedia.org/wiki/Sparse_matrix#Compressed_sparse_row_(CSR,_CRS_or_Yale_format)) (\ref TNL::Algorithms::Segments::CSR) is the most popular format for sparse matrices. It is simple ane very efficient especially on CPUs and today there are efficient kernels even for GPUs. The following GPU kernels are implemented in TNL:
+   1. [Scalar](http://mgarland.org/files/papers/nvr-2008-004.pdf) which maps one GPU thread for each segment (matrix row).
+   2. [Vector](http://mgarland.org/files/papers/nvr-2008-004.pdf) which maps one warp of GPU threads for each segment (matrix row).
+   3. [Adaptive](https://ieeexplore.ieee.org/document/7397620) ...
+2. [Ellpack format](http://mgarland.org/files/papers/nvr-2008-004.pdf) (\ref TNL::Algorithms::Segments::Ellpack) uses padding elements to have the same number of element in each segment. It can be highly inefficient in cases when one works with few very long segments.
+3. [SlicedEllpack format](https://link.springer.com/chapter/10.1007/978-3-642-11515-8_10) (\ref TNL::Algorithms::Segments::SlicedEllpack) which was also presented as [Row-grouped CSR format](https://arxiv.org/abs/1012.2270) is similar to common Ellpack. However, SlicedEllpack first merges the segments into groups of 32. It also uses padding elements but only segments within the same group are aligned to have the same size. Therefore there is not such a high performance drop because of few long segments.
+4. [ChunkedEllpack format](http://geraldine.fjfi.cvut.cz/~oberhuber/data/vyzkum/publikace/12-heller-oberhuber-improved-rgcsr-format.pdf) (\ref TNL::Algorithms::Segments::ChunkedEllpack) is simillar to SlicedEllpack but it splits segments into chunks which allows to map more GPU threads to one segment.
+5. [BiEllpack format](https://www.sciencedirect.com/science/article/pii/S0743731514000458?casa_token=2phrEj0Ef1gAAAAA:Lgf6rMBUN6T7TJne6mAgI_CSUJ-jR8jz7Eghdv6L0SJeGm4jfso-x6Wh8zgERk3Si7nFtTAJngg) (\ref TNL::Algorithms::Segments::BiEllpack) is simillar to ChunkedEllpack. In addition it sorts segments within the same slice w.r.t. their length to achieve higher performance and better memory accesses.
+
+Especially in case of GPUs, the performance of each format strongly depends on distribution of the segment sizes. Therefore we cannot say that one of the previous formats would outperform the others in general. To get the best performance, one should try more of the formats and choose the best one. It is the reason why TNL offers more of them and additional formats will acrue.
+
+Necessity of working with this kind of data structures is not limited only to sparse matrices. We could name at least few other applications for segments:
+
+1. [Graphs](https://en.wikipedia.org/wiki/Graph_(discrete_mathematics)) - one segment represents one graph node, the elements in one segments are indexes of its neighbors.
+2. [Unstructured numerical meshes](https://en.wikipedia.org/wiki/Types_of_mesh) - unstructured numerical mesh is a graph in fact.
+3. [Particle in cell method](https://en.wikipedia.org/wiki/Particle-in-cell) - one segment represents one cell, the elements in one segment are indexes of the particles.
+4. [K-means clustering](https://en.wikipedia.org/wiki/K-means_clustering) - segments represent one cluster, the elements represent vectors belonging to given cluster.
+5. [Hashing](https://arxiv.org/abs/1907.02900) - segments are particular rows of the hash table, elements in segments corresponds with colliding hashed elements.
+
+In general, segments can be used for problems that somehow corresponds wit 2D data structure where each row can have different size and we need to perform miscellaneous operations within the rows. The name *segments* comes from segmented parallel reduction or [segmented scan (prefix-sum)](https://en.wikipedia.org/wiki/Segmented_scan).
+
+## Segments setup
+
+Segments are defined just by sizes of particular segments. The following example shows how to create them:
+
+\includelineno Algorithms/Segments/SegmentsPrintingExample-1.cpp
+
+We use constructor with initializer list (line 16) where each element of the list defines size of one segment. Next we print sizes of particular segments (line 17). We call this function for different segments types (excluding \ref TNL::Algorithms::Segments::SlicedEllpack since it would behave the same way as \ref TNL::Algorithms::Segments::Ellpack on this small example). The result looks as follows:
+
+\include SegmentsPrintingExample-1.out
+
+We can see, that real sizes of the segments are different for all Ellpack-based formats. As we said already, these formats often use padding elements to get more efficient access to the memory. For example \ref TNL::Algorithms::Segments::ChunkedEllpack format involves multiple of elements. It is, however, only because of very small example we present now, on large examples the overhead is not so significant.
+
+We remind that segments represent rather sparse format then data structure because they do not store any data. The following example shows how to connect segments with array:
+
+\includelineno Algorithms/Segments/SegmentsPrintingExample-2.cpp
+
+On the line 19, we show how to create segments with vector (\ref TNL::Containers::Vector) carrying the segments sizes. Of course, the same constructor works even for arrays and views (i.e. \ref TNL::Containers::Array, \ref TNL::Containers::ArrayView and \ref TNL::Containers::VectorView). Next we print the real segment sizes depending on the format in the background (line 20) the same way as we did in the previous example. On the line 25, we allocate array having the size requested by the `segments` by means of method `getStorageSize` (\ref TNL::Algortihms::Segments::CSR::getStorageSize for example). This method says how many elements the segments need to be able to address all elements by their global index. On the lines 26-28, we mark each element of the array by its rank in the array. On the line 35, we use function \ref TNL::Algorithms::Segments::printSegments which accepts lambda function `fetch` as one its parameters. The lambda function reads data from our array `data` (with the help of array view `data_view`) according to given global index `globalIdx` (line 34). The result looks as follows:
+
+\include SegmentsPrintingExample-2.out
+
+Frankly, what we see is not so important. It only shows that different segments formats can use very different mapping of elements identified by its *segment index* and *local index* (rank of the element in given segment) to a *global index* which serves as an address in the related container.
+
+## Iteration over elements of segments
+
+In this section, we show how to iterate over the elements of segments and how to manipulate with them. There are three possible ways:
+
+1. Method `forElements` (\ref TNL::Algorihms::Segments::CSR::forElements for example), which iterates in parallel over all elements of segments and perform given lambda function on each of them.
+2. Method `forSegments` (\ref TNL::Algorihms::Segments::CSR::forSegments for example), which iterates in parallel over all segments. It is better choice when we need to process each segment sequentially are we have significant amount of computations common for all elements in each segment.
+3. Method `sequentailForSegments` (\ref TNL::Algorihms::Segments::CSR::sequentailForSegments for example), which iterates over all segments sequentially i.e. using only one thread even on GPUs. It is useful for debugging or for printing for example.
+
+Methods iterating over particular segments use a segment view (\ref TNL::Algorithms::Segments::SegmentView) to access the elements of given segment. The segment view offers iterator for better convenience.
+
+### Method forElements
+
+The following example shows use of the method `forElements`:
+
+\includelineno Algorithms/Segments/SegmentsExample_forElements.cpp
+
+On the line 7, we first create segments with linearly increasing size (so it is like lower triangular matrix). Next, we allocate array `data` (line 21) having the same size as the number of elements managed by the segments. It can be obtained by the method `getStorageSize` (\ref TNL::Algorithms::Segments::CSR::getStorageSize for example). We prepare array view `data_view` for the purpose of use in lambda functions (line 26). Finally, we call the method `forAllElements` (lines 27-29) which iterates in parallel over all elements in the segments and for each element it calls given lambda function. The lambda function receives three arguments - `segmentIdx` is an index of the segment the element belongs to, `localIdx` is the rank of the element within the segment and `globalIdx` is an index of the element in the array `data`. We use the global index to set proper element of the array `data` to the index of the segment. On the line 35, we print the array `data`. We can see elements belonging to particular segments by their indexes. The layout of the elements depends on the type of segments (which means sparse format in use). Next we print the elements of array `data` by segments (lines 36 and 37). The function `printSegments` iterates over all elements and it reads the elements of the array `data` with the help of the lambda function defined on the line 36.
+
+Note, that for the Ellpack format, the output looks as follows:
+
+```
+Seg. 0: [ 0, 0, 0, 0, 0 ]
+Seg. 1: [ 1, 1, 1, 1, 1 ]
+Seg. 2: [ 2, 2, 2, 2, 2 ]
+Seg. 3: [ 3, 3, 3, 3, 3 ]
+Seg. 4: [ 4, 4, 4, 4, 4 ]
+```
+
+We see more elements that we have requested. The reason is that the Ellpack format uses padding elements for optimizing access to the memory. Segments give access even to the padding elements, they can be used in case when we get to situation of need of additional elements. Therefore we need to check for relevant and padding elements each time we work with elements of segments. It is demonstrated on the lines 43-46 where we set the array `data` again but we check for the padding elements (line 44). After printing the segments the same way as before (line 53) we get correct result:
+
+```
+Seg. 0: [ 0, 0, 0, 0, 0 ]
+Seg. 1: [ 1, 1, 0, 0, 0 ]
+Seg. 2: [ 2, 2, 2, 0, 0 ]
+Seg. 3: [ 3, 3, 3, 3, 0 ]
+Seg. 4: [ 4, 4, 4, 4, 4 ]
+```
+
+The result of the whole example looks as follows:
+
+\include SegmentsExample_forElements.out
+
+### Method forSegments
+
+Method `forSegments` iterates in parallel over particular segments. Iteration over elements within the segment is sequential. There are two reasons for such proceeding:
+
+1. The iteration over the elements within the same segments must be sequential, i.e. the computation with one element depends on a result of the computation with the previous one.
+2. Some part of computations on all elements in one segment is common. In this case, we can first perform the common part and then iterate over the elements. If we would use the method `forElements`, the common part would have to be performed for each element.
+
+#### Sequential dependency
+
+The first situation is demonstrated in the following example:
+
+\includelineno Algorithms/Segments/SegmentsExample_forSegments-1.cpp
+
+The result looks as follows:
+
+The code is the same as in the previous example up to line 26. Instead of calling the method `forElements` we call the method `forSegments` (line 28) for which we need to define type  `SegmentViewType` (\ref TNL::Algorithms::Segments::CSR::SegmentViewType for example). The lambda function on the line 28 gets the segment view and it iterates over all elements of the segment by means of a for loop. We use auxiliary variable `sum` to compute cumulative sum of elements in each segment which is just the sequential dependency. The result looks as follows:
+
+\include SegmentsExample_forSegments-1.out
+
+#### Common computations
+
+Now let's take a look at the second situation, i.e. there are common computations for all elements of one segment. In the following example, we first set values of each element using the method `forElements` which we are already familiar with (lines 26-29). Next we print values of all elements (lines 34-36) and then we use the method `forAllSegments` (lines 41-52) to divide each element by a sum of values of all elements in a segment. So we first sum up all elements in the segment (lines 43-47). This is the common part of the computation for all elements in the segment. Next we perform the division of all elements by the value of the variable `sum` (lines 48-51).
+
+\includelineno Algorithms/Segments/SegmentsExample_forSegments-2.cpp
+
+The result looks as follows:
+
+\include SegmentsExample_forSegments-2.out
+
+## Flexible reduction within segments
+
+In this section we will explain extension of [flexible reduction]() to segments. It allows to reduce all elements within the same segment and store the result into an array. See the following example:
+
+\includelineno Algorithms/Segments/SegmentsExample_reduceSegments.cpp
+
+We first create the segments `segments` (line 18), related array `data` (line 23) and setup the elements (lines 28-32). After printing the segments (lines 37-39) we are ready for the parallel reduction. It requires three lambda fuctions:
+
+1. `fetch` which reads data belonging to particular elements of the segments. The fetch function can have two different forms - *brief* and *full*:
+   * *Brief form* - is this case the lambda function gets only global index and the `compute` flag:
+```
+      auto fetch = [=] __cuda_callable__ ( int globalIdx, bool& compute ) -> double { ... };
+```
+   * *Full form* - in this case the lambda function receives even the segment index and element index:
+```
+      auto fetch = [=] __cuda_callable__ ( int segmentIdx, int localIdx, int globalIdx, bool& compute ) -> double { ... };
+```
+   where `segmentIdx` is the index of the segment, `localIdx` is the rank of the element within the segment, `globalIdx` is index of the element in the related array and `compute` serves for the reduction interruption which means that the remaining elements in the segment can be omitted. Many formats used for segments are optimized for much higher performance if the brief variant is used. The form of the `fetch` lambda function is detected automatically using [SFINAE](https://en.cppreference.com/w/cpp/language/sfinae) and so the use of both is very ease for the user.
+2. `reduce` is a function representing the reduction operation, in our case it is defined as follows:
+```
+auto reduce = [=] __cuda_callable__ ( const double& a, const double& b ) -> double { return a + b; }
+```
+   or, in fact, we can use the function `std::plus`.
+3. `keep` is a lambda function responsible for storage of the results. It is supposed to be defined as:
+```
+auto keep = [=] __cuda_callable__ ( int segmentIdx, const double& value ) mutable { ... };
+```
+where `segmentIdx` is an index of the segment of which the reduction result we aim to store and `value` is the result of the reduction in the segment.
+
+We first create vector `sums` where we will store the results (line 44) and prepare a view to this vector for later use in the lambda functions. We demonstrate use of both variants - full by `fetch_full` (lines 46-54) and brief by `fetch_brief` (lines 55-57). The lambda function `keep` for storing the sums from particular segments into the vector `sums` is on the lines 59-60. Finally, we call the method `reduceAllSegments` (\ref TNL::Algorithms::Segments::CSR::reduceSegments for example) to compute the reductions in the segments - first with  `fetch_full` (line 61) and then with `fetch_brief` (line 63). In both cases, we use `std::plus` for the reduction and we pass zero (the last argument) as an idempotent element for sumation. In both cases we print the results which are supposed to be the same. The result looks as follows:
+
+\include SegmentsExample_reduceSegments.out
+
+
+
+
diff --git a/Documentation/Tutorials/index.md b/Documentation/Tutorials/index.md
index 739d609acc543148ddb30aa21ee231a4527e02ee..031de3faee9cd64c347950dec75983fd71a27a36 100644
--- a/Documentation/Tutorials/index.md
+++ b/Documentation/Tutorials/index.md
@@ -10,4 +10,5 @@
 6. [Sorting](tutorial_Sorting.html)
 7. [Cross-device pointers](tutorial_Pointers.html)
 8. [Matrices](tutorial_Matrices.html)
-9. [Unstructured meshes](tutorial_Meshes.html)
+9. [Segments aka sparse formats](tutorial_Segments.html)
+10. [Unstructured meshes](tutorial_Meshes.html)
diff --git a/src/Benchmarks/BLAS/array-operations.h b/src/Benchmarks/BLAS/array-operations.h
index a8c606d386c32573599ad67144006bf56f19d9bb..38a58c4312b14bcc3231d3c097ff4ae6bd171e6b 100644
--- a/src/Benchmarks/BLAS/array-operations.h
+++ b/src/Benchmarks/BLAS/array-operations.h
@@ -26,7 +26,7 @@ template< typename Real = double,
           template<typename> class HostAllocator = Allocators::Default< Devices::Host >::Allocator,
           template<typename> class CudaAllocator = Allocators::Default< Devices::Cuda >::Allocator >
 void
-benchmarkArrayOperations( Benchmark & benchmark,
+benchmarkArrayOperations( Benchmark<> & benchmark,
                           const long & size )
 {
    using HostArray = Containers::Array< Real, Devices::Host, Index, HostAllocator< Real > >;
diff --git a/src/Benchmarks/BLAS/cublasWrappers.h b/src/Benchmarks/BLAS/cublasWrappers.h
index 1e63e139d6faa513706ed1b18a207e87ea1a079d..f0d8952e633627d503cc310aa24ee51ef48dcc34 100644
--- a/src/Benchmarks/BLAS/cublasWrappers.h
+++ b/src/Benchmarks/BLAS/cublasWrappers.h
@@ -118,4 +118,29 @@ cublasGscal( cublasHandle_t handle, int n,
    return cublasDscal( handle, n, alpha, x, incx );
 }
 
+
+inline cublasStatus_t
+cublasGemv( cublasHandle_t handle, cublasOperation_t trans,
+            int m, int n,
+            const float           *alpha,
+            const float           *A, int lda,
+            const float           *x, int incx,
+            const float           *beta,
+            float           *y, int incy )
+{
+   return cublasSgemv( handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy );
+}
+
+inline cublasStatus_t
+cublasGemv( cublasHandle_t handle, cublasOperation_t trans,
+            int m, int n,
+            const double          *alpha,
+            const double          *A, int lda,
+            const double          *x, int incx,
+            const double          *beta,
+            double          *y, int incy )
+{
+   return cublasDgemv( handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy );
+}
+
 #endif
diff --git a/src/Benchmarks/BLAS/dense-mv.h b/src/Benchmarks/BLAS/dense-mv.h
new file mode 100644
index 0000000000000000000000000000000000000000..1204257cce7f2b6fa354a045245129282992c1b3
--- /dev/null
+++ b/src/Benchmarks/BLAS/dense-mv.h
@@ -0,0 +1,146 @@
+/***************************************************************************
+                          dense-mv.h  -  description
+                             -------------------
+    begin                : Jul 8, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Jakub Klinkovsky
+
+#pragma once
+
+#include "../Benchmarks.h"
+#include "cublasWrappers.h"
+
+#include <TNL/Containers/Vector.h>
+#include <TNL/Pointers/DevicePointer.h>
+#include <TNL/Matrices/DenseMatrix.h>
+#include <TNL/Devices/Cuda.h>
+#include <TNL/Devices/Host.h>
+
+namespace TNL {
+namespace Benchmarks {
+
+template< typename Matrix >
+void setMatrix( Matrix& matrix )
+{
+   using RealType = typename Matrix::RealType;
+   using IndexType = typename Matrix::IndexType;
+   matrix.forAllElements( [] __cuda_callable__ ( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx_, RealType& value ) {
+       value = 1.0; } );
+}
+
+template< typename Real >
+void
+benchmarkDenseMVSynthetic( Benchmark<> & benchmark,
+                           const int & size )
+{
+   using HostMatrix = TNL::Matrices::DenseMatrix< Real, TNL::Devices::Host >;
+   using RowMajorCudaMatrix = TNL::Matrices::DenseMatrix< Real, TNL::Devices::Cuda, int, TNL::Algorithms::Segments::RowMajorOrder >;
+   using ColumnMajorCudaMatrix = TNL::Matrices::DenseMatrix< Real, TNL::Devices::Cuda >;
+   using HostVector = Containers::Vector< Real, Devices::Host, int >;
+   using CudaVector = Containers::Vector< Real, Devices::Cuda, int >;
+
+   HostMatrix hostMatrix;
+   RowMajorCudaMatrix rowMajorCudaMatrix;
+   ColumnMajorCudaMatrix columnMajorCudaMatrix;
+   HostVector inHostVector, outHostVector;
+   CudaVector inCudaVector, outCudaVector1, outCudaVector2;
+
+   // create benchmark group
+   const std::vector< String > parsedType = parseObjectType( getType< HostMatrix >() );
+#ifdef HAVE_CUDA
+   benchmark.createHorizontalGroup( parsedType[ 0 ], 2 );
+#else
+   benchmark.createHorizontalGroup( parsedType[ 0 ], 1 );
+#endif
+
+   hostMatrix.setDimensions( size, size );
+   inHostVector.setSize( size );
+   outHostVector.setSize( size );
+
+   setMatrix< HostMatrix >( hostMatrix );
+   const double datasetSize = (double) ( size * size ) * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB;
+
+   // reset function
+   auto reset = [&]() {
+      inHostVector = 1.0;
+      outHostVector = 0.0;
+#ifdef HAVE_CUDA
+      inCudaVector = 1.0;
+      //outCudaVector1 = 0.0;
+      //outCudaVector2 = 0.0;
+#endif
+   };
+
+   // compute functions
+   auto spmvHost = [&]() {
+      hostMatrix.vectorProduct( inHostVector, outHostVector );
+   };
+   benchmark.setOperation( datasetSize );
+   benchmark.time< Devices::Host >( reset, "CPU", spmvHost );
+
+#ifdef HAVE_CUDA
+   columnMajorCudaMatrix.setDimensions( size, size );
+   inCudaVector.setSize( size );
+   outCudaVector1.setSize( size );
+   outCudaVector2.setSize( size );
+   setMatrix< ColumnMajorCudaMatrix >( columnMajorCudaMatrix );
+
+   auto columnMajorMvCuda = [&]() {
+      columnMajorCudaMatrix.vectorProduct( inCudaVector, outCudaVector1 );
+   };
+   benchmark.time< Devices::Cuda >( reset, "GPU col", columnMajorMvCuda );
+
+   columnMajorCudaMatrix.reset();
+
+   rowMajorCudaMatrix.setDimensions( size, size );
+   setMatrix< RowMajorCudaMatrix >( rowMajorCudaMatrix );
+
+   auto rowMajorMvCuda = [&]() {
+      rowMajorCudaMatrix.vectorProduct( inCudaVector, outCudaVector2 );
+   };
+   benchmark.time< Devices::Cuda >( reset, "GPU row", rowMajorMvCuda );
+
+   auto diff = TNL::max( abs( outCudaVector2 - outCudaVector1 ) );
+   //std::cerr << outCudaVector1 << std::endl << outCudaVector2 << std::endl;
+
+   rowMajorCudaMatrix.reset();
+   columnMajorCudaMatrix.setDimensions( size, size );
+   setMatrix< ColumnMajorCudaMatrix >( columnMajorCudaMatrix );
+
+   cublasHandle_t cublasHandle;
+   cublasCreate( &cublasHandle );
+   auto mvCublas = [&] () {
+      Real alpha = 1.0;
+      Real beta = 0.0;
+      cublasGemv( cublasHandle, CUBLAS_OP_N, size, size, &alpha,
+                  columnMajorCudaMatrix.getValues().getData(), size,
+                  inCudaVector.getData(), 1, &beta,
+                  outCudaVector1.getData(), 1 );
+   };
+   benchmark.time< Devices::Cuda >( reset, "GPU cublas", mvCublas );
+
+   //std::cerr << "Diff. = " << diff << std::endl;
+#endif
+}
+
+/*template< typename Real = double,
+          typename Index = int >
+void
+benchmarkDenseSynthetic( Benchmark<> & benchmark,
+                         const int & size )
+{
+   // TODO: benchmark all formats from tnl-benchmark-spmv (different parameters of the base formats)
+   // NOTE: CSR is disabled because it is very slow on GPU
+   //benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Scalar >( benchmark, size, elementsPerRow );
+   benchmarkSpMV< Real, Benchmarks::SpMV::ReferenceFormats::Legacy::Ellpack >( benchmark, size, elementsPerRow );
+   benchmarkSpMV< Real, SlicedEllpack >( benchmark, size, elementsPerRow );
+   benchmarkSpMV< Real, Benchmarks::SpMV::ReferenceFormats::Legacy::ChunkedEllpack >( benchmark, size, elementsPerRow );
+}*/
+
+} // namespace Benchmarks
+} // namespace TNL
diff --git a/src/Benchmarks/BLAS/spmv.h b/src/Benchmarks/BLAS/spmv.h
index 587794f356c210e4469e4c1b6ca190c4a3c50b8a..6cd669dc085f2bbfc74e75f8c1f93fb5fb6682a6 100644
--- a/src/Benchmarks/BLAS/spmv.h
+++ b/src/Benchmarks/BLAS/spmv.h
@@ -97,7 +97,7 @@ void setCudaTestMatrix( Matrix& matrix,
 template< typename Real,
           template< typename, typename, typename > class Matrix >
 void
-benchmarkSpMV( Benchmark & benchmark,
+benchmarkSpMV( Benchmark<> & benchmark,
                const int & size,
                const int elementsPerRow = 5 )
 {
@@ -173,7 +173,7 @@ benchmarkSpMV( Benchmark & benchmark,
 template< typename Real = double,
           typename Index = int >
 void
-benchmarkSpmvSynthetic( Benchmark & benchmark,
+benchmarkSpmvSynthetic( Benchmark<> & benchmark,
                         const int & size,
                         const int & elementsPerRow )
 {
diff --git a/src/Benchmarks/BLAS/tnl-benchmark-blas.h b/src/Benchmarks/BLAS/tnl-benchmark-blas.h
index 3e05da6304d48d581434716deb2e7929f2c83f79..9b061adf65a752116bc75090d86af8509e4e764a 100644
--- a/src/Benchmarks/BLAS/tnl-benchmark-blas.h
+++ b/src/Benchmarks/BLAS/tnl-benchmark-blas.h
@@ -22,6 +22,8 @@
 #include "vector-operations.h"
 #include "triad.h"
 #include "spmv.h"
+#include "dense-mv.h"
+
 
 using namespace TNL;
 using namespace TNL::Benchmarks;
@@ -29,8 +31,8 @@ using namespace TNL::Benchmarks;
 
 template< typename Real >
 void
-runBlasBenchmarks( Benchmark & benchmark,
-                   Benchmark::MetadataMap metadata,
+runBlasBenchmarks( Benchmark<> & benchmark,
+                   Benchmark<>::MetadataMap metadata,
                    const std::size_t & minSize,
                    const std::size_t & maxSize,
                    const double & sizeStepFactor,
@@ -43,7 +45,7 @@ runBlasBenchmarks( Benchmark & benchmark,
    benchmark.newBenchmark( String("Array operations (") + precision + ", host allocator = Host)",
                            metadata );
    for( std::size_t size = minSize; size <= maxSize; size *= 2 ) {
-      benchmark.setMetadataColumns( Benchmark::MetadataColumns({
+      benchmark.setMetadataColumns( Benchmark<>::MetadataColumns({
          { "size", convertToString( size ) },
       } ));
       benchmarkArrayOperations< Real >( benchmark, size );
@@ -52,7 +54,7 @@ runBlasBenchmarks( Benchmark & benchmark,
    benchmark.newBenchmark( String("Array operations (") + precision + ", host allocator = CudaHost)",
                            metadata );
    for( std::size_t size = minSize; size <= maxSize; size *= 2 ) {
-      benchmark.setMetadataColumns( Benchmark::MetadataColumns({
+      benchmark.setMetadataColumns( Benchmark<>::MetadataColumns({
          { "size", convertToString( size ) },
       } ));
       benchmarkArrayOperations< Real, int, Allocators::CudaHost >( benchmark, size );
@@ -60,7 +62,7 @@ runBlasBenchmarks( Benchmark & benchmark,
    benchmark.newBenchmark( String("Array operations (") + precision + ", host allocator = CudaManaged)",
                            metadata );
    for( std::size_t size = minSize; size <= maxSize; size *= 2 ) {
-      benchmark.setMetadataColumns( Benchmark::MetadataColumns({
+      benchmark.setMetadataColumns( Benchmark<>::MetadataColumns({
          { "size", convertToString( size ) },
       } ));
       benchmarkArrayOperations< Real, int, Allocators::CudaManaged >( benchmark, size );
@@ -71,7 +73,7 @@ runBlasBenchmarks( Benchmark & benchmark,
    benchmark.newBenchmark( String("Vector operations (") + precision + ")",
                            metadata );
    for( std::size_t size = minSize; size <= maxSize; size *= sizeStepFactor ) {
-      benchmark.setMetadataColumns( Benchmark::MetadataColumns({
+      benchmark.setMetadataColumns( Benchmark<>::MetadataColumns({
          { "size", convertToString( size ) },
       } ));
       benchmarkVectorOperations< Real >( benchmark, size );
@@ -82,7 +84,7 @@ runBlasBenchmarks( Benchmark & benchmark,
    benchmark.newBenchmark( String("Triad benchmark (") + precision + ")",
                            metadata );
    for( std::size_t size = minSize; size <= maxSize; size *= 2 ) {
-      benchmark.setMetadataColumns( Benchmark::MetadataColumns({
+      benchmark.setMetadataColumns( Benchmark<>::MetadataColumns({
          { "size", convertToString( size ) },
       } ));
       benchmarkTriad< Real >( benchmark, size );
@@ -93,13 +95,25 @@ runBlasBenchmarks( Benchmark & benchmark,
    benchmark.newBenchmark( String("Sparse matrix-vector multiplication (") + precision + ")",
                            metadata );
    for( std::size_t size = minSize; size <= maxSize; size *= 2 ) {
-      benchmark.setMetadataColumns( Benchmark::MetadataColumns({
+      benchmark.setMetadataColumns( Benchmark<>::MetadataColumns({
          { "rows", convertToString( size ) },
          { "columns", convertToString( size ) },
          { "elements per row", convertToString( elementsPerRow ) },
       } ));
       benchmarkSpmvSynthetic< Real >( benchmark, size, elementsPerRow );
    }
+
+   // Dense matrix-vector multiplication
+   benchmark.newBenchmark( String("Dense matrix-vector multiplication (") + precision + ")",
+                           metadata );
+   for( std::size_t size = 10; size <= 20000; size *= 2 ) {
+      benchmark.setMetadataColumns( Benchmark<>::MetadataColumns({
+         { "rows", convertToString( size ) },
+         { "columns", convertToString( size ) }
+      } ));
+      benchmarkDenseMVSynthetic< Real >( benchmark, size );
+   }
+
 }
 
 void
@@ -168,10 +182,10 @@ main( int argc, char* argv[] )
    std::ofstream logFile( logFileName.getString(), mode );
 
    // init benchmark and common metadata
-   Benchmark benchmark( loops, verbose );
+   Benchmark<> benchmark( loops, verbose );
 
    // prepare global metadata
-   Benchmark::MetadataMap metadata = getHardwareMetadata();
+   Benchmark<>::MetadataMap metadata = getHardwareMetadata< Logging >();
 
    if( precision == "all" || precision == "float" )
       runBlasBenchmarks< float >( benchmark, metadata, minSize, maxSize, sizeStepFactor, elementsPerRow );
diff --git a/src/Benchmarks/BLAS/triad.h b/src/Benchmarks/BLAS/triad.h
index 3ac747fba5f386654a9558646868b8fb13671690..d2bdf12cf684268c4652171db2e53a518dbb7a11 100644
--- a/src/Benchmarks/BLAS/triad.h
+++ b/src/Benchmarks/BLAS/triad.h
@@ -24,7 +24,7 @@ namespace Benchmarks {
 template< typename Real = double,
           typename Index = int >
 void
-benchmarkTriad( Benchmark & benchmark,
+benchmarkTriad( Benchmark<> & benchmark,
                 const long & size )
 {
    using HostAllocator = Allocators::Host< Real >;
diff --git a/src/Benchmarks/BLAS/vector-operations.h b/src/Benchmarks/BLAS/vector-operations.h
index 3391f23fa766a1c5627610022c39d38b17cd30ae..c2a3ceab321b879eec052d8df24f7091cf778d05 100644
--- a/src/Benchmarks/BLAS/vector-operations.h
+++ b/src/Benchmarks/BLAS/vector-operations.h
@@ -36,7 +36,7 @@ namespace Benchmarks {
 template< typename Real = double,
           typename Index = int >
 void
-benchmarkVectorOperations( Benchmark & benchmark,
+benchmarkVectorOperations( Benchmark<> & benchmark,
                            const long & size )
 {
    using HostVector = Containers::Vector< Real, Devices::Host, Index >;
diff --git a/src/Benchmarks/Benchmark.hpp b/src/Benchmarks/Benchmark.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..e2357990a8285bdef417f738b317fde4e3942735
--- /dev/null
+++ b/src/Benchmarks/Benchmark.hpp
@@ -0,0 +1,312 @@
+/***************************************************************************
+                          Benchmarks.hpp  -  description
+                             -------------------
+    begin                : Jun 7, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Jakub Klinkovsky,
+//                 Tomas Oberhuber
+
+#pragma once
+
+#include "FunctionTimer.h"
+#include "Logging.h"
+
+#include <iostream>
+#include <exception>
+#include <limits>
+
+#include <TNL/String.h>
+
+#include <TNL/Devices/Host.h>
+#include <TNL/SystemInfo.h>
+#include <TNL/Cuda/DeviceInfo.h>
+#include <TNL/Config/ConfigDescription.h>
+#include <TNL/MPI/Wrappers.h>
+
+namespace TNL {
+namespace Benchmarks {
+
+
+template< typename Logger >
+Benchmark< Logger >::
+Benchmark( int loops,
+           bool verbose,
+           String outputMode,
+           bool logFileAppend )
+: Logger(verbose, outputMode, logFileAppend), loops(loops)
+{}
+
+template< typename Logger >
+void
+Benchmark< Logger >::
+configSetup( Config::ConfigDescription& config )
+{
+   config.addEntry< int >( "loops", "Number of iterations for every computation.", 10 );
+   config.addEntry< bool >( "reset", "Call reset function between loops.", true );
+   config.addEntry< double >( "min-time", "Minimal real time in seconds for every computation.", 0.0 );
+   config.addEntry< int >( "verbose", "Verbose mode, the higher number the more verbosity.", 1 );
+}
+
+template< typename Logger >
+void
+Benchmark< Logger >::
+setup( const Config::ParameterContainer& parameters )
+{
+   this->loops = parameters.getParameter< int >( "loops" );
+   this->reset = parameters.getParameter< bool >( "reset" );
+   this->minTime = parameters.getParameter< double >( "min-time" );
+   const int verbose = parameters.getParameter< int >( "verbose" );
+   Logger::setVerbose( verbose );
+}
+
+template< typename Logger >
+void
+Benchmark< Logger >::
+setLoops( int loops )
+{
+   this->loops = loops;
+}
+
+template< typename Logger >
+void
+Benchmark< Logger >::
+setMinTime( const double& minTime )
+{
+   this->minTime = minTime;
+}
+
+template< typename Logger >
+void
+Benchmark< Logger >::
+newBenchmark( const String & title )
+{
+   Logger::closeTable();
+   Logger::writeTitle( title );
+}
+
+template< typename Logger >
+void
+Benchmark< Logger >::
+newBenchmark( const String & title,
+               MetadataMap metadata )
+{
+   Logger::closeTable();
+   Logger::writeTitle( title );
+   // add loops and reset flag to metadata
+   metadata["loops"] = convertToString(loops);
+   metadata["reset"] = convertToString( reset );
+   metadata["minimal test time"] = convertToString( minTime );
+   Logger::writeMetadata( metadata );
+}
+
+template< typename Logger >
+void
+Benchmark< Logger >::
+setMetadataColumns( const MetadataColumns & metadata )
+{
+   if( Logger::metadataColumns != metadata )
+      Logger::header_changed = true;
+   Logger::metadataColumns = metadata;
+}
+
+template< typename Logger >
+void
+Benchmark< Logger >::
+setOperation( const String & operation,
+              const double datasetSize,
+              const double baseTime )
+{
+   monitor.setStage( operation.getString() );
+   if( Logger::metadataColumns.size() > 0 && String(Logger::metadataColumns[ 0 ].first) == "operation" ) {
+      Logger::metadataColumns[ 0 ].second = operation;
+   }
+   else {
+      Logger::metadataColumns.insert( Logger::metadataColumns.begin(), {"operation", operation} );
+   }
+   setOperation( datasetSize, baseTime );
+   Logger::header_changed = true;
+}
+
+template< typename Logger >
+void
+Benchmark< Logger >::
+setOperation( const double datasetSize,
+              const double baseTime )
+{
+   this->datasetSize = datasetSize;
+   this->baseTime = baseTime;
+}
+
+template< typename Logger >
+void
+Benchmark< Logger >::
+createHorizontalGroup( const String & name,
+                       int subcolumns )
+{
+   if( Logger::horizontalGroups.size() == 0 ) {
+      Logger::horizontalGroups.push_back( {name, subcolumns} );
+   }
+   else {
+      auto & last = Logger::horizontalGroups.back();
+      if( last.first != name && last.second > 0 ) {
+         Logger::horizontalGroups.push_back( {name, subcolumns} );
+      }
+      else {
+         last.first = name;
+         last.second = subcolumns;
+      }
+   }
+}
+
+template< typename Logger >
+   template< typename Device,
+             typename ResetFunction,
+             typename ComputeFunction >
+double
+Benchmark< Logger >::
+time( ResetFunction reset,
+      const String & performer,
+      ComputeFunction & compute,
+      BenchmarkResult< Logger > & result )
+{
+   result.time = std::numeric_limits<double>::quiet_NaN();
+   result.stddev = std::numeric_limits<double>::quiet_NaN();
+   FunctionTimer< Device > functionTimer;
+   try {
+      if( Logger::verbose > 1 ) {
+         // run the monitor main loop
+         Solvers::SolverMonitorThread monitor_thread( monitor );
+         if( this->reset )
+            std::tie( result.time, result.stddev ) = functionTimer.timeFunction( compute, reset, loops, minTime, Logger::verbose, monitor );
+         else
+            std::tie( result.time, result.stddev ) = functionTimer.timeFunction( compute, loops, minTime, Logger::verbose, monitor );
+      }
+      else {
+         if( this->reset )
+            std::tie( result.time, result.stddev ) = functionTimer.timeFunction( compute, reset, loops, minTime, Logger::verbose, monitor );
+         else
+            std::tie( result.time, result.stddev ) = functionTimer.timeFunction( compute, loops, minTime, Logger::verbose, monitor );
+      }
+      this->performedLoops = functionTimer.getPerformedLoops();
+   }
+   catch ( const std::exception& e ) {
+      std::cerr << "timeFunction failed due to a C++ exception with description: " << e.what() << std::endl;
+   }
+
+   result.bandwidth = datasetSize / result.time;
+   result.speedup = this->baseTime / result.time;
+   if( this->baseTime == 0.0 )
+      this->baseTime = result.time;
+
+   Logger::writeTableHeader( performer, result.getTableHeader() );
+   Logger::writeTableRow( performer, result.getRowElements() );
+
+   return this->baseTime;
+}
+
+template< typename Logger >
+   template< typename Device,
+             typename ResetFunction,
+             typename ComputeFunction >
+inline double
+Benchmark< Logger >::
+time( ResetFunction reset,
+      const String& performer,
+      ComputeFunction& compute )
+{
+   BenchmarkResult< Logger > result;
+   return time< Device, ResetFunction, ComputeFunction >( reset, performer, compute, result );
+}
+
+template< typename Logger >
+   template< typename Device,
+             typename ComputeFunction >
+double
+Benchmark< Logger >::
+time( const String & performer,
+      ComputeFunction & compute,
+      BenchmarkResult< Logger > & result )
+{
+   result.time = std::numeric_limits<double>::quiet_NaN();
+   result.stddev = std::numeric_limits<double>::quiet_NaN();
+   FunctionTimer< Device > functionTimer;
+   try {
+      if( Logger::verbose > 1 ) {
+         // run the monitor main loop
+         Solvers::SolverMonitorThread monitor_thread( monitor );
+         std::tie( result.time, result.stddev ) = functionTimer.timeFunction( compute, loops, minTime, Logger::verbose, monitor );
+      }
+      else {
+         std::tie( result.time, result.stddev ) = functionTimer.timeFunction( compute, loops, minTime, Logger::verbose, monitor );
+      }
+   }
+   catch ( const std::exception& e ) {
+      std::cerr << "Function timer failed due to a C++ exception with description: " << e.what() << std::endl;
+   }
+
+   result.bandwidth = datasetSize / result.time;
+   result.speedup = this->baseTime / result.time;
+   if( this->baseTime == 0.0 )
+      this->baseTime = result.time;
+
+   Logger::writeTableHeader( performer, result.getTableHeader() );
+   Logger::writeTableRow( performer, result.getRowElements() );
+
+   return this->baseTime;
+}
+
+template< typename Logger >
+   template< typename Device,
+             typename ComputeFunction >
+inline double
+Benchmark< Logger >::
+time( const String & performer,
+      ComputeFunction & compute )
+{
+   BenchmarkResult< Logger > result;
+   return time< Device, ComputeFunction >( performer, compute, result );
+}
+
+template< typename Logger >
+void
+Benchmark< Logger >::
+addErrorMessage( const char* msg,
+                 int numberOfComputations )
+{
+   // each computation has 3 subcolumns
+   const int colspan = 3 * numberOfComputations;
+   Logger::writeErrorMessage( msg, colspan );
+   std::cerr << msg << std::endl;
+}
+
+template< typename Logger >
+auto
+Benchmark< Logger >::
+getMonitor() -> SolverMonitorType&
+{
+   return monitor;
+}
+
+template< typename Logger >
+int
+Benchmark< Logger >::
+getPerformedLoops() const
+{
+   return this->performedLoops;
+}
+
+template< typename Logger >
+bool
+Benchmark< Logger >::
+isResetingOn() const
+{
+   return reset;
+}
+
+} // namespace Benchmarks
+} // namespace TNL
diff --git a/src/Benchmarks/Benchmarks.h b/src/Benchmarks/Benchmarks.h
index 770d38a3e323fd08dd22572695d9805d746b5572..77fa9e47c897617b6ddee3d5ab94e1510bd5e777 100644
--- a/src/Benchmarks/Benchmarks.h
+++ b/src/Benchmarks/Benchmarks.h
@@ -1,5 +1,5 @@
 /***************************************************************************
-                          benchmarks.h  -  description
+                          Benchmarks.h  -  description
                              -------------------
     begin                : Dec 30, 2015
     copyright            : (C) 2015 by Tomas Oberhuber et al.
@@ -34,10 +34,11 @@ namespace Benchmarks {
 const double oneGB = 1024.0 * 1024.0 * 1024.0;
 
 
+template< typename Logger = Logging >
 struct BenchmarkResult
 {
-   using HeaderElements = Logging::HeaderElements;
-   using RowElements = Logging::RowElements;
+   using HeaderElements = typename Logger::HeaderElements;
+   using RowElements = typename Logger::RowElements;
 
    double time = std::numeric_limits<double>::quiet_NaN();
    double stddev = std::numeric_limits<double>::quiet_NaN();
@@ -46,7 +47,12 @@ struct BenchmarkResult
 
    virtual HeaderElements getTableHeader() const
    {
-      return HeaderElements({ "time", "stddev", "stddev/time", "bandwidth", "speedup" });
+      return HeaderElements( {
+         std::pair< String, int >( "time", 8 ),
+         std::pair< String, int >( "stddev", 8 ),
+         std::pair< String, int >( "stddev/time", 8 ),
+         std::pair< String, int >( "bandwidth", 8 ),
+         std::pair< String, int >( "speedup", 8 ) } );
    }
 
    virtual RowElements getRowElements() const
@@ -61,280 +67,138 @@ struct BenchmarkResult
    }
 };
 
-
+template< typename Logger = Logging >
 class Benchmark
-: protected Logging
+: protected Logger
 {
-public:
-   using Logging::MetadataElement;
-   using Logging::MetadataMap;
-   using Logging::MetadataColumns;
-   using SolverMonitorType = Solvers::IterativeSolverMonitor< double, int >;
-
-   Benchmark( int loops = 10,
-              bool verbose = true )
-   : Logging(verbose), loops(loops)
-   {}
-
-   static void configSetup( Config::ConfigDescription& config )
-   {
-      config.addEntry< int >( "loops", "Number of iterations for every computation.", 10 );
-      config.addEntry< bool >( "reset", "Call reset function between loops.", true );
-      config.addEntry< double >( "min-time", "Minimal real time in seconds for every computation.", 0.0 );
-      config.addEntry< int >( "verbose", "Verbose mode, the higher number the more verbosity.", 1 );
-   }
-
-   void setup( const Config::ParameterContainer& parameters )
-   {
-      this->loops = parameters.getParameter< int >( "loops" );
-      this->reset = parameters.getParameter< bool >( "reset" );
-      this->minTime = parameters.getParameter< double >( "min-time" );
-      const int verbose = parameters.getParameter< int >( "verbose" );
-      Logging::setVerbose( verbose );
-   }
-   // TODO: ensure that this is not called in the middle of the benchmark
-   // (or just remove it completely?)
-   void
-   setLoops( int loops )
-   {
-      this->loops = loops;
-   }
-
-   void setMinTime( const double& minTime )
-   {
-      this->minTime = minTime;
-   }
-
-   // Marks the start of a new benchmark
-   void
-   newBenchmark( const String & title )
-   {
-      closeTable();
-      writeTitle( title );
-   }
-
-   // Marks the start of a new benchmark (with custom metadata)
-   void
-   newBenchmark( const String & title,
-                 MetadataMap metadata )
-   {
-      closeTable();
-      writeTitle( title );
-      // add loops and reset flag to metadata
-      metadata["loops"] = convertToString(loops);
-      metadata["reset"] = convertToString( reset );
-      metadata["minimal test time"] = convertToString( minTime );
-      writeMetadata( metadata );
-   }
-
-   // Sets metadata columns -- values used for all subsequent rows until
-   // the next call to this function.
-   void
-   setMetadataColumns( const MetadataColumns & metadata )
-   {
-      if( metadataColumns != metadata )
-         header_changed = true;
-      metadataColumns = metadata;
-   }
-
-   // TODO: maybe should be renamed to createVerticalGroup and ensured that vertical and horizontal groups are not used within the same "Benchmark"
-   // Sets current operation -- operations expand the table vertically
-   //  - baseTime should be reset to 0.0 for most operations, but sometimes
-   //    it is useful to override it
-   //  - Order of operations inside a "Benchmark" does not matter, rows can be
-   //    easily sorted while converting to HTML.)
-   void
-   setOperation( const String & operation,
-                 const double datasetSize = 0.0, // in GB
-                 const double baseTime = 0.0 )
-   {
-      monitor.setStage( operation.getString() );
-      if( metadataColumns.size() > 0 && String(metadataColumns[ 0 ].first) == "operation" ) {
-         metadataColumns[ 0 ].second = operation;
-      }
-      else {
-         metadataColumns.insert( metadataColumns.begin(), {"operation", operation} );
-      }
-      setOperation( datasetSize, baseTime );
-      header_changed = true;
-   }
-
-   void
-   setOperation( const double datasetSize = 0.0,
-                 const double baseTime = 0.0 )
-   {
-      this->datasetSize = datasetSize;
-      this->baseTime = baseTime;
-   }
-
-   // Creates new horizontal groups inside a benchmark -- increases the number
-   // of columns in the "Benchmark", implies column spanning.
-   // (Useful e.g. for SpMV formats, different configurations etc.)
-   void
-   createHorizontalGroup( const String & name,
-                          int subcolumns )
-   {
-      if( horizontalGroups.size() == 0 ) {
-         horizontalGroups.push_back( {name, subcolumns} );
-      }
-      else {
-         auto & last = horizontalGroups.back();
-         if( last.first != name && last.second > 0 ) {
-            horizontalGroups.push_back( {name, subcolumns} );
-         }
-         else {
-            last.first = name;
-            last.second = subcolumns;
-         }
-      }
-   }
-
-   // Times a single ComputeFunction. Subsequent calls implicitly split
-   // the current "horizontal group" into sub-columns identified by
-   // "performer", which are further split into "bandwidth", "time" and
-   // "speedup" columns.
-   // TODO: allow custom columns bound to lambda functions (e.g. for Gflops calculation)
-   // Also terminates the recursion of the following variadic template.
-   template< typename Device,
-             typename ResetFunction,
-             typename ComputeFunction >
-   double
-   time( ResetFunction reset,
-         const String & performer,
-         ComputeFunction & compute,
-         BenchmarkResult & result )
-   {
-      result.time = std::numeric_limits<double>::quiet_NaN();
-      result.stddev = std::numeric_limits<double>::quiet_NaN();
-      FunctionTimer< Device > functionTimer;
-      try {
-         if( verbose > 1 ) {
-            // run the monitor main loop
-            Solvers::SolverMonitorThread monitor_thread( monitor );
-            if( this->reset )
-               std::tie( result.time, result.stddev ) = functionTimer.timeFunction( compute, reset, loops, minTime, verbose, monitor );
-            else
-               std::tie( result.time, result.stddev ) = functionTimer.timeFunction( compute, loops, minTime, verbose, monitor );
-         }
-         else {
-            if( this->reset )
-               std::tie( result.time, result.stddev ) = functionTimer.timeFunction( compute, reset, loops, minTime, verbose, monitor );
-            else
-               std::tie( result.time, result.stddev ) = functionTimer.timeFunction( compute, loops, minTime, verbose, monitor );
-         }
-         this->performedLoops = functionTimer.getPerformedLoops();
-      }
-      catch ( const std::exception& e ) {
-         std::cerr << "timeFunction failed due to a C++ exception with description: " << e.what() << std::endl;
-      }
-
-      result.bandwidth = datasetSize / result.time;
-      result.speedup = this->baseTime / result.time;
-      if( this->baseTime == 0.0 )
-         this->baseTime = result.time;
-
-      writeTableHeader( performer, result.getTableHeader() );
-      writeTableRow( performer, result.getRowElements() );
-
-      return this->baseTime;
-   }
-
-   template< typename Device,
-             typename ResetFunction,
-             typename ComputeFunction >
-   inline double
-   time( ResetFunction reset,
-         const String & performer,
-         ComputeFunction & compute )
-   {
-      BenchmarkResult result;
-      return time< Device, ResetFunction, ComputeFunction >( reset, performer, compute, result );
-   }
-
-   /****
-    * The same methods as above but without reset function
-    */
-   template< typename Device,
-             typename ComputeFunction >
-   double
-   time( const String & performer,
-         ComputeFunction & compute,
-         BenchmarkResult & result )
-   {
-      result.time = std::numeric_limits<double>::quiet_NaN();
-      result.stddev = std::numeric_limits<double>::quiet_NaN();
-      FunctionTimer< Device > functionTimer;
-      try {
-         if( verbose > 1 ) {
-            // run the monitor main loop
-            Solvers::SolverMonitorThread monitor_thread( monitor );
-            std::tie( result.time, result.stddev ) = functionTimer.timeFunction( compute, loops, minTime, verbose, monitor );
-         }
-         else {
-            std::tie( result.time, result.stddev ) = functionTimer.timeFunction( compute, loops, minTime, verbose, monitor );
-         }
-      }
-      catch ( const std::exception& e ) {
-         std::cerr << "Function timer failed due to a C++ exception with description: " << e.what() << std::endl;
-      }
-
-      result.bandwidth = datasetSize / result.time;
-      result.speedup = this->baseTime / result.time;
-      if( this->baseTime == 0.0 )
-         this->baseTime = result.time;
-
-      writeTableHeader( performer, result.getTableHeader() );
-      writeTableRow( performer, result.getRowElements() );
-
-      return this->baseTime;
-   }
-
-   template< typename Device,
-             typename ComputeFunction >
-   inline double
-   time( const String & performer,
-         ComputeFunction & compute )
-   {
-      BenchmarkResult result;
-      return time< Device, ComputeFunction >( performer, compute, result );
-   }
-
-   // Adds an error message to the log. Should be called in places where the
-   // "time" method could not be called (e.g. due to failed allocation).
-   void
-   addErrorMessage( const char* msg,
-                    int numberOfComputations = 1 ) {
-      // each computation has 3 subcolumns
-      const int colspan = 3 * numberOfComputations;
-      writeErrorMessage( msg, colspan );
-      std::cerr << msg << std::endl;
-   }
-
-   using Logging::save;
-
-   SolverMonitorType& getMonitor() {
-      return monitor;
-   }
-
-   int getPerformedLoops() const {
-      return this->performedLoops;
-   }
-
-   bool isResetingOn() const {
-      return reset;
-   }
-
-protected:
-   int loops = 1, performedLoops = 0;
-   double minTime = 0.0;
-   double datasetSize = 0.0;
-   double baseTime = 0.0;
-   bool reset = true;
-   SolverMonitorType monitor;
+   public:
+      using typename Logger::MetadataElement;
+      using typename Logger::MetadataMap;
+      using typename Logger::MetadataColumns;
+      using SolverMonitorType = Solvers::IterativeSolverMonitor< double, int >;
+
+      using typename Logger::CommonLogs;
+      using Logger::addCommonLogs;
+      using Logger::addLogsMetadata;
+      using Logger::writeHeader;
+
+      Benchmark( int loops = 10,
+               bool verbose = true,
+               String outputMode = "",
+               bool logFileAppend = false );
+
+      static void configSetup( Config::ConfigDescription& config );
+
+      void setup( const Config::ParameterContainer& parameters );
+
+      // TODO: ensure that this is not called in the middle of the benchmark
+      // (or just remove it completely?)
+      void setLoops( int loops );
+
+      void setMinTime( const double& minTime );
+
+      // Marks the start of a new benchmark
+      void newBenchmark( const String & title );
+
+      // Marks the start of a new benchmark (with custom metadata)
+      void newBenchmark( const String & title,
+                        MetadataMap metadata );
+
+      // Sets metadata columns -- values used for all subsequent rows until
+      // the next call to this function.
+      void setMetadataColumns( const MetadataColumns & metadata );
+
+      // TODO: maybe should be renamed to createVerticalGroup and ensured that vertical and horizontal groups are not used within the same "Benchmark"
+      // Sets current operation -- operations expand the table vertically
+      //  - baseTime should be reset to 0.0 for most operations, but sometimes
+      //    it is useful to override it
+      //  - Order of operations inside a "Benchmark" does not matter, rows can be
+      //    easily sorted while converting to HTML.)
+      void
+      setOperation( const String & operation,
+                  const double datasetSize = 0.0, // in GB
+                  const double baseTime = 0.0 );
+
+      void setOperation( const double datasetSize = 0.0,
+                        const double baseTime = 0.0 );
+
+      // Creates new horizontal groups inside a benchmark -- increases the number
+      // of columns in the "Benchmark", implies column spanning.
+      // (Useful e.g. for SpMV formats, different configurations etc.)
+      void
+      createHorizontalGroup( const String & name,
+                           int subcolumns );
+
+      // Times a single ComputeFunction. Subsequent calls implicitly split
+      // the current "horizontal group" into sub-columns identified by
+      // "performer", which are further split into "bandwidth", "time" and
+      // "speedup" columns.
+      // TODO: allow custom columns bound to lambda functions (e.g. for Gflops calculation)
+      // Also terminates the recursion of the following variadic template.
+      template< typename Device,
+               typename ResetFunction,
+               typename ComputeFunction >
+      double time( ResetFunction reset,
+                  const String & performer,
+                  ComputeFunction & compute,
+                  BenchmarkResult< Logger > & result );
+
+      template< typename Device,
+               typename ResetFunction,
+               typename ComputeFunction >
+      inline double time( ResetFunction reset,
+                        const String & performer,
+                        ComputeFunction & compute );
+      /*{
+         BenchmarkResult< Logger > result;
+         return time< Device, ResetFunction, ComputeFunction >( reset, performer, compute, result );
+      }*/
+
+      /****
+       * The same methods as above but without reset function
+       */
+      template< typename Device,
+               typename ComputeFunction >
+      double time( const String & performer,
+                  ComputeFunction & compute,
+                  BenchmarkResult< Logger > & result );
+
+      template< typename Device,
+               typename ComputeFunction >
+      inline double time( const String & performer,
+                        ComputeFunction & compute );
+
+      // Adds an error message to the log. Should be called in places where the
+      // "time" method could not be called (e.g. due to failed allocation).
+      void addErrorMessage( const char* msg,
+                           int numberOfComputations = 1 );
+
+      using Logger::save;
+
+      SolverMonitorType& getMonitor();
+
+      int getPerformedLoops() const;
+
+      bool isResetingOn() const;
+
+   protected:
+
+      int loops = 1, performedLoops = 0;
+
+      double minTime = 0.0;
+
+      double datasetSize = 0.0;
+
+      double baseTime = 0.0;
+
+      bool reset = true;
+
+      SolverMonitorType monitor;
 };
 
 
-inline Benchmark::MetadataMap getHardwareMetadata()
+template< typename Logger >
+inline typename Benchmark< Logger >::MetadataMap getHardwareMetadata()
 {
    const int cpu_id = 0;
    const CacheSizes cacheSizes = SystemInfo::getCPUCacheSizes( cpu_id );
@@ -356,7 +220,7 @@ inline Benchmark::MetadataMap getHardwareMetadata()
       nproc = TNL::MPI::GetSize();
 #endif
 
-   Benchmark::MetadataMap metadata {
+   typename Benchmark< Logger >::MetadataMap metadata {
        { "host name", SystemInfo::getHostname() },
        { "architecture", SystemInfo::getArchitecture() },
        { "system", SystemInfo::getSystemName() },
@@ -388,3 +252,5 @@ inline Benchmark::MetadataMap getHardwareMetadata()
 
 } // namespace Benchmarks
 } // namespace TNL
+
+#include <Benchmarks/Benchmark.hpp>
diff --git a/src/Benchmarks/DistSpMV/tnl-benchmark-distributed-spmv.h b/src/Benchmarks/DistSpMV/tnl-benchmark-distributed-spmv.h
index e178786965ac67518bae5cef6adaa004c981cdcd..b79d80ebf1e5bb9730357a690fe8824e7cc3864b 100644
--- a/src/Benchmarks/DistSpMV/tnl-benchmark-distributed-spmv.h
+++ b/src/Benchmarks/DistSpMV/tnl-benchmark-distributed-spmv.h
@@ -42,7 +42,7 @@ using namespace TNL::Benchmarks;
 
 template< typename Matrix, typename Vector >
 void
-benchmarkSpmv( Benchmark& benchmark,
+benchmarkSpmv( Benchmark<>& benchmark,
                const Matrix& matrix,
                const Vector& x,
                const char* performer = "CPU" )
@@ -65,7 +65,7 @@ benchmarkSpmv( Benchmark& benchmark,
 
 template< typename Matrix, typename Vector >
 void
-benchmarkSpmvCuda( Benchmark& benchmark,
+benchmarkSpmvCuda( Benchmark<>& benchmark,
                    const Matrix& matrix,
                    const Vector& x )
 {
@@ -91,7 +91,7 @@ benchmarkSpmvCuda( Benchmark& benchmark,
 
 template< typename Matrix, typename Vector >
 void
-benchmarkDistributedSpmv( Benchmark& benchmark,
+benchmarkDistributedSpmv( Benchmark<>& benchmark,
                           // TODO: cannot be const due to internal buffering
 //                          const Matrix& matrix,
                           Matrix& matrix,
@@ -117,7 +117,7 @@ benchmarkDistributedSpmv( Benchmark& benchmark,
 
 template< typename Matrix, typename Vector >
 void
-benchmarkDistributedSpmvCuda( Benchmark& benchmark,
+benchmarkDistributedSpmvCuda( Benchmark<>& benchmark,
                               const Matrix& matrix,
                               const Vector& x )
 {
@@ -156,8 +156,8 @@ struct SpmvBenchmark
    using DistributedRowLengths = typename DistributedMatrix::RowsCapacitiesType;
 
    static bool
-   run( Benchmark& benchmark,
-        Benchmark::MetadataMap metadata,
+   run( Benchmark<>& benchmark,
+        Benchmark<>::MetadataMap metadata,
         const Config::ParameterContainer& parameters )
    {
       MatrixType matrix;
@@ -172,7 +172,7 @@ struct SpmvBenchmark
       const String name = String( (TNL::MPI::GetSize() > 1) ? "DistSpMV" : "SpMV" )
                           + " (" + parameters.getParameter< String >( "name" ) + "): ";
       benchmark.newBenchmark( name, metadata );
-      benchmark.setMetadataColumns( Benchmark::MetadataColumns({
+      benchmark.setMetadataColumns( Benchmark<>::MetadataColumns({
          // TODO: strip the device
 //         { "matrix type", matrix.getType() },
          { "rows", convertToString( matrix.getRows() ) },
@@ -205,8 +205,8 @@ struct SpmvBenchmark
    }
 
    static void
-   runNonDistributed( Benchmark& benchmark,
-                      Benchmark::MetadataMap metadata,
+   runNonDistributed( Benchmark<>& benchmark,
+                      Benchmark<>::MetadataMap metadata,
                       const Config::ParameterContainer& parameters,
                       MatrixType& matrix,
                       VectorType& vector )
@@ -218,8 +218,8 @@ struct SpmvBenchmark
    }
 
    static void
-   runDistributed( Benchmark& benchmark,
-                   Benchmark::MetadataMap metadata,
+   runDistributed( Benchmark<>& benchmark,
+                   Benchmark<>::MetadataMap metadata,
                    const Config::ParameterContainer& parameters,
                    MatrixType& matrix,
                    VectorType& vector )
@@ -334,10 +334,10 @@ main( int argc, char* argv[] )
       logFile.open( logFileName.getString(), mode );
 
    // init benchmark and common metadata
-   Benchmark benchmark( loops, verbose );
+   Benchmark<> benchmark( loops, verbose );
 
    // prepare global metadata
-   Benchmark::MetadataMap metadata = getHardwareMetadata();
+   Benchmark<>::MetadataMap metadata = getHardwareMetadata< Logging >();
 
    // TODO: implement resolveMatrixType
 //   return ! Matrices::resolveMatrixType< MainConfig,
diff --git a/src/Benchmarks/JsonLogging.h b/src/Benchmarks/JsonLogging.h
new file mode 100644
index 0000000000000000000000000000000000000000..7d9817c654a36f937063b3f27a18bcf0dfed7cc7
--- /dev/null
+++ b/src/Benchmarks/JsonLogging.h
@@ -0,0 +1,267 @@
+/***************************************************************************
+                          JsonLogging.h  -  description
+                             -------------------
+    begin                : May 11, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Jakub Klinkovsky,
+//                 Tomas Oberhuber
+
+#pragma once
+
+#include <map>
+#include <vector>
+#include <iostream>
+#include <iomanip>
+#include <string>
+#include <sstream>
+
+#include <TNL/String.h>
+
+namespace TNL {
+namespace Benchmarks {
+
+class JsonLoggingRowElements
+{
+   public:
+
+      JsonLoggingRowElements()
+      {
+         stream << std::setprecision( 6 ) << std::fixed;
+      }
+
+      template< typename T >
+      JsonLoggingRowElements& operator << ( const T& b )
+      {
+         stream << b;
+         elements.push_back( stream.str() );
+         stream.str( std::string() );
+         return *this;
+      }
+
+      JsonLoggingRowElements& operator << ( decltype( std::setprecision( 2 ) )& setprec )
+      {
+         stream << setprec;
+         return *this;
+      }
+
+      JsonLoggingRowElements& operator << ( decltype( std::fixed )& setfixed ) // the same works also for std::scientific
+      {
+         stream << setfixed;
+         return *this;
+      }
+
+      // iterators
+      auto begin() noexcept { return elements.begin(); }
+
+      auto begin() const noexcept { return elements.begin(); }
+
+      auto cbegin() const noexcept { return elements.cbegin(); }
+
+      auto end() noexcept { return elements.end(); }
+
+      auto end() const noexcept { return elements.end(); }
+
+      auto cend() const noexcept { return elements.cend(); }
+
+      size_t size() const noexcept { return this->elements.size(); };
+   protected:
+      std::list< String > elements;
+
+      std::stringstream stream;
+};
+
+class JsonLogging
+{
+public:
+   using MetadataElement = std::pair< const char*, String >;
+   using MetadataMap = std::map< const char*, String >;
+   using MetadataColumns = std::vector<MetadataElement>;
+
+   using CommonLogs = std::vector< std::pair< const char*, String > >;
+   using LogsMetadata = std::vector< std::pair< String, int > >;
+
+   using HeaderElements = std::vector< std::pair< String, int > >;
+   using RowElements = JsonLoggingRowElements;
+
+   JsonLogging( int verbose = true,
+                String outputMode = "",
+                bool logFileAppend = false )
+   : verbose(verbose), outputMode( outputMode ), logFileAppend( logFileAppend )
+   {}
+
+   void
+   setVerbose( int verbose)
+   {
+      this->verbose = verbose;
+   }
+
+   void addCommonLogs( const CommonLogs& logs )
+   {
+      this->commonLogs = logs;
+      if( verbose )
+      {
+         std::cout << std::endl << "Benchmark setup:" << std::endl;
+         for( auto lg : logs )
+            std::cout << "   " << lg.first << " = " << lg.second << std::endl;
+         std::cout << std::endl;
+      }
+   };
+
+   void resetLogsMetada() { this->logsMetadata.clear(); };
+
+   void addLogsMetadata( const std::vector< std::pair< String, int > >& md )
+   {
+      this->logsMetadata.insert( this->logsMetadata.end(), md.begin(), md.end() );
+   }
+
+   void writeHeader()
+   {
+      if( verbose )
+      {
+         for( auto md : this->logsMetadata )
+            std::cout << std::setw( md.second ) << md.first;
+         std::cout << std::endl;
+      }
+   }
+
+   void writeRow( const RowElements& rowEls )
+   {
+      TNL_ASSERT_EQ( rowEls.size(), this->logsMetadata.size(), "" );
+      if( this->lineStarted )
+         log << "," << std::endl;
+
+      log << "      {" << std::endl;
+
+      // write common logs
+      int idx( 0 );
+      for( auto lg : this->commonLogs )
+      {
+         if( idx++ > 0 )
+            log << "," << std::endl;
+         log << "         \"" << lg.first << "\" : \"" << lg.second << "\"";
+      }
+
+      auto md = this->logsMetadata.begin();
+      for( auto el : rowEls )
+      {
+         if( verbose )
+            std::cout << std::setw( md->second ) << el;
+         if( idx++ > 0 )
+            log << "," << std::endl;
+         log << "         \"" << md++->first << "\" : \"" << el << "\"";
+      }
+      log << std::endl << "      }";
+      this->lineStarted = true;
+      if( verbose )
+         std::cout << std::endl;
+   }
+
+   void
+   writeTitle( const String & title )
+   {
+      if( outputMode == "append" )
+         return;
+
+      if( verbose )
+         std::cout << std::endl << "== " << title << " ==" << std::endl << std::endl;
+   }
+
+   void
+   writeMetadata( const MetadataMap & metadata )
+   {
+      if( outputMode == "append" )
+         return;
+
+      if( verbose )
+         std::cout << "properties:" << std::endl;
+
+      for( auto & it : metadata ) {
+         if( verbose )
+            std::cout << "   " << it.first << " = " << it.second << std::endl;
+      }
+
+      if( verbose )
+         std::cout << std::endl;
+   }
+
+   void
+   writeTableHeader( const String & spanningElement,
+                     const HeaderElements & subElements )
+   {
+   }
+
+   void
+   writeTableRow( const String & spanningElement,
+                  const RowElements & subElements )
+   {
+      writeRow( subElements );
+   }
+
+   void
+   writeErrorMessage( const char* msg,
+                      int colspan = 1 )
+   {
+      log << "\"error\" : \"" << msg << "\"" << std::endl;
+   }
+
+   void
+   closeTable()
+   {
+   }
+
+   bool save( std::ostream & logFile )
+   {
+      if( ! this->logFileAppend )
+      {
+         logFile << "{" << std::endl;
+         logFile << "   \"results\" : [ " << std::endl;
+      }
+      else
+         logFile << log.str();
+      if( logFile.good() ) {
+         log.str() = "";
+         return true;
+      }
+      return false;
+   }
+
+protected:
+   // manual double -> String conversion with fixed precision
+   static String
+   _to_string( double num, int precision = 0, bool fixed = false )
+   {
+      std::stringstream str;
+      if( fixed )
+         str << std::fixed;
+      if( precision )
+         str << std::setprecision( precision );
+      str << num;
+      return String( str.str().data() );
+   }
+
+   std::stringstream log;
+   std::string header_indent;
+   std::string body_indent;
+
+   int verbose;
+   MetadataColumns metadataColumns;
+   bool header_changed = true;
+   std::vector< std::pair< String, int > > horizontalGroups;
+
+   // new JSON implementation
+   LogsMetadata logsMetadata;
+   CommonLogs commonLogs;
+   String outputMode;
+
+   bool lineStarted = false;
+   bool resultsStarted = false;
+   bool logFileAppend = false;
+};
+
+} // namespace Benchmarks
+} // namespace TNL
diff --git a/src/Benchmarks/LinearSolvers/benchmarks.h b/src/Benchmarks/LinearSolvers/benchmarks.h
index b9e130c39e99cc67a7c86b1e8580384b48b8edc2..59d2ab3de327ced0beb8a986f44e938af4b4e5d0 100644
--- a/src/Benchmarks/LinearSolvers/benchmarks.h
+++ b/src/Benchmarks/LinearSolvers/benchmarks.h
@@ -54,7 +54,7 @@ bool checkDevice( const Config::ParameterContainer& parameters )
 
 template< template<typename> class Preconditioner, typename Matrix >
 void
-benchmarkPreconditionerUpdate( Benchmark& benchmark,
+benchmarkPreconditionerUpdate( Benchmark<>& benchmark,
                                const Config::ParameterContainer& parameters,
                                const SharedPointer< Matrix >& matrix )
 {
@@ -78,7 +78,7 @@ benchmarkPreconditionerUpdate( Benchmark& benchmark,
 
 template< template<typename> class Solver, template<typename> class Preconditioner, typename Matrix, typename Vector >
 void
-benchmarkSolver( Benchmark& benchmark,
+benchmarkSolver( Benchmark<>& benchmark,
                  const Config::ParameterContainer& parameters,
                  const SharedPointer< Matrix >& matrix,
                  const Vector& x0,
@@ -126,7 +126,7 @@ benchmarkSolver( Benchmark& benchmark,
 
    // subclass BenchmarkResult to add extra columns to the benchmark
    // (iterations, preconditioned residue, true residue)
-   struct MyBenchmarkResult : public BenchmarkResult
+   struct MyBenchmarkResult : public BenchmarkResult<>
    {
       using HeaderElements = BenchmarkResult::HeaderElements;
       using RowElements = BenchmarkResult::RowElements;
@@ -145,7 +145,15 @@ benchmarkSolver( Benchmark& benchmark,
 
       virtual HeaderElements getTableHeader() const override
       {
-         return HeaderElements({"time", "stddev", "stddev/time", "speedup", "converged", "iterations", "residue_precond", "residue_true"});
+         return HeaderElements( {
+            std::pair< String, int >( "time", 8 ),
+            std::pair< String, int >( "stddev", 8 ),
+            std::pair< String, int >( "stddev/time", 8 ),
+            std::pair< String, int >( "speedup", 8 ),
+            std::pair< String, int >( "converged", 8 ),
+            std::pair< String, int >( "iterations", 8 ),
+            std::pair< String, int >( "residue_precond", 8 ),
+            std::pair< String, int >( "residue_true", 8 ) } );
       }
 
       virtual RowElements getRowElements() const override
diff --git a/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.h b/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.h
index 393fafb493464cc6b6a70ed676dc08d9f577d7f2..0c16513203dff54edf82564434b58fa07d2493f9 100644
--- a/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.h
+++ b/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.h
@@ -145,7 +145,7 @@ void set_random_vector( Vector& v, typename Vector::RealType a, typename Vector:
 
 template< typename Matrix, typename Vector >
 void
-benchmarkIterativeSolvers( Benchmark& benchmark,
+benchmarkIterativeSolvers( Benchmark<>& benchmark,
                            Config::ParameterContainer parameters,
                            const SharedPointer< Matrix >& matrixPointer,
                            const Vector& x0,
@@ -337,8 +337,8 @@ struct LinearSolversBenchmark
    using DistributedRowLengths = typename DistributedMatrix::RowsCapacitiesType;
 
    static bool
-   run( Benchmark& benchmark,
-        Benchmark::MetadataMap metadata,
+   run( Benchmark<>& benchmark,
+        Benchmark<>::MetadataMap metadata,
         const Config::ParameterContainer& parameters )
    {
       const String file_matrix = parameters.getParameter< String >( "input-matrix" );
@@ -384,7 +384,7 @@ struct LinearSolversBenchmark
       const String name = String( (TNL::MPI::GetSize() > 1) ? "Distributed linear solvers" : "Linear solvers" )
                           + " (" + parameters.getParameter< String >( "name" ) + "): ";
       benchmark.newBenchmark( name, metadata );
-      benchmark.setMetadataColumns( Benchmark::MetadataColumns({
+      benchmark.setMetadataColumns( Benchmark<>::MetadataColumns({
          // TODO: strip the device
 //         { "matrix type", matrixPointer->getType() },
          { "rows", convertToString( matrixPointer->getRows() ) },
@@ -422,8 +422,8 @@ struct LinearSolversBenchmark
    }
 
    static void
-   runDistributed( Benchmark& benchmark,
-                   Benchmark::MetadataMap metadata,
+   runDistributed( Benchmark<>& benchmark,
+                   Benchmark<>::MetadataMap metadata,
                    const Config::ParameterContainer& parameters,
                    const SharedPointer< MatrixType >& matrixPointer,
                    const VectorType& x0,
@@ -466,8 +466,8 @@ struct LinearSolversBenchmark
    }
 
    static void
-   runNonDistributed( Benchmark& benchmark,
-                      Benchmark::MetadataMap metadata,
+   runNonDistributed( Benchmark<>& benchmark,
+                      Benchmark<>::MetadataMap metadata,
                       const Config::ParameterContainer& parameters,
                       const SharedPointer< MatrixType >& matrixPointer,
                       const VectorType& x0,
@@ -614,10 +614,10 @@ main( int argc, char* argv[] )
       logFile.open( logFileName.getString(), mode );
 
    // init benchmark and common metadata
-   Benchmark benchmark( loops, verbose );
+   Benchmark<> benchmark( loops, verbose );
 
    // prepare global metadata
-   Benchmark::MetadataMap metadata = getHardwareMetadata();
+   Benchmark<>::MetadataMap metadata = getHardwareMetadata< Logging >();
 
    // TODO: implement resolveMatrixType
 //   return ! Matrices::resolveMatrixType< MainConfig,
diff --git a/src/Benchmarks/Logging.h b/src/Benchmarks/Logging.h
index fb4426bb13daa9f59e2518c1ed11a971ccd525ab..2c8262d21018473c2013bec8d8f1f13f9f9d4e77 100644
--- a/src/Benchmarks/Logging.h
+++ b/src/Benchmarks/Logging.h
@@ -28,7 +28,7 @@ namespace Benchmarks {
 class LoggingRowElements
 {
    public:
-   
+
       LoggingRowElements()
       {
          stream << std::setprecision( 6 ) << std::fixed;
@@ -81,11 +81,15 @@ public:
    using MetadataMap = std::map< const char*, String >;
    using MetadataColumns = std::vector<MetadataElement>;
 
-   using HeaderElements = std::vector< String >;
+   using CommonLogs = std::vector< std::pair< const char*, String > >;
+
+   using HeaderElements = std::vector< std::pair< String, int > >;
    using RowElements = LoggingRowElements;
 
-   Logging( int verbose = true )
-   : verbose(verbose)
+   Logging( int verbose = true,
+            String outputMode = "",
+            bool logFileAppend = false )
+   : verbose(verbose), outputMode( outputMode )
    {}
 
    void
@@ -102,6 +106,19 @@ public:
       log << ": title = " << title << std::endl;
    }
 
+   void addCommonLogs( const CommonLogs& logs )
+   {
+      for( auto log : logs )
+      {
+         if( verbose )
+            std::cout << log.first << " = " << log.second << std::endl;
+      }
+   };
+
+   void addLogsMetadata( const std::vector< String >& md ){};
+
+   void writeHeader(){};
+
    void
    writeMetadata( const MetadataMap & metadata )
    {
@@ -131,7 +148,7 @@ public:
          std::cout << std::setw( 15 ) << "";
 
          for( auto & it : subElements ) {
-            std::cout << std::setw( 15 ) << it;
+            std::cout << std::setw( 15 ) << it.first;
          }
          std::cout << std::endl;
 
@@ -160,7 +177,7 @@ public:
 
       log << header_indent << " " << spanningElement << std::endl;
       for( auto & it : subElements ) {
-         log << header_indent << "! " << it << std::endl;
+         log << header_indent << "! " << it.first << std::endl;
       }
 
       if( horizontalGroups.size() > 0 ) {
@@ -279,6 +296,8 @@ protected:
    MetadataColumns metadataColumns;
    bool header_changed = true;
    std::vector< std::pair< String, int > > horizontalGroups;
+
+   String outputMode;
 };
 
 } // namespace Benchmarks
diff --git a/src/Benchmarks/NDArray/tnl-benchmark-ndarray-boundary.h b/src/Benchmarks/NDArray/tnl-benchmark-ndarray-boundary.h
index 29445234c60b9d036d52c2c755f36e838a801a86..f7a485aa13f955ea22b97c830b0f43f37f10f529 100644
--- a/src/Benchmarks/NDArray/tnl-benchmark-ndarray-boundary.h
+++ b/src/Benchmarks/NDArray/tnl-benchmark-ndarray-boundary.h
@@ -81,7 +81,7 @@ void reset() {}
 // as "compile-time constants" and thus e.g. optimizing the 1D iterations with memcpy
 
 template< typename Device >
-void benchmark_1D( Benchmark& benchmark, index_type size = 500000000 )
+void benchmark_1D( Benchmark<>& benchmark, index_type size = 500000000 )
 {
    NDArray< value_type,
             SizesHolder< index_type, 0 >,
@@ -108,7 +108,7 @@ void benchmark_1D( Benchmark& benchmark, index_type size = 500000000 )
 }
 
 template< typename Device >
-void benchmark_2D( Benchmark& benchmark, index_type size = 22333 )
+void benchmark_2D( Benchmark<>& benchmark, index_type size = 22333 )
 {
    NDArray< value_type,
             SizesHolder< index_type, 0, 0 >,
@@ -135,7 +135,7 @@ void benchmark_2D( Benchmark& benchmark, index_type size = 22333 )
 }
 
 template< typename Device >
-void benchmark_3D( Benchmark& benchmark, index_type size = 800 )
+void benchmark_3D( Benchmark<>& benchmark, index_type size = 800 )
 {
    NDArray< value_type,
             SizesHolder< index_type, 0, 0, 0 >,
@@ -245,7 +245,7 @@ void benchmark_3D( Benchmark& benchmark, index_type size = 800 )
 
 
 template< typename Device >
-void benchmark_2D_perm( Benchmark& benchmark, index_type size = 22333 )
+void benchmark_2D_perm( Benchmark<>& benchmark, index_type size = 22333 )
 {
    NDArray< value_type,
             SizesHolder< index_type, 0, 0 >,
@@ -272,7 +272,7 @@ void benchmark_2D_perm( Benchmark& benchmark, index_type size = 22333 )
 }
 
 template< typename Device >
-void benchmark_3D_perm( Benchmark& benchmark, index_type size = 800 )
+void benchmark_3D_perm( Benchmark<>& benchmark, index_type size = 800 )
 {
    NDArray< value_type,
             SizesHolder< index_type, 0, 0, 0 >,
@@ -381,7 +381,7 @@ void benchmark_3D_perm( Benchmark& benchmark, index_type size = 800 )
 //}
 
 template< typename Device >
-void run_benchmarks( Benchmark& benchmark )
+void run_benchmarks( Benchmark<>& benchmark )
 {
    benchmark_1D< Device >( benchmark );
    benchmark_2D< Device >( benchmark );
@@ -443,10 +443,10 @@ int main( int argc, char* argv[] )
    std::ofstream logFile( logFileName.getString(), mode );
 
    // init benchmark and common metadata
-   Benchmark benchmark( loops, verbose );
+   Benchmark<> benchmark( loops, verbose );
 
    // prepare global metadata
-   Benchmark::MetadataMap metadata = getHardwareMetadata();
+   Benchmark<>::MetadataMap metadata = getHardwareMetadata< Logging >();
 
    const String devices = parameters.getParameter< String >( "devices" );
    if( devices == "all" || devices == "host" )
diff --git a/src/Benchmarks/NDArray/tnl-benchmark-ndarray.h b/src/Benchmarks/NDArray/tnl-benchmark-ndarray.h
index 9f17b8b5c79b8a245a08551f4a51a597a05c059b..8d4ac8e7ace879496e9c18cc25469e83efde3a1b 100644
--- a/src/Benchmarks/NDArray/tnl-benchmark-ndarray.h
+++ b/src/Benchmarks/NDArray/tnl-benchmark-ndarray.h
@@ -83,7 +83,7 @@ void reset() {}
 // as "compile-time constants" and thus e.g. optimizing the 1D iterations with memcpy
 
 template< typename Device >
-void benchmark_array( Benchmark& benchmark, index_type size = 500000000 )
+void benchmark_array( Benchmark<>& benchmark, index_type size = 500000000 )
 {
    Array< value_type, Device > a, b;
    a.setSize( size );
@@ -114,7 +114,7 @@ void benchmark_array( Benchmark& benchmark, index_type size = 500000000 )
 }
 
 template< typename Device >
-void benchmark_1D( Benchmark& benchmark, index_type size = 500000000 )
+void benchmark_1D( Benchmark<>& benchmark, index_type size = 500000000 )
 {
    NDArray< value_type,
             SizesHolder< index_type, 0 >,
@@ -137,7 +137,7 @@ void benchmark_1D( Benchmark& benchmark, index_type size = 500000000 )
 }
 
 template< typename Device >
-void benchmark_2D( Benchmark& benchmark, index_type size = 22333 )
+void benchmark_2D( Benchmark<>& benchmark, index_type size = 22333 )
 {
    NDArray< value_type,
             SizesHolder< index_type, 0, 0 >,
@@ -160,7 +160,7 @@ void benchmark_2D( Benchmark& benchmark, index_type size = 22333 )
 }
 
 template< typename Device >
-void benchmark_3D( Benchmark& benchmark, index_type size = 800 )
+void benchmark_3D( Benchmark<>& benchmark, index_type size = 800 )
 {
    NDArray< value_type,
             SizesHolder< index_type, 0, 0, 0 >,
@@ -183,7 +183,7 @@ void benchmark_3D( Benchmark& benchmark, index_type size = 800 )
 }
 
 template< typename Device >
-void benchmark_4D( Benchmark& benchmark, index_type size = 150 )
+void benchmark_4D( Benchmark<>& benchmark, index_type size = 150 )
 {
    NDArray< value_type,
             SizesHolder< index_type, 0, 0, 0, 0 >,
@@ -206,7 +206,7 @@ void benchmark_4D( Benchmark& benchmark, index_type size = 150 )
 }
 
 template< typename Device >
-void benchmark_5D( Benchmark& benchmark, index_type size = 56 )
+void benchmark_5D( Benchmark<>& benchmark, index_type size = 56 )
 {
    NDArray< value_type,
             SizesHolder< index_type, 0, 0, 0, 0, 0 >,
@@ -229,7 +229,7 @@ void benchmark_5D( Benchmark& benchmark, index_type size = 56 )
 }
 
 template< typename Device >
-void benchmark_6D( Benchmark& benchmark, index_type size = 28 )
+void benchmark_6D( Benchmark<>& benchmark, index_type size = 28 )
 {
    NDArray< value_type,
             SizesHolder< index_type, 0, 0, 0, 0, 0, 0 >,
@@ -253,7 +253,7 @@ void benchmark_6D( Benchmark& benchmark, index_type size = 28 )
 
 
 template< typename Device >
-void benchmark_2D_perm( Benchmark& benchmark, index_type size = 22333 )
+void benchmark_2D_perm( Benchmark<>& benchmark, index_type size = 22333 )
 {
    NDArray< value_type,
             SizesHolder< index_type, 0, 0 >,
@@ -276,7 +276,7 @@ void benchmark_2D_perm( Benchmark& benchmark, index_type size = 22333 )
 }
 
 template< typename Device >
-void benchmark_3D_perm( Benchmark& benchmark, index_type size = 800 )
+void benchmark_3D_perm( Benchmark<>& benchmark, index_type size = 800 )
 {
    NDArray< value_type,
             SizesHolder< index_type, 0, 0, 0 >,
@@ -299,7 +299,7 @@ void benchmark_3D_perm( Benchmark& benchmark, index_type size = 800 )
 }
 
 template< typename Device >
-void benchmark_4D_perm( Benchmark& benchmark, index_type size = 150 )
+void benchmark_4D_perm( Benchmark<>& benchmark, index_type size = 150 )
 {
    NDArray< value_type,
             SizesHolder< index_type, 0, 0, 0, 0 >,
@@ -322,7 +322,7 @@ void benchmark_4D_perm( Benchmark& benchmark, index_type size = 150 )
 }
 
 template< typename Device >
-void benchmark_5D_perm( Benchmark& benchmark, index_type size = 56 )
+void benchmark_5D_perm( Benchmark<>& benchmark, index_type size = 56 )
 {
    NDArray< value_type,
             SizesHolder< index_type, 0, 0, 0, 0, 0 >,
@@ -345,7 +345,7 @@ void benchmark_5D_perm( Benchmark& benchmark, index_type size = 56 )
 }
 
 template< typename Device >
-void benchmark_6D_perm( Benchmark& benchmark, index_type size = 28 )
+void benchmark_6D_perm( Benchmark<>& benchmark, index_type size = 28 )
 {
    NDArray< value_type,
             SizesHolder< index_type, 0, 0, 0, 0, 0, 0 >,
@@ -368,7 +368,7 @@ void benchmark_6D_perm( Benchmark& benchmark, index_type size = 28 )
 }
 
 template< typename Device >
-void run_benchmarks( Benchmark& benchmark )
+void run_benchmarks( Benchmark<>& benchmark )
 {
    benchmark_array< Device >( benchmark );
    benchmark_1D< Device >( benchmark );
@@ -431,10 +431,10 @@ int main( int argc, char* argv[] )
    std::ofstream logFile( logFileName.getString(), mode );
 
    // init benchmark and common metadata
-   Benchmark benchmark( loops, verbose );
+   Benchmark<> benchmark( loops, verbose );
 
    // prepare global metadata
-   Benchmark::MetadataMap metadata = getHardwareMetadata();
+   Benchmark<>::MetadataMap metadata = getHardwareMetadata< Logging >();
 
    const String devices = parameters.getParameter< String >( "devices" );
    if( devices == "all" || devices == "host" )
diff --git a/src/Benchmarks/ODESolvers/benchmarks.h b/src/Benchmarks/ODESolvers/benchmarks.h
index a6ee67a624a01443eeabb12d540fc7d6cecb58d8..f27d6962e33cb0cf5cd19373d734a65683f5e1c2 100644
--- a/src/Benchmarks/ODESolvers/benchmarks.h
+++ b/src/Benchmarks/ODESolvers/benchmarks.h
@@ -35,7 +35,7 @@ getPerformer()
 
 template< typename Solver, typename VectorPointer >
 void
-benchmarkSolver( Benchmark& benchmark,
+benchmarkSolver( Benchmark<>& benchmark,
                  const Config::ParameterContainer& parameters,
                  VectorPointer& u )
 {
diff --git a/src/Benchmarks/ODESolvers/tnl-benchmark-ode-solvers.h b/src/Benchmarks/ODESolvers/tnl-benchmark-ode-solvers.h
index 4def52d52d3bb277e09c0188536f404a3a6f7ce3..afdf33d3a4ade89f9444ddbc0e7b45542278ece3 100644
--- a/src/Benchmarks/ODESolvers/tnl-benchmark-ode-solvers.h
+++ b/src/Benchmarks/ODESolvers/tnl-benchmark-ode-solvers.h
@@ -41,7 +41,7 @@ using namespace TNL::Pointers;
 
 template< typename Real, typename Index >
 void
-benchmarkODESolvers( Benchmark& benchmark,
+benchmarkODESolvers( Benchmark<>& benchmark,
                      const Config::ParameterContainer& parameters,
                      size_t dofs )
 {
@@ -51,7 +51,7 @@ benchmarkODESolvers( Benchmark& benchmark,
    using CudaVectorPointer = Pointers::SharedPointer< CudaVectorType >;
    using HostProblem = SimpleProblem< Real, Devices::Host, Index >;
    using CudaProblem = SimpleProblem< Real, Devices::Cuda, Index >;
-   using SolverMonitorType = typename Benchmark::SolverMonitorType;
+   using SolverMonitorType = typename Benchmark<>::SolverMonitorType;
 
    const auto& solvers = parameters.getList< String >( "solvers" );
    for( auto&& solver : solvers )
@@ -107,15 +107,15 @@ struct ODESolversBenchmark
    using VectorPointer = Pointers::SharedPointer< VectorType >;
 
    static bool
-   run( Benchmark& benchmark,
-        Benchmark::MetadataMap metadata,
+   run( Benchmark<>& benchmark,
+        Benchmark<>::MetadataMap metadata,
         const Config::ParameterContainer& parameters )
    {
       const String name = String( (TNL::MPI::GetSize() > 1) ? "Distributed ODE solvers" : "ODE solvers" );
                           //+ " (" + parameters.getParameter< String >( "name" ) + "): ";
       benchmark.newBenchmark( name, metadata );
       for( size_t dofs = 25; dofs <= 10000000; dofs *= 2 ) {
-         benchmark.setMetadataColumns( Benchmark::MetadataColumns({
+         benchmark.setMetadataColumns( Benchmark<>::MetadataColumns({
             // TODO: strip the device
             { "DOFs", convertToString( dofs ) },
          } ));
@@ -129,8 +129,8 @@ struct ODESolversBenchmark
    }
 
    static void
-   runDistributed( Benchmark& benchmark,
-                   Benchmark::MetadataMap metadata,
+   runDistributed( Benchmark<>& benchmark,
+                   Benchmark<>::MetadataMap metadata,
                    const Config::ParameterContainer& parameters,
                    size_t dofs )
    {
@@ -139,8 +139,8 @@ struct ODESolversBenchmark
    }
 
    static void
-   runNonDistributed( Benchmark& benchmark,
-                      Benchmark::MetadataMap metadata,
+   runNonDistributed( Benchmark<>& benchmark,
+                      Benchmark<>::MetadataMap metadata,
                       const Config::ParameterContainer& parameters,
                       size_t dofs )
    {
@@ -150,8 +150,8 @@ struct ODESolversBenchmark
 };
 
 template< typename Real >
-bool resolveIndexType( Benchmark& benchmark,
-   Benchmark::MetadataMap& metadata,
+bool resolveIndexType( Benchmark<>& benchmark,
+   Benchmark<>::MetadataMap& metadata,
    Config::ParameterContainer& parameters )
 {
    const String& index = parameters.getParameter< String >( "index-type" );
@@ -159,8 +159,8 @@ bool resolveIndexType( Benchmark& benchmark,
    return ODESolversBenchmark< Real, long int >::run( benchmark, metadata, parameters );
 }
 
-bool resolveRealTypes( Benchmark& benchmark,
-   Benchmark::MetadataMap& metadata,
+bool resolveRealTypes( Benchmark<>& benchmark,
+   Benchmark<>::MetadataMap& metadata,
    Config::ParameterContainer& parameters )
 {
    const String& realType = parameters.getParameter< String >( "real-type" );
@@ -245,10 +245,10 @@ main( int argc, char* argv[] )
       logFile.open( logFileName.getString(), mode );
 
    // init benchmark and common metadata
-   Benchmark benchmark( loops, verbose );
+   Benchmark<> benchmark( loops, verbose );
 
    // prepare global metadata
-   Benchmark::MetadataMap metadata = getHardwareMetadata();
+   Benchmark<>::MetadataMap metadata = getHardwareMetadata< Logging >();
 
    const bool status = resolveRealTypes( benchmark, metadata, parameters );
 
diff --git a/src/Benchmarks/SpMV/CMakeLists.txt b/src/Benchmarks/SpMV/CMakeLists.txt
index 6af6965345eeacee224edd7b44dc55f389cd7fbe..93dccab0dc793ea2f3218eb6ba0cb9f326450425 100644
--- a/src/Benchmarks/SpMV/CMakeLists.txt
+++ b/src/Benchmarks/SpMV/CMakeLists.txt
@@ -1,8 +1,23 @@
+# CSR5 does not work properly yet:
+#
+# https://github.com/weifengliu-ssslab/Benchmark_SpMV_using_CSR5/issues/9
+# https://github.com/weifengliu-ssslab/Benchmark_SpMV_using_CSR5/issues/10
+#
+# We can build it with TNL but it crashes with many CUDA errors. We should first check it
+# with the original build.
+#
+#include( cmake/BuildCSR5.cmake )
+
 if( BUILD_CUDA )
-    CUDA_ADD_EXECUTABLE( tnl-benchmark-spmv tnl-benchmark-spmv.cu )
-    TARGET_LINK_LIBRARIES( tnl-benchmark-spmv ${CUDA_cusparse_LIBRARY} ${CUDA_cudadevrt_LIBRARY} )
+    cuda_include_directories( ${CXX_BENCHMARKS_INCLUDE_DIRS} )
+    message( STATUS ${CXX_BENCHMARKS_FLAGS} )
+    CUDA_ADD_EXECUTABLE( tnl-benchmark-spmv tnl-benchmark-spmv.cu OPTIONS ${CXX_BENCHMARKS_FLAGS} ${PETSC_CXX_FLAGS} )
+    TARGET_LINK_LIBRARIES( tnl-benchmark-spmv ${CUDA_cusparse_LIBRARY} ${CUDA_cudadevrt_LIBRARY} ${PETSC_LINKER_FLAGS})
 else()
     ADD_EXECUTABLE( tnl-benchmark-spmv tnl-benchmark-spmv.cpp )
+    target_compile_options( tnl-benchmark-spmv  PRIVATE ${CXX_BENCHMARKS_FLAGS} ${PETSC_CXX_FLAGS} )
+    target_include_directories( tnl-benchmark-spmv PRIVATE ${CXX_BENCHMARKS_INCLUDE_DIRS} )
+    TARGET_LINK_LIBRARIES( tnl-benchmark-spmv ${PETSC_LINKER_FLAGS} )
 endif()
 
 install( TARGETS tnl-benchmark-spmv RUNTIME DESTINATION bin )
diff --git a/src/Benchmarks/SpMV/ReferenceFormats/CSR5Benchmark.h b/src/Benchmarks/SpMV/ReferenceFormats/CSR5Benchmark.h
new file mode 100644
index 0000000000000000000000000000000000000000..8cfd8f453a3cc5a0126103a85f89614bc7619efc
--- /dev/null
+++ b/src/Benchmarks/SpMV/ReferenceFormats/CSR5Benchmark.h
@@ -0,0 +1,136 @@
+/***************************************************************************
+                          CSR5Benchmark.h  -  description
+                             -------------------
+    begin                : Apr 23, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+/***
+ * Wrapper of original CSR5 kernels for TNL benchmarks.
+ */
+
+#include <stdexcept>
+
+
+namespace TNL {
+/////
+// Currently CSR5 for CUDA cannot be build because of conflict of atomicAdd for `double` type:
+//   https://github.com/weifengliu-ssslab/Benchmark_SpMV_using_CSR5/issues/9
+// The solution is to insert whole benchmark into separate namespace. In this case, however,
+// CSR5 does not work with `float`. So far, this seems to be the best solution.
+namespace CSR5Benchmark {
+
+#ifdef HAVE_CSR5
+#include <CSR5_cuda/anonymouslib_cuda.h>
+#endif
+
+#ifdef HAVE_CSR5
+template< typename CsrMatrix,
+          typename Real = typename CsrMatrix::RealType >
+struct CSR5SpMVCaller
+{
+   static_assert( std::is_same< typename CsrMatrix::DeviceType, TNL::Devices::Cuda >::value, "Only CUDA device is allowed for CSR matrix for CSR5 benchmark." );
+   using RealType = typename CsrMatrix::RealType;
+   using DeviceType = TNL::Devices::Cuda;
+   using IndexType = typename CsrMatrix::IndexType;
+   using VectorType = TNL::Containers::Vector< RealType, DeviceType, IndexType >;
+   using VectorView = typename VectorType::ViewType;
+   using CSR5Type = anonymouslibHandle< IndexType, typename std::make_unsigned< IndexType >::type, RealType >;
+
+   static void spmv( CSR5Type& csr5, VectorView& outVector ) {
+      csr5.spmv( ( RealType ) 1.0, outVector.getData() );
+   };
+};
+
+template< typename CsrMatrix >
+struct CSR5SpMVCaller< CsrMatrix, float >
+{
+   static_assert( std::is_same< typename CsrMatrix::DeviceType, TNL::Devices::Cuda >::value, "Only CUDA device is allowed for CSR matrix for CSR5 benchmark." );
+   using RealType = typename CsrMatrix::RealType;
+   using DeviceType = TNL::Devices::Cuda;
+   using IndexType = typename CsrMatrix::IndexType;
+   using VectorType = TNL::Containers::Vector< RealType, DeviceType, IndexType >;
+   using VectorView = typename VectorType::ViewType;
+   using CSR5Type = anonymouslibHandle< IndexType, typename std::make_unsigned< IndexType >::type, RealType >;
+
+   static void spmv( CSR5Type& csr5, VectorView& outVector )
+   {
+      //csr5.spmv( ( RealType ) 1.0, outVector.getData() );
+   };
+};
+#endif
+
+
+template< typename CsrMatrix >
+struct CSR5Benchmark
+{
+   static_assert( std::is_same< typename CsrMatrix::DeviceType, TNL::Devices::Cuda >::value, "Only CUDA device is allowed for CSR matrix for CSR5 benchmark." );
+   using RealType = typename CsrMatrix::RealType;
+   using DeviceType = TNL::Devices::Cuda;
+   using IndexType = typename CsrMatrix::IndexType;
+   using VectorType = TNL::Containers::Vector< RealType, DeviceType, IndexType >;
+   using VectorView = typename VectorType::ViewType;
+#ifdef HAVE_CSR5
+   using CSR5Type = anonymouslibHandle< IndexType, typename std::make_unsigned< IndexType >::type, RealType >;
+#endif
+
+   CSR5Benchmark( CsrMatrix& matrix, VectorType& inVector, VectorType& outVector )
+   :
+#ifdef HAVE_CSR5
+   csr5( matrix.getRows(), matrix.getColumns() ),
+#endif
+     inVectorView( inVector ), outVectorView( outVector )
+   {
+#ifdef HAVE_CSR5
+      // err = A.inputCSR(nnzA, d_csrRowPtrA, d_csrColIdxA, d_csrValA);
+      //cout << "inputCSR err = " << err << endl;
+      this->csr5.inputCSR( matrix.getValues().getSize(),
+                           matrix.getRowPointers().getData(),
+                           matrix.getColumnIndexes().getData(),
+                           matrix.getValues().getData() );
+
+      //err = A.setX(d_x); // you only need to do it once!
+      //cout << "setX err = " << err << endl;
+      this->csr5.setX( inVector.getData() );
+
+      this->csr5.setSigma(ANONYMOUSLIB_AUTO_TUNED_SIGMA);
+
+      // warmup device
+      this->csr5.warmup();
+
+      // conversion ... probably
+      this->csr5.asCSR5();
+#endif
+   }
+
+   void vectorProduct()
+   {
+#ifdef HAVE_CSR5
+      CSR5SpMVCaller< CsrMatrix >::spmv( this->csr5, outVectorView );
+#endif
+   }
+
+   const VectorView& getCudaOutVector()
+   {
+      return this->outVectorView;
+   }
+
+   ~CSR5Benchmark()
+   {
+#ifdef HAVE_CSR5
+      this->csr5.destroy();
+#endif
+   }
+
+   protected:
+#ifdef HAVE_CSR5
+      CSR5Type csr5;
+#endif
+      VectorView inVectorView, outVectorView;
+};
+
+   } // namespace CSR5Benchmark
+} // namespace TNL
diff --git a/src/Benchmarks/SpMV/ReferenceFormats/Legacy/CSR.h b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/CSR.h
index 2db4c9f0c78f94670839059f13fc664345985521..efbd997e26f2c2ef0d0cc7a428bc64854bc32345 100644
--- a/src/Benchmarks/SpMV/ReferenceFormats/Legacy/CSR.h
+++ b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/CSR.h
@@ -53,6 +53,17 @@ union Block {
 
    Block() = default;
 
+   template< typename Index2 >
+   Block& operator=( const Block< Index2 >& source ) {
+      index[ 0 ] = source.index[ 0 ];
+      index[ 1 ] = source.index[ 1 ];
+      for( int i = 0; i < ( sizeof(Index) == 4 ? 8 : 16); i ++ )
+         byte[ i ] = source.byte[ i ];
+      for( int i = 0; i < (sizeof(Index) == 4 ? 4 : 8); i++ )
+         twobytes[ i ] = source.twobytes[ i ];
+      return *this;
+   }
+
    Index index[2]; // index[0] is row pointer, index[1] is index in warp
    uint8_t byte[sizeof(Index) == 4 ? 8 : 16]; // byte[7/15] is type specificator
    uint16_t twobytes[sizeof(Index) == 4 ? 4 : 8]; //twobytes[2/4] is maxID - minID
@@ -262,8 +273,8 @@ public:
    // copy assignment
    CSR& operator=( const CSR& matrix );
 
-   template< CSRKernel KernelType2 >
-   CSR& operator=( const CSR< RealType, DeviceType, IndexType, KernelType2 >& matrix );
+   template< typename IndexType2, CSRKernel KernelType2 >
+   CSR& operator=( const CSR< RealType, DeviceType, IndexType2, KernelType2 >& matrix );
 
    // cross-device copy assignment
    template< typename Real2, typename Device2, typename Index2, CSRKernel KernelType2,
diff --git a/src/Benchmarks/SpMV/ReferenceFormats/Legacy/CSR_impl.h b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/CSR_impl.h
index 2cb2b4784d37266cd79dce0968845de0dc3ef705..f71eba123464cd32095ba027791350a26ed3693d 100644
--- a/src/Benchmarks/SpMV/ReferenceFormats/Legacy/CSR_impl.h
+++ b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/CSR_impl.h
@@ -690,10 +690,10 @@ template< typename Real,
           typename Device,
           typename Index,
           CSRKernel KernelType >
-   template< CSRKernel KernelType2 >
+   template< typename IndexType2, CSRKernel KernelType2 >
 CSR< Real, Device, Index, KernelType >&
 CSR< Real, Device, Index, KernelType >::
-operator=( const CSR< Real, Device, Index, KernelType2 >& matrix )
+operator=( const CSR< Real, Device, IndexType2, KernelType2 >& matrix )
 {
    this->setLike( matrix );
    this->values = matrix.values;
diff --git a/src/Benchmarks/SpMV/ReferenceFormats/Legacy/Sparse_impl.h b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/Sparse_impl.h
index d87c80eee51b8115c6b5fb5e80a899ac72f7b22c..ddc851022f8478571c000e63d86d3c6b6cf4a39d 100644
--- a/src/Benchmarks/SpMV/ReferenceFormats/Legacy/Sparse_impl.h
+++ b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/Sparse_impl.h
@@ -104,7 +104,7 @@ template< typename Real,
           typename Index >
 void Sparse< Real, Device, Index >::allocateMatrixElements( const IndexType& numberOfMatrixElements )
 {
-   TNL_ASSERT_GE( numberOfMatrixElements, 0, "Number of matrix elements must be non-negative." );
+   TNL_ASSERT_GE( numberOfMatrixElements, ( IndexType ) 0, "Number of matrix elements must be non-negative." );
 
    this->values.setSize( numberOfMatrixElements );
    this->columnIndexes.setSize( numberOfMatrixElements );
diff --git a/src/Benchmarks/SpMV/ReferenceFormats/LightSpMV-1.0/Options.cu b/src/Benchmarks/SpMV/ReferenceFormats/LightSpMV-1.0/Options.cu
new file mode 100644
index 0000000000000000000000000000000000000000..f9bbeae70f327fda0afc93bef7f90c5dc173bab3
--- /dev/null
+++ b/src/Benchmarks/SpMV/ReferenceFormats/LightSpMV-1.0/Options.cu
@@ -0,0 +1,381 @@
+/*
+ * Options.cu
+ *
+ *  Created on: Nov 24, 2014
+ *      Author: yongchao
+ */
+
+#include "Options.h"
+
+void Options::printUsage() {
+	cerr << endl
+			<< "LightSpMV (" << VERSION << ")"
+			<< ": GPU-based sparse matrix-vector multiplication using CSR storate format"
+			<< endl;
+	cerr << "Usage: lightspmv -i matrix [options]" << endl << endl;
+	cerr << "Options:" << endl;
+	cerr << "Input:" << endl
+			<< "\t-i <string> sparse matrix A file (in Matrix Market format)"
+			<< endl
+			<< "\t-x <string> vector X file (one element per line) [otherwise, set each element to 1.0]"
+			<< endl
+			<< "\t-y <string> vector Y file (one elemenet per line) [otherwise, set each element to 0.0]"
+			<< endl << "Output:" << endl
+			<< "\t-o <string> output file (one element per line) [otherwise, no output]"
+			<< endl << "Compute:" << endl
+			<< "\t-a <float> alpha value, default = " << _alpha << endl
+			<< "\t-b <float> beta value, defualt = " << _beta << endl
+			<< "\t-f <int> formula used, default = " << _formula << endl
+			<< "\t    0: y = Ax" << endl << "\t    1: y = alpha * Ax + beta * y"
+			<< endl << "\t-r <int> select the routine to use, default = "
+			<< _routine << endl
+			<< "\t    0: vector-based row dynamic distribution" << endl
+			<< "\t    1: warp-based row dynamic distribution" << endl
+			<< "\t-d <int> double-precision floating point, default = "
+			<< (_singlePrecision ? 0 : 1) << endl
+			<< "\t-g <int> index of the single GPU used, default = "
+			<< _gpuIndex << endl
+			<< "\t-m <int> number of SpMV iterations, default = " << _numIters
+			<< endl << endl;
+}
+bool Options::parseArgs(int32_t argc, char* argv[]) {
+	int32_t c;
+
+	if (argc < 2) {
+		printUsage();
+		return false;
+	}
+
+	while ((c = getopt(argc, argv, "i:x:y:o:g:f:r:d:m:\n")) != -1) {
+		switch (c) {
+		case 'i':
+			_mmFileName = optarg;
+			break;
+		case 'x':
+			_vecXFileName = optarg;
+			break;
+		case 'y':
+			_vecYFileName = optarg;
+			break;
+		case 'o':
+			_outFileName = optarg;
+			break;
+		case 'a':
+			_alpha = atof(optarg);
+			break;
+		case 'b':
+			_beta = atof(optarg);
+			break;
+		case 'f':
+			_formula = atoi(optarg);
+			break;
+		case 'g':
+			_gpuIndex = atoi(optarg);
+			if (_gpuIndex < 0) {
+				_gpuIndex = 0;
+			}
+			break;
+		case 'r':
+			_routine = atoi(optarg);
+			if (_routine < 0) {
+				_routine = 0;
+			}
+			break;
+		case 'd':
+			_singlePrecision = atoi(optarg) ? false : true;
+			break;
+		case 'm':
+			_numIters = atoi(optarg);
+			if(_numIters < 1){
+				_numIters = 1;
+			}
+			break;
+		default:
+			cerr << "Unknown parameter: " << optarg << endl;
+			return false;
+		}
+	}
+
+	/*check the file length*/
+	if (_mmFileName.length() == 0) {
+		cerr << "Matrix file should be specified" << endl;
+		return false;
+	}
+
+	/*load the list of GPUs*/
+	if (!getGPUs()) {
+		return false;
+	}
+
+	/*load the matrix*/
+	if (!loadMatrixMarketFile(_mmFileName.c_str())) {
+		return false;
+	}
+
+	/*load vector X*/
+	int64_t elementSize = _singlePrecision ? sizeof(float) : sizeof(double);
+	int64_t numBytes = _numCols * elementSize;
+
+	/*allocate space*/
+	cudaMallocHost(&_vectorX, numBytes);
+	CudaCheckError();
+
+	/*load the vector X*/
+	if (_vecXFileName.length() == 0) {
+		/*initialize X*/
+		cerr << "Initialize each element of vector X to 1.0" << endl;
+		if (_singlePrecision) {
+			float* p = (float*) _vectorX;
+			for (uint32_t i = 0; i < _numCols; ++i) {
+				p[i] = 1.0;
+			}
+		} else {
+			double* p = (double*) _vectorX;
+			for (uint32_t i = 0; i < _numCols; ++i) {
+				p[i] = 1.0;
+			}
+		}
+	} else {
+		cerr << "Load vector X from file" << endl;
+		/*could not get the data*/
+		if (!loadVector(_vecXFileName, _vectorX, _numCols)) {
+			return false;
+		}
+	}
+
+	/*load vector Y*/
+	numBytes = _numRows * elementSize;
+
+	/*allocate space*/
+	cudaMallocHost(&_vectorY, numBytes);
+	CudaCheckError();
+
+	/*load the vector Y*/
+	if (_vecYFileName.length() == 0) {
+		/*initialize Y*/
+		cerr << "Initialize each element of vector Y to 0" << endl;
+
+		memset(_vectorY, 0, numBytes);
+	} else {
+		cerr << "Load vector Y from file" << endl;
+		/*could not get the data*/
+		if (!loadVector(_vecYFileName, _vectorY, _numRows)) {
+			return false;
+		}
+	}
+
+	return true;
+}
+/*convert the matrix market format to CSR*/
+bool Options::loadMatrixMarketFile(const char* fileName) {
+	uint64_t numBytes;
+
+	cerr << "loading sparse matrix" << endl;
+	if (_singlePrecision) {
+		/*create an empty CSR sparse matrix object*/
+		cusp::csr_matrix<uint32_t, float, cusp::host_memory> matrix;
+
+		// load a matrix stored in MatrixMarket format
+		cusp::io::read_matrix_market_file(matrix, fileName);
+
+		/*save the matrix information*/
+		_numRows = matrix.num_rows;
+		_numCols = matrix.num_cols;
+		_numValues = matrix.num_entries;
+
+		/*reserve memory*/
+		cudaMallocHost(&_rowOffsets, (_numRows + 1) * sizeof(uint32_t));
+		CudaCheckError();
+
+		cudaMallocHost(&_colIndexValues, _numValues * sizeof(uint32_t));
+		CudaCheckError();
+
+		cudaMallocHost(&_numericalValues, _numValues * sizeof(float));
+		CudaCheckError();
+
+		/*copy the elements*/
+		numBytes = (_numRows + 1) * sizeof(uint32_t);
+		cudaMemcpy(_rowOffsets, &matrix.row_offsets[0], numBytes,
+				cudaMemcpyHostToHost);
+		CudaCheckError();
+
+		numBytes = _numValues * sizeof(uint32_t);
+		cudaMemcpy(_colIndexValues, &matrix.column_indices[0], numBytes,
+				cudaMemcpyHostToHost);
+		CudaCheckError();
+
+		numBytes = _numValues * sizeof(float);
+		cudaMemcpy(_numericalValues, &matrix.values[0], numBytes,
+				cudaMemcpyHostToHost);
+		CudaCheckError();
+	} else {
+		/*create an empty CSR sparse matrix object*/
+		cusp::csr_matrix<uint32_t, double, cusp::host_memory> matrix;
+
+		// load a matrix stored in MatrixMarket format
+		cusp::io::read_matrix_market_file(matrix, fileName);
+
+		/*save the matrix information*/
+		_numRows = matrix.num_rows;
+		_numCols = matrix.num_cols;
+		_numValues = matrix.num_entries;
+
+		/*reserve memory*/
+		cudaMallocHost(&_rowOffsets, (_numRows + 1) * sizeof(uint32_t));
+		CudaCheckError();
+
+		cudaMallocHost(&_colIndexValues, _numValues * sizeof(uint32_t));
+		CudaCheckError();
+
+		cudaMallocHost(&_numericalValues, _numValues * sizeof(double));
+		CudaCheckError();
+
+		/*copy the elements*/
+		numBytes = (_numRows + 1) * sizeof(uint32_t);
+		cudaMemcpy(_rowOffsets, &matrix.row_offsets[0], numBytes,
+				cudaMemcpyHostToHost);
+		CudaCheckError();
+
+		numBytes = _numValues * sizeof(uint32_t);
+		cudaMemcpy(_colIndexValues, &matrix.column_indices[0], numBytes,
+				cudaMemcpyHostToHost);
+		CudaCheckError();
+
+		numBytes = _numValues * sizeof(double);
+		cudaMemcpy(_numericalValues, &matrix.values[0], numBytes,
+				cudaMemcpyHostToHost);
+		CudaCheckError();
+	}
+
+	return true;
+}
+bool Options::loadVector(const string& fileName, void* vector,
+		const uint32_t maxNumValues) {
+	char buffer[1024];
+	FILE* file;
+	uint32_t pos;
+	float* fptr = (float*) vector;
+	double* dptr = (double*) vector;
+
+	cerr << "loading vector X" << endl;
+	/*open the file*/
+	if (fileName.length() == 0) {
+		return false;
+	}
+	file = fopen(fileName.c_str(), "r");
+	if (!file) {
+		cerr << "Failed to open file " << fileName << endl;
+		return false;
+	}
+
+	/*read the file*/
+	pos = 0;
+	while (fgets(buffer, 1023, file)) {
+		/*remove the end of line*/
+		for (int32_t i = strlen(buffer) - 1;
+				i >= 0 && (buffer[i] == '\n' || buffer[i] == '\r'); --i) {
+			buffer[i] = '\0';
+		}
+		if (strlen(buffer) == 0) {
+			continue;
+		}
+
+		/*get the number and save to vector*/
+		if (pos >= maxNumValues) {
+			/*already have enough numbers*/
+			break;
+		}
+		if (_singlePrecision) {
+			float value;
+			sscanf(buffer, "%f", &value);
+			fptr[pos++] = value;
+		} else {
+			double value;
+			sscanf(buffer, "%lf", &value);
+			dptr[pos++] = value;
+		}
+	}
+	if (pos < maxNumValues) {
+		cerr << "Do not have enough numbers in the file" << endl;
+		return false;
+	}
+	cerr << "Finished loading vector X" << endl;
+	return true;
+}
+void Options::getRowSizeVariance() {
+	double rowStart;
+	uint32_t rowEnd;
+
+	/*compute the variance*/
+	_variance = 0;
+	_mean = rint((double) _numValues / _numRows);
+	rowStart = _rowOffsets[0];
+	for (uint32_t i = 1; i <= _numRows; ++i) {
+		rowEnd = _rowOffsets[i];
+		_variance += (rowEnd - rowStart - _mean) * (rowEnd - rowStart - _mean);
+		rowStart = rowEnd;
+	}
+	_variance = rint(sqrt(_variance / (_numRows > 1 ? _numRows - 1 : 1)));
+
+	/*information*/
+	cerr << "Rows: " << _numRows << " Cols: " << _numCols << " Elements: "
+			<< _numValues << " Mean: " << _mean << " Standard deviation: "
+			<< _variance << endl;
+}
+bool Options::getGPUs() {
+	int32_t numGPUs;
+
+	/*get the number of GPUs*/
+	if (cudaGetDeviceCount(&numGPUs) != cudaSuccess) {
+		cerr << "No CUDA-enabled GPU is available in the host" << endl;
+		return false;
+	}
+
+#if defined(HAVE_SM_35)
+	cerr << "Require GPUs with compute capability >= 3.5" << endl;
+#else
+	cerr << "Require GPUs with compute capability >= 3.0" << endl;
+#endif
+
+	/*iterate each GPU*/
+	cudaDeviceProp prop;
+	for (int32_t i = 0; i < numGPUs; ++i) {
+
+		/*get the property of the device*/
+		cudaGetDeviceProperties(&prop, i);
+
+		/*check the major of the GPU*/
+#if defined(HAVE_SM_35)
+		if ((prop.major * 10 + prop.minor) >= 35) {
+#else
+		if ((prop.major * 10 + prop.minor) >= 30) {
+#endif
+			cerr << "GPU " << _gpus.size() << ": " << prop.name
+					<< " (capability " << prop.major << "." << prop.minor << ")"
+					<< endl;
+
+			/*save the Kepler GPU*/
+			_gpus.push_back(make_pair(i, prop));
+		}
+	}
+	/*check the number of qualified GPUs*/
+	if (_gpus.size() == 0) {
+		cerr << "No qualified GPU is available" << endl;
+		return false;
+	}
+
+	/*check the GPU index*/
+
+	/*reset the number of GPUs*/
+	if (_gpuIndex >= (int32_t) _gpus.size()) {
+		_gpuIndex = _gpus.size() - 1;
+	}
+	if (_gpuIndex < 0) {
+		_gpuIndex = 0;
+	}
+
+	/*move the selected gpu to the first*/
+	swap(_gpus[0], _gpus[_gpuIndex]);
+
+	return true;
+}
diff --git a/src/Benchmarks/SpMV/ReferenceFormats/LightSpMV-1.0/Options.h b/src/Benchmarks/SpMV/ReferenceFormats/LightSpMV-1.0/Options.h
new file mode 100644
index 0000000000000000000000000000000000000000..877e36352fc89eb5168b56f33018bbd0edcb8c4e
--- /dev/null
+++ b/src/Benchmarks/SpMV/ReferenceFormats/LightSpMV-1.0/Options.h
@@ -0,0 +1,123 @@
+/*
+ * Options.h
+ *
+ *  Created on: Nov 21, 2014
+ *      Author: yongchao
+ */
+
+#ifndef OPTIONS_H_
+#define OPTIONS_H_
+
+#include "Types.h"
+//#include <cusp/io/matrix_market.h>
+
+struct Options {
+	Options() {
+
+		/*input*/
+		_routine = 1;
+		_formula = 1;
+		_numIters = 1000;
+		_singlePrecision = true;
+
+		/*matrix data*/
+		_numRows = 0;
+		_numCols = 0;
+		_rowOffsets = NULL;
+		_numValues = 0;
+		_colIndexValues = NULL;
+		_numericalValues = NULL;
+		_alpha = 1.0;
+		_beta = 1.0;
+
+		/*vector data*/
+		_vectorX = NULL;
+		_vectorY = NULL;
+
+		/*the number of GPUs*/
+		_numGPUs = 1;
+
+		/*GPU index used*/
+		_gpuIndex = 0;
+
+		/*for debug*/
+		_mean = 0;
+		_variance = 0;
+	}
+	~Options() {
+		if (_rowOffsets) {
+			cudaFreeHost(_rowOffsets);
+		}
+		if (_colIndexValues) {
+			cudaFreeHost(_colIndexValues);
+		}
+		if (_numericalValues) {
+			cudaFreeHost(_numericalValues);
+		}
+
+		if (_vectorX) {
+			cudaFreeHost(_vectorX);
+		}
+		if (_vectorY) {
+			cudaFreeHost(_vectorY);
+		}
+	}
+
+	/*parse parameters*/
+	bool parseArgs(int32_t argc, char* argv[]);
+
+	/*load Matrix Market file*/
+	bool loadMatrixMarketFile(const char* fileName);
+
+	/*load vector*/
+	bool loadVector(const string& fileName, void* vector,
+			const uint32_t maxNumValues);
+
+	/*print out usage*/
+	void printUsage();
+
+	/*get row distribution*/
+	void getRowSizeVariance();
+
+	/*retrieve GPU list*/
+	bool getGPUs();
+
+	/*input files*/
+	string _mmFileName;
+	string _vecXFileName;
+	string _vecYFileName;
+	string _outFileName;
+	bool _singlePrecision;
+	int32_t _routine;
+	int32_t _formula;
+	int32_t _numIters;
+	double _alpha;
+	double _beta;
+
+	/*for debugging*/
+	double _mean;
+	double _variance;
+
+	/*matrix data*/
+	uint32_t _numRows;
+	uint32_t _numCols;
+	uint32_t *_rowOffsets;
+	uint32_t _numValues;
+	uint32_t *_colIndexValues;
+	void *_numericalValues;
+
+	/*vector data*/
+	void *_vectorX;
+	void *_vectorY;
+
+	/*number of GPUs to be used*/
+	int32_t _numGPUs;
+
+	/*GPU index used*/
+	int32_t _gpuIndex;
+
+	/*GPU device list*/
+	vector<pair<int32_t, struct cudaDeviceProp> > _gpus;
+};
+
+#endif /* OPTIONS_H_ */
diff --git a/src/Benchmarks/SpMV/ReferenceFormats/LightSpMV-1.0/SpMV.cu b/src/Benchmarks/SpMV/ReferenceFormats/LightSpMV-1.0/SpMV.cu
new file mode 100644
index 0000000000000000000000000000000000000000..c4ef3fda50db8ee47926be4cfadd57a9d2241f0f
--- /dev/null
+++ b/src/Benchmarks/SpMV/ReferenceFormats/LightSpMV-1.0/SpMV.cu
@@ -0,0 +1,874 @@
+/*
+ * SpMV.cu
+ *
+ *  Created on: Nov 21, 2014
+ *      Author: yongchao
+ */
+#include "SpMV.h"
+#include "SpMVCSR.h"
+
+extern __constant__ uint32_t _cudaNumRows;
+
+SpMV::SpMV(Options* opt) {
+	_opt = opt;
+
+	/*the number of GPUs*/
+	_numGPUs = _opt->_numGPUs;
+
+	/*compute the mean number of elements per row*/
+	_meanElementsPerRow = (int32_t) rint(
+			(double) _opt->_numValues / _opt->_numRows);
+
+	/*create row counter*/
+	_cudaRowCounters.resize(_numGPUs, NULL);
+
+	/*create streams*/
+	_streams.resize(_numGPUs, 0);
+
+	for (int32_t i = 0; i < _numGPUs; ++i) {
+		cudaSetDevice(_opt->_gpus[i].first);
+		CudaCheckError();
+
+		cudaStreamCreate(&_streams[i]);
+		CudaCheckError();
+	}
+#if defined(FLOAT_USE_TEXTURE_MEMORY) || defined(DOUBLE_USE_TEXTURE_MEMORY)
+	_texVectorX.resize(_numGPUs, 0);
+#endif
+}
+SpMV::~SpMV() {
+	/*destroy the streams*/
+	for (int32_t i = 0; i < _numGPUs; ++i) {
+
+		/*set device*/
+		cudaSetDevice(_opt->_gpus[i].first);
+		CudaCheckError();
+
+		cudaStreamDestroy(_streams[i]);
+		CudaCheckError();
+
+#if defined(FLOAT_USE_TEXTURE_MEMORY) || defined(DOUBLE_USE_TEXTURE_MEMORY)
+		if (_texVectorX[i]) {
+			cudaDestroyTextureObject(_texVectorX[i]);
+		}
+		CudaCheckError();
+#endif
+	}
+}
+
+/*invoke kernel*/
+void SpMV::spmvKernel() {
+
+	/*initialize the counter*/
+	cudaMemset(_cudaRowCounters[0], 0, sizeof(uint32_t));
+
+	/*invoke kernel*/
+	if (_opt->_formula == 0) {
+		invokeKernel(0);
+	} else {
+		invokeKernelBLAS(0);
+	}
+}
+void SpMV::invokeKernel(const int32_t i) {
+	/*do nothing*/
+}
+void SpMV::invokeKernelBLAS(const int32_t i) {
+	/*do nothing*/
+}
+
+/*single-precision floating point*/
+SpMVFloatVector::SpMVFloatVector(Options* opt) :
+		SpMV(opt) {
+
+	_rowOffsets.resize(_numGPUs, NULL);
+	_colIndexValues.resize(_numGPUs, NULL);
+	_numericalValues.resize(_numGPUs, NULL);
+	_vectorY.resize(_numGPUs, NULL);
+	_vectorX.resize(_numGPUs, NULL);
+
+	_alpha = _opt->_alpha;
+	_beta = _opt->_beta;
+}
+SpMVFloatVector::~SpMVFloatVector() {
+	/*release matrix data*/
+	for (int32_t i = 0; i < _numGPUs; ++i) {
+
+		/*select the device*/
+		cudaSetDevice(_opt->_gpus[i].first);
+		CudaCheckError();
+
+		/*release the resources*/
+		if (_rowOffsets[i]) {
+			cudaFree(_rowOffsets[i]);
+		}
+		if (_colIndexValues[i]) {
+			cudaFree(_colIndexValues[i]);
+		}
+
+		if (_numericalValues[i]) {
+			cudaFree(_numericalValues[i]);
+		}
+		if (i == 0 && _vectorY[i]) {
+			cudaFree(_vectorY[i]);
+		}
+		if (_vectorX[i]) {
+			cudaFree(_vectorX[i]);
+		}
+	}
+}
+void SpMVFloatVector::loadData() {
+	size_t numBytes;
+
+#ifdef FLOAT_USE_TEXTURE_MEMORY
+	cudaTextureDesc texDesc;
+	cudaResourceDesc resDesc;
+
+	/*specify the texture object parameters*/
+	memset(&texDesc, 0, sizeof(texDesc));
+	texDesc.addressMode[0] = cudaAddressModeClamp;
+	texDesc.addressMode[1] = cudaAddressModeClamp;
+	texDesc.filterMode = cudaFilterModePoint;
+	texDesc.readMode = cudaReadModeElementType;
+#endif
+
+	/*iterate each GPU*/
+	for (int32_t i = 0; i < _numGPUs; ++i) {
+
+		/*select the device*/
+		cudaSetDevice(_opt->_gpus[i].first);
+		CudaCheckError();
+
+		/*allocate counter buffers*/
+		cudaMalloc(&_cudaRowCounters[i], sizeof(uint32_t));
+		CudaCheckError();
+
+		cudaMemcpyToSymbol(_cudaNumRows, &_opt->_numRows, sizeof(uint32_t));
+		CudaCheckError();
+
+		cudaMemcpyToSymbol(_cudaNumCols, &_opt->_numCols, sizeof(uint32_t));
+		CudaCheckError();
+
+		/******************************************************
+		 * Load matrix data
+		 ******************************************************/
+		numBytes = (_opt->_numRows + 1) * sizeof(uint32_t);
+		cudaMalloc(&_rowOffsets[i], numBytes);
+		CudaCheckError();
+
+		cudaMemcpy(_rowOffsets[i], _opt->_rowOffsets, numBytes,
+				cudaMemcpyHostToDevice);
+		CudaCheckError();
+
+		numBytes = _opt->_numValues * sizeof(uint32_t);
+		cudaMalloc(&_colIndexValues[i], numBytes);
+		CudaCheckError();
+
+		cudaMemcpy(_colIndexValues[i], _opt->_colIndexValues, numBytes,
+				cudaMemcpyHostToDevice);
+		CudaCheckError();
+
+		/*load the numerical values*/
+		numBytes = _opt->_numValues * sizeof(float);
+		cudaMalloc(&_numericalValues[i], numBytes);
+		CudaCheckError();
+
+		cudaMemcpy(_numericalValues[i], _opt->_numericalValues, numBytes,
+				cudaMemcpyHostToDevice);
+		CudaCheckError();
+
+		/*****************************************************
+		 * Load vector X data
+		 ******************************************************/
+		numBytes = _opt->_numCols * sizeof(float);
+		cudaMalloc(&_vectorX[i], numBytes);
+		CudaCheckError();
+
+		cudaMemcpy(_vectorX[i], _opt->_vectorX, numBytes,
+				cudaMemcpyHostToDevice);
+		CudaCheckError();
+
+#ifdef FLOAT_USE_TEXTURE_MEMORY
+		/*specify texture and texture object*/
+		memset(&resDesc, 0, sizeof(resDesc));
+		resDesc.resType = cudaResourceTypeLinear;
+		resDesc.res.linear.devPtr = _vectorX[i];
+		resDesc.res.linear.desc = cudaCreateChannelDesc(32, 0, 0, 0,
+				cudaChannelFormatKindFloat);
+		resDesc.res.linear.sizeInBytes = numBytes;
+		cudaCreateTextureObject(&_texVectorX[i], &resDesc, &texDesc, NULL);
+		CudaCheckError();
+#endif
+
+		/*****************************************************
+		 * vector Y data
+		 ******************************************************/
+		numBytes = _opt->_numRows * sizeof(float);
+		cudaMalloc(&_vectorY[i], numBytes);
+		CudaCheckError();
+
+		/*copy the data*/
+		cudaMemcpy(_vectorY[i], _opt->_vectorY, numBytes,
+				cudaMemcpyHostToDevice);
+		CudaCheckError();
+	}
+}
+void SpMVFloatVector::storeData() {
+	/*transfer back vector Y*/
+	uint64_t numBytes = _opt->_numRows * sizeof(float);
+
+	/*select the device*/
+	cudaSetDevice(_opt->_gpus[0].first);
+	CudaCheckError();
+
+	/*copy back the data*/
+	cudaMemcpy(_opt->_vectorY, _vectorY[0], numBytes, cudaMemcpyDeviceToHost);
+	CudaCheckError();
+
+	/*open the file*/
+	FILE* file;
+	if (_opt->_outFileName.length() == 0) {
+		return;
+	}
+
+	file = fopen(_opt->_outFileName.c_str(), "w");
+	if (!file) {
+		cerr << "Failed to open file: " << _opt->_outFileName << endl;
+		return;
+	}
+
+	/*write to the file*/
+	float* ptr = (float*) _opt->_vectorY;
+	for (uint32_t i = 0; i < _opt->_numRows; ++i) {
+		fprintf(file, "%f\n", ptr[i]);
+	}
+
+	/*close the file*/
+	if (file != stdout) {
+		fclose(file);
+	}
+}
+void SpMVFloatVector::invokeKernel(const int32_t i) {
+	int32_t numThreadsPerBlock;
+	int32_t numThreadBlocks;
+
+	/*get the number of threads per block*/
+	getKernelGridInfo(i, numThreadsPerBlock, numThreadBlocks);
+
+	/*invoke the kernel*/
+#ifdef FLOAT_USE_TEXTURE_MEMORY
+	if (_meanElementsPerRow <= 2) {
+		spmv_csr::csr32DynamicVector<float, 2, MAX_NUM_THREADS_PER_BLOCK / 2><<<
+				numThreadBlocks, numThreadsPerBlock>>>(_cudaRowCounters[i],
+				_rowOffsets[i], _colIndexValues[i], _numericalValues[i],
+				_texVectorX[i], _vectorY[i]);
+	} else if (_meanElementsPerRow <= 4) {
+		spmv_csr::csr32DynamicVector<float, 4, MAX_NUM_THREADS_PER_BLOCK / 4><<<
+				numThreadBlocks, numThreadsPerBlock>>>(_cudaRowCounters[i],
+				_rowOffsets[i], _colIndexValues[i], _numericalValues[i],
+				_texVectorX[i], _vectorY[i]);
+	} else if (_meanElementsPerRow <= 64) {
+		spmv_csr::csr32DynamicVector<float, 8, MAX_NUM_THREADS_PER_BLOCK / 8><<<
+				numThreadBlocks, numThreadsPerBlock>>>(_cudaRowCounters[i],
+				_rowOffsets[i], _colIndexValues[i], _numericalValues[i],
+				_texVectorX[i], _vectorY[i]);
+	} else {
+		spmv_csr::csr32DynamicVector<float, 32, MAX_NUM_THREADS_PER_BLOCK / 32><<<
+				numThreadBlocks, numThreadsPerBlock>>>(_cudaRowCounters[i],
+				_rowOffsets[i], _colIndexValues[i], _numericalValues[i],
+				_texVectorX[i], _vectorY[i]);
+	}
+#else
+	if (_meanElementsPerRow <= 2) {
+		spmv_csr::csr32DynamicVector<float, 2, MAX_NUM_THREADS_PER_BLOCK / 2><<<
+		numThreadBlocks, numThreadsPerBlock>>>(
+				_cudaRowCounters[i], _rowOffsets[i], _colIndexValues[i],
+				_numericalValues[i], _vectorX[i], _vectorY[i]);
+	} else if (_meanElementsPerRow <= 4) {
+		spmv_csr::csr32DynamicVector<float, 4, MAX_NUM_THREADS_PER_BLOCK / 4><<<
+		numThreadBlocks, numThreadsPerBlock>>>(
+				_cudaRowCounters[i], _rowOffsets[i], _colIndexValues[i],
+				_numericalValues[i], _vectorX[i], _vectorY[i]);
+	} else if(_meanElementsPerRow <= 64) {
+		spmv_csr::csr32DynamicVector<float, 8, MAX_NUM_THREADS_PER_BLOCK / 8><<<
+		numThreadBlocks, numThreadsPerBlock>>>(
+				_cudaRowCounters[i], _rowOffsets[i], _colIndexValues[i],
+				_numericalValues[i], _vectorX[i], _vectorY[i]);
+	} else {
+		spmv_csr::csr32DynamicVector<float, 32, MAX_NUM_THREADS_PER_BLOCK / 32><<<
+		numThreadBlocks, numThreadsPerBlock>>>(
+				_cudaRowCounters[i], _rowOffsets[i], _colIndexValues[i],
+				_numericalValues[i], _vectorX[i], _vectorY[i]);
+	}
+
+#endif
+}
+
+void SpMVFloatVector::invokeKernelBLAS(const int32_t i) {
+	int32_t numThreadsPerBlock;
+	int32_t numThreadBlocks;
+
+	/*get the number of threads per block*/
+	getKernelGridInfo(i, numThreadsPerBlock, numThreadBlocks);
+
+	/*invoke the kernel*/
+#ifdef FLOAT_USE_TEXTURE_MEMORY
+	if (_meanElementsPerRow <= 2) {
+		spmv_csr::csr32DynamicVectorBLAS<float, 2, MAX_NUM_THREADS_PER_BLOCK / 2><<<
+				numThreadBlocks, numThreadsPerBlock>>>(_cudaRowCounters[i],
+				_rowOffsets[i], _colIndexValues[i], _numericalValues[i],
+				_texVectorX[i], _vectorY[i], _alpha, _beta);
+	} else if (_meanElementsPerRow <= 4) {
+		spmv_csr::csr32DynamicVectorBLAS<float, 4, MAX_NUM_THREADS_PER_BLOCK / 4><<<
+				numThreadBlocks, numThreadsPerBlock>>>(_cudaRowCounters[i],
+				_rowOffsets[i], _colIndexValues[i], _numericalValues[i],
+				_texVectorX[i], _vectorY[i], _alpha, _beta);
+	} else if (_meanElementsPerRow <= 64) {
+		spmv_csr::csr32DynamicVectorBLAS<float, 8, MAX_NUM_THREADS_PER_BLOCK / 8><<<
+				numThreadBlocks, numThreadsPerBlock>>>(_cudaRowCounters[i],
+				_rowOffsets[i], _colIndexValues[i], _numericalValues[i],
+				_texVectorX[i], _vectorY[i], _alpha, _beta);
+	} else {
+		spmv_csr::csr32DynamicVectorBLAS<float, 32,
+				MAX_NUM_THREADS_PER_BLOCK / 32><<<numThreadBlocks,
+				numThreadsPerBlock>>>(_cudaRowCounters[i], _rowOffsets[i],
+				_colIndexValues[i], _numericalValues[i], _texVectorX[i],
+				_vectorY[i], _alpha, _beta);
+	}
+#else
+	if (_meanElementsPerRow <= 2) {
+		spmv_csr::csr32DynamicVectorBLAS<float, 2, MAX_NUM_THREADS_PER_BLOCK / 2><<<
+		numThreadBlocks, numThreadsPerBlock>>>(
+				_cudaRowCounters[i], _rowOffsets[i], _colIndexValues[i],
+				_numericalValues[i], _vectorX[i], _vectorY[i], _alpha, _beta);
+	} else if (_meanElementsPerRow <= 4) {
+		spmv_csr::csr32DynamicVectorBLAS<float, 4, MAX_NUM_THREADS_PER_BLOCK / 4><<<
+		numThreadBlocks, numThreadsPerBlock>>>(
+				_cudaRowCounters[i], _rowOffsets[i], _colIndexValues[i],
+				_numericalValues[i], _vectorX[i], _vectorY[i], _alpha, _beta);
+	} else if(_meanElementsPerRow <= 64) {
+		spmv_csr::csr32DynamicVectorBLAS<float, 8, MAX_NUM_THREADS_PER_BLOCK / 8><<<
+		numThreadBlocks, numThreadsPerBlock>>>(
+				_cudaRowCounters[i], _rowOffsets[i], _colIndexValues[i],
+				_numericalValues[i], _vectorX[i], _vectorY[i], _alpha, _beta);
+	} else {
+		spmv_csr::csr32DynamicVectorBLAS<float, 32, MAX_NUM_THREADS_PER_BLOCK / 32><<<
+		numThreadBlocks, numThreadsPerBlock>>>(
+				_cudaRowCounters[i], _rowOffsets[i], _colIndexValues[i],
+				_numericalValues[i], _vectorX[i], _vectorY[i], _alpha, _beta);
+	}
+
+#endif
+}
+
+void SpMVFloatWarp::invokeKernel(const int32_t i) {
+	int32_t numThreadsPerBlock;
+	int32_t numThreadBlocks;
+
+	/*get the number of threads per block*/
+	getKernelGridInfo(i, numThreadsPerBlock, numThreadBlocks);
+
+	/*invoke the kernel*/
+#ifdef FLOAT_USE_TEXTURE_MEMORY
+	if (_meanElementsPerRow <= 2) {
+		spmv_csr::csr32DynamicWarp<float, 2, MAX_NUM_THREADS_PER_BLOCK / 2><<<
+				numThreadBlocks, numThreadsPerBlock>>>(_cudaRowCounters[i],
+				_rowOffsets[i], _colIndexValues[i], _numericalValues[i],
+				_texVectorX[i], _vectorY[i]);
+	} else if (_meanElementsPerRow <= 4) {
+		spmv_csr::csr32DynamicWarp<float, 4, MAX_NUM_THREADS_PER_BLOCK / 4><<<
+				numThreadBlocks, numThreadsPerBlock>>>(_cudaRowCounters[i],
+				_rowOffsets[i], _colIndexValues[i], _numericalValues[i],
+				_texVectorX[i], _vectorY[i]);
+	} else if (_meanElementsPerRow <= 64) {
+		spmv_csr::csr32DynamicWarp<float, 8, MAX_NUM_THREADS_PER_BLOCK / 8><<<
+				numThreadBlocks, numThreadsPerBlock>>>(_cudaRowCounters[i],
+				_rowOffsets[i], _colIndexValues[i], _numericalValues[i],
+				_texVectorX[i], _vectorY[i]);
+	} else {
+		spmv_csr::csr32DynamicWarp<float, 32, MAX_NUM_THREADS_PER_BLOCK / 32><<<
+				numThreadBlocks, numThreadsPerBlock>>>(_cudaRowCounters[i],
+				_rowOffsets[i], _colIndexValues[i], _numericalValues[i],
+				_texVectorX[i], _vectorY[i]);
+	}
+#else
+	if (_meanElementsPerRow <= 2) {
+		spmv_csr::csr32DynamicWarp<float, 2, MAX_NUM_THREADS_PER_BLOCK / 2><<<
+		numThreadBlocks, numThreadsPerBlock>>>(
+				_cudaRowCounters[i], _rowOffsets[i], _colIndexValues[i],
+				_numericalValues[i], _vectorX[i],_vectorY[i]);
+	} else if (_meanElementsPerRow <= 4) {
+		spmv_csr::csr32DynamicWarp<float, 4, MAX_NUM_THREADS_PER_BLOCK / 4><<<
+		numThreadBlocks, numThreadsPerBlock>>>(
+				_cudaRowCounters[i], _rowOffsets[i], _colIndexValues[i],
+				_numericalValues[i], _vectorX[i], _vectorY[i]);
+	} else if(_meanElementsPerRow <= 64) {
+		spmv_csr::csr32DynamicWarp<float, 8, MAX_NUM_THREADS_PER_BLOCK / 8><<<
+		numThreadBlocks, numThreadsPerBlock>>>(
+				_cudaRowCounters[i], _rowOffsets[i], _colIndexValues[i],
+				_numericalValues[i], _vectorX[i], _vectorY[i]);
+	} else {
+		spmv_csr::csr32DynamicWarp<float, 32, MAX_NUM_THREADS_PER_BLOCK / 32><<<
+		numThreadBlocks, numThreadsPerBlock>>>(
+				_cudaRowCounters[i], _rowOffsets[i], _colIndexValues[i],
+				_numericalValues[i], _vectorX[i], _vectorY[i]);
+	}
+
+#endif
+}
+
+void SpMVFloatWarp::invokeKernelBLAS(const int32_t i) {
+	int32_t numThreadsPerBlock;
+	int32_t numThreadBlocks;
+
+	/*get the number of threads per block*/
+	getKernelGridInfo(i, numThreadsPerBlock, numThreadBlocks);
+
+	/*invoke the kernel*/
+#ifdef FLOAT_USE_TEXTURE_MEMORY
+	if (_meanElementsPerRow <= 2) {
+		spmv_csr::csr32DynamicWarpBLAS<float, 2, MAX_NUM_THREADS_PER_BLOCK / 2><<<
+				numThreadBlocks, numThreadsPerBlock>>>(_cudaRowCounters[i],
+				_rowOffsets[i], _colIndexValues[i], _numericalValues[i],
+				_texVectorX[i], _vectorY[i], _alpha, _beta);
+	} else if (_meanElementsPerRow <= 4) {
+		spmv_csr::csr32DynamicWarpBLAS<float, 4, MAX_NUM_THREADS_PER_BLOCK / 4><<<
+				numThreadBlocks, numThreadsPerBlock>>>(_cudaRowCounters[i],
+				_rowOffsets[i], _colIndexValues[i], _numericalValues[i],
+				_texVectorX[i], _vectorY[i], _alpha, _beta);
+	} else if (_meanElementsPerRow <= 64) {
+		spmv_csr::csr32DynamicWarpBLAS<float, 8, MAX_NUM_THREADS_PER_BLOCK / 8><<<
+				numThreadBlocks, numThreadsPerBlock>>>(_cudaRowCounters[i],
+				_rowOffsets[i], _colIndexValues[i], _numericalValues[i],
+				_texVectorX[i], _vectorY[i], _alpha, _beta);
+	} else {
+		spmv_csr::csr32DynamicWarpBLAS<float, 32, MAX_NUM_THREADS_PER_BLOCK / 32><<<
+				numThreadBlocks, numThreadsPerBlock>>>(_cudaRowCounters[i],
+				_rowOffsets[i], _colIndexValues[i], _numericalValues[i],
+				_texVectorX[i], _vectorY[i], _alpha, _beta);
+	}
+#else
+	if (_meanElementsPerRow <= 2) {
+		spmv_csr::csr32DynamicWarpBLAS<float, 2, MAX_NUM_THREADS_PER_BLOCK / 2><<<
+		numThreadBlocks, numThreadsPerBlock>>>(
+				_cudaRowCounters[i], _rowOffsets[i], _colIndexValues[i],
+				_numericalValues[i], _vectorX[i],_vectorY[i], _alpha, _beta);
+	} else if (_meanElementsPerRow <= 4) {
+		spmv_csr::csr32DynamicWarpBLAS<float, 4, MAX_NUM_THREADS_PER_BLOCK / 4><<<
+		numThreadBlocks, numThreadsPerBlock>>>(
+				_cudaRowCounters[i], _rowOffsets[i], _colIndexValues[i],
+				_numericalValues[i], _vectorX[i], _vectorY[i], _alpha, _beta);
+	} else if(_meanElementsPerRow <= 64) {
+		spmv_csr::csr32DynamicWarpBLAS<float, 8, MAX_NUM_THREADS_PER_BLOCK / 8><<<
+		numThreadBlocks, numThreadsPerBlock>>>(
+				_cudaRowCounters[i], _rowOffsets[i], _colIndexValues[i],
+				_numericalValues[i], _vectorX[i], _vectorY[i], _alpha, _beta);
+	} else {
+		spmv_csr::csr32DynamicWarpBLAS<float, 32, MAX_NUM_THREADS_PER_BLOCK / 32><<<
+		numThreadBlocks, numThreadsPerBlock>>>(
+				_cudaRowCounters[i], _rowOffsets[i], _colIndexValues[i],
+				_numericalValues[i], _vectorX[i], _vectorY[i], _alpha, _beta);
+	}
+
+#endif
+}
+
+/*double-precision floating point*/
+SpMVDoubleVector::SpMVDoubleVector(Options* opt) :
+		SpMV(opt) {
+
+	_rowOffsets.resize(_numGPUs, NULL);
+	_colIndexValues.resize(_numGPUs, NULL);
+	_numericalValues.resize(_numGPUs, NULL);
+	_vectorY.resize(_numGPUs, NULL);
+
+	_vectorX.resize(_numGPUs, NULL);
+
+	_alpha = _opt->_alpha;
+	_beta = _opt->_beta;
+
+}
+SpMVDoubleVector::~SpMVDoubleVector() {
+	/*release matrix data*/
+	for (int32_t i = 0; i < _numGPUs; ++i) {
+
+		/*select the device*/
+		cudaSetDevice(_opt->_gpus[i].first);
+		CudaCheckError();
+
+		/*release the resources*/
+		if (_rowOffsets[i]) {
+			cudaFree(_rowOffsets[i]);
+		}
+		if (_colIndexValues[i]) {
+			cudaFree(_colIndexValues[i]);
+		}
+
+		if (_numericalValues[i]) {
+			cudaFree(_numericalValues[i]);
+		}
+		if (i == 0 && _vectorY[i]) {
+			cudaFree(_vectorY[i]);
+		}
+		if (_vectorX[i]) {
+			cudaFree(_vectorX[i]);
+		}
+	}
+}
+void SpMVDoubleVector::loadData() {
+	size_t numBytes;
+
+#ifdef DOUBLE_USE_TEXTURE_MEMORY
+	cudaTextureDesc texDesc;
+	cudaResourceDesc resDesc;
+
+	/*specify the texture object parameters*/
+	memset(&texDesc, 0, sizeof(texDesc));
+	texDesc.addressMode[0] = cudaAddressModeClamp;
+	texDesc.addressMode[1] = cudaAddressModeClamp;
+	texDesc.filterMode = cudaFilterModePoint;
+	texDesc.readMode = cudaReadModeElementType;
+#endif
+
+	/*iterate each GPU*/
+	for (int32_t i = 0; i < _numGPUs; ++i) {
+
+		/*select the device*/
+		cudaSetDevice(_opt->_gpus[i].first);
+		CudaCheckError();
+
+		/*allocate counter buffers*/
+		cudaMalloc(&_cudaRowCounters[i], sizeof(uint32_t));
+		CudaCheckError();
+
+		cudaMemcpyToSymbol(_cudaNumRows, &_opt->_numRows, sizeof(uint32_t));
+		CudaCheckError();
+
+		cudaMemcpyToSymbol(_cudaNumCols, &_opt->_numCols, sizeof(uint32_t));
+		CudaCheckError();
+
+		/******************************************************
+		 * Load matrix data
+		 ******************************************************/
+		numBytes = (_opt->_numRows + 1) * sizeof(uint32_t);
+		cudaMalloc(&_rowOffsets[i], numBytes);
+		CudaCheckError();
+
+		cudaMemcpy(_rowOffsets[i], _opt->_rowOffsets, numBytes,
+				cudaMemcpyHostToDevice);
+		CudaCheckError();
+
+		numBytes = _opt->_numValues * sizeof(uint32_t);
+		cudaMalloc(&_colIndexValues[i], numBytes);
+		CudaCheckError();
+
+		cudaMemcpy(_colIndexValues[i], _opt->_colIndexValues, numBytes,
+				cudaMemcpyHostToDevice);
+		CudaCheckError();
+
+		/*load the numerical values*/
+		numBytes = _opt->_numValues * sizeof(double);
+		cudaMalloc(&_numericalValues[i], numBytes);
+		CudaCheckError();
+
+		cudaMemcpy(_numericalValues[i], _opt->_numericalValues, numBytes,
+				cudaMemcpyHostToDevice);
+		CudaCheckError();
+
+		/*****************************************************
+		 * Load vector X data
+		 ******************************************************/
+		numBytes = _opt->_numCols * sizeof(double);
+		cudaMalloc(&_vectorX[i], numBytes);
+		CudaCheckError();
+
+		cudaMemcpy(_vectorX[i], _opt->_vectorX, numBytes,
+				cudaMemcpyHostToDevice);
+		CudaCheckError();
+
+#ifdef DOUBLE_USE_TEXTURE_MEMORY
+		/*specify texture and texture object*/
+		memset(&resDesc, 0, sizeof(resDesc));
+		resDesc.resType = cudaResourceTypeLinear;
+		resDesc.res.linear.devPtr = _vectorX[i];
+		resDesc.res.linear.desc = cudaCreateChannelDesc(32, 32, 0, 0,
+				cudaChannelFormatKindSigned);
+		resDesc.res.linear.sizeInBytes = numBytes;
+		cudaCreateTextureObject(&_texVectorX[i], &resDesc, &texDesc, NULL);
+		CudaCheckError();
+#endif
+		/*****************************************************
+		 * vector Y data
+		 ******************************************************/
+		numBytes = _opt->_numRows * sizeof(double);
+		/*allocate space on the first GPU*/
+		cudaMalloc(&_vectorY[i], numBytes);
+		CudaCheckError();
+
+		/*copy the data*/
+		cudaMemcpy(_vectorY[i], _opt->_vectorY, numBytes,
+				cudaMemcpyHostToDevice);
+		CudaCheckError();
+	}
+}
+void SpMVDoubleVector::storeData() {
+	/*transfer back vector Y*/
+	uint64_t numBytes = _opt->_numRows * sizeof(double);
+
+	/*select the device*/
+	cudaSetDevice(_opt->_gpus[0].first);
+	CudaCheckError();
+
+	/*copy back the data*/
+	cudaMemcpy(_opt->_vectorY, _vectorY[0], numBytes, cudaMemcpyDeviceToHost);
+	CudaCheckError();
+
+	/*open the file*/
+	FILE* file;
+	if (_opt->_outFileName.length() == 0) {
+		return;
+	}
+
+	file = fopen(_opt->_outFileName.c_str(), "w");
+	if (!file) {
+		cerr << "Failed to open file: " << _opt->_outFileName << endl;
+		return;
+	}
+
+	/*write to the file*/
+	double* ptr = (double*) _opt->_vectorY;
+	for (uint32_t i = 0; i < _opt->_numRows; ++i) {
+		fprintf(file, "%lf\n", ptr[i]);
+	}
+
+	/*close the file*/
+	if (file != stdout) {
+		fclose(file);
+	}
+}
+void SpMVDoubleVector::invokeKernel(const int32_t i) {
+	int32_t numThreadsPerBlock;
+	int32_t numThreadBlocks;
+
+	/*get the number of threads per block*/
+	getKernelGridInfo(i, numThreadsPerBlock, numThreadBlocks);
+
+	/*invoke the kernel*/
+#ifdef DOUBLE_USE_TEXTURE_MEMORY
+	if (_meanElementsPerRow <= 2) {
+		spmv_csr::csr64DynamicVector<double, 2, MAX_NUM_THREADS_PER_BLOCK / 2><<<
+				numThreadBlocks, numThreadsPerBlock>>>(_cudaRowCounters[i],
+				_rowOffsets[i], _colIndexValues[i], _numericalValues[i],
+				_texVectorX[i], _vectorY[i]);
+	} else if (_meanElementsPerRow <= 4) {
+		spmv_csr::csr64DynamicVector<double, 4, MAX_NUM_THREADS_PER_BLOCK / 4><<<
+				numThreadBlocks, numThreadsPerBlock>>>(_cudaRowCounters[i],
+				_rowOffsets[i], _colIndexValues[i], _numericalValues[i],
+				_texVectorX[i], _vectorY[i]);
+	} else if (_meanElementsPerRow <= 64) {
+		spmv_csr::csr64DynamicVector<double, 8, MAX_NUM_THREADS_PER_BLOCK / 8><<<
+				numThreadBlocks, numThreadsPerBlock>>>(_cudaRowCounters[i],
+				_rowOffsets[i], _colIndexValues[i], _numericalValues[i],
+				_texVectorX[i], _vectorY[i]);
+	} else {
+		spmv_csr::csr64DynamicVector<double, 32, MAX_NUM_THREADS_PER_BLOCK / 32><<<
+				numThreadBlocks, numThreadsPerBlock>>>(_cudaRowCounters[i],
+				_rowOffsets[i], _colIndexValues[i], _numericalValues[i],
+				_texVectorX[i], _vectorY[i]);
+	}
+#else
+	if (_meanElementsPerRow <= 2) {
+		spmv_csr::csr64DynamicVector<double, 2, MAX_NUM_THREADS_PER_BLOCK / 2><<<
+		numThreadBlocks, numThreadsPerBlock>>>(
+				_cudaRowCounters[i], _rowOffsets[i], _colIndexValues[i],
+				_numericalValues[i], _vectorX[i], _vectorY[i]);
+	} else if (_meanElementsPerRow <= 4) {
+		spmv_csr::csr64DynamicVector<double, 4, MAX_NUM_THREADS_PER_BLOCK / 4><<<
+		numThreadBlocks, numThreadsPerBlock>>>(
+				_cudaRowCounters[i], _rowOffsets[i], _colIndexValues[i],
+				_numericalValues[i], _vectorX[i], _vectorY[i]);
+	} else if(_meanElementsPerRow <= 64) {
+		spmv_csr::csr64DynamicVector<double, 8, MAX_NUM_THREADS_PER_BLOCK / 8><<<
+		numThreadBlocks, numThreadsPerBlock>>>(
+				_cudaRowCounters[i], _rowOffsets[i], _colIndexValues[i],
+				_numericalValues[i], _vectorX[i], _vectorY[i]);
+	} else {
+		spmv_csr::csr64DynamicVector<double, 32, MAX_NUM_THREADS_PER_BLOCK / 32><<<
+		numThreadBlocks, numThreadsPerBlock>>>(
+				_cudaRowCounters[i], _rowOffsets[i], _colIndexValues[i],
+				_numericalValues[i], _vectorX[i], _vectorY[i]);
+	}
+
+#endif
+}
+
+void SpMVDoubleVector::invokeKernelBLAS(const int32_t i) {
+	int32_t numThreadsPerBlock;
+	int32_t numThreadBlocks;
+
+	/*get the number of threads per block*/
+	getKernelGridInfo(i, numThreadsPerBlock, numThreadBlocks);
+
+	/*invoke the kernel*/
+#ifdef DOUBLE_USE_TEXTURE_MEMORY
+	if (_meanElementsPerRow <= 2) {
+		spmv_csr::csr64DynamicVectorBLAS<double, 2,
+				MAX_NUM_THREADS_PER_BLOCK / 2><<<numThreadBlocks,
+				numThreadsPerBlock>>>(_cudaRowCounters[i], _rowOffsets[i],
+				_colIndexValues[i], _numericalValues[i], _texVectorX[i],
+				_vectorY[i], _vectorY[i], _alpha, _beta);
+	} else if (_meanElementsPerRow <= 4) {
+		spmv_csr::csr64DynamicVectorBLAS<double, 4,
+				MAX_NUM_THREADS_PER_BLOCK / 4><<<numThreadBlocks,
+				numThreadsPerBlock>>>(_cudaRowCounters[i], _rowOffsets[i],
+				_colIndexValues[i], _numericalValues[i], _texVectorX[i],
+				_vectorY[i], _vectorY[i], _alpha, _beta);
+	} else if (_meanElementsPerRow <= 64) {
+		spmv_csr::csr64DynamicVectorBLAS<double, 8,
+				MAX_NUM_THREADS_PER_BLOCK / 8><<<numThreadBlocks,
+				numThreadsPerBlock>>>(_cudaRowCounters[i], _rowOffsets[i],
+				_colIndexValues[i], _numericalValues[i], _texVectorX[i],
+				_vectorY[i], _vectorY[i], _alpha, _beta);
+	} else {
+		spmv_csr::csr64DynamicVectorBLAS<double, 32,
+				MAX_NUM_THREADS_PER_BLOCK / 32><<<numThreadBlocks,
+				numThreadsPerBlock>>>(_cudaRowCounters[i], _rowOffsets[i],
+				_colIndexValues[i], _numericalValues[i], _texVectorX[i],
+				_vectorY[i], _vectorY[i], _alpha, _beta);
+	}
+#else
+	if (_meanElementsPerRow <= 2) {
+		spmv_csr::csr64DynamicVectorBLAS<double, 2, MAX_NUM_THREADS_PER_BLOCK / 2><<<
+		numThreadBlocks, numThreadsPerBlock>>>(
+				_cudaRowCounters[i], _rowOffsets[i], _colIndexValues[i],
+				_numericalValues[i], _vectorX[i], _vectorY[i], _vectorY[i], _alpha, _beta);
+	} else if (_meanElementsPerRow <= 4) {
+		spmv_csr::csr64DynamicVectorBLAS<double, 4, MAX_NUM_THREADS_PER_BLOCK / 4><<<
+		numThreadBlocks, numThreadsPerBlock>>>(
+				_cudaRowCounters[i], _rowOffsets[i], _colIndexValues[i],
+				_numericalValues[i], _vectorX[i], _vectorY[i], _vectorY[i], _alpha, _beta);
+	} else if(_meanElementsPerRow <= 64) {
+		spmv_csr::csr64DynamicVectorBLAS<double, 8, MAX_NUM_THREADS_PER_BLOCK / 8><<<
+		numThreadBlocks, numThreadsPerBlock>>>(
+				_cudaRowCounters[i], _rowOffsets[i], _colIndexValues[i],
+				_numericalValues[i], _vectorX[i], _vectorY[i], _vectorY[i], _alpha, _beta);
+	} else {
+		spmv_csr::csr64DynamicVectorBLAS<double, 32, MAX_NUM_THREADS_PER_BLOCK / 32><<<
+		numThreadBlocks, numThreadsPerBlock>>>(
+				_cudaRowCounters[i], _rowOffsets[i], _colIndexValues[i],
+				_numericalValues[i], _vectorX[i], _vectorY[i], _vectorY[i], _alpha, _beta);
+	}
+#endif
+}
+
+void SpMVDoubleWarp::invokeKernel(const int32_t i) {
+	int32_t numThreadsPerBlock;
+	int32_t numThreadBlocks;
+
+	/*get the number of threads per block*/
+	getKernelGridInfo(i, numThreadsPerBlock, numThreadBlocks);
+
+	/*invoke the kernel*/
+#ifdef DOUBLE_USE_TEXTURE_MEMORY
+	if (_meanElementsPerRow <= 2) {
+		spmv_csr::csr64DynamicWarp<double, 2, MAX_NUM_THREADS_PER_BLOCK / 2><<<
+				numThreadBlocks, numThreadsPerBlock>>>(_cudaRowCounters[i],
+				_rowOffsets[i], _colIndexValues[i], _numericalValues[i],
+				_texVectorX[i], _vectorY[i]);
+	} else if (_meanElementsPerRow <= 4) {
+		spmv_csr::csr64DynamicWarp<double, 4, MAX_NUM_THREADS_PER_BLOCK / 4><<<
+				numThreadBlocks, numThreadsPerBlock>>>(_cudaRowCounters[i],
+				_rowOffsets[i], _colIndexValues[i], _numericalValues[i],
+				_texVectorX[i], _vectorY[i]);
+	} else if (_meanElementsPerRow <= 64) {
+		spmv_csr::csr64DynamicWarp<double, 8, MAX_NUM_THREADS_PER_BLOCK / 8><<<
+				numThreadBlocks, numThreadsPerBlock>>>(_cudaRowCounters[i],
+				_rowOffsets[i], _colIndexValues[i], _numericalValues[i],
+				_texVectorX[i], _vectorY[i]);
+	} else {
+		spmv_csr::csr64DynamicWarp<double, 32, MAX_NUM_THREADS_PER_BLOCK / 32><<<
+				numThreadBlocks, numThreadsPerBlock>>>(_cudaRowCounters[i],
+				_rowOffsets[i], _colIndexValues[i], _numericalValues[i],
+				_texVectorX[i], _vectorY[i]);
+	}
+#else
+	if (_meanElementsPerRow <= 2) {
+		spmv_csr::csr64DynamicWarp<double, 2, MAX_NUM_THREADS_PER_BLOCK / 2><<<
+		numThreadBlocks, numThreadsPerBlock>>>(
+				_cudaRowCounters[i], _rowOffsets[i], _colIndexValues[i],
+				_numericalValues[i], _vectorX[i], _vectorY[i]);
+	} else if (_meanElementsPerRow <= 4) {
+		spmv_csr::csr64DynamicWarp<double, 4, MAX_NUM_THREADS_PER_BLOCK / 4><<<
+		numThreadBlocks, numThreadsPerBlock>>>(
+				_cudaRowCounters[i], _rowOffsets[i], _colIndexValues[i],
+				_numericalValues[i], _vectorX[i], _vectorY[i]);
+	} else if(_meanElementsPerRow <= 64) {
+		spmv_csr::csr64DynamicWarp<double, 8, MAX_NUM_THREADS_PER_BLOCK / 8><<<
+		numThreadBlocks, numThreadsPerBlock>>>(
+				_cudaRowCounters[i], _rowOffsets[i], _colIndexValues[i],
+				_numericalValues[i], _vectorX[i], _vectorY[i]);
+	} else {
+		spmv_csr::csr64DynamicWarp<double, 32, MAX_NUM_THREADS_PER_BLOCK / 32><<<
+		numThreadBlocks, numThreadsPerBlock>>>(
+				_cudaRowCounters[i], _rowOffsets[i], _colIndexValues[i],
+				_numericalValues[i], _vectorX[i], _vectorY[i]);
+	}
+
+#endif
+}
+
+void SpMVDoubleWarp::invokeKernelBLAS(const int32_t i) {
+	int32_t numThreadsPerBlock;
+	int32_t numThreadBlocks;
+
+	/*get the number of threads per block*/
+	getKernelGridInfo(i, numThreadsPerBlock, numThreadBlocks);
+
+	/*invoke the kernel*/
+#ifdef DOUBLE_USE_TEXTURE_MEMORY
+	if (_meanElementsPerRow <= 2) {
+		spmv_csr::csr64DynamicWarpBLAS<double, 2, MAX_NUM_THREADS_PER_BLOCK / 2><<<
+				numThreadBlocks, numThreadsPerBlock>>>(_cudaRowCounters[i],
+				_rowOffsets[i], _colIndexValues[i], _numericalValues[i],
+				_texVectorX[i], _vectorY[i], _vectorY[i], _alpha, _beta);
+	} else if (_meanElementsPerRow <= 4) {
+		spmv_csr::csr64DynamicWarpBLAS<double, 4, MAX_NUM_THREADS_PER_BLOCK / 4><<<
+				numThreadBlocks, numThreadsPerBlock>>>(_cudaRowCounters[i],
+				_rowOffsets[i], _colIndexValues[i], _numericalValues[i],
+				_texVectorX[i], _vectorY[i], _vectorY[i], _alpha, _beta);
+	} else if (_meanElementsPerRow <= 64) {
+		spmv_csr::csr64DynamicWarpBLAS<double, 8, MAX_NUM_THREADS_PER_BLOCK / 8><<<
+				numThreadBlocks, numThreadsPerBlock>>>(_cudaRowCounters[i],
+				_rowOffsets[i], _colIndexValues[i], _numericalValues[i],
+				_texVectorX[i], _vectorY[i], _vectorY[i], _alpha, _beta);
+	} else {
+		spmv_csr::csr64DynamicWarpBLAS<double, 32,
+				MAX_NUM_THREADS_PER_BLOCK / 32><<<numThreadBlocks,
+				numThreadsPerBlock>>>(_cudaRowCounters[i], _rowOffsets[i],
+				_colIndexValues[i], _numericalValues[i], _texVectorX[i],
+				_vectorY[i], _vectorY[i], _alpha, _beta);
+	}
+#else
+	if (_meanElementsPerRow <= 2) {
+		spmv_csr::csr64DynamicWarpBLAS<double, 2, MAX_NUM_THREADS_PER_BLOCK / 2><<<
+		numThreadBlocks, numThreadsPerBlock>>>(
+				_cudaRowCounters[i], _rowOffsets[i], _colIndexValues[i],
+				_numericalValues[i], _vectorX[i], _vectorY[i], _vectorY[i], _alpha, _beta);
+	} else if (_meanElementsPerRow <= 4) {
+		spmv_csr::csr64DynamicWarpBLAS<double, 4, MAX_NUM_THREADS_PER_BLOCK / 4><<<
+		numThreadBlocks, numThreadsPerBlock>>>(
+				_cudaRowCounters[i], _rowOffsets[i], _colIndexValues[i],
+				_numericalValues[i], _vectorX[i], _vectorY[i], _vectorY[i], _alpha, _beta);
+	} else if(_meanElementsPerRow <= 64) {
+		spmv_csr::csr64DynamicWarpBLAS<double, 8, MAX_NUM_THREADS_PER_BLOCK / 8><<<
+		numThreadBlocks, numThreadsPerBlock>>>(
+				_cudaRowCounters[i], _rowOffsets[i], _colIndexValues[i],
+				_numericalValues[i], _vectorX[i], _vectorY[i], _vectorY[i], _alpha, _beta);
+	} else {
+		spmv_csr::csr64DynamicWarpBLAS<double, 32, MAX_NUM_THREADS_PER_BLOCK / 32><<<
+		numThreadBlocks, numThreadsPerBlock>>>(
+				_cudaRowCounters[i], _rowOffsets[i], _colIndexValues[i],
+				_numericalValues[i], _vectorX[i], _vectorY[i], _vectorY[i], _alpha, _beta);
+	}
+
+#endif
+}
diff --git a/src/Benchmarks/SpMV/ReferenceFormats/LightSpMV-1.0/SpMV.h b/src/Benchmarks/SpMV/ReferenceFormats/LightSpMV-1.0/SpMV.h
new file mode 100644
index 0000000000000000000000000000000000000000..55f89d3b3725ab15f49f014c2e964d20f7585a69
--- /dev/null
+++ b/src/Benchmarks/SpMV/ReferenceFormats/LightSpMV-1.0/SpMV.h
@@ -0,0 +1,152 @@
+/*
+ * SpMV.h
+ *
+ *  Created on: Nov 21, 2014
+ *      Author: yongchao
+ */
+
+#ifndef SPMV_H_
+#define SPMV_H_
+#include "Options.h"
+#include "sys/time.h"
+
+class SpMV {
+public:
+	SpMV(Options* opt);
+	virtual ~SpMV() = 0;
+
+	/*compute the number of threads per block*/
+	inline void getKernelGridInfo(const int32_t dev,
+			int32_t & numThreadsPerBlock, int32_t &numThreadBlocks) {
+
+		/*set to the maximum number of threads per block*/
+		numThreadsPerBlock = _opt->_gpus[dev].second.maxThreadsPerBlock;
+
+		/*set to the number of multiprocessors*/
+		numThreadBlocks = _opt->_gpus[dev].second.multiProcessorCount
+				* (_opt->_gpus[dev].second.maxThreadsPerMultiProcessor
+						/ numThreadsPerBlock);
+
+		//cerr << numThreadsPerBlock << " " << numThreadBlocks << endl;
+	}
+
+	inline double getSysTime() {
+		double dtime;
+		struct timeval tv;
+
+		/*get the time of the day*/
+		gettimeofday(&tv, NULL);
+
+		/*get the milli-seconds*/
+		dtime = ((double) tv.tv_sec) * 1000.0;
+		dtime += ((double) tv.tv_usec) / 1000.0;
+
+		return dtime;
+	}
+	void spmvKernel();
+	virtual void loadData() = 0;
+	virtual void storeData() = 0;
+
+	/*y = AX*/
+	virtual void invokeKernel(const int32_t i) = 0;
+	/*y = alpha * Ax + beta * y*/
+	virtual void invokeKernelBLAS(const int32_t i) = 0;
+
+protected:
+	/*member variable*/
+	Options* _opt;
+
+	/*number of GPUs*/
+	int32_t _numGPUs;
+
+	/*average number of elements per row*/
+	int32_t _meanElementsPerRow;
+
+	/*stream*/
+	vector<cudaStream_t> _streams;
+
+	/*row counter*/
+	vector<uint32_t*> _cudaRowCounters;
+
+#if defined(FLOAT_USE_TEXTURE_MEMORY) || defined(DOUBLE_USE_TEXTURE_MEMORY)
+	vector<cudaTextureObject_t> _texVectorX;
+#endif
+};
+
+/*use global memory*/
+/*vector-based row dynamic distribution*/
+class SpMVFloatVector: public SpMV {
+public:
+	SpMVFloatVector(Options* opt);
+	virtual ~SpMVFloatVector();
+
+	void loadData();
+	void storeData();
+
+	/*y = Ax*/
+	virtual void invokeKernel(const int32_t i);
+	/*y = alpha * Ax + beta * y*/
+	virtual void invokeKernelBLAS(const int32_t i);
+
+//protected:
+	vector<uint32_t*> _rowOffsets;
+	vector<uint32_t*> _colIndexValues;
+	vector<float*> _numericalValues;
+	vector<float*> _vectorY;
+	vector<float*> _vectorX;
+
+	float _alpha;
+	float _beta;
+};
+
+/*warp-based row dynamic distribution*/
+class SpMVFloatWarp: public SpMVFloatVector {
+public:
+	SpMVFloatWarp(Options* opt) :
+			SpMVFloatVector(opt) {
+	}
+
+	/*y = Ax*/
+	void invokeKernel(const int32_t i);
+	/*y = alpha * Ax + beta * y*/
+	void invokeKernelBLAS(const int32_t i);
+};
+
+class SpMVDoubleVector: public SpMV {
+public:
+	SpMVDoubleVector(Options* opt);
+	virtual ~SpMVDoubleVector();
+
+	void loadData();
+	void storeData();
+
+	/*y = Ax*/
+	virtual void invokeKernel(const int32_t i);
+
+	/*y = alpha * Ax + beta * y*/
+	virtual void invokeKernelBLAS(const int32_t i);
+
+//protected:
+	vector<uint32_t*> _rowOffsets;
+	vector<uint32_t*> _colIndexValues;
+	vector<double*> _numericalValues;
+	vector<double*> _vectorY;
+	vector<double*> _vectorX;
+
+	double _alpha;
+	double _beta;
+};
+
+/*warp-based row dynamic distribution*/
+class SpMVDoubleWarp: public SpMVDoubleVector {
+public:
+	SpMVDoubleWarp(Options* opt) :
+			SpMVDoubleVector(opt) {
+	}
+	/*y = Ax*/
+	void invokeKernel(const int32_t i);
+
+	/*y = alpha * Ax + beta * y*/
+	void invokeKernelBLAS(const int32_t i);
+};
+#endif /* SPMV_H_ */
diff --git a/src/Benchmarks/SpMV/ReferenceFormats/LightSpMV-1.0/SpMVCSR.cu b/src/Benchmarks/SpMV/ReferenceFormats/LightSpMV-1.0/SpMVCSR.cu
new file mode 100644
index 0000000000000000000000000000000000000000..74ed616271c2600dab9dbc79a6600115fff16d45
--- /dev/null
+++ b/src/Benchmarks/SpMV/ReferenceFormats/LightSpMV-1.0/SpMVCSR.cu
@@ -0,0 +1,12 @@
+/*
+ * SpMVCSR.cu
+ *
+ *  Created on: Nov 25, 2014
+ *      Author: yongchao
+ */
+#include "SpMVCSR.h"
+
+/*device variables*/
+__constant__ uint32_t _cudaNumRows;
+__constant__ uint32_t _cudaNumCols;
+
diff --git a/src/Benchmarks/SpMV/ReferenceFormats/LightSpMV-1.0/SpMVCSR.h b/src/Benchmarks/SpMV/ReferenceFormats/LightSpMV-1.0/SpMVCSR.h
new file mode 100644
index 0000000000000000000000000000000000000000..fb004308fb2815aeaddc8b78df97229917a4c64c
--- /dev/null
+++ b/src/Benchmarks/SpMV/ReferenceFormats/LightSpMV-1.0/SpMVCSR.h
@@ -0,0 +1,696 @@
+/*
+ * SpMVCSR.h
+ *
+ *  Created on: Nov 25, 2014
+ *      Author: yongchao
+ */
+
+#ifndef SPMVCSR_H_
+#define SPMVCSR_H_
+#include "Types.h"
+
+#pragma once
+
+extern __constant__ uint32_t _cudaNumRows;
+extern __constant__ uint32_t _cudaNumCols;
+
+namespace spmv_csr {
+
+/*device functions*/
+template < typename T>
+__device__ inline T shfl_down_64bits(T var, int32_t srcLane,
+		int32_t width) {
+
+	int2 a = *reinterpret_cast<int2*>(&var);
+
+	/*exchange the data*/
+	a.x = __shfl_down_sync(0xffffffff,a.x, srcLane, width);
+	a.y = __shfl_down_sync(0xffffffff,a.y, srcLane, width);
+	
+	return *reinterpret_cast<T*>(&a);
+}
+
+/*macro to get the X value*/
+__device__ inline float FLOAT_VECTOR_GET(const cudaTextureObject_t vectorX, uint32_t index){
+	return tex1Dfetch<float>(vectorX, index);
+}
+__device__ inline float FLOAT_VECTOR_GET (const float* __restrict vectorX, uint32_t index){
+	return vectorX[index];
+}
+
+__device__ inline double DOUBLE_VECTOR_GET (const cudaTextureObject_t vectorX, uint32_t index){
+	/*load the data*/
+	int2 v = tex1Dfetch<int2>(vectorX, index);
+
+	/*convert to double*/
+	return __hiloint2double(v.y, v.x);
+}
+__device__ inline double DOUBLE_VECTOR_GET (const double* __restrict vectorX, uint32_t index){
+	return vectorX[index];
+}
+
+
+/*32-bit*/
+template < typename T, uint32_t THREADS_PER_VECTOR, uint32_t MAX_NUM_VECTORS_PER_BLOCK>
+#ifdef FLOAT_USE_TEXTURE_MEMORY
+__global__ void csr32DynamicWarp(uint32_t* __restrict cudaRowCounter, const uint32_t* __restrict rowOffsets, const uint32_t* __restrict colIndexValues,
+		const T* __restrict numericalValues, const cudaTextureObject_t vectorX,  T* vectorY) {
+#else
+	__global__ void csr32DynamicWarp(uint32_t* __restrict cudaRowCounter, const uint32_t* __restrict rowOffsets, const uint32_t* __restrict colIndexValues,
+			const T* __restrict numericalValues, const T* __restrict vectorX, T* vectorY) {
+#endif
+	uint32_t i;
+	T sum;
+	uint32_t row;
+	uint32_t rowStart, rowEnd;
+	const uint32_t laneId = threadIdx.x % THREADS_PER_VECTOR; /*lane index in the vector*/
+	const uint32_t vectorId = threadIdx.x / THREADS_PER_VECTOR; /*vector index in the thread block*/
+	const uint32_t warpLaneId = threadIdx.x & 31;	/*lane index in the warp*/
+	const uint32_t warpVectorId = warpLaneId / THREADS_PER_VECTOR;	/*vector index in the warp*/
+
+	__shared__ volatile uint32_t space[MAX_NUM_VECTORS_PER_BLOCK][2];
+
+	/*get the row index*/
+	if (warpLaneId == 0) {
+		row = atomicAdd(cudaRowCounter, 32 / THREADS_PER_VECTOR);
+	}
+	/*broadcast the value to other threads in the same warp and compute the row index of each vector*/
+	row = __shfl_sync(0xffffffff,row, 0) + warpVectorId;
+
+	/*check the row range*/
+	while (row < _cudaNumRows) {
+
+		/*use two threads to fetch the row offset*/
+		if (laneId < 2) {
+			space[vectorId][laneId] = rowOffsets[row + laneId];
+		}
+		rowStart = space[vectorId][0];
+		rowEnd = space[vectorId][1];
+
+		/*there are non-zero elements in the current row*/
+		sum = 0;
+		/*compute dot product*/
+		if (THREADS_PER_VECTOR == 32) {
+
+			/*ensure aligned memory access*/
+			i = rowStart - (rowStart & (THREADS_PER_VECTOR - 1)) + laneId;
+
+			/*process the unaligned part*/
+			if (i >= rowStart && i < rowEnd) {
+				sum += numericalValues[i] * FLOAT_VECTOR_GET(vectorX, colIndexValues[i]);
+			}
+
+				/*process the aligned part*/
+			for (i += THREADS_PER_VECTOR; i < rowEnd; i += THREADS_PER_VECTOR) {
+				sum += numericalValues[i] * FLOAT_VECTOR_GET(vectorX, colIndexValues[i]);
+			}
+		} else {
+			/*regardless of the global memory access alignment*/
+			for (i = rowStart + laneId; i < rowEnd; i +=
+					THREADS_PER_VECTOR) {
+				sum += numericalValues[i] * FLOAT_VECTOR_GET(vectorX, colIndexValues[i]);
+			}
+		}
+		/*intra-vector reduction*/
+		for (i = THREADS_PER_VECTOR >> 1; i > 0; i >>= 1) {
+			sum += __shfl_down_sync(0xffffffff,sum, i, THREADS_PER_VECTOR);
+		}
+
+		/*save the results and get a new row*/
+		if (laneId == 0) {
+			/*save the results*/
+			vectorY[row] = sum;
+		}
+
+		/*get a new row index*/
+		if(warpLaneId == 0){
+			row = atomicAdd(cudaRowCounter, 32 / THREADS_PER_VECTOR);
+		}
+		/*broadcast the row index to the other threads in the same warp and compute the row index of each vetor*/
+		row = __shfl_sync(0xffffffff,row, 0) + warpVectorId;
+
+	}/*while*/
+}
+
+/*vector-based row dynamic distribution*/
+template < typename T, uint32_t THREADS_PER_VECTOR, uint32_t MAX_NUM_VECTORS_PER_BLOCK>
+#ifdef FLOAT_USE_TEXTURE_MEMORY
+__global__ void csr32DynamicVector(uint32_t* __restrict cudaRowCounter, const uint32_t* __restrict rowOffsets, const uint32_t* __restrict colIndexValues,
+		const T* __restrict numericalValues, const cudaTextureObject_t vectorX, T* vectorY) {
+#else
+	__global__ void csr32DynamicVector(uint32_t* __restrict cudaRowCounter, const uint32_t* __restrict rowOffsets, const uint32_t* __restrict colIndexValues,
+			const T* __restrict numericalValues, const T* __restrict vectorX, T* vectorY) {
+#endif
+
+	uint32_t i;
+	T sum;
+	uint32_t row;
+	uint32_t rowStart, rowEnd;
+	const uint32_t laneId = threadIdx.x % THREADS_PER_VECTOR; /*lane index in the vector*/
+	const uint32_t vectorId = threadIdx.x / THREADS_PER_VECTOR; /*vector index in the block*/
+	__shared__ volatile uint32_t space[MAX_NUM_VECTORS_PER_BLOCK][2];
+
+	/*get the row index*/
+	if (laneId == 0) {
+		row = atomicAdd(cudaRowCounter, 1);
+	}
+	/*broadcast the value to other lanes from lane 0*/
+	row = __shfl_sync(0xffffffff,row, 0, THREADS_PER_VECTOR);
+
+	/*check the row range*/
+	while (row < _cudaNumRows) {
+
+		/*use two threads to fetch the row offset*/
+		if (laneId < 2) {
+			space[vectorId][laneId] = rowOffsets[row + laneId];
+		}
+		rowStart = space[vectorId][0];
+		rowEnd = space[vectorId][1];
+
+		/*there are non-zero elements in the current row*/
+		sum = 0;
+		/*compute dot product*/
+		if (THREADS_PER_VECTOR == 32) {
+
+			/*ensure aligned memory access*/
+			i = rowStart - (rowStart & (THREADS_PER_VECTOR - 1)) + laneId;
+
+			/*process the unaligned part*/
+			if (i >= rowStart && i < rowEnd) {
+				sum += numericalValues[i] * FLOAT_VECTOR_GET(vectorX, colIndexValues[i]);
+			}
+
+				/*process the aligned part*/
+			for (i += THREADS_PER_VECTOR; i < rowEnd; i += THREADS_PER_VECTOR) {
+				sum += numericalValues[i] * FLOAT_VECTOR_GET(vectorX, colIndexValues[i]);
+			}
+		} else {
+			/*regardless of the global memory access alignment*/
+			for (i = rowStart + laneId; i < rowEnd; i +=
+					THREADS_PER_VECTOR) {
+				sum += numericalValues[i] * FLOAT_VECTOR_GET(vectorX, colIndexValues[i]);
+			}
+		}
+		/*intra-vector reduction*/
+		for (i = THREADS_PER_VECTOR >> 1; i > 0; i >>= 1) {
+			sum += __shfl_down_sync(0xffffffff,sum, i, THREADS_PER_VECTOR);
+		}
+
+		/*save the results and get a new row*/
+		if (laneId == 0) {
+			/*save the results*/
+			vectorY[row] = sum;
+
+			/*get a new row index*/
+			row = atomicAdd(cudaRowCounter, 1);
+		}
+		row = __shfl_sync(0xffffffff,row, 0, THREADS_PER_VECTOR);
+	}/*while*/
+}
+
+	/*32-bit*/
+	template < typename T, uint32_t THREADS_PER_VECTOR, uint32_t MAX_NUM_VECTORS_PER_BLOCK>
+	#ifdef FLOAT_USE_TEXTURE_MEMORY
+	__global__ void csr32DynamicWarpBLAS(uint32_t* __restrict cudaRowCounter, const uint32_t* __restrict rowOffsets, const uint32_t* __restrict colIndexValues,
+			const T* __restrict numericalValues, const cudaTextureObject_t vectorX,  T* vectorY, const T alpha, const T beta) {
+	#else
+		__global__ void csr32DynamicWarpBLAS(uint32_t* __restrict cudaRowCounter, const uint32_t* __restrict rowOffsets, const uint32_t* __restrict colIndexValues,
+				const T* __restrict numericalValues, const T* __restrict vectorX, T* vectorY, const T alpha, const T beta) {
+	#endif
+		uint32_t i;
+		T sum;
+		uint32_t row;
+		uint32_t rowStart, rowEnd;
+		const uint32_t laneId = threadIdx.x % THREADS_PER_VECTOR; /*lane index in the vector*/
+		const uint32_t vectorId = threadIdx.x / THREADS_PER_VECTOR; /*vector index in the thread block*/
+		const uint32_t warpLaneId = threadIdx.x & 31;	/*lane index in the warp*/
+		const uint32_t warpVectorId = warpLaneId / THREADS_PER_VECTOR;	/*vector index in the warp*/
+
+		__shared__ volatile uint32_t space[MAX_NUM_VECTORS_PER_BLOCK][2];
+
+		/*get the row index*/
+		if (warpLaneId == 0) {
+			row = atomicAdd(cudaRowCounter, 32 / THREADS_PER_VECTOR);
+		}
+		/*broadcast the value to other threads in the same warp and compute the row index of each vector*/
+		row = __shfl_sync(0xffffffff,row, 0) + warpVectorId;
+
+		/*check the row range*/
+		while (row < _cudaNumRows) {
+
+			/*use two threads to fetch the row offset*/
+			if (laneId < 2) {
+				space[vectorId][laneId] = rowOffsets[row + laneId];
+			}
+			rowStart = space[vectorId][0];
+			rowEnd = space[vectorId][1];
+
+			/*there are non-zero elements in the current row*/
+			sum = 0;
+			/*compute dot product*/
+			if (THREADS_PER_VECTOR == 32) {
+
+				/*ensure aligned memory access*/
+				i = rowStart - (rowStart & (THREADS_PER_VECTOR - 1)) + laneId;
+
+				/*process the unaligned part*/
+				if (i >= rowStart && i < rowEnd) {
+					sum += numericalValues[i] * FLOAT_VECTOR_GET(vectorX, colIndexValues[i]);
+				}
+
+					/*process the aligned part*/
+				for (i += THREADS_PER_VECTOR; i < rowEnd; i += THREADS_PER_VECTOR) {
+					sum += numericalValues[i] * FLOAT_VECTOR_GET(vectorX, colIndexValues[i]);
+				}
+			} else {
+				/*regardless of the global memory access alignment*/
+				for (i = rowStart + laneId; i < rowEnd; i +=
+						THREADS_PER_VECTOR) {
+					sum += numericalValues[i] * FLOAT_VECTOR_GET(vectorX, colIndexValues[i]);
+				}
+			}
+			/*intra-vector reduction*/
+			sum *= alpha;
+			for (i = THREADS_PER_VECTOR >> 1; i > 0; i >>= 1) {
+				sum += __shfl_down_sync(0xffffffff,sum, i, THREADS_PER_VECTOR);
+			}
+
+			/*save the results and get a new row*/
+			if (laneId == 0) {
+				/*save the results*/
+				vectorY[row] = sum + beta * vectorY[row];
+			}
+
+			/*get a new row index*/
+			if(warpLaneId == 0){
+				row = atomicAdd(cudaRowCounter, 32 / THREADS_PER_VECTOR);
+			}
+			/*broadcast the row index to the other threads in the same warp and compute the row index of each vetor*/
+			row = __shfl_sync(0xffffffff,row, 0) + warpVectorId;
+
+		}/*while*/
+	}
+
+	/*vector-based row dynamic distribution*/
+	template < typename T, uint32_t THREADS_PER_VECTOR, uint32_t MAX_NUM_VECTORS_PER_BLOCK>
+	#ifdef FLOAT_USE_TEXTURE_MEMORY
+	__global__ void csr32DynamicVectorBLAS(uint32_t* __restrict cudaRowCounter, const uint32_t* __restrict rowOffsets, const uint32_t* __restrict colIndexValues,
+			const T* __restrict numericalValues, const cudaTextureObject_t vectorX, T* vectorY, const T alpha, const T beta) {
+	#else
+		__global__ void csr32DynamicVectorBLAS(uint32_t* __restrict cudaRowCounter, const uint32_t* __restrict rowOffsets, const uint32_t* __restrict colIndexValues,
+				const T* __restrict numericalValues, const T* __restrict vectorX, T* vectorY, const T alpha, const T beta) {
+	#endif
+
+		uint32_t i;
+		T sum;
+		uint32_t row;
+		uint32_t rowStart, rowEnd;
+		const uint32_t laneId = threadIdx.x % THREADS_PER_VECTOR; /*lane index in the vector*/
+		const uint32_t vectorId = threadIdx.x / THREADS_PER_VECTOR; /*vector index in the block*/
+		__shared__ volatile uint32_t space[MAX_NUM_VECTORS_PER_BLOCK][2];
+
+		/*get the row index*/
+		if (laneId == 0) {
+			row = atomicAdd(cudaRowCounter, 1);
+		}
+		/*broadcast the value to other lanes from lane 0*/
+		row = __shfl_sync(0xffffffff,row, 0, THREADS_PER_VECTOR);
+
+		/*check the row range*/
+		while (row < _cudaNumRows) {
+
+			/*use two threads to fetch the row offset*/
+			if (laneId < 2) {
+				space[vectorId][laneId] = rowOffsets[row + laneId];
+			}
+			rowStart = space[vectorId][0];
+			rowEnd = space[vectorId][1];
+
+			/*there are non-zero elements in the current row*/
+			sum = 0;
+			/*compute dot product*/
+			if (THREADS_PER_VECTOR == 32) {
+
+				/*ensure aligned memory access*/
+				i = rowStart - (rowStart & (THREADS_PER_VECTOR - 1)) + laneId;
+
+				/*process the unaligned part*/
+				if (i >= rowStart && i < rowEnd) {
+					sum += numericalValues[i] * FLOAT_VECTOR_GET(vectorX, colIndexValues[i]);
+				}
+
+					/*process the aligned part*/
+				for (i += THREADS_PER_VECTOR; i < rowEnd; i += THREADS_PER_VECTOR) {
+					sum += numericalValues[i] * FLOAT_VECTOR_GET(vectorX, colIndexValues[i]);
+				}
+			} else {
+				/*regardless of the global memory access alignment*/
+				for (i = rowStart + laneId; i < rowEnd; i +=
+						THREADS_PER_VECTOR) {
+					sum += numericalValues[i] * FLOAT_VECTOR_GET(vectorX, colIndexValues[i]);
+				}
+			}
+			/*intra-vector reduction*/
+			sum *= alpha;
+			for (i = THREADS_PER_VECTOR >> 1; i > 0; i >>= 1) {
+				sum += __shfl_down_sync(0xffffffff,sum, i, THREADS_PER_VECTOR);
+			}
+
+			/*save the results and get a new row*/
+			if (laneId == 0) {
+				/*save the results*/
+				vectorY[row] = sum + beta * vectorY[row];
+
+				/*get a new row index*/
+				row = atomicAdd(cudaRowCounter, 1);
+			}
+			row = __shfl_sync(0xffffffff,row, 0, THREADS_PER_VECTOR);
+		}/*while*/
+	}
+
+/*64-bit functions*/
+template < typename T, uint32_t THREADS_PER_VECTOR, uint32_t MAX_NUM_VECTORS_PER_BLOCK>
+#ifdef DOUBLE_USE_TEXTURE_MEMORY
+__global__ void csr64DynamicVector(uint32_t* __restrict cudaRowCounter, const uint32_t* __restrict rowOffsets, const uint32_t* __restrict colIndexValues,
+		const T* __restrict numericalValues, const cudaTextureObject_t vectorX, T* vectorY)
+#else
+__global__ void csr64DynamicVector(uint32_t* __restrict cudaRowCounter, const uint32_t* __restrict rowOffsets, const uint32_t* __restrict colIndexValues,
+		const T* __restrict numericalValues, const T* __restrict vectorX, T* vectorY)
+#endif
+{
+	uint32_t i;
+	T sum;
+	uint32_t row;
+	uint32_t rowStart, rowEnd;
+	const uint32_t laneId = threadIdx.x % THREADS_PER_VECTOR; /*lane index in the vector*/
+	const uint32_t vectorId = threadIdx.x / THREADS_PER_VECTOR; /*vector index in the block*/
+
+	__shared__ volatile uint32_t space[MAX_NUM_VECTORS_PER_BLOCK][2];
+
+	/*get the row index*/
+	if (laneId == 0) {
+		row = atomicAdd(cudaRowCounter, 1);
+	}
+	/*broadcast the value to other lanes from lane 0*/
+	row = __shfl_sync(0xffffffff,row, 0, THREADS_PER_VECTOR);
+
+	/*check the row range*/
+	while (row < _cudaNumRows) {
+
+		/*use two threads to fetch the row offset*/
+		if (laneId < 2) {
+			space[vectorId][laneId] = rowOffsets[row + laneId];
+		}
+		rowStart = space[vectorId][0];
+		rowEnd = space[vectorId][1];
+
+		/*there are non-zero elements in the current row*/
+		sum = 0;
+		/*compute dot product*/
+		if (THREADS_PER_VECTOR == 32) {
+
+			/*ensure aligned memory access*/
+			i = rowStart - (rowStart & (THREADS_PER_VECTOR - 1)) + laneId;
+
+			/*process the unaligned part*/
+			if (i >= rowStart && i < rowEnd) {
+				sum += numericalValues[i] * DOUBLE_VECTOR_GET(vectorX, colIndexValues[i]);
+			}
+
+				/*process the aligned part*/
+			for (i += THREADS_PER_VECTOR; i < rowEnd; i += THREADS_PER_VECTOR) {
+				sum += numericalValues[i] * DOUBLE_VECTOR_GET(vectorX, colIndexValues[i]);
+			}
+		} else {
+			/*regardless of the global memory access alignment*/
+			for (i = rowStart + laneId; i < rowEnd; i +=
+					THREADS_PER_VECTOR) {
+				sum += numericalValues[i] * DOUBLE_VECTOR_GET(vectorX, colIndexValues[i]);
+			}
+		}
+		/*intra-vector reduction*/
+		for (i = THREADS_PER_VECTOR >> 1; i > 0; i >>= 1) {
+			sum += shfl_down_64bits<T>(sum, i, THREADS_PER_VECTOR);
+		}
+
+		/*save the results and get a new row*/
+		if (laneId == 0) {
+			/*save the results*/
+			vectorY[row] = sum;
+
+			/*get a new row index*/
+			row = atomicAdd(cudaRowCounter, 1);
+		}
+		row = __shfl_sync( 0xffffffff, row, 0, THREADS_PER_VECTOR);
+	}/*while*/
+}
+
+template < typename T, uint32_t THREADS_PER_VECTOR, uint32_t MAX_NUM_VECTORS_PER_BLOCK>
+#ifdef DOUBLE_USE_TEXTURE_MEMORY
+__global__ void csr64DynamicWarp(uint32_t* __restrict cudaRowCounter, const uint32_t* __restrict rowOffsets, const uint32_t* __restrict colIndexValues,
+		const T* __restrict numericalValues, const cudaTextureObject_t vectorX, T* vectorY)
+#else
+__global__ void csr64DynamicWarp(uint32_t* __restrict cudaRowCounter, const uint32_t* __restrict rowOffsets, const uint32_t* __restrict colIndexValues,
+		const T* __restrict numericalValues, const T* __restrict vectorX, T* vectorY)
+#endif
+{
+	uint32_t i;
+	T sum;
+	uint32_t row;
+	uint32_t rowStart, rowEnd;
+	const uint32_t laneId = threadIdx.x % THREADS_PER_VECTOR; /*lane index in the vector*/
+	const uint32_t vectorId = threadIdx.x / THREADS_PER_VECTOR; /*vector index in the thread block*/
+	const uint32_t warpLaneId = threadIdx.x & 31;	/*lane index in the warp*/
+	const uint32_t warpVectorId = warpLaneId / THREADS_PER_VECTOR;	/*vector index in the warp*/
+
+	__shared__ volatile uint32_t space[MAX_NUM_VECTORS_PER_BLOCK][2];
+
+	/*get the row index*/
+	if (warpLaneId == 0) {
+		row = atomicAdd(cudaRowCounter, 32 / THREADS_PER_VECTOR);
+	}
+	/*broadcast the value to other threads in the same warp*/
+	row = __shfl_sync(0xffffffff,row, 0) + warpVectorId;
+
+	/*check the row range*/
+	while (row < _cudaNumRows) {
+
+		/*use two threads to fetch the row offset*/
+		if (laneId < 2) {
+			space[vectorId][laneId] = rowOffsets[row + laneId];
+		}
+		rowStart = space[vectorId][0];
+		rowEnd = space[vectorId][1];
+
+		/*there are non-zero elements in the current row*/
+		sum = 0;
+		/*compute dot product*/
+		if (THREADS_PER_VECTOR == 32) {
+
+			/*ensure aligned memory access*/
+			i = rowStart - (rowStart & (THREADS_PER_VECTOR - 1)) + laneId;
+
+			/*process the unaligned part*/
+			if (i >= rowStart && i < rowEnd) {
+				sum += numericalValues[i] * DOUBLE_VECTOR_GET(vectorX, colIndexValues[i]);
+			}
+
+				/*process the aligned part*/
+			for (i += THREADS_PER_VECTOR; i < rowEnd; i += THREADS_PER_VECTOR) {
+				sum += numericalValues[i] * DOUBLE_VECTOR_GET(vectorX, colIndexValues[i]);
+			}
+		} else {
+			/*regardless of the global memory access alignment*/
+			for (i = rowStart + laneId; i < rowEnd; i +=
+					THREADS_PER_VECTOR) {
+				sum += numericalValues[i] * DOUBLE_VECTOR_GET(vectorX, colIndexValues[i]);
+			}
+		}
+
+		/*intra-vector reduction*/
+		for (i = THREADS_PER_VECTOR >> 1; i > 0; i >>= 1) {
+			sum += shfl_down_64bits<T>(sum, i, THREADS_PER_VECTOR);
+		}
+
+		/*save the results and get a new row*/
+		if (laneId == 0) {
+			/*save the results*/
+			vectorY[row] = sum;
+		}
+
+		/*get a new row index*/
+		if(warpLaneId == 0){
+			row = atomicAdd(cudaRowCounter, 32 / THREADS_PER_VECTOR);
+		}
+		/*broadcast the value to other threads in the same warp*/
+		row = __shfl_sync(0xffffffff,row, 0) + warpVectorId;
+
+	}/*while*/
+}
+
+/*64-bit functions*/
+template < typename T, uint32_t THREADS_PER_VECTOR, uint32_t MAX_NUM_VECTORS_PER_BLOCK>
+#ifdef DOUBLE_USE_TEXTURE_MEMORY
+__global__ void csr64DynamicVectorBLAS(uint32_t* __restrict cudaRowCounter, const uint32_t* __restrict rowOffsets, const uint32_t* __restrict colIndexValues,
+		const T* __restrict numericalValues, const cudaTextureObject_t vectorX, const T* __restrict inVectorY, T* vectorY, const T alpha, const T beta)
+#else
+__global__ void csr64DynamicVectorBLAS(uint32_t* __restrict cudaRowCounter, const uint32_t* __restrict rowOffsets, const uint32_t* __restrict colIndexValues,
+		const T* __restrict numericalValues, const T* __restrict vectorX, const T* __restrict inVectorY, T* vectorY, const T alpha, const T beta)
+#endif
+{
+	uint32_t i;
+	T sum;
+	uint32_t row;
+	uint32_t rowStart, rowEnd;
+	const uint32_t laneId = threadIdx.x % THREADS_PER_VECTOR; /*lane index in the vector*/
+	const uint32_t vectorId = threadIdx.x / THREADS_PER_VECTOR; /*vector index in the block*/
+
+	__shared__ volatile uint32_t space[MAX_NUM_VECTORS_PER_BLOCK][2];
+
+	/*get the row index*/
+	if (laneId == 0) {
+		row = atomicAdd(cudaRowCounter, 1);
+	}
+	/*broadcast the value to other lanes from lane 0*/
+	row = __shfl_sync(0xffffffff,row, 0, THREADS_PER_VECTOR);
+
+	/*check the row range*/
+	while (row < _cudaNumRows) {
+
+		/*use two threads to fetch the row offset*/
+		if (laneId < 2) {
+			space[vectorId][laneId] = rowOffsets[row + laneId];
+		}
+		rowStart = space[vectorId][0];
+		rowEnd = space[vectorId][1];
+
+		/*there are non-zero elements in the current row*/
+		sum = 0;
+		/*compute dot product*/
+		if (THREADS_PER_VECTOR == 32) {
+
+			/*ensure aligned memory access*/
+			i = rowStart - (rowStart & (THREADS_PER_VECTOR - 1)) + laneId;
+
+			/*process the unaligned part*/
+			if (i >= rowStart && i < rowEnd) {
+				sum += numericalValues[i] * DOUBLE_VECTOR_GET(vectorX, colIndexValues[i]);
+			}
+
+				/*process the aligned part*/
+			for (i += THREADS_PER_VECTOR; i < rowEnd; i += THREADS_PER_VECTOR) {
+				sum += numericalValues[i] * DOUBLE_VECTOR_GET(vectorX, colIndexValues[i]);
+			}
+		} else {
+			/*regardless of the global memory access alignment*/
+			for (i = rowStart + laneId; i < rowEnd; i +=
+					THREADS_PER_VECTOR) {
+				sum += numericalValues[i] * DOUBLE_VECTOR_GET(vectorX, colIndexValues[i]);
+			}
+		}
+		/*intra-vector reduction*/
+		sum *= alpha;
+		for (i = THREADS_PER_VECTOR >> 1; i > 0; i >>= 1) {
+			sum += shfl_down_64bits<T>(sum, i, THREADS_PER_VECTOR);
+		}
+
+		/*save the results and get a new row*/
+		if (laneId == 0) {
+			/*save the results*/
+			vectorY[row] = sum + beta * DOUBLE_VECTOR_GET(inVectorY, row);
+
+			/*get a new row index*/
+			row = atomicAdd(cudaRowCounter, 1);
+		}
+		row = __shfl_sync(0xffffffff,row, 0, THREADS_PER_VECTOR);
+	}/*while*/
+}
+
+template < typename T, uint32_t THREADS_PER_VECTOR, uint32_t MAX_NUM_VECTORS_PER_BLOCK>
+#ifdef DOUBLE_USE_TEXTURE_MEMORY
+__global__ void csr64DynamicWarpBLAS(uint32_t* __restrict cudaRowCounter, const uint32_t* __restrict rowOffsets, const uint32_t* __restrict colIndexValues,
+		const T* __restrict numericalValues, const cudaTextureObject_t vectorX, const T* __restrict inVectorY, T* vectorY, const T alpha, const T beta)
+#else
+__global__ void csr64DynamicWarpBLAS(uint32_t* __restrict cudaRowCounter, const uint32_t* __restrict rowOffsets, const uint32_t* __restrict colIndexValues,
+		const T* __restrict numericalValues, const T* __restrict vectorX, const T* __restrict inVectorY, T* vectorY, const T alpha, const T beta)
+#endif
+{
+	uint32_t i;
+	T sum;
+	uint32_t row;
+	uint32_t rowStart, rowEnd;
+	const uint32_t laneId = threadIdx.x % THREADS_PER_VECTOR; /*lane index in the vector*/
+	const uint32_t vectorId = threadIdx.x / THREADS_PER_VECTOR; /*vector index in the thread block*/
+	const uint32_t warpLaneId = threadIdx.x & 31;	/*lane index in the warp*/
+	const uint32_t warpVectorId = warpLaneId / THREADS_PER_VECTOR;	/*vector index in the warp*/
+
+	__shared__ volatile uint32_t space[MAX_NUM_VECTORS_PER_BLOCK][2];
+
+	/*get the row index*/
+	if (warpLaneId == 0) {
+		row = atomicAdd(cudaRowCounter, 32 / THREADS_PER_VECTOR);
+	}
+	/*broadcast the value to other threads in the same warp*/
+	row = __shfl_sync(0xffffffff,row, 0) + warpVectorId;
+
+	/*check the row range*/
+	while (row < _cudaNumRows) {
+
+		/*use two threads to fetch the row offset*/
+		if (laneId < 2) {
+			space[vectorId][laneId] = rowOffsets[row + laneId];
+		}
+		rowStart = space[vectorId][0];
+		rowEnd = space[vectorId][1];
+
+		/*there are non-zero elements in the current row*/
+		sum = 0;
+		/*compute dot product*/
+		if (THREADS_PER_VECTOR == 32) {
+
+			/*ensure aligned memory access*/
+			i = rowStart - (rowStart & (THREADS_PER_VECTOR - 1)) + laneId;
+
+			/*process the unaligned part*/
+			if (i >= rowStart && i < rowEnd) {
+				sum += numericalValues[i] * DOUBLE_VECTOR_GET(vectorX, colIndexValues[i]);
+			}
+
+				/*process the aligned part*/
+			for (i += THREADS_PER_VECTOR; i < rowEnd; i += THREADS_PER_VECTOR) {
+				sum += numericalValues[i] * DOUBLE_VECTOR_GET(vectorX, colIndexValues[i]);
+			}
+		} else {
+			/*regardless of the global memory access alignment*/
+			for (i = rowStart + laneId; i < rowEnd; i +=
+					THREADS_PER_VECTOR) {
+				sum += numericalValues[i] * DOUBLE_VECTOR_GET(vectorX, colIndexValues[i]);
+			}
+		}
+
+		/*intra-vector reduction*/
+		sum *= alpha;
+		for (i = THREADS_PER_VECTOR >> 1; i > 0; i >>= 1) {
+			sum += shfl_down_64bits<T>(sum, i, THREADS_PER_VECTOR);
+		}
+
+		/*save the results and get a new row*/
+		if (laneId == 0) {
+			/*save the results*/
+			vectorY[row] = sum + beta * DOUBLE_VECTOR_GET(inVectorY, row);
+		}
+
+		/*get a new row index*/
+		if(warpLaneId == 0){
+			row = atomicAdd(cudaRowCounter, 32 / THREADS_PER_VECTOR);
+		}
+		/*broadcast the value to other threads in the same warp*/
+		row = __shfl_sync(0xffffffff,row, 0) + warpVectorId;
+
+	}/*while*/
+}
+
+
+}/*namespace*/
+
+#endif /* SPMVCSR_H_ */
diff --git a/src/Benchmarks/SpMV/ReferenceFormats/LightSpMV-1.0/Types.h b/src/Benchmarks/SpMV/ReferenceFormats/LightSpMV-1.0/Types.h
new file mode 100644
index 0000000000000000000000000000000000000000..6aca384ffa96a8982c058a784a3acd186c137168
--- /dev/null
+++ b/src/Benchmarks/SpMV/ReferenceFormats/LightSpMV-1.0/Types.h
@@ -0,0 +1,59 @@
+/*
+ * Types.h
+ *
+ *  Created on: Nov 21, 2014
+ *      Author: yongchao
+ */
+
+#ifndef TYPES_H_
+#define TYPES_H_
+
+#include <cuda.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/time.h>
+#include <vector>
+#include <iostream>
+using namespace std;
+
+/*program version*/
+#define VERSION "v1.0"
+
+/*macros for cuda array*/
+#if !defined(SPMV_CUDA_ARRAY_WIDTH_SHIFT) || SPMV_CUDA_ARRAY_WIDTH_SHIFT < 10 || SPMV_CUDA_ARRAY_WIDTH_SHIFT > 16
+#define SPMV_CUDA_ARRAY_WIDTH_SHIFT		15
+#endif
+#define SPMV_CUDA_ARRAY_WIDTH_MASK		((1 << SPMV_CUDA_ARRAY_WIDTH_SHIFT) - 1)
+#define SPMV_CUDA_ARRAY_WIDTH 			(1 << SPMV_CUDA_ARRAY_WIDTH_SHIFT)
+
+/*texture memory*/
+#ifdef NO_FLOAT_TEXTURE_MEMORY
+#undef FLOAT_USE_TEXTURE_MEMORY
+#else
+#define FLOAT_USE_TEXTURE_MEMORY
+#endif
+
+#ifdef NO_DOUBLE_TEXTURE_MEMORY
+#undef DOUBLE_USE_TEXTURE_MEMORY
+#else
+#define DOUBLE_USE_TEXTURE_MEMORY
+#endif
+
+/*maximum number of threads per block*/
+#define MAX_NUM_THREADS_PER_BLOCK			1024
+
+/*error check*/
+#define CudaCheckError() __cudaCheckError( __FILE__, __LINE__ )
+inline void __cudaCheckError(const char* file, const int32_t line) {
+	cudaError err = cudaGetLastError();
+	if (cudaSuccess != err) {
+		cerr << "cudaCheckError() failed at " << file << ":" << line << " : "
+				<< cudaGetErrorString(err) << endl;
+		exit(-1);
+	}
+}
+
+#endif /* TYPES_H_ */
diff --git a/src/Benchmarks/SpMV/ReferenceFormats/LightSpMV-1.0/main.cu b/src/Benchmarks/SpMV/ReferenceFormats/LightSpMV-1.0/main.cu
new file mode 100644
index 0000000000000000000000000000000000000000..0345d0b39c1df757f01d86a302f006c179a0ab61
--- /dev/null
+++ b/src/Benchmarks/SpMV/ReferenceFormats/LightSpMV-1.0/main.cu
@@ -0,0 +1,99 @@
+/*
+ * main.cu
+ *
+ *  Created on: Nov 21, 2014
+ *      Author: yongchao
+ */
+#include "Options.h"
+#include "SpMV.h"
+
+int32_t main(int32_t argc, char* argv[]) {
+	Options opt;
+	float runtime;
+	double gflops;
+	int32_t numIters;
+
+	/*parse the parameters*/
+	if (!opt.parseArgs(argc, argv)) {
+		return -1;
+	}
+	numIters = opt._numIters;
+
+	/*run the sparse matrix-vector multiplication kernel*/
+	SpMV* spmv;
+	if (opt._singlePrecision) {
+		switch (opt._routine) {
+		case 0:
+			spmv = new SpMVFloatVector(&opt);
+			break;
+		case 1:
+			spmv = new SpMVFloatWarp(&opt);
+			break;
+		default:
+			cerr << "Error: unsupported routine number for FLOAT" << endl;
+			return -1;
+		}
+	} else {
+		switch (opt._routine) {
+		case 0:
+			spmv = new SpMVDoubleVector(&opt);
+			break;
+		case 1:
+			spmv = new SpMVDoubleWarp(&opt);
+			break;
+		default:
+			cerr << "Error: unsupported routine number for DOUBLE" << endl;
+			return -1;
+		}
+	}
+
+	/*set device cache*/
+	if (opt._routine == 2) {
+		cudaDeviceSetCacheConfig (cudaFuncCachePreferShared);
+	} else {
+		cudaDeviceSetCacheConfig (cudaFuncCachePreferL1);
+	}
+
+	if (opt._singlePrecision) {
+		cerr << "Use single-precision floating point" << endl;
+	} else {
+		cerr << "Use double-precision floating point" << endl;
+	}
+
+	/*print out the statistical information of the sparse matrix*/
+	opt.getRowSizeVariance();
+
+	/*load the data*/
+	spmv->loadData();
+
+	/*run the kernel*/
+	double stime = spmv->getSysTime();
+	for (int32_t i = 0; i < numIters; ++i) {
+		spmv->spmvKernel();
+	}
+	/*synchronize all kernels*/
+	cudaDeviceSynchronize();
+	double etime = spmv->getSysTime();
+
+	runtime = etime - stime;
+	runtime /= 1000.0 * (float) numIters;
+	cerr << "Average runtime: " << runtime << " seconds (in " << numIters
+			<< " iterations)" << endl;
+
+	/*compute the GFLOPS*/
+	gflops =
+			opt._formula == 0 ?
+					2 * opt._numValues - 1 :
+					2 * (opt._numValues + opt._numRows);
+	cerr << "Total FLOPs: " << (uint64_t) gflops << endl;
+	gflops /= runtime * 1000000000;
+	cerr << "GFLOPS: " << gflops << endl;
+
+	/*store the data*/
+	spmv->storeData();
+
+	/*release the data*/
+	delete spmv;
+
+	return 0;
+}
diff --git a/src/Benchmarks/SpMV/ReferenceFormats/LightSpMVBenchmark.h b/src/Benchmarks/SpMV/ReferenceFormats/LightSpMVBenchmark.h
new file mode 100644
index 0000000000000000000000000000000000000000..7d6ffde49960a1e53b563222af8ab4861accba0f
--- /dev/null
+++ b/src/Benchmarks/SpMV/ReferenceFormats/LightSpMVBenchmark.h
@@ -0,0 +1,176 @@
+/***************************************************************************
+                          LightSpMVBenchmark.h  -  description
+                             -------------------
+    begin                : Apr 23, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+/***
+ * Wrapper of original LightSpMV kernels for TNL benchmarks.
+ */
+
+#include <stdexcept>
+#ifdef HAVE_CUDA
+#pragma push
+#pragma diag_suppress = 1444
+#include "LightSpMV-1.0/SpMV.h"
+#include "LightSpMV-1.0/SpMV.cu"
+#include "LightSpMV-1.0/SpMVCSR.cu"
+#pragma pop
+#endif
+#include <TNL/Matrices/SparseMatrix.h>
+
+namespace TNL {
+
+enum LightSpMVBenchmarkKernelType { LightSpMVBenchmarkKernelVector, LightSpMVBenchmarkKernelWarp };
+
+template< typename Real1, typename Real2 >
+struct LightSpMVVectorsBinder
+{
+   template< typename Index >
+   static void bind( TNL::Containers::VectorView< Real1, TNL::Devices::Cuda, Index >& vectorView, Real2* data, Index size ){};
+};
+
+template< typename Real >
+struct LightSpMVVectorsBinder< Real, Real >
+{
+   template< typename Index >
+   static void bind( TNL::Containers::VectorView< Real, TNL::Devices::Cuda, Index >& vectorView, Real* data, Index size )
+   {
+      vectorView.bind( data, size );
+   }
+};
+
+template< typename Real >
+struct LightSpMVBenchmark
+{
+   using RealType = Real;
+   using DeviceType = TNL::Devices::Host;
+   using IndexType = uint32_t;
+   using VectorType = TNL::Containers::Vector< RealType, DeviceType, IndexType >;
+   using CudaVectorView = TNL::Containers::VectorView< RealType, TNL::Devices::Cuda, IndexType >;
+
+   template< typename Matrix >
+   LightSpMVBenchmark( Matrix& matrix, LightSpMVBenchmarkKernelType kernelType )
+   : inVector( matrix.getColumns(), 1.0 ),
+     outVector( matrix.getRows(), 0.0 ),
+     kernelType( kernelType )
+   {
+      static_assert( std::is_same< typename Matrix::DeviceType, TNL::Devices::Host >::value, "The only device type accepted here is TNL::Devices::Host." );
+#ifdef HAVE_CUDA
+      cudaDeviceProp prop;
+      cudaGetDeviceProperties(&prop, 0);
+      opt._gpus.push_back(make_pair(0, prop));
+      opt._numGPUs = 1;
+      opt._numRows = matrix.getRows();
+      opt._numCols = matrix.getColumns();
+      opt._rowOffsets = matrix.getRowPointers().getData();
+      opt._numValues = matrix.getValues().getSize();
+      opt._colIndexValues = matrix.getColumnIndexes().getData();
+      opt._numericalValues = matrix.getValues().getData();
+      opt._alpha = 1.0; // matrix multiplicator
+      opt._beta = 0.0;  // output vector multiplicator
+      opt._vectorX = inVector.getData();
+      opt._vectorY = outVector.getData();
+      opt._formula = 0;
+      if( std::is_same< Real, float >::value )
+      {
+         if( kernelType == LightSpMVBenchmarkKernelVector )
+            this->spmv = new SpMVFloatVector( &opt );
+         else
+            this->spmv = new SpMVFloatWarp( &opt );
+      }
+      else if( std::is_same< Real, double >::value )
+      {
+         if( kernelType == LightSpMVBenchmarkKernelVector )
+            this->spmv = new SpMVDoubleVector( &opt );
+         else
+            this->spmv = new SpMVDoubleWarp( &opt );
+      }
+      else throw std::runtime_error( "Unknown real type for LightSpMV." );
+      this->spmv->loadData();
+      if( std::is_same< Real, float >::value )
+      {
+         if( kernelType == LightSpMVBenchmarkKernelVector )
+         {
+            SpMVFloatVector* floatSpMV = dynamic_cast< SpMVFloatVector* >( this->spmv );
+            LightSpMVVectorsBinder< Real, float >::bind( this->inVectorView, floatSpMV->_vectorX[ 0 ], matrix.getColumns() );
+            LightSpMVVectorsBinder< Real, float >::bind( this->outVectorView, floatSpMV->_vectorY[ 0 ], matrix.getRows() );
+         }
+         else
+         {
+            SpMVFloatVector* floatSpMV = dynamic_cast< SpMVFloatWarp* >( this->spmv );
+            LightSpMVVectorsBinder< Real, float >::bind( this->inVectorView, floatSpMV->_vectorX[ 0 ], matrix.getColumns() );
+            LightSpMVVectorsBinder< Real, float >::bind( this->outVectorView, floatSpMV->_vectorY[ 0 ], matrix.getRows() );
+         }
+      }
+      else if( std::is_same< Real, double >::value )
+      {
+         if( kernelType == LightSpMVBenchmarkKernelVector )
+         {
+            SpMVDoubleVector* doubleSpMV = dynamic_cast< SpMVDoubleVector* >( this->spmv );
+            LightSpMVVectorsBinder< Real, double >::bind( this->inVectorView, doubleSpMV->_vectorX[ 0 ], matrix.getColumns() );
+            LightSpMVVectorsBinder< Real, double >::bind( this->outVectorView, doubleSpMV->_vectorY[ 0 ], matrix.getRows() );
+         }
+         else
+         {
+            SpMVDoubleVector* doubleSpMV = dynamic_cast< SpMVDoubleWarp* >( this->spmv );
+            LightSpMVVectorsBinder< Real, double >::bind( this->inVectorView, doubleSpMV->_vectorX[ 0 ], matrix.getColumns() );
+            LightSpMVVectorsBinder< Real, double >::bind( this->outVectorView, doubleSpMV->_vectorY[ 0 ], matrix.getRows() );
+         }
+      }
+      else std::runtime_error( "Unknown real type for LightSpMV." );
+#endif
+   }
+
+   void setKernelType( LightSpMVBenchmarkKernelType type )
+   {
+      this->kernelType = type;
+   }
+
+   void resetVectors()
+   {
+      this->inVectorView = 1.0;
+      this->outVectorView = 0.0;
+   }
+
+   void vectorProduct()
+   {
+#ifdef HAVE_CUDA
+      this->spmv->spmvKernel();
+      cudaDeviceSynchronize();
+#endif
+
+   }
+
+   const CudaVectorView& getCudaOutVector()
+   {
+      return this->outVectorView;
+   }
+
+   ~LightSpMVBenchmark()
+   {
+#ifdef HAVE_CUDA
+      if( spmv ) delete spmv;
+      opt._rowOffsets = nullptr;
+      opt._colIndexValues = nullptr;
+      opt._numericalValues = nullptr;
+      opt._vectorX = nullptr;
+      opt._vectorY = nullptr;
+#endif
+   }
+
+   protected:
+#ifdef HAVE_CUDA
+      Options opt;
+      SpMV* spmv = nullptr;
+#endif
+      VectorType  inVector, outVector;
+      CudaVectorView inVectorView, outVectorView;
+      LightSpMVBenchmarkKernelType kernelType = LightSpMVBenchmarkKernelVector;
+};
+
+} // namespace TNL
diff --git a/src/Benchmarks/SpMV/SpmvBenchmarkResult.h b/src/Benchmarks/SpMV/SpmvBenchmarkResult.h
index 7f688b7cb7f55b3c78bba51570bc124748cf2cf5..61fae4f609601f6a35a08160c9132f471cd4d72c 100644
--- a/src/Benchmarks/SpMV/SpmvBenchmarkResult.h
+++ b/src/Benchmarks/SpMV/SpmvBenchmarkResult.h
@@ -17,33 +17,58 @@ namespace Benchmarks {
 
 template< typename Real,
           typename Device,
-          typename Index >
+          typename Index,
+          typename ResultReal = Real,
+          typename Logger = JsonLogging >
 struct SpmvBenchmarkResult
-: public BenchmarkResult
+: public BenchmarkResult< Logger >
 {
    using RealType = Real;
    using DeviceType = Device;
    using IndexType = Index;
    using HostVector = Containers::Vector< Real, Devices::Host, Index >;
-   using BenchmarkVector = Containers::Vector< Real, Device, Index >;
+   using BenchmarkVector = Containers::Vector< ResultReal, Device, Index >;
 
-   SpmvBenchmarkResult( const HostVector& csrResult,
+   using typename BenchmarkResult< Logger >::HeaderElements;
+   using typename BenchmarkResult< Logger >::RowElements;
+   using BenchmarkResult< Logger >::stddev;
+   using BenchmarkResult< Logger >::bandwidth;
+   using BenchmarkResult< Logger >::speedup;
+   using BenchmarkResult< Logger >::time;
+
+
+   SpmvBenchmarkResult( const String& format,
+                        const HostVector& csrResult,
                         const BenchmarkVector& benchmarkResult,
                         const IndexType nonzeros )
-   : csrResult( csrResult ), benchmarkResult( benchmarkResult ), nonzeros( nonzeros ){};
+   : format( format ), csrResult( csrResult ), benchmarkResult( benchmarkResult ), nonzeros( nonzeros ){};
 
    virtual HeaderElements getTableHeader() const override
    {
-      return HeaderElements( {"non-zeros", "time", "stddev", "stddev/time", "bandwidth", "speedup", "CSR Diff.Max", "CSR Diff.L2"} );
+      return HeaderElements( {
+         std::pair< String, int >( "format", 35 ),
+         std::pair< String, int >( "device", 12 ),
+         std::pair< String, int >( "non-zeros", 12 ),
+         std::pair< String, int >( "time", 12 ),
+         std::pair< String, int >( "stddev", 12 ),
+         std::pair< String, int >( "stddev/time", 14 ),
+         std::pair< String, int >( "bandwidth", 12 ),
+         std::pair< String, int >( "speedup", 12 ),
+         std::pair< String, int >( "CSR Diff.Max", 14 ),
+         std::pair< String, int >( "CSR Diff.L2", 14 ) } );
    }
 
+   void setFormat( const String& format ) { this->format = format; };
+
    virtual RowElements getRowElements() const override
    {
       HostVector benchmarkResultCopy;
       benchmarkResultCopy = benchmarkResult;
       auto diff = csrResult - benchmarkResultCopy;
       RowElements elements;
-      elements << nonzeros << time << stddev << stddev/time << bandwidth;
+      elements << format
+               << ( std::is_same< Device, Devices::Host >::value ? "CPU" : "GPU" )
+               << nonzeros << time << stddev << stddev/time << bandwidth;
       if( speedup != 0.0 )
          elements << speedup;
       else elements << "N/A";
@@ -51,10 +76,11 @@ struct SpmvBenchmarkResult
       return elements;
    }
 
+   String format;
    const HostVector& csrResult;
    const BenchmarkVector& benchmarkResult;
    const IndexType nonzeros;
 };
-   
+
 } //namespace Benchmarks
 } //namespace TNL
diff --git a/src/Benchmarks/SpMV/cmake/BuildCSR5.cmake b/src/Benchmarks/SpMV/cmake/BuildCSR5.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..51a4a0be3a833831ce9d2d89fb34fdb94c1d82d6
--- /dev/null
+++ b/src/Benchmarks/SpMV/cmake/BuildCSR5.cmake
@@ -0,0 +1,28 @@
+# compatibility with the CSR5 package
+
+set( CUDA_SAMPLES_DIR $ENV{CUDA_SAMPLES_DIR} )
+if( NOT DEFINED CUDA_SAMPLES_DIR )
+    message( WARNING "CUDA_SAMPLES_DIR variable was not set and it is required by CSR5 benchmark - CSR5 benchmark is disabled.")
+else()
+    # Download and unpack CSR5 at configure time
+    message( STATUS "CUDA_SAMPLES_DIR set to ${CUDA_SAMPLES_DIR}")
+    configure_file(cmake/CSR5.cmake.in csr5-download/CMakeLists.txt)
+    execute_process(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" .
+    RESULT_VARIABLE result
+    WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/src/Benchmarks/SpMV/csr5-download )
+    if(result)
+        message(WARNING "CMake step for CSR5 failed: ${result}")
+    else()
+        execute_process(COMMAND ${CMAKE_COMMAND} --build .
+            RESULT_VARIABLE result
+            WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/src/Benchmarks/SpMV/csr5-download )
+        if(result)
+            message( ${CMAKE_COMMAND} --build ${CMAKE_BINARY_DIR}/src/Benchmarks/SpMV/csr5-download )
+            message(WARNING "Build step for CSR5 failed: ${result}")
+        else()
+            set( CXX_BENCHMARKS_FLAGS ${CXX_BENCHMARKS_FLAGS} "-DHAVE_CSR5" )
+            set( CXX_BENCHMARKS_INCLUDE_DIRS ${CXX_BENCHMARKS_INCLUDE_DIRS} ${CMAKE_BINARY_DIR}/src/Benchmarks/SpMV/csr5-src ${CUDA_SAMPLES_DIR}/common/inc)
+            message( STATUS "CSR5 build was succesfull.")
+        endif()
+    endif()
+endif()
diff --git a/src/Benchmarks/SpMV/cmake/CSR5.cmake.in b/src/Benchmarks/SpMV/cmake/CSR5.cmake.in
new file mode 100644
index 0000000000000000000000000000000000000000..14a0c61ae0c76d81c3188dd2aa4dbc0564d68c93
--- /dev/null
+++ b/src/Benchmarks/SpMV/cmake/CSR5.cmake.in
@@ -0,0 +1,24 @@
+# vim: ft=cmake
+
+# This is a separate template for CMakeLists.txt to build gtest as a separate project
+
+cmake_minimum_required(VERSION 2.8.2)
+
+project(csr5-download NONE)
+
+include(ExternalProject)
+ExternalProject_Add(csr5
+  GIT_REPOSITORY    https://github.com/weifengliu-ssslab/Benchmark_SpMV_using_CSR5.git
+  #GIT_TAG           master
+  # build from a stable branch instead of master (which gets broken pretty often)
+  #GIT_TAG           v1.10.x
+  SOURCE_DIR        "${CMAKE_BINARY_DIR}/src/Benchmarks/SpMV/csr5-src"
+  BINARY_DIR        "${CMAKE_BINARY_DIR}/src/Benchmarks/SpMV/csr5-build"
+  CONFIGURE_COMMAND ""
+  BUILD_COMMAND     ""
+  INSTALL_COMMAND   ""
+  TEST_COMMAND      ""
+  # Disable update of the external project in an offline build
+  # reference: https://stackoverflow.com/a/40423683
+  UPDATE_DISCONNECTED ${OFFLINE_BUILD}
+)
diff --git a/src/Benchmarks/SpMV/spmv.h b/src/Benchmarks/SpMV/spmv.h
index 652ed94053ba5a9dd0e8c06f661b539f61fe6fc6..559adadfff6078cd09d98daa006a21e8f52add8c 100644
--- a/src/Benchmarks/SpMV/spmv.h
+++ b/src/Benchmarks/SpMV/spmv.h
@@ -14,7 +14,10 @@
 
 #pragma once
 
+#include <cstdint>
+
 #include "../Benchmarks.h"
+#include "../JsonLogging.h"
 #include "SpmvBenchmarkResult.h"
 
 #include <TNL/Pointers/DevicePointer.h>
@@ -35,14 +38,35 @@
 #include <TNL/Algorithms/Segments/SlicedEllpack.h>
 #include <TNL/Algorithms/Segments/ChunkedEllpack.h>
 #include <TNL/Algorithms/Segments/BiEllpack.h>
+
+#ifdef HAVE_PETSC
+#include <petscmat.h>
+#endif
+
+// Comment the following to turn off some groups of SpMV benchmarks and speed-up the compilation
+#define WITH_TNL_BENCHMARK_SPMV_GENERAL_MATRICES
+#define WITH_TNL_BENCHMARK_SPMV_SYMMETRIC_MATRICES
+#define WITH_TNL_BENCHMARK_SPMV_BINARY_MATRICES
+#define WITH_TNL_BENCHMARK_SPMV_LEGACY_FORMATS
+
+// Uncomment the following line to enable benchmarking the sandbox sparse matrix.
+//#define WITH_TNL_BENCHMARK_SPMV_SANDBOX_MATRIX
+#ifdef WITH_TNL_BENCHMARK_SPMV_SANDBOX_MATRIX
+#include <TNL/Matrices/Sandbox/SparseSandboxMatrix.h>
+#endif
+
 using namespace TNL::Matrices;
 
 #include <Benchmarks/SpMV/ReferenceFormats/cusparseCSRMatrix.h>
 #include <Benchmarks/SpMV/ReferenceFormats/cusparseCSRMatrixLegacy.h>
+#include <Benchmarks/SpMV/ReferenceFormats/LightSpMVBenchmark.h>
+#include <Benchmarks/SpMV/ReferenceFormats/CSR5Benchmark.h>
 
 namespace TNL {
    namespace Benchmarks {
-      namespace SpMVLegacy {
+      namespace SpMV {
+
+using BenchmarkType = TNL::Benchmarks::Benchmark< JsonLogging >;
 
 /////
 // General sparse matrix aliases
@@ -56,6 +80,9 @@ using SparseMatrix_CSR_Vector = Matrices::SparseMatrix< Real, Device, Index, Mat
 template< typename Real, typename Device, typename Index >
 using SparseMatrix_CSR_Hybrid = Matrices::SparseMatrix< Real, Device, Index, Matrices::GeneralMatrix, Algorithms::Segments::CSRHybrid >;
 
+template< typename Real, typename Device, typename Index >
+using SparseMatrix_CSR_Light = Matrices::SparseMatrix< Real, Device, Index, Matrices::GeneralMatrix, Algorithms::Segments::CSRLight >;
+
 template< typename Real, typename Device, typename Index >
 using SparseMatrix_CSR_Adaptive = Matrices::SparseMatrix< Real, Device, Index, Matrices::GeneralMatrix, Algorithms::Segments::CSRAdaptive >;
 
@@ -95,6 +122,9 @@ using SymmetricSparseMatrix_CSR_Vector = Matrices::SparseMatrix< Real, Device, I
 template< typename Real, typename Device, typename Index >
 using SymmetricSparseMatrix_CSR_Hybrid = Matrices::SparseMatrix< Real, Device, Index, Matrices::SymmetricMatrix, Algorithms::Segments::CSRHybrid >;
 
+template< typename Real, typename Device, typename Index >
+using SymmetricSparseMatrix_CSR_Light = Matrices::SparseMatrix< Real, Device, Index, Matrices::SymmetricMatrix, Algorithms::Segments::CSRLight >;
+
 template< typename Real, typename Device, typename Index >
 using SymmetricSparseMatrix_CSR_Adaptive = Matrices::SparseMatrix< Real, Device, Index, Matrices::SymmetricMatrix, Algorithms::Segments::CSRAdaptive >;
 
@@ -122,6 +152,10 @@ using BiEllpackSegments = Algorithms::Segments::BiEllpack< Device, Index, IndexA
 template< typename Real, typename Device, typename Index >
 using SymmetricSparseMatrix_BiEllpack = Matrices::SparseMatrix< Real, Device, Index, Matrices::SymmetricMatrix, BiEllpackSegments >;
 
+#ifdef WITH_TNL_BENCHMARK_SPMV_SANDBOX_MATRIX
+template< typename Real, typename Device, typename Index >
+using SparseSandboxMatrix = Matrices::Sandbox::SparseSandboxMatrix< Real, Device, Index, Matrices::GeneralMatrix >;
+#endif
 
 /////
 // Legacy formats
@@ -162,59 +196,14 @@ using SparseMatrixLegacy_CSR_LightWithoutAtomic = Benchmarks::SpMV::ReferenceFor
 template< typename Real, typename Device, typename Index >
 using SlicedEllpackAlias = Benchmarks::SpMV::ReferenceFormats::Legacy::SlicedEllpack< Real, Device, Index >;
 
-// Get the name (with extension) of input matrix file
-std::string getMatrixFileName( const String& InputFileName )
-{
-    std::string fileName = InputFileName;
-
-    const size_t last_slash_idx = fileName.find_last_of( "/\\" );
-    if( std::string::npos != last_slash_idx )
-        fileName.erase( 0, last_slash_idx + 1 );
-
-    return fileName;
-}
-
-// Get only the name of the format from getType()
-template< typename Matrix >
-std::string getMatrixFormat( const Matrix& matrix )
-{
-    std::string mtrxFullType = getType( matrix );
-    std::string mtrxType = mtrxFullType.substr( 0, mtrxFullType.find( "<" ) );
-    std::string format = mtrxType.substr( mtrxType.find( ':' ) + 2 );
-
-    return format;
-}
-
-template< typename Matrix >
-std::string getFormatShort( const Matrix& matrix )
-{
-    std::string mtrxFullType = getType( matrix );
-    std::string mtrxType = mtrxFullType.substr( 0, mtrxFullType.find( "<" ) );
-    std::string format = mtrxType.substr( mtrxType.find( ':' ) + 2 );
-    format = format.substr( format.find(':') + 2);
-    format = format.substr( 0, 3 );
-
-    return format;
-}
-
-// Print information about the matrix.
-template< typename Matrix >
-void printMatrixInfo( const Matrix& matrix,
-                      std::ostream& str )
-{
-    str << "\n Format: " << getMatrixFormat( matrix ) << std::endl;
-    str << " Rows: " << matrix.getRows() << std::endl;
-    str << " Cols: " << matrix.getColumns() << std::endl;
-    str << " Nonzero Elements: " << matrix.getNumberOfNonzeroMatrixElements() << std::endl;
-}
-
 template< typename Real,
           template< typename, typename, typename > class Matrix,
           template< typename, typename, typename, typename > class Vector = Containers::Vector >
 void
-benchmarkSpMVLegacy( Benchmark& benchmark,
+benchmarkSpMVLegacy( BenchmarkType& benchmark,
                      const TNL::Containers::Vector< Real, Devices::Host, int >& csrResultVector,
                      const String& inputFileName,
+                     bool allCpuTests,
                      bool verboseMR )
 {
    using HostMatrix = Matrix< Real, TNL::Devices::Host, int >;
@@ -225,14 +214,16 @@ benchmarkSpMVLegacy( Benchmark& benchmark,
    HostMatrix hostMatrix;
    CudaMatrix cudaMatrix;
 
-   SpMV::ReferenceFormats::Legacy::LegacyMatrixReader< HostMatrix >::readMtxFile( inputFileName, hostMatrix, verboseMR );
+   try
+   {
+      SpMV::ReferenceFormats::Legacy::LegacyMatrixReader< HostMatrix >::readMtxFile( inputFileName, hostMatrix, verboseMR );
+   }
+   catch(const std::exception& e)
+   {
+      std::cerr << "Unable to read the matrix: " << e.what() << std::endl;
+      return;
+   }
 
-   benchmark.setMetadataColumns( Benchmark::MetadataColumns({
-         { "matrix name", convertToString( inputFileName ) },
-         { "rows", convertToString( hostMatrix.getRows() ) },
-         { "columns", convertToString( hostMatrix.getColumns() ) },
-         { "matrix format", MatrixInfo< HostMatrix >::getFormat() }
-      } ));
    const int elements = hostMatrix.getNonzeroElementsCount();
    const double datasetSize = (double) elements * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB;
    benchmark.setOperation( datasetSize );
@@ -240,25 +231,37 @@ benchmarkSpMVLegacy( Benchmark& benchmark,
    /////
    // Benchmark SpMV on host
    //
-   HostVector hostInVector( hostMatrix.getColumns() ), hostOutVector( hostMatrix.getRows() );
+   if( allCpuTests )
+   {
+      HostVector hostInVector( hostMatrix.getColumns() ), hostOutVector( hostMatrix.getRows() );
 
-   auto resetHostVectors = [&]() {
-      hostInVector = 1.0;
-      hostOutVector = 0.0;
-   };
+      auto resetHostVectors = [&]() {
+         hostInVector = 1.0;
+         hostOutVector = 0.0;
+      };
 
-   auto spmvHost = [&]() {
-      hostMatrix.vectorProduct( hostInVector, hostOutVector );
+      auto spmvHost = [&]() {
+         hostMatrix.vectorProduct( hostInVector, hostOutVector );
 
-   };
-   SpmvBenchmarkResult< Real, Devices::Host, int > hostBenchmarkResults( csrResultVector, hostOutVector, hostMatrix.getNonzeroElementsCount() );
-   benchmark.time< Devices::Host >( resetHostVectors, "CPU", spmvHost, hostBenchmarkResults );
+      };
+      SpmvBenchmarkResult< Real, Devices::Host, int > hostBenchmarkResults( MatrixInfo< HostMatrix >::getFormat(), csrResultVector, hostOutVector, hostMatrix.getNonzeroElementsCount() );
+      benchmark.time< Devices::Host >( resetHostVectors, "CPU", spmvHost, hostBenchmarkResults );
+   }
 
    /////
    // Benchmark SpMV on CUDA
    //
 #ifdef HAVE_CUDA
-   cudaMatrix = hostMatrix;
+   try
+   {
+      cudaMatrix = hostMatrix;
+   }
+   catch(const std::exception& e)
+   {
+      std::cerr << "Unable to copy the matrix on GPU: " << e.what() << std::endl;
+      return;
+   }
+
    CudaVector cudaInVector( hostMatrix.getColumns() ), cudaOutVector( hostMatrix.getRows() );
 
    auto resetCudaVectors = [&]() {
@@ -269,10 +272,9 @@ benchmarkSpMVLegacy( Benchmark& benchmark,
    auto spmvCuda = [&]() {
       cudaMatrix.vectorProduct( cudaInVector, cudaOutVector );
    };
-   SpmvBenchmarkResult< Real, Devices::Cuda, int > cudaBenchmarkResults( csrResultVector, cudaOutVector, cudaMatrix.getNonzeroElementsCount() );
+   SpmvBenchmarkResult< Real, Devices::Cuda, int > cudaBenchmarkResults( MatrixInfo< HostMatrix >::getFormat(), csrResultVector, cudaOutVector, cudaMatrix.getNonzeroElementsCount() );
    benchmark.time< Devices::Cuda >( resetCudaVectors, "GPU", spmvCuda, cudaBenchmarkResults );
  #endif
-    std::cout << std::endl;
 }
 
 template< typename Real,
@@ -280,10 +282,11 @@ template< typename Real,
           template< typename, typename, typename > class Matrix,
           template< typename, typename, typename, typename > class Vector = Containers::Vector >
 void
-benchmarkSpMV( Benchmark& benchmark,
+benchmarkSpMV( BenchmarkType& benchmark,
                const InputMatrix& inputMatrix,
                const TNL::Containers::Vector< Real, Devices::Host, int >& csrResultVector,
                const String& inputFileName,
+               bool allCpuTests,
                bool verboseMR )
 {
    using HostMatrix = Matrix< Real, TNL::Devices::Host, int >;
@@ -298,16 +301,10 @@ benchmarkSpMV( Benchmark& benchmark,
    }
    catch(const std::exception& e)
    {
-      std::cerr << "Unable to convert the matrix to the target format." << std::endl;
+      std::cerr << "Unable to convert the matrix to the target format:"  << e.what() << std::endl;
       return;
    }
 
-   benchmark.setMetadataColumns( Benchmark::MetadataColumns({
-         { "matrix name", convertToString( inputFileName ) },
-         { "rows", convertToString( hostMatrix.getRows() ) },
-         { "columns", convertToString( hostMatrix.getColumns() ) },
-         { "matrix format", MatrixInfo< HostMatrix >::getFormat() }
-      } ));
    const int elements = hostMatrix.getNonzeroElementsCount();
    const double datasetSize = (double) elements * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB;
    benchmark.setOperation( datasetSize );
@@ -315,26 +312,224 @@ benchmarkSpMV( Benchmark& benchmark,
    /////
    // Benchmark SpMV on host
    //
-   HostVector hostInVector( hostMatrix.getColumns() ), hostOutVector( hostMatrix.getRows() );
+   if( allCpuTests )
+   {
+      HostVector hostInVector( hostMatrix.getColumns() ), hostOutVector( hostMatrix.getRows() );
 
-   auto resetHostVectors = [&]() {
-      hostInVector = 1.0;
-      hostOutVector = 0.0;
+      auto resetHostVectors = [&]() {
+         hostInVector = 1.0;
+         hostOutVector = 0.0;
+      };
+
+      auto spmvHost = [&]() {
+         hostMatrix.vectorProduct( hostInVector, hostOutVector );
+
+      };
+      SpmvBenchmarkResult< Real, Devices::Host, int > hostBenchmarkResults( MatrixInfo< HostMatrix >::getFormat(), csrResultVector, hostOutVector, hostMatrix.getNonzeroElementsCount() );
+      benchmark.time< Devices::Host >( resetHostVectors, "CPU", spmvHost, hostBenchmarkResults );
+   }
+
+   /////
+   // Benchmark SpMV on CUDA
+   //
+#ifdef HAVE_CUDA
+   CudaMatrix cudaMatrix;
+   try
+   {
+      cudaMatrix = inputMatrix;
+   }
+   catch(const std::exception& e)
+   {
+      std::cerr << "Unable to copy the matrix on GPU:" << e.what() << std::endl;
+      return;
+   }
+
+   CudaVector cudaInVector( hostMatrix.getColumns() ), cudaOutVector( hostMatrix.getRows() );
+
+   auto resetCudaVectors = [&]() {
+      cudaInVector = 1.0;
+      cudaOutVector = 0.0;
+   };
+
+   auto spmvCuda = [&]() {
+      cudaMatrix.vectorProduct( cudaInVector, cudaOutVector );
+   };
+   SpmvBenchmarkResult< Real, Devices::Cuda, int > cudaBenchmarkResults( MatrixInfo< HostMatrix >::getFormat(), csrResultVector, cudaOutVector, cudaMatrix.getNonzeroElementsCount() );
+   benchmark.time< Devices::Cuda >( resetCudaVectors, "GPU", spmvCuda, cudaBenchmarkResults );
+ #endif
+}
+
+template< typename Real,
+          typename InputMatrix,
+          template< typename, typename, typename > class Matrix,
+          typename TestReal = Real,
+          template< typename, typename, typename, typename > class Vector = Containers::Vector >
+void
+benchmarkSpMVCSRLight( BenchmarkType& benchmark,
+                       const InputMatrix& inputMatrix,
+                       const TNL::Containers::Vector< Real, Devices::Host, int >& csrResultVector,
+                       const String& inputFileName,
+                       bool allCpuTests,
+                       bool verboseMR )
+{
+   using HostMatrix = Matrix< TestReal, TNL::Devices::Host, int >;
+   using CudaMatrix = Matrix< TestReal, TNL::Devices::Cuda, int >;
+   using HostVector = Containers::Vector< Real, Devices::Host, int >;
+   using CudaVector = Containers::Vector< Real, Devices::Cuda, int >;
+
+   HostMatrix hostMatrix;
+   try
+   {
+      hostMatrix = inputMatrix;
+   }
+   catch(const std::exception& e)
+   {
+      std::cerr << "Unable to convert the matrix to the target format:"  << e.what() << std::endl;
+      return;
+   }
+
+   const int elements = hostMatrix.getNonzeroElementsCount();
+   const double datasetSize = (double) elements * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB;
+   benchmark.setOperation( datasetSize );
+
+   /////
+   // Benchmark SpMV on host
+   //
+   if( allCpuTests )
+   {
+      HostVector hostInVector( hostMatrix.getColumns() ), hostOutVector( hostMatrix.getRows() );
+
+      auto resetHostVectors = [&]() {
+         hostInVector = 1.0;
+         hostOutVector = 0.0;
+      };
+
+      auto spmvHost = [&]() {
+         hostMatrix.vectorProduct( hostInVector, hostOutVector );
+
+      };
+      SpmvBenchmarkResult< Real, Devices::Host, int > hostBenchmarkResults( MatrixInfo< HostMatrix >::getFormat(), csrResultVector, hostOutVector, hostMatrix.getNonzeroElementsCount() );
+      benchmark.time< Devices::Host >( resetHostVectors, "CPU", spmvHost, hostBenchmarkResults );
+   }
+
+   /////
+   // Benchmark SpMV on CUDA
+   //
+#ifdef HAVE_CUDA
+   CudaMatrix cudaMatrix;
+   try
+   {
+      cudaMatrix = inputMatrix;
+   }
+   catch(const std::exception& e)
+   {
+      std::cerr << "Unable to copy the matrix on GPU:" << e.what() << std::endl;
+      return;
+   }
+
+   CudaVector cudaInVector( hostMatrix.getColumns() ), cudaOutVector( hostMatrix.getRows() );
+
+   auto resetCudaVectors = [&]() {
+      cudaInVector = 1.0;
+      cudaOutVector = 0.0;
+   };
+
+   auto spmvCuda = [&]() {
+      cudaMatrix.vectorProduct( cudaInVector, cudaOutVector );
    };
 
-   auto spmvHost = [&]() {
-      hostMatrix.vectorProduct( hostInVector, hostOutVector );
+   {
+      cudaMatrix.getSegments().getKernel().setThreadsMapping( Algorithms::Segments::CSRLightAutomaticThreads );
+      String format = MatrixInfo< HostMatrix >::getFormat() + " Automatic";
+      SpmvBenchmarkResult< Real, Devices::Cuda, int > cudaBenchmarkResults( format, csrResultVector, cudaOutVector, cudaMatrix.getNonzeroElementsCount() );
+      benchmark.time< Devices::Cuda >( resetCudaVectors, "GPU", spmvCuda, cudaBenchmarkResults );
+   };
 
+   {
+      cudaMatrix.getSegments().getKernel().setThreadsMapping( Algorithms::Segments::CSRLightAutomaticThreadsLightSpMV );
+      String format = MatrixInfo< HostMatrix >::getFormat() + " Automatic Light";
+      SpmvBenchmarkResult< Real, Devices::Cuda, int > cudaBenchmarkResults( format, csrResultVector, cudaOutVector, cudaMatrix.getNonzeroElementsCount() );
+      benchmark.time< Devices::Cuda >( resetCudaVectors, "GPU", spmvCuda, cudaBenchmarkResults );
    };
-   SpmvBenchmarkResult< Real, Devices::Host, int > hostBenchmarkResults( csrResultVector, hostOutVector, hostMatrix.getNonzeroElementsCount() );
-   benchmark.time< Devices::Host >( resetHostVectors, "CPU", spmvHost, hostBenchmarkResults );
+
+   /*for( auto threadsPerRow : std::vector< int >{ 1, 2, 4, 8, 16, 32 } )
+   {
+      cudaMatrix.getSegments().getKernel().setThreadsPerSegment( threadsPerRow );
+      String format = MatrixInfo< HostMatrix >::getFormat() + " " + convertToString( threadsPerRow );
+      SpmvBenchmarkResult< Real, Devices::Cuda, int > cudaBenchmarkResults( format, csrResultVector, cudaOutVector, cudaMatrix.getNonzeroElementsCount() );
+      benchmark.time< Devices::Cuda >( resetCudaVectors, "GPU", spmvCuda, cudaBenchmarkResults );
+   }*/
+ #endif
+}
+
+
+template< typename Real,
+          typename InputMatrix,
+          template< typename, typename, typename > class Matrix,
+          template< typename, typename, typename, typename > class Vector = Containers::Vector >
+void
+benchmarkBinarySpMV( BenchmarkType& benchmark,
+                     const InputMatrix& inputMatrix,
+                     const TNL::Containers::Vector< Real, Devices::Host, int >& csrResultVector,
+                     const String& inputFileName,
+                     bool allCpuTests,
+                     bool verboseMR )
+{
+   using HostMatrix = Matrix< bool, TNL::Devices::Host, int >;
+   using CudaMatrix = Matrix< bool, TNL::Devices::Cuda, int >;
+   using HostVector = Containers::Vector< Real, Devices::Host, int >;
+   using CudaVector = Containers::Vector< Real, Devices::Cuda, int >;
+
+   HostMatrix hostMatrix;
+   try
+   {
+      hostMatrix = inputMatrix;
+   }
+   catch(const std::exception& e)
+   {
+      std::cerr << "Unable to convert the matrix to the target format:" << e.what() << std::endl;
+      return;
+   }
+
+   const int elements = hostMatrix.getNonzeroElementsCount();
+   const double datasetSize = (double) elements * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB;
+   benchmark.setOperation( datasetSize );
+
+   /////
+   // Benchmark SpMV on host
+   //
+   if( allCpuTests )
+   {
+      HostVector hostInVector( hostMatrix.getColumns() ), hostOutVector( hostMatrix.getRows() );
+
+      auto resetHostVectors = [&]() {
+         hostInVector = 1.0;
+         hostOutVector = 0.0;
+      };
+
+      auto spmvHost = [&]() {
+         hostMatrix.vectorProduct( hostInVector, hostOutVector );
+
+      };
+      SpmvBenchmarkResult< Real, Devices::Host, int > hostBenchmarkResults( MatrixInfo< HostMatrix >::getFormat(), csrResultVector, hostOutVector, hostMatrix.getNonzeroElementsCount() );
+      benchmark.time< Devices::Host >( resetHostVectors, "CPU", spmvHost, hostBenchmarkResults );
+   }
 
    /////
    // Benchmark SpMV on CUDA
    //
 #ifdef HAVE_CUDA
    CudaMatrix cudaMatrix;
-   cudaMatrix = inputMatrix;
+   try
+   {
+      cudaMatrix = inputMatrix;
+   }
+   catch(const std::exception& e)
+   {
+      std::cerr << "Unable to copy the matrix on GPU:" << e.what() << std::endl;
+      return;
+   }
+
    CudaVector cudaInVector( hostMatrix.getColumns() ), cudaOutVector( hostMatrix.getRows() );
 
    auto resetCudaVectors = [&]() {
@@ -345,19 +540,18 @@ benchmarkSpMV( Benchmark& benchmark,
    auto spmvCuda = [&]() {
       cudaMatrix.vectorProduct( cudaInVector, cudaOutVector );
    };
-   SpmvBenchmarkResult< Real, Devices::Cuda, int > cudaBenchmarkResults( csrResultVector, cudaOutVector, cudaMatrix.getNonzeroElementsCount() );
+   SpmvBenchmarkResult< Real, Devices::Cuda, int > cudaBenchmarkResults( MatrixInfo< HostMatrix >::getFormat(), csrResultVector, cudaOutVector, cudaMatrix.getNonzeroElementsCount() );
    benchmark.time< Devices::Cuda >( resetCudaVectors, "GPU", spmvCuda, cudaBenchmarkResults );
  #endif
-    std::cout << std::endl;
 }
 
 template< typename Real = double,
           typename Index = int >
 void
-benchmarkSpmvSynthetic( Benchmark& benchmark,
-                        const String& inputFileName,
-                        const Config::ParameterContainer& parameters,
-                        bool verboseMR )
+benchmarkSpmv( BenchmarkType& benchmark,
+               const String& inputFileName,
+               const Config::ParameterContainer& parameters,
+               bool verboseMR )
 {
    // The following is another workaround because of a bug in nvcc versions 10 and 11.
    // If we use the current matrix formats, not the legacy ones, we get
@@ -372,6 +566,7 @@ benchmarkSpmvSynthetic( Benchmark& benchmark,
    using CSRHostMatrix = SpMV::ReferenceFormats::Legacy::CSR< Real, Devices::Host, int >;
    using CSRCudaMatrix = SpMV::ReferenceFormats::Legacy::CSR< Real, Devices::Cuda, int >;
    using CusparseMatrix = TNL::CusparseCSRLegacy< Real >;
+   using LightSpMVCSRHostMatrix = SpMV::ReferenceFormats::Legacy::CSR< Real, Devices::Host, uint32_t >;
 #else
    // Here we use 'int' instead of 'Index' because of compatibility with cusparse.
    using CSRHostMatrix = TNL::Matrices::SparseMatrix< Real, TNL::Devices::Host, int >;
@@ -379,9 +574,9 @@ benchmarkSpmvSynthetic( Benchmark& benchmark,
    using CusparseMatrix = TNL::CusparseCSR< Real >;
 #endif
 
-
    using HostVector = Containers::Vector< Real, Devices::Host, int >;
    using CudaVector = Containers::Vector< Real, Devices::Cuda, int >;
+   using BinaryHostVector = Containers::Vector< int, Devices::Host, int >;
 
    CSRHostMatrix csrHostMatrix;
 
@@ -396,12 +591,13 @@ benchmarkSpmvSynthetic( Benchmark& benchmark,
    ////
    // Perform benchmark on host with CSR as a reference CPU format
    //
-   benchmark.setMetadataColumns( Benchmark::MetadataColumns({
-         { "matrix name", convertToString( inputFileName ) },
-         { "rows", convertToString( csrHostMatrix.getRows() ) },
-         { "columns", convertToString( csrHostMatrix.getColumns() ) },
-         { "matrix format", String( "CSR" ) }
-      } ));
+   auto nonzeros = csrHostMatrix.getNonzeroElementsCount();
+   benchmark.addCommonLogs( BenchmarkType::CommonLogs( {
+      { "matrix name", convertToString( inputFileName ) },
+      { "rows", convertToString( csrHostMatrix.getRows() ) },
+      { "columns", convertToString( csrHostMatrix.getColumns() ) },
+      { "nonzeros", convertToString( nonzeros ) },
+      { "nonzeros per row", convertToString( ( double ) nonzeros / ( double ) csrHostMatrix.getRows() ) } } ) );
 
    HostVector hostInVector( csrHostMatrix.getRows() ), hostOutVector( csrHostMatrix.getRows() );
 
@@ -414,90 +610,172 @@ benchmarkSpmvSynthetic( Benchmark& benchmark,
        csrHostMatrix.vectorProduct( hostInVector, hostOutVector );
    };
 
-   SpmvBenchmarkResult< Real, Devices::Host, int > csrBenchmarkResults( hostOutVector, hostOutVector, csrHostMatrix.getNonzeroElementsCount() );
-   benchmark.time< Devices::Cuda >( resetHostVectors, "CPU", spmvCSRHost, csrBenchmarkResults );
+   SpmvBenchmarkResult< Real, Devices::Host, int > csrBenchmarkResults( String( "CSR" ), hostOutVector, hostOutVector, csrHostMatrix.getNonzeroElementsCount() );
+   benchmark.addLogsMetadata( csrBenchmarkResults.getTableHeader() );
+   benchmark.writeHeader();
+   benchmark.time< Devices::Host >( resetHostVectors, "", spmvCSRHost, csrBenchmarkResults );
+
+#ifdef HAVE_PETSC
+   Mat petscMatrix;
+   Containers::Vector< PetscInt, Devices::Host, PetscInt > petscRowPointers( csrHostMatrix.getRowPointers() );
+   Containers::Vector< PetscInt, Devices::Host, PetscInt > petscColumns( csrHostMatrix.getColumnIndexes() );
+   Containers::Vector< PetscScalar, Devices::Host, PetscInt > petscValues( csrHostMatrix.getValues() );
+   MatCreateSeqAIJWithArrays( PETSC_COMM_WORLD, //PETSC_COMM_SELF,
+                              csrHostMatrix.getRows(),
+                              csrHostMatrix.getColumns(),
+                              petscRowPointers.getData(),
+                              petscColumns.getData(),
+                              petscValues.getData(),
+                              &petscMatrix );
+   Vec inVector, outVector;
+   VecCreateSeq( PETSC_COMM_WORLD, csrHostMatrix.getColumns(), &inVector );
+   VecCreateSeq( PETSC_COMM_WORLD, csrHostMatrix.getRows(), &outVector );
+
+   auto resetPetscVectors = [&]() {
+      VecSet( inVector, 1.0 );
+      VecSet( outVector, 0.0 );
+   };
+
+   auto petscSpmvCSRHost = [&]() {
+      MatMult( petscMatrix, inVector, outVector );
+   };
+
+   SpmvBenchmarkResult< Real, Devices::Host, int > petscBenchmarkResults( String( "Petsc" ), hostOutVector, hostOutVector, csrHostMatrix.getNonzeroElementsCount() );
+   //benchmark.addLogsMetadata( petscBenchmarkResults.getTableHeader() );
+   //benchmark.writeHeader();
+   benchmark.time< Devices::Host >( resetPetscVectors, "", petscSpmvCSRHost, petscBenchmarkResults );
+#endif
 
+
+#ifdef HAVE_CUDA
    ////
    // Perform benchmark on CUDA device with cuSparse as a reference GPU format
    //
-#ifdef HAVE_CUDA
-   benchmark.setMetadataColumns( Benchmark::MetadataColumns({
-         { "matrix name", convertToString( inputFileName ) },
-         { "rows", convertToString( csrHostMatrix.getRows() ) },
-         { "columns", convertToString( csrHostMatrix.getColumns() ) },
-         { "matrix format", String( "cuSparse" ) }
-      } ));
-
    cusparseHandle_t cusparseHandle;
    cusparseCreate( &cusparseHandle );
 
    CSRCudaMatrix csrCudaMatrix;
    csrCudaMatrix = csrHostMatrix;
 
-   // Delete the CSRhostMatrix, so it doesn't take up unnecessary space
-   csrHostMatrix.reset();
-
    CusparseMatrix cusparseMatrix;
    cusparseMatrix.init( csrCudaMatrix, &cusparseHandle );
 
-   CudaVector cusparseInVector( csrCudaMatrix.getColumns() ), cusparseOutVector( csrCudaMatrix.getRows() );
+   CudaVector cudaInVector( csrCudaMatrix.getColumns() ), cudaOutVector( csrCudaMatrix.getRows() );
 
    auto resetCusparseVectors = [&]() {
-      cusparseInVector = 1.0;
-      cusparseOutVector = 0.0;
+      cudaInVector = 1.0;
+      cudaOutVector = 0.0;
    };
 
    auto spmvCusparse = [&]() {
-       cusparseMatrix.vectorProduct( cusparseInVector, cusparseOutVector );
+       cusparseMatrix.vectorProduct( cudaInVector, cudaOutVector );
    };
 
-   SpmvBenchmarkResult< Real, Devices::Host, int > cusparseBenchmarkResults( hostOutVector, hostOutVector, csrHostMatrix.getNonzeroElementsCount() );
-   benchmark.time< Devices::Cuda >( resetCusparseVectors, "GPU", spmvCusparse, cusparseBenchmarkResults );
+   SpmvBenchmarkResult< Real, Devices::Cuda, int > cudaBenchmarkResults( String( "cusparse" ), hostOutVector, cudaOutVector, csrHostMatrix.getNonzeroElementsCount() );
+   benchmark.time< Devices::Cuda >( resetCusparseVectors, "GPU", spmvCusparse, cudaBenchmarkResults );
+
+#ifdef HAVE_CSR5
+   ////
+   // Perform benchmark on CUDA device with CSR5 as a reference GPU format
+   //
+   cudaBenchmarkResults.setFormat( String( "CSR5" ) );
+
+   CudaVector cudaOutVector2( cudaOutVector );
+   CSR5Benchmark::CSR5Benchmark< CSRCudaMatrix > csr5Benchmark( csrCudaMatrix, cudaInVector, cudaOutVector );
+
+   auto csr5SpMV = [&]() {
+       csr5Benchmark.vectorProduct();
+   };
+
+   benchmark.time< Devices::Cuda >( resetCusparseVectors, "GPU", csr5SpMV, cudaBenchmarkResults );
+   std::cerr << "CSR5 error = " << max( abs( cudaOutVector - cudaOutVector2 ) ) << std::endl;
    csrCudaMatrix.reset();
 #endif
+
+   ////
+   // Perform benchmark on CUDA device with LightSpMV as a reference GPU format
+   //
+   cudaBenchmarkResults.setFormat( String( "LightSpMV Vector" ) );
+
+   LightSpMVCSRHostMatrix lightSpMVCSRHostMatrix;
+   lightSpMVCSRHostMatrix = csrHostMatrix;
+   LightSpMVBenchmark< Real > lightSpMVBenchmark( lightSpMVCSRHostMatrix, LightSpMVBenchmarkKernelVector );
+   auto resetLightSpMVVectors = [&]() {
+      lightSpMVBenchmark.resetVectors();
+   };
+
+   auto spmvLightSpMV = [&]() {
+       lightSpMVBenchmark.vectorProduct();
+   };
+   benchmark.time< Devices::Cuda >( resetLightSpMVVectors, "GPU", spmvLightSpMV, cudaBenchmarkResults );
+
+   cudaBenchmarkResults.setFormat( String( "LightSpMV Warp" ) );
+   lightSpMVBenchmark.setKernelType( LightSpMVBenchmarkKernelWarp );
+   benchmark.time< Devices::Cuda >( resetLightSpMVVectors, "GPU", spmvLightSpMV, cudaBenchmarkResults );
+#endif
    csrHostMatrix.reset();
 
+   bool allCpuTests = parameters.getParameter< bool >( "with-all-cpu-tests" );
+#ifdef WITH_TNL_BENCHMARK_SPMV_LEGACY_FORMATS
    /////
    // Benchmarking of TNL legacy formats
    //
    if( parameters.getParameter< bool >("with-legacy-matrices") )
    {
       using namespace Benchmarks::SpMV::ReferenceFormats;
-      benchmarkSpMVLegacy< Real, SparseMatrixLegacy_CSR_Scalar             >( benchmark, hostOutVector, inputFileName, verboseMR );
-      benchmarkSpMVLegacy< Real, SparseMatrixLegacy_CSR_Vector             >( benchmark, hostOutVector, inputFileName, verboseMR );
-      benchmarkSpMVLegacy< Real, SparseMatrixLegacy_CSR_Light              >( benchmark, hostOutVector, inputFileName, verboseMR );
-      benchmarkSpMVLegacy< Real, SparseMatrixLegacy_CSR_Light2             >( benchmark, hostOutVector, inputFileName, verboseMR );
-      benchmarkSpMVLegacy< Real, SparseMatrixLegacy_CSR_Light3             >( benchmark, hostOutVector, inputFileName, verboseMR );
-      benchmarkSpMVLegacy< Real, SparseMatrixLegacy_CSR_Light4             >( benchmark, hostOutVector, inputFileName, verboseMR );
-      benchmarkSpMVLegacy< Real, SparseMatrixLegacy_CSR_Light5             >( benchmark, hostOutVector, inputFileName, verboseMR );
-      benchmarkSpMVLegacy< Real, SparseMatrixLegacy_CSR_Light6             >( benchmark, hostOutVector, inputFileName, verboseMR );
-      benchmarkSpMVLegacy< Real, SparseMatrixLegacy_CSR_Adaptive           >( benchmark, hostOutVector, inputFileName, verboseMR );
-      benchmarkSpMVLegacy< Real, SparseMatrixLegacy_CSR_MultiVector        >( benchmark, hostOutVector, inputFileName, verboseMR );
-      benchmarkSpMVLegacy< Real, SparseMatrixLegacy_CSR_LightWithoutAtomic >( benchmark, hostOutVector, inputFileName, verboseMR );
-      benchmarkSpMVLegacy< Real, Legacy::Ellpack                           >( benchmark, hostOutVector, inputFileName, verboseMR );
-      benchmarkSpMVLegacy< Real, SlicedEllpackAlias                        >( benchmark, hostOutVector, inputFileName, verboseMR );
-      benchmarkSpMVLegacy< Real, Legacy::ChunkedEllpack                    >( benchmark, hostOutVector, inputFileName, verboseMR );
-      benchmarkSpMVLegacy< Real, Legacy::BiEllpack                         >( benchmark, hostOutVector, inputFileName, verboseMR );
+      benchmarkSpMVLegacy< Real, SparseMatrixLegacy_CSR_Scalar             >( benchmark, hostOutVector, inputFileName, allCpuTests, verboseMR );
+      benchmarkSpMVLegacy< Real, SparseMatrixLegacy_CSR_Vector             >( benchmark, hostOutVector, inputFileName, allCpuTests, verboseMR );
+      //benchmarkSpMVLegacy< Real, SparseMatrixLegacy_CSR_Light              >( benchmark, hostOutVector, inputFileName, verboseMR );
+      //benchmarkSpMVLegacy< Real, SparseMatrixLegacy_CSR_Light2             >( benchmark, hostOutVector, inputFileName, verboseMR );
+      //benchmarkSpMVLegacy< Real, SparseMatrixLegacy_CSR_Light3             >( benchmark, hostOutVector, inputFileName, verboseMR );
+      //benchmarkSpMVLegacy< Real, SparseMatrixLegacy_CSR_Light4             >( benchmark, hostOutVector, inputFileName, verboseMR );
+      //benchmarkSpMVLegacy< Real, SparseMatrixLegacy_CSR_Light5             >( benchmark, hostOutVector, inputFileName, verboseMR );
+      //benchmarkSpMVLegacy< Real, SparseMatrixLegacy_CSR_Light6             >( benchmark, hostOutVector, inputFileName, verboseMR );
+      benchmarkSpMVLegacy< Real, SparseMatrixLegacy_CSR_Adaptive           >( benchmark, hostOutVector, inputFileName, allCpuTests, verboseMR );
+      benchmarkSpMVLegacy< Real, SparseMatrixLegacy_CSR_MultiVector        >( benchmark, hostOutVector, inputFileName, allCpuTests, verboseMR );
+      benchmarkSpMVLegacy< Real, SparseMatrixLegacy_CSR_LightWithoutAtomic >( benchmark, hostOutVector, inputFileName, allCpuTests, verboseMR );
+      benchmarkSpMVLegacy< Real, Legacy::Ellpack                           >( benchmark, hostOutVector, inputFileName, allCpuTests, verboseMR );
+      benchmarkSpMVLegacy< Real, SlicedEllpackAlias                        >( benchmark, hostOutVector, inputFileName, allCpuTests, verboseMR );
+      benchmarkSpMVLegacy< Real, Legacy::ChunkedEllpack                    >( benchmark, hostOutVector, inputFileName, allCpuTests, verboseMR );
+      benchmarkSpMVLegacy< Real, Legacy::BiEllpack                         >( benchmark, hostOutVector, inputFileName, allCpuTests, verboseMR );
    }
    // AdEllpack is broken
    //benchmarkSpMV< Real, Matrices::AdEllpack              >( benchmark, hostOutVector, inputFileName, verboseMR );
+#endif
 
+#ifdef WITH_TNL_BENCHMARK_SPMV_GENERAL_MATRICES
    /////
    // Benchmarking TNL formats
    //
    using HostMatrixType = TNL::Matrices::SparseMatrix< Real, TNL::Devices::Host >;
    HostMatrixType hostMatrix;
    TNL::Matrices::MatrixReader< HostMatrixType >::readMtx( inputFileName, hostMatrix, verboseMR );
-   benchmarkSpMV< Real, HostMatrixType, SparseMatrix_CSR_Scalar                   >( benchmark, hostMatrix, hostOutVector, inputFileName, verboseMR );
-   benchmarkSpMV< Real, HostMatrixType, SparseMatrix_CSR_Vector                   >( benchmark, hostMatrix, hostOutVector, inputFileName, verboseMR );
-   benchmarkSpMV< Real, HostMatrixType, SparseMatrix_CSR_Hybrid                   >( benchmark, hostMatrix, hostOutVector, inputFileName, verboseMR );
-   benchmarkSpMV< Real, HostMatrixType, SparseMatrix_CSR_Adaptive                 >( benchmark, hostMatrix, hostOutVector, inputFileName, verboseMR );
-   benchmarkSpMV< Real, HostMatrixType, SparseMatrix_Ellpack                      >( benchmark, hostMatrix, hostOutVector, inputFileName, verboseMR );
-   benchmarkSpMV< Real, HostMatrixType, SparseMatrix_SlicedEllpack                >( benchmark, hostMatrix, hostOutVector, inputFileName, verboseMR );
-   benchmarkSpMV< Real, HostMatrixType, SparseMatrix_ChunkedEllpack               >( benchmark, hostMatrix, hostOutVector, inputFileName, verboseMR );
-   benchmarkSpMV< Real, HostMatrixType, SparseMatrix_BiEllpack                    >( benchmark, hostMatrix, hostOutVector, inputFileName, verboseMR );
+   benchmarkSpMV< Real, HostMatrixType, SparseMatrix_CSR_Scalar                   >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+   benchmarkSpMV< Real, HostMatrixType, SparseMatrix_CSR_Vector                   >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+   //benchmarkSpMV< Real, HostMatrixType, SparseMatrix_CSR_Hybrid                   >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+   benchmarkSpMVCSRLight< Real, HostMatrixType, SparseMatrix_CSR_Light            >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+   benchmarkSpMV< Real, HostMatrixType, SparseMatrix_CSR_Adaptive                 >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+   benchmarkSpMV< Real, HostMatrixType, SparseMatrix_Ellpack                      >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+   benchmarkSpMV< Real, HostMatrixType, SparseMatrix_SlicedEllpack                >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+   benchmarkSpMV< Real, HostMatrixType, SparseMatrix_ChunkedEllpack               >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+   benchmarkSpMV< Real, HostMatrixType, SparseMatrix_BiEllpack                    >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+#ifdef WITH_TNL_BENCHMARK_SPMV_BINARY_MATRICES
+   benchmarkBinarySpMV< Real, HostMatrixType, SparseMatrix_CSR_Scalar              >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+   benchmarkBinarySpMV< Real, HostMatrixType, SparseMatrix_CSR_Vector              >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+   benchmarkSpMVCSRLight< Real, HostMatrixType, SparseMatrix_CSR_Light, bool >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+   benchmarkBinarySpMV< Real, HostMatrixType, SparseMatrix_CSR_Adaptive            >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+   benchmarkBinarySpMV< Real, HostMatrixType, SparseMatrix_Ellpack                 >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+   benchmarkBinarySpMV< Real, HostMatrixType, SparseMatrix_SlicedEllpack           >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+   benchmarkBinarySpMV< Real, HostMatrixType, SparseMatrix_ChunkedEllpack          >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+   benchmarkBinarySpMV< Real, HostMatrixType, SparseMatrix_BiEllpack               >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+#endif
+#ifdef WITH_TNL_BENCHMARK_SPMV_SANDBOX_MATRIX
+   benchmarkSpMV< Real, HostMatrixType, SparseSandboxMatrix                       >( benchmark, hostMatrix, hostOutVector, inputFileName, verboseMR );
+#endif
    hostMatrix.reset();
+#endif
 
+#ifdef WITH_TNL_BENCHMARK_SPMV_SYMMETRIC_MATRICES
    /////
    // Benchmarking symmetric sparse matrices
    //
@@ -517,21 +795,35 @@ benchmarkSpmvSynthetic( Benchmark& benchmark,
       }
       InputMatrix hostMatrix;
       TNL::Matrices::MatrixReader< InputMatrix >::readMtx( inputFileName, hostMatrix, verboseMR );
-      if( hostMatrix != symmetricHostMatrix )
-      {
-         std::cerr << "ERROR !!!!!! " << std::endl;
-      }
-      benchmarkSpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_CSR_Scalar                   >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, verboseMR );
-      benchmarkSpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_CSR_Vector                   >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, verboseMR );
-      benchmarkSpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_CSR_Hybrid                   >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, verboseMR );
-      benchmarkSpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_CSR_Adaptive                 >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, verboseMR );
-      benchmarkSpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_Ellpack                      >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, verboseMR );
-      benchmarkSpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_SlicedEllpack                >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, verboseMR );
-      benchmarkSpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_ChunkedEllpack               >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, verboseMR );
-      benchmarkSpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_BiEllpack                    >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, verboseMR );
+      // TODO: Comparison of symmetric and general matrix does not work yet.
+      //if( hostMatrix != symmetricHostMatrix )
+      //{
+      //   std::cerr << "ERROR: Symmetric matrices do not match !!!" << std::endl;
+      //}
+      benchmarkSpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_CSR_Scalar                    >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+      benchmarkSpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_CSR_Vector                    >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+      //benchmarkSpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_CSR_Hybrid                   >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+      benchmarkSpMVCSRLight< Real, SymmetricInputMatrix, SymmetricSparseMatrix_CSR_Light             >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+      benchmarkSpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_CSR_Adaptive                  >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+      benchmarkSpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_Ellpack                       >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+      benchmarkSpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_SlicedEllpack                 >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+      benchmarkSpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_ChunkedEllpack                >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+      benchmarkSpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_BiEllpack                     >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+#ifdef WITH_TNL_BENCHMARK_SPMV_BINARY_MATRICES
+      benchmarkBinarySpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_CSR_Scalar              >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+      benchmarkBinarySpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_CSR_Vector              >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+      //benchmarkBinarySpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_CSR_Hybrid            >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+      benchmarkSpMVCSRLight< Real, SymmetricInputMatrix, SymmetricSparseMatrix_CSR_Light, bool       >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+      benchmarkBinarySpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_CSR_Adaptive            >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+      benchmarkBinarySpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_Ellpack                 >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+      benchmarkBinarySpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_SlicedEllpack           >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+      benchmarkBinarySpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_ChunkedEllpack          >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+      benchmarkBinarySpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_BiEllpack               >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+#endif
    }
+#endif
 }
 
-} // namespace SpMVLegacy
-} // namespace Benchmarks
+      } // namespace SpMVLegacy
+   } // namespace Benchmarks
 } // namespace TNL
diff --git a/src/Benchmarks/SpMV/tnl-benchmark-spmv.h b/src/Benchmarks/SpMV/tnl-benchmark-spmv.h
index 9a5005de73d06fb3d99709f89a32d8036722cac3..c5ff2bb3fa90dd9cf88207d7b087767db8377825 100644
--- a/src/Benchmarks/SpMV/tnl-benchmark-spmv.h
+++ b/src/Benchmarks/SpMV/tnl-benchmark-spmv.h
@@ -21,18 +21,24 @@
 #include "spmv.h"
 
 #include <TNL/Matrices/MatrixReader.h>
+
+#ifdef HAVE_PETSC
+#include <petscmat.h>
+#endif
+
 using namespace TNL::Matrices;
 
 #include <exception>
 #include <ctime> // Used for file naming, so logs don't get overwritten.
+#include <experimental/filesystem> // check file existence
 
 using namespace TNL;
 using namespace TNL::Benchmarks;
 
 template< typename Real >
 void
-runSpMVBenchmarks( Benchmark & benchmark,
-                   Benchmark::MetadataMap metadata,
+runSpMVBenchmarks( TNL::Benchmarks::SpMV::BenchmarkType & benchmark,
+                   TNL::Benchmarks::SpMV::BenchmarkType::MetadataMap metadata,
                    const String & inputFileName,
                    const Config::ParameterContainer& parameters,
                    bool verboseMR = false )
@@ -45,7 +51,7 @@ runSpMVBenchmarks( Benchmark & benchmark,
                            metadata );
    // Start the actual benchmark in spmv.h
    try {
-      SpMVLegacy::benchmarkSpmvSynthetic< Real >( benchmark, inputFileName, parameters, verboseMR );
+      TNL::Benchmarks::SpMV::benchmarkSpmv< Real >( benchmark, inputFileName, parameters, verboseMR );
    }
    catch( const std::exception& ex ) {
       std::cerr << ex.what() << std::endl;
@@ -69,13 +75,15 @@ void
 setupConfig( Config::ConfigDescription & config )
 {
    config.addDelimiter( "Benchmark settings:" );
-   config.addRequiredEntry< String >( "input-file", "Input file name." );
+   config.addEntry< String >( "input-file", "Input file name.", "" );
    config.addEntry< bool >( "with-symmetric-matrices", "Perform benchmark even for symmetric matrix formats.", true );
    config.addEntry< bool >( "with-legacy-matrices", "Perform benchmark even for legacy TNL matrix formats.", true );
+   config.addEntry< bool >( "with-all-cpu-tests", "All matrix formats are tested on both CPU and GPU. ", false );
    config.addEntry< String >( "log-file", "Log file name.", "tnl-benchmark-spmv::" + getCurrDateTime() + ".log");
-   config.addEntry< String >( "output-mode", "Mode for opening the log file.", "overwrite" );
+   config.addEntry< String >( "output-mode", "Mode for opening the log file - 'close' will only finalize the log file.", "append" );
    config.addEntryEnum( "append" );
    config.addEntryEnum( "overwrite" );
+   config.addEntryEnum( "close" );
    config.addEntry< String >( "precision", "Precision of the arithmetics.", "double" );
    config.addEntryEnum( "float" );
    config.addEntryEnum( "double" );
@@ -92,6 +100,9 @@ setupConfig( Config::ConfigDescription & config )
 int
 main( int argc, char* argv[] )
 {
+#ifdef HAVE_PETSC
+   PetscInitialize( &argc, &argv, nullptr, nullptr );
+#endif
    Config::ParameterContainer parameters;
    Config::ConfigDescription conf_desc;
 
@@ -100,7 +111,7 @@ main( int argc, char* argv[] )
    // FIXME: When ./tnl-benchmark-spmv-dbg is called without parameters:
    //           * The guide on what parameters to use prints twice.
    // FIXME: When ./tnl-benchmark-spmv-dbg is called with '--help':
-   //           * The guide on what parameter to use print once. 
+   //           * The guide on what parameter to use print once.
    //              But then it CRASHES due to segfault:
    //              The program attempts to get unknown parameter openmp-enabled
    //              Aborting the program.
@@ -116,23 +127,46 @@ main( int argc, char* argv[] )
 
    const String & inputFileName = parameters.getParameter< String >( "input-file" );
    const String & logFileName = parameters.getParameter< String >( "log-file" );
-   const String & outputMode = parameters.getParameter< String >( "output-mode" );
+   String outputMode = parameters.getParameter< String >( "output-mode" );
    const String & precision = parameters.getParameter< String >( "precision" );
    const int loops = parameters.getParameter< int >( "loops" );
    const int verbose = parameters.getParameter< int >( "verbose" );
    const int verboseMR = parameters.getParameter< int >( "verbose-MReader" );
 
    // open log file
+   if( outputMode == "close" )
+   {
+      std::fstream file;
+      file.open( logFileName.getString(), std::ios::out | std::ios::app );
+      file << std::endl << "   ]" << std::endl << "}";
+      return EXIT_SUCCESS;
+   }
+   if( inputFileName == "" )
+   {
+      std::cerr << "ERROR: Input file name is required." << std::endl;
+      return EXIT_FAILURE;
+   }
+   bool logFileAppend( false );
+   if( std::experimental::filesystem::exists(logFileName.getString()) )
+   {
+      logFileAppend = true;
+      std::cout << "Log file " << logFileName << " exists and ";
+      if( outputMode == "append" )
+         std::cout << "new logs will be appended." << std::endl;
+      else
+         std::cout << "will be overwritten." << std::endl;
+   }
+
    auto mode = std::ios::out;
    if( outputMode == "append" )
        mode |= std::ios::app;
    std::ofstream logFile( logFileName.getString(), mode );
 
    // init benchmark and common metadata
-   Benchmark benchmark( loops, verbose );
+   TNL::Benchmarks::SpMV::BenchmarkType benchmark( loops, verbose, outputMode, logFileAppend );
 
    // prepare global metadata
-   Benchmark::MetadataMap metadata = getHardwareMetadata();
+   TNL::Benchmarks::SpMV::BenchmarkType::MetadataMap metadata = getHardwareMetadata< Logging >();
 
    // Initiate setup of benchmarks
    if( precision == "all" || precision == "float" )
diff --git a/src/Benchmarks/scripts/run-tnl-benchmark-spmv b/src/Benchmarks/scripts/run-tnl-benchmark-spmv
index 88b4d70d02104c74e250b0e80b3e7b22dd02f131..84d2536ef695a03f06419da15ede38c06ddac1cf 100755
--- a/src/Benchmarks/scripts/run-tnl-benchmark-spmv
+++ b/src/Benchmarks/scripts/run-tnl-benchmark-spmv
@@ -1,5 +1,5 @@
 #!/usr/bin/env bash
-                
+
 DEBUG="no"
 STOP_TIME="1"
 export CUDA_PROFILE=0
@@ -7,45 +7,43 @@ export CUDA_PROFILE=0
 PWD=`pwd`
 IWD="$PWD"
 BASE="ftp://math.nist.gov/pub/MatrixMarket2/Harwell-Boeing/"
-BENCHMARK="tnl-benchmark-spmv"
-BENCHMARK_DBG="tnl-benchmark-spmv-dbg"
+BENCHMARK="tnl-benchmark-spmv --with-legacy-matrices yes --precision double --openmp-enabled no"
+BENCHMARK_DBG="tnl-benchmark-spmv-dbg --with-legacy-matrices no"
 
 export CUDA_PROFILE_CONFIG="$IWD/cuda-profiler.conf"
 PROCESS_CUDA_PROFILE="$IWD/process-cuda-profile.pl"
-#source matrix-market
+source matrix-market
+#MM_MATRICES=""
 source florida-matrix-market
+#FLORIDA_MM_MATRICES=""
 
-# !!!Matrices in MatrixMarket2 don't load properly, formatting issues with every file. MatrixReader fails. 
-#for link in $MM_MATRICES;
-#do
-#   echo "======================================================================================================"
-#   matrix=matrices`echo $link | sed 's/ftp:\/\/math.nist.gov\/pub//'`
-#   unzipped_matrix=`echo $matrix | sed 's/.gz//'`
-#   if test ! -e $matrix;
-#   then
-#      echo "Matrix $matrix is missing !!! Run the script 'get-matrices' first."
-#      #echo "Matrix $matrix is missing !!! Run the script 'get-matrices' first." >> sparse-matrix-benchmark.log            
-#   else
-#      gunzip -c ${matrix} > ${unzipped_matrix}      
-#      echo "Benchmarking with the matrix $unzipped_matrix ..."
-#      export CUDA_PROFILE_LOG=$unzipped_matrix.float.log
-#      if test x$DEBUG = xyes;
-#      then
-#         gdb --args ${BENCHMARK_DBG} --input-file $unzipped_matrix --log-file sparse-matrix-benchmark.log --verbose 1
-#      else
-#         $BENCHMARK --input-file $unzipped_matrix --log-file sparse-matrix-benchmark.log --verbose 1
-#      fi
-#      #perl $PROCESS_CUDA_PROFILE $unzipped_matrix.float.log sparse-matrix-profiling-float.log          
-#   fi
-#done
+for link in $MM_MATRICES;
+do
+   echo "======================================================================================================"
+   matrix=matrices`echo $link | sed 's/ftp:\/\/math.nist.gov\/pub//'`
+   unzipped_matrix=`echo $matrix | sed 's/.gz//'`
+   if test ! -e $matrix;
+   then
+      echo "Matrix $matrix is missing !!! Run the script 'get-matrices' first."
+   else
+      gunzip -c ${matrix} > ${unzipped_matrix}
+      echo "Benchmarking with the matrix $unzipped_matrix ..."
+      export CUDA_PROFILE_LOG=$unzipped_matrix.float.log
+      if test x$DEBUG = xyes;
+      then
+         gdb --args ${BENCHMARK_DBG} --input-file $unzipped_matrix --log-file log-files/sparse-matrix-benchmark.log --verbose 1
+      else
+         $BENCHMARK --input-file $unzipped_matrix --log-file log-files/sparse-matrix-benchmark.log --verbose 1
+      fi
+   fi
+done
 
 for link in $FLORIDA_MM_MATRICES;
 do
    matrix=matrices`echo $link | sed 's/http:\/\/www.cise.ufl.edu\/research\/sparse//'`
    if test ! -e $matrix;
-   then      
+   then
       echo "Matrix $matrix is missing !!! Run the script 'get-matrices' first."
-      #echo "Matrix $matrix is missing !!! Run the script 'get-matrices' first." >> sparse-matrix-benchmark.log
    else
      DIRNAME=`dirname $matrix`
      FILENAME=`basename $matrix`
@@ -59,18 +57,18 @@ do
      SUBDIRNAME=`echo $FILENAME | sed 's/.tar.gz//'`
      rm -f $DIRNAME/$SUBDIRNAME/*_b.mtx # these are usualy in array format
      for file in $DIRNAME/$SUBDIRNAME/*.mtx;
-     do        
+     do
          echo "======================================================================================================"
-         echo "Benchmarking with the matrix $file ..."
-	 mtx_file_name=`basename $file`
-	 mtx_file_name=${mtx_file_name%.mtx}	 
+         mtx_file_name=`basename $file`
+         mtx_file_name=${mtx_file_name%.mtx}
          if test x$DEBUG = xyes;
          then
-            gdb --args $BENCHMARK --input-file $file --log-file log-files/sparse-matrix-benchmark.log --output-mode append --verbose 1
+            gdb --args ${BENCHMARK_DBG} --input-file $file --log-file log-files/sparse-matrix-benchmark.log --output-mode append --verbose 1
          else
             $BENCHMARK --input-file $file --log-file log-files/sparse-matrix-benchmark.log --output-mode append --verbose 1
          fi
      done
    fi
 done
+$BENCHMARK --log-file log-files/sparse-matrix-benchmark.log --output-mode close --verbose 1
 
diff --git a/src/Benchmarks/scripts/tnl-run-spmv-benchmark b/src/Benchmarks/scripts/tnl-run-spmv-benchmark
deleted file mode 100755
index a20c179d764404038e767ea9079d61f9ce168df1..0000000000000000000000000000000000000000
--- a/src/Benchmarks/scripts/tnl-run-spmv-benchmark
+++ /dev/null
@@ -1,501 +0,0 @@
-#!/usr/bin/env bash
-                
-DEBUG="no"
-FORMAT_TEST="yes"
-STOP_TIME="1"
-MAX_ITERATIONS="10"
-export CUDA_PROFILE=0
-
-PWD=`pwd`
-IWD="$PWD"
-BASE="ftp://math.nist.gov/pub/MatrixMarket2/Harwell-Boeing/"
-SPARSE_MATRIX_BENCHMARK="tnl-sparse-matrix-benchmark"
-SPARSE_MATRIX_BENCHMARK_DBG="tnl-sparse-matrix-benchmark-dbg"
-#SPARSE_MATRIX_BENCHMARK="tnl-sparse-matrix-benchmark-dbg"
-
-export CUDA_PROFILE_CONFIG="$IWD/cuda-profiler.conf"
-PROCESS_CUDA_PROFILE="$IWD/process-cuda-profile.pl"
-source matrix-market
-source florida-matrix-market
-
-write_header()
-{
-   echo "<html>" > $1
-   echo "   <body>" >> $1
-   echo "      <table border=1>" >> $1
-   echo "          <tr>" >> $1
-   echo "             <td rowspan=4 colspan=4 align=center>Matrix</td>" >> $1
-   echo "             <td rowspan=4 colspan=2 align=center>CSR</td>" >> $1
-   echo "             <td rowspan=4 colspan=3 align=center>Cusparse</td>" >> $1
-   echo "             <td rowspan=4 colspan=3 align=center>Hybrid</td>" >> $1
-   echo "             <td colspan=68 align=center>Row-Grouped CSR</td>" >> $1   
-   echo "             <td colspan=68 align=center>Row-Grouped CSR with rows sorted decreasingly by the number of the nonzeros</td>" >> $1
-   echo "             <td colspan=120 align=center>Adaptive Row-Grouped CSR</td>" >> $1   
-   echo "          </tr>" >> $1
-   
-   echo "          <tr>" >> $1
-   echo "             <td colspan=17>Group Size = 16</td>" >> $1       # RgCSR
-   echo "             <td colspan=17>Group Size = 32</td>" >> $1
-   echo "             <td colspan=17>Group Size = 64</td>" >> $1
-   echo "             <td colspan=17>Group Size Variable</td>" >> $1    # RgCSR adaptive group size
-   echo "             <td colspan=17>Group Size = 16</td>" >> $1       # RgCSR rows sorted decreasingly
-   echo "             <td colspan=17>Group Size = 32</td>" >> $1
-   echo "             <td colspan=17>Group Size = 64</td>" >> $1
-   echo "             <td colspan=17>Group Size Variable</td>" >> $1      # RgCSR rows sorted decreasingly, adaptive group size
-   echo "             <td rowspan=2 colspan=20>Chunk Size = 1</td>" >> $1          # Adaptive RgCSR 
-   echo "             <td rowspan=2 colspan=20>Chunk Size = 2</td>" >> $1
-   echo "             <td rowspan=2 colspan=20>Chunk Size = 4</td>" >> $1
-   echo "             <td rowspan=2 colspan=20>Chunk Size = 8</td>" >> $1
-   echo "             <td rowspan=2 colspan=20>Chunk Size = 16</td>" >> $1
-   echo "             <td rowspan=2 colspan=20>Chunk Size = 32</td>" >> $1   
-   echo "          </tr>" >> $1
-   
-   echo "          <tr>" >> $1
-   echo "             <td rowspan=2 colspan=2></td>" >> $1                         # RgCSR format with the group size = 16
-   echo "             <td rowspan=2 colspan=3>CPU</td>" >> $1
-   echo "             <td colspan=12>GPU</td>" >> $1
-   echo "             <td rowspan=2 colspan=2></td>" >> $1                         # RgCSR format with the group size = 32
-   echo "             <td rowspan=2 colspan=3>CPU</td>" >> $1
-   echo "             <td colspan=12>GPU</td>" >> $1   
-   echo "             <td rowspan=2 colspan=2></td>" >> $1                         # RgCSR format with the group size = 64
-   echo "             <td rowspan=2 colspan=3>CPU</td>" >> $1
-   echo "             <td colspan=12>GPU</td>" >> $1
-
-   echo "             <td rowspan=2 colspan=2></td>" >> $1                         # RgCSR format with the variable group size 
-   echo "             <td rowspan=2 colspan=3>CPU</td>" >> $1
-   echo "             <td colspan=12>GPU</td>" >> $1
-   
-   echo "             <td rowspan=2 colspan=2></td>" >> $1                         # RgCSR (sorted rows) format with the group size = 16
-   echo "             <td rowspan=2 colspan=3>CPU</td>" >> $1
-   echo "             <td colspan=12>GPU</td>" >> $1
-   echo "             <td rowspan=2 colspan=2></td>" >> $1                         # RgCSR (sorted rows) format with the group size = 32
-   echo "             <td rowspan=2 colspan=3>CPU</td>" >> $1
-   echo "             <td colspan=12>GPU</td>" >> $1   
-   echo "             <td rowspan=2 colspan=2></td>" >> $1                         # RgCSR (sorted rows) format with the group size = 64
-   echo "             <td rowspan=2 colspan=3>CPU</td>" >> $1
-   echo "             <td colspan=12>GPU</td>" >> $1
-      
-   echo "             <td rowspan=2 colspan=2></td>" >> $1                         # RgCSR (sorted rows) format with variable group size
-   echo "             <td rowspan=2 colspan=3>CPU</td>" >> $1
-   echo "             <td colspan=12>GPU</td>" >> $1
-   
-#   echo "             <td colspan=20>GPU</td>" >> $1                     # Adaptive RgCSR format with the chunk size 1
-#   echo "             <td colspan=20>GPU</td>" >> $1                     # Adaptive RgCSR format with the chunk size 2
-#   echo "             <td colspan=20>GPU</td>" >> $1                     # Adaptive RgCSR format with the chunk size 4
-#   echo "             <td colspan=20>GPU</td>" >> $1                     # Adaptive RgCSR format with the chunk size 8
-#   echo "             <td colspan=20>GPU</td>" >> $1                     # Adaptive RgCSR format with the chunk size 16
-#   echo "             <td colspan=20>GPU</td>" >> $1                     # Adaptive RgCSR format with the chunk size 32
-   
-   
-   
-                  
-   echo "          </tr>" >> $1
-   
-   echo "          <tr>" >> $1
-   echo "             <td colspan=3>CUDA Block Size = 32</td>" >> $1        # RgCSR format with the group size = 16
-   echo "             <td colspan=3>CUDA Block Size = 64</td>" >> $1
-   echo "             <td colspan=3>CUDA Block Size = 128</td>" >> $1
-   echo "             <td colspan=3>CUDA Block Size = 256</td>" >> $1
-   echo "             <td colspan=3>CUDA Block Size = 32</td>" >> $1        # RgCSR format with the group size = 32
-   echo "             <td colspan=3>CUDA Block Size = 64</td>" >> $1
-   echo "             <td colspan=3>CUDA Block Size = 128</td>" >> $1
-   echo "             <td colspan=3>CUDA Block Size = 256</td>" >> $1         
-   echo "             <td colspan=3>CUDA Block Size = 32</td>" >> $1        # RgCSR format with the group size = 64
-   echo "             <td colspan=3>CUDA Block Size = 64</td>" >> $1
-   echo "             <td colspan=3>CUDA Block Size = 128</td>" >> $1
-   echo "             <td colspan=3>CUDA Block Size = 256</td>" >> $1
-   
-   echo "             <td colspan=3>CUDA Block Size = 32</td>" >> $1        # RgCSR format with the group size cca 16
-   echo "             <td colspan=3>CUDA Block Size = 64</td>" >> $1
-   echo "             <td colspan=3>CUDA Block Size = 128</td>" >> $1
-   echo "             <td colspan=3>CUDA Block Size = 256</td>" >> $1
-   
-   
-   echo "             <td colspan=3>CUDA Block Size = 32</td>" >> $1        # RgCSR (sorted rows) format with the group size = 16
-   echo "             <td colspan=3>CUDA Block Size = 64</td>" >> $1
-   echo "             <td colspan=3>CUDA Block Size = 128</td>" >> $1
-   echo "             <td colspan=3>CUDA Block Size = 256</td>" >> $1
-   echo "             <td colspan=3>CUDA Block Size = 32</td>" >> $1        # RgCSR (sorted rows) format with the group size = 32
-   echo "             <td colspan=3>CUDA Block Size = 64</td>" >> $1
-   echo "             <td colspan=3>CUDA Block Size = 128</td>" >> $1
-   echo "             <td colspan=3>CUDA Block Size = 256</td>" >> $1         
-   echo "             <td colspan=3>CUDA Block Size = 32</td>" >> $1        # RgCSR (sorted rows) format with the group size = 64
-   echo "             <td colspan=3>CUDA Block Size = 64</td>" >> $1
-   echo "             <td colspan=3>CUDA Block Size = 128</td>" >> $1
-   echo "             <td colspan=3>CUDA Block Size = 256</td>" >> $1
-   
-   echo "             <td colspan=3>CUDA Block Size = 32</td>" >> $1        # RgCSR (sorted rows) format with the group size >= 16
-   echo "             <td colspan=3>CUDA Block Size = 64</td>" >> $1
-   echo "             <td colspan=3>CUDA Block Size = 128</td>" >> $1
-   echo "             <td colspan=3>CUDA Block Size = 256</td>" >> $1
-   
-   echo "             <td colspan=5>CUDA Block Size = 32</td>" >> $1        # Adaptive RgCSR format with the chunk size = 1
-   echo "             <td colspan=5>CUDA Block Size = 64</td>" >> $1
-   echo "             <td colspan=5>CUDA Block Size = 128</td>" >> $1
-   echo "             <td colspan=5>CUDA Block Size = 256</td>" >> $1
-   echo "             <td colspan=5>CUDA Block Size = 32</td>" >> $1        # Adaptive RgCSR format with the chunk size = 2
-   echo "             <td colspan=5>CUDA Block Size = 64</td>" >> $1
-   echo "             <td colspan=5>CUDA Block Size = 128</td>" >> $1
-   echo "             <td colspan=5>CUDA Block Size = 256</td>" >> $1         
-   echo "             <td colspan=5>CUDA Block Size = 32</td>" >> $1        # Adaptive RgCSR format with the chunk size = 4
-   echo "             <td colspan=5>CUDA Block Size = 64</td>" >> $1
-   echo "             <td colspan=5>CUDA Block Size = 128</td>" >> $1
-   echo "             <td colspan=5>CUDA Block Size = 256</td>" >> $1
-   echo "             <td colspan=5>CUDA Block Size = 32</td>" >> $1        # Adaptive RgCSR format with the chunk size = 8
-   echo "             <td colspan=5>CUDA Block Size = 64</td>" >> $1
-   echo "             <td colspan=5>CUDA Block Size = 128</td>" >> $1
-   echo "             <td colspan=5>CUDA Block Size = 256</td>" >> $1  
-   echo "             <td colspan=5>CUDA Block Size = 32</td>" >> $1        # Adaptive RgCSR format with the chunk size = 16
-   echo "             <td colspan=5>CUDA Block Size = 64</td>" >> $1
-   echo "             <td colspan=5>CUDA Block Size = 128</td>" >> $1
-   echo "             <td colspan=5>CUDA Block Size = 256</td>" >> $1  
-   echo "             <td colspan=5>CUDA Block Size = 32</td>" >> $1        # Adaptive RgCSR format with the chunk size = 32
-   echo "             <td colspan=5>CUDA Block Size = 64</td>" >> $1
-   echo "             <td colspan=5>CUDA Block Size = 128</td>" >> $1
-   echo "             <td colspan=5>CUDA Block Size = 256</td>" >> $1  
-     
-            
-   echo "          </tr>" >> $1      
-         
-   echo "          <tr>" >> $1
-   echo "             <td>Name</td>" >> $1                      # Matrix description
-   echo "             <td>Size</td>" >> $1
-   echo "             <td>NonZeros No.</td>" >> $1
-   echo "             <td>NonZeros %</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1                 # CSR format on CPU
-   echo "             <td>GFlops</td>" >> $1                    
-
-   echo "             <td>Time/sec.</td>" >> $1                 # Cusparse
-   echo "             <td>GFlops</td>" >> $1                    
-   echo "             <td>Speed-up</td>" >> $1
-   
-   echo "             <td>Time/sec.</td>" >> $1                 # Hybrid format Bell, Garland on GPU
-   echo "             <td>GFlops</td>" >> $1                    
-   echo "             <td>Speed-up</td>" >> $1
-   
-   echo "             <td>Info</td>" >> $1                      # RgCSR format with the group size = 16
-   echo "             <td>Artificial Zeros</td>" >> $1    
-   echo "             <td>Time/sec.</td>" >> $1  
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1      
-   echo "             <td>Info</td>" >> $1                      # RgCSR format with the group size = 32
-   echo "             <td>Artificial Zeros</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1      
-   echo "             <td>Info</td>" >> $1                      # RgCSR format with the group size = 64   
-   echo "             <td>Artificial Zeros</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1          
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1
-   
-   echo "             <td>Info</td>" >> $1                        # RgCSR format with the group size variable
-   echo "             <td>Artificial Zeros</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1      
-   
-   echo "             <td>Info</td>" >> $1                        # RgCSR (sorted rows) format with the group size = 16
-   echo "             <td>Artificial Zeros</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1      
-   echo "             <td>Info</td>" >> $1                        # RgCSR (sorted rows) format with the group size = 32   
-   echo "             <td>Artificial Zeros</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1      
-   echo "             <td>Info</td>" >> $1                         # RgCSR (sorted rows) format with the group size = 64   
-   echo "             <td>Artificial Zeros</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1
-   
-   echo "             <td>Info</td>" >> $1                          # RgCSR (sorted rows) format with the group size variable
-   echo "             <td>Artificial Zeros</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1      
-   
-   echo "             <td>Info</td>" >> $1                         # Adaptive RgCSR format with the chunk size = 1
-   echo "             <td>Artificial Zeros</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1
-   echo "             <td>Info</td>" >> $1
-   echo "             <td>Artificial Zeros</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1
-   echo "             <td>Info</td>" >> $1
-   echo "             <td>Artificial Zeros</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1   
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1
-   echo "             <td>Info</td>" >> $1
-   echo "             <td>Artificial Zeros</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1   
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1
-   echo "             <td>Info</td>" >> $1                         # Adaptive RgCSR format with the chunk size = 2   
-   echo "             <td>Artificial Zeros</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1
-   echo "             <td>Info</td>" >> $1
-   echo "             <td>Artificial Zeros</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1   
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1
-   echo "             <td>Info</td>" >> $1
-   echo "             <td>Artificial Zeros</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1   
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1
-   echo "             <td>Info</td>" >> $1
-   echo "             <td>Artificial Zeros</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1   
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1
-   echo "             <td>Info</td>" >> $1                        # Adaptive RgCSR format with the chunk size = 4   
-   echo "             <td>Artificial Zeros</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1
-   echo "             <td>Info</td>" >> $1
-   echo "             <td>Artificial Zeros</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1   
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1
-   echo "             <td>Info</td>" >> $1
-   echo "             <td>Artificial Zeros</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1   
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1
-   echo "             <td>Info</td>" >> $1
-   echo "             <td>Artificial Zeros</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1   
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1
-   echo "             <td>Info</td>" >> $1                        # Adaptive RgCSR format with the chunk size = 8   
-   echo "             <td>Artificial Zeros</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1
-   echo "             <td>Info</td>" >> $1
-   echo "             <td>Artificial Zeros</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1   
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1
-   echo "             <td>Info</td>" >> $1
-   echo "             <td>Artificial Zeros</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1   
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1
-   echo "             <td>Info</td>" >> $1
-   echo "             <td>Artificial Zeros</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1   
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1
-   echo "             <td>Info</td>" >> $1                        # Adaptive RgCSR format with the chunk size = 16   
-   echo "             <td>Artificial Zeros</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1
-   echo "             <td>Info</td>" >> $1
-   echo "             <td>Artificial Zeros</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1   
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1
-   echo "             <td>Info</td>" >> $1
-   echo "             <td>Artificial Zeros</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1   
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1
-   echo "             <td>Info</td>" >> $1
-   echo "             <td>Artificial Zeros</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1   
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1
-   echo "             <td>Info</td>" >> $1                        # Adaptive RgCSR format with the chunk size = 32   
-   echo "             <td>Artificial Zeros</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1
-   echo "             <td>Info</td>" >> $1
-   echo "             <td>Artificial Zeros</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1
-   echo "             <td>Info</td>" >> $1
-   echo "             <td>Artificial Zeros</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1   
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1
-   echo "             <td>Info</td>" >> $1
-   echo "             <td>Artificial Zeros</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1   
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1   
-}
-
-write_closing()
-{
-   echo "      </table>" >> $1
-   echo "   </body>" >> $1
-   echo "</html>" >> $1
-}
-
-write_header sparse-matrix-benchmark-float.log.html
-write_header sparse-matrix-benchmark-double.log.html
-write_header sparse-matrix-benchmark-amd-float.log.html
-write_header sparse-matrix-benchmark-amd-double.log.html
-
-for link in $MM_MATRICES;
-do
-   echo "###############################################################################################"
-   matrix=matrices`echo $link | sed 's/ftp:\/\/math.nist.gov\/pub//'`
-   unzipped_matrix=`echo $matrix | sed 's/.gz//'`
-   if test ! -e $matrix;
-   then
-      echo "Matrix $matrix is missing !!! Run the script 'get-matrices' first."
-      #echo "Matrix $matrix is missing !!! Run the script 'get-matrices' first." >> sparse-matrix-benchmark.log            
-   else
-      gunzip -c ${matrix} > ${unzipped_matrix}      
-      echo "Checking with the matrix $unzipped_matrix in single precision ..."
-      export CUDA_PROFILE_LOG=$unzipped_matrix.float.log
-      if test x$DEBUG = xyes;
-      then
-         gdb --args ${SPARSE_MATRIX_BENCHMARK_DBG} --input-mtx-file $unzipped_matrix --log-file sparse-matrix-benchmark-float.log.html --stop-time $STOP_TIME --max-iterations $MAX_ITERATIONS --verbose 1
-      else
-         $SPARSE_MATRIX_BENCHMARK --input-mtx-file $unzipped_matrix --pdf-file $unzipped_matrix.pdf --log-file sparse-matrix-benchmark-float.log.html --stop-time $STOP_TIME --max-iterations $MAX_ITERATIONS --format-test $FORMAT_TEST --verbose 1
-      fi
-      #perl $PROCESS_CUDA_PROFILE $unzipped_matrix.float.log sparse-matrix-profiling-float.log          
-   fi
-done
-
-for link in $FLORIDA_MM_MATRICES;
-do
-   matrix=matrices`echo $link | sed 's/http:\/\/www.cise.ufl.edu\/research\/sparse//'`
-   if test ! -e $matrix;
-   then      
-      echo "Matrix $matrix is missing !!! Run the script 'get-matrices' first."
-      #echo "Matrix $matrix is missing !!! Run the script 'get-matrices' first." >> sparse-matrix-benchmark.log
-   else
-     DIRNAME=`dirname $matrix`
-     FILENAME=`basename $matrix`
-     cd $DIRNAME
-     tar zxvf $FILENAME
-     cd $IWD
-     SUBDIRNAME=`echo $FILENAME | sed 's/.tar.gz//'`
-     rm -f $DIRNAME/$SUBDIRNAME/*_b.mtx # these are usualy in array format
-     for file in $DIRNAME/$SUBDIRNAME/*.mtx;
-     do
-         echo "###############################################################################################"
-         echo "Checking with the matrix $file ..."
-         $SPARSE_MATRIX_BENCHMARK --input-file $file.float.bin.bz2 --input-mtx-file $file --pdf-file $file.pdf --log-file sparse-matrix-benchmark-float.log.html --stop-time $STOP_TIME --max-iterations $MAX_ITERATIONS --format-test $FORMAT_TEST --verbose 1                        
-     done
-   fi
-done
-
-write_closing sparse-matrix-benchmark-float.log.html
-write_closing sparse-matrix-benchmark-double.log.html
-write_closing sparse-matrix-benchmark-amd-float.log.html
-write_closing sparse-matrix-benchmark-amd-double.log.html   
-
diff --git a/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables-json.py b/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables-json.py
new file mode 100755
index 0000000000000000000000000000000000000000..4d77faffe10b6ae1448ec6f2b389f1b024c05394
--- /dev/null
+++ b/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables-json.py
@@ -0,0 +1,1046 @@
+#!/usr/bin/python3
+
+import os
+import json
+import pandas as pd
+from pandas.io.json import json_normalize
+import matplotlib.pyplot as plt
+import numpy as np
+import math
+
+#Latex fonst set-up
+
+#plt.rcParams.update({
+#   "text.usetex": True,
+#   "font.family": "sans-serif",
+#   "font.sans-serif": ["Helvetica"]})
+#
+# for Palatino and other serif fonts use:
+#plt.rcParams.update({
+#   "text.usetex": True,
+#   "font.family": "serif",
+#   "font.serif": ["Palatino"],
+#})
+
+
+####
+# A map of rgb points in your distribution
+# [distance, (r, g, b)]
+# distance is percentage from left edge
+# https://stackoverflow.com/questions/25668828/how-to-create-colour-gradient-in-python/50784012#50784012
+heatmap = [
+    [0.0,  (0.1, 0.1, 1.0)],
+ #  [0.20, (0, 0, .5)],
+ #  [0.40, (0, .5, 0)],
+    [0.40, (0.1, 1.0, 0.1)],
+#   [0.80, (.75, .75, 0)],
+#   [0.90, (1.0, .75, 0)],
+    [1.00, (1.0, 0.1, 0.1)],
+]
+
+def gaussian(x, a, b, c, d=0):
+    return a * math.exp(-(x - b)**2 / (2 * c**2)) + d
+
+def color_map(x, width=100, map=[], spread=1):
+    width = float(width)
+    r = sum([gaussian(x, p[1][0], p[0] * width, width/(spread*len(map))) for p in map])
+    g = sum([gaussian(x, p[1][1], p[0] * width, width/(spread*len(map))) for p in map])
+    b = sum([gaussian(x, p[1][2], p[0] * width, width/(spread*len(map))) for p in map])
+    return min(1.0, r), min(1.0, g), min(1.0, b)
+
+#for x in range(im.size[0]):
+#    r, g, b = pixel(x, width=im.size[0], map=heatmap)
+#    r, g, b = [int(256*v) for v in (r, g, b)]
+#    for y in range(im.size[1]):
+#        ld[x, y] = r, g, b
+
+
+####
+# Helper function
+def slugify(s):
+   s = str(s).strip().replace(' ', '_')
+   return re.sub(r'(?u)[^-\w.]', '', s)
+
+def latexFormatName( name ):
+   name = name.replace('<','')
+   name = name.replace('>','')
+   name = name.replace( 'Light  Automatic ', '')
+   #print( f'~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~{name}~~~')
+   if name == 'CSR':
+      return 'CSR on CPU'
+   if name == 'cusparse':
+      return 'cuSPARSE'
+   if 'SlicedEllpack' in name:
+      return name.replace( 'SlicedEllpack', 'Sliced Ellpack' )
+   if 'ChunkedEllpack' in name:
+      return name.replace( 'ChunkedEllpack', 'Chunked Ellpack' )
+   if 'BiEllpack' in name:
+      return name.replace( 'BiEllpack', 'Bisection Ellpack' )
+   if 'CSR Scalar' in name:
+      return name.replace( 'CSR Scalar', 'Scalar CSR' )
+   if 'CSR Vector' in name:
+      return name.replace( 'CSR Vector', 'Vector CSR' )
+   if 'CSR Light' in name:
+      return name.replace( 'CSR Light', 'Light CSR' )
+   if 'CSR Adaptive' in name:
+      return name.replace( 'CSR Adaptive', 'Adaptive CSR' )
+   return name
+
+####
+# Create multiindex for columns
+def get_multiindex( input_df, formats ):
+   level1 = [ 'Matrix name', 'rows', 'columns' ]
+   level2 = [ '',            '',     ''        ]
+   level3 = [ '',            '',     ''        ]
+   level4 = [ '',            '',     ''        ]
+   df_data = [[ ' ',' ',' ']]
+   for format in formats:
+      for device in ['CPU','GPU']:
+         for data in ['bandwidth', 'time', 'diff.max' ]: #,'time','speed-up','non-zeros','stddev','stddev/time','diff.max','diff.l2']:
+            level1.append( format )
+            level2.append( device )
+            level3.append( data )
+            level4.append( '' )
+            df_data[ 0 ].append( ' ' )
+      if not format in [ 'cusparse', 'CSR' ]:
+         for speedup in [ 'cusparse', 'CSR CPU']:
+            level1.append( format )
+            level2.append( 'GPU' )
+            level3.append( 'speed-up')
+            level4.append( speedup )
+            df_data[ 0 ].append( ' ' )
+      if 'Binary' in format:
+         level1.append( format )
+         level2.append( 'GPU' )
+         level3.append( 'speed-up')
+         level4.append( 'non-binary' )
+         df_data[ 0 ].append( ' ' )
+      if 'Symmetric' in format:
+         level1.append( format )
+         level2.append( 'GPU' )
+         level3.append( 'speed-up')
+         level4.append( 'non-symmetric' )
+         df_data[ 0 ].append( ' ' )
+      if format == 'CSR< Light > Automatic' or format == 'CSR< Light > Automatic Light':
+         level1.append( format )
+         level2.append( 'GPU' )
+         level3.append( 'speed-up')
+         level4.append( 'LightSpMV Vector' )
+         df_data[ 0 ].append( ' ' )
+      if format == 'TNL Best':
+         level1.append( format )
+         level2.append( 'GPU' )
+         level3.append( 'format')
+         level4.append( '' )
+         df_data[ 0 ].append( ' ' )
+
+   multiColumns = pd.MultiIndex.from_arrays([ level1, level2, level3, level4 ] )
+   return multiColumns, df_data
+
+####
+# Convert input table to better structured one
+def convert_data_frame( input_df, multicolumns, df_data, begin_idx = 0, end_idx = -1 ):
+   frames = []
+   in_idx = 0
+   out_idx = 0
+   #max_out_idx = max_rows
+   if end_idx == -1:
+      end_idx = len(input_df.index)
+   best_count = 0
+   while in_idx < len(input_df.index) and out_idx < end_idx:
+      matrixName = input_df.iloc[in_idx]['matrix name']
+      df_matrix = input_df.loc[input_df['matrix name'] == matrixName]
+      if out_idx >= begin_idx:
+         print( f'{out_idx} : {in_idx} / {len(input_df.index)} : {matrixName}' )
+      else:
+         print( f'{out_idx} : {in_idx} / {len(input_df.index)} : {matrixName} - SKIP' )
+      aux_df = pd.DataFrame( df_data, columns = multicolumns, index = [out_idx] )
+      best_bw = 0
+      for index,row in df_matrix.iterrows():
+         aux_df.iloc[0]['Matrix name'] = row['matrix name']
+         aux_df.iloc[0]['rows']        = row['rows']
+         aux_df.iloc[0]['columns']     = row['columns']
+         current_format = row['format']
+         current_device = row['device']
+         #print( current_format + " / " + current_device )
+         bw = pd.to_numeric(row['bandwidth'], errors='coerce')
+         time = pd.to_numeric(row['time'], errors='coerce')
+         diff_max = pd.to_numeric(row['CSR Diff.Max'], errors='coerce')
+         aux_df.iloc[0][(current_format,current_device,'bandwidth','')] = bw
+         aux_df.iloc[0][(current_format,current_device,'time','')] = time
+         aux_df.iloc[0][(current_format,current_device,'diff.max','')] = diff_max
+         if( current_device == 'GPU' and
+             not 'Binary' in current_format and
+             not 'Symmetric' in current_format and
+             not 'Legacy' in current_format and
+             not 'cusparse' in current_format and
+             not 'LightSpMV' in current_format and
+             not 'Hybrid' in current_format and
+             current_format != 'CSR< Light > Automatic' and
+             bw > best_bw ):
+            best_bw = bw
+            best_format = current_format
+         if current_format == 'cusparse':
+            cusparse_bw = bw
+         #aux_df.iloc[0][(current_format,current_device,'time')]        = row['time']
+         #aux_df.iloc[0][(current_format,current_device,'speed-up')]    = row['speedup']
+         #aux_df.iloc[0][(current_format,current_device,'non-zeros')]   = row['non-zeros']
+         #aux_df.iloc[0][(current_format,current_device,'stddev')]      = row['stddev']
+         #aux_df.iloc[0][(current_format,current_device,'stddev/time')] = row['stddev/time']
+         #aux_df.iloc[0][(current_format,current_device,'diff.max')]    = row['CSR Diff.Max']
+         #aux_df.iloc[0][(current_format,current_device,'diff.l2')]    = row['CSR Diff.L2']
+      aux_df.iloc[0][('TNL Best','GPU','bandwidth','')] = best_bw
+      if best_bw > cusparse_bw:
+         aux_df.iloc[0][('TNL Best','GPU','format','')] = best_format
+      else:
+         aux_df.iloc[0][('TNL Best','GPU','format','')] = 'cusparse'
+      best_count += 1
+      if out_idx >= begin_idx:
+         frames.append( aux_df )
+      out_idx = out_idx + 1
+      in_idx = in_idx + len(df_matrix.index)
+   result = pd.concat( frames )
+   return result
+
+####
+# Compute speed-up of particular formats compared to Cusparse on GPU and CSR on CPU
+def compute_cusparse_speedup( df, formats ):
+   for device in [ 'CPU', 'GPU' ]:
+      for format in formats:
+         if not format in [ 'cusparse', 'CSR' ]:
+            print( 'Adding speed-up for ', format )
+            try:
+               format_times_list = df[(format,device,'time')]
+            except:
+               continue
+            cusparse_times_list = df[('cusparse','GPU','time')]
+            csr_times_list = df[('CSR','CPU','time')]
+            cusparse_speedup_list = []
+            csr_speedup_list = []
+            for( format_time, cusparse_time, csr_time ) in zip( format_times_list, cusparse_times_list,csr_times_list ):
+               if( device == 'GPU' ):
+                  try:
+                     cusparse_speedup_list.append( cusparse_time / format_time  )
+                  except:
+                     cusparse_speedup_list.append(float('nan'))
+               try:
+                  csr_speedup_list.append( csr_time / format_time  )
+               except:
+                  csr_speedup_list.append(float('nan'))
+            if( device == 'GPU' ):
+               df[(format,'GPU','speed-up','cusparse')] = cusparse_speedup_list
+            df[(format,device,'speed-up','CSR CPU')] = csr_speedup_list
+
+####
+# Compute speedup of Light CSR
+def compute_csr_light_speedup( df, formats ):
+   for light in [ 'CSR< Light > Automatic', 'CSR< Light > Automatic Light']:
+      if light in formats:
+         csr_light_bdw_list = df[(light,'GPU','bandwidth')]
+         light_spmv_bdw_list = df[('LightSpMV Vector','GPU','bandwidth')]
+
+         csr_light_speedup_list = []
+         for ( csr_light_bdw, light_spmv_bdw ) in zip(csr_light_bdw_list,light_spmv_bdw_list):
+            try:
+               csr_light_speedup_list.append( csr_light_bdw / light_spmv_bdw  )
+            except:
+               csr_light_speedup_list.append(float('nan'))
+         df[(light,'GPU','speed-up','LightSpMV Vector')] = csr_light_speedup_list
+
+####
+# Compute speed-up of binary formats
+def compute_binary_speedup( df, formats ):
+   for format in formats:
+      if 'Binary' in format:
+         non_binary_format = format.replace( 'Binary ', '' )
+         print( f'Adding speed-up of {format} vs {non_binary_format}' )
+         format_bdw_list = df[(format,'GPU','bandwidth')]
+         non_binary_bdw_list = df[(non_binary_format,'GPU','bandwidth')]
+         binary_speedup_list = []
+         for ( format_bdw, non_binary_bdw ) in zip( format_bdw_list, non_binary_bdw_list ):
+            try:
+               binary_speedup_list.append( format_bdw / non_binary_bdw )
+            except:
+               binary_speedup_list.append( float('nan'))
+         df[(format,'GPU','speed-up','non-binary')] = binary_speedup_list
+
+####
+# Compute speed-up of symmetric formats
+def compute_symmetric_speedup( df, formats ):
+   for format in formats:
+      if 'Symmetric' in format:
+         non_symmetric_format = format.replace( 'Symmetric ', '' )
+         print( f'Adding speed-up of {format} vs {non_symmetric_format}' )
+         format_times_list = df[(format,'GPU','time')]
+         non_symmetric_times_list = df[(non_symmetric_format,'GPU','time')]
+
+         symmetric_speedup_list = []
+         for ( format_time, non_symmetric_time ) in zip( format_times_list, non_symmetric_times_list ):
+            try:
+               symmetric_speedup_list.append( non_symmetric_time / format_time  )
+            except:
+               symmetric_speedup_list.append(float('nan'))
+         df[(format,'GPU','speed-up','non-symmetric')] = symmetric_speedup_list
+
+def compute_speedup( df, formats ):
+   compute_cusparse_speedup( df, formats )
+   compute_csr_light_speedup( df, formats )
+   compute_binary_speedup( df, formats )
+   compute_symmetric_speedup( df, formats )
+
+###
+# Draw several profiles into one figure
+def draw_profiles( formats, profiles, xlabel, ylabel, filename, legend_loc='upper right', bar='none' ):
+   fig, axs = plt.subplots( 1, 1, figsize=(6,4) )
+   latexNames = []
+   size = 1
+   for format in formats:
+      t = np.arange(profiles[format].size )
+      axs.plot( t, profiles[format], '-o', ms=1, lw=1 )
+      size = len( profiles[format] )
+      latexNames.append( latexFormatName( format ) )
+   if bar != 'none':
+      #print( f'size = {size}' )
+      bar_data = np.full( size, 1 )
+      axs.plot( t, bar_data, '-', ms=1, lw=1.5 )
+      if bar != '':
+         latexNames.append( bar )
+
+   axs.legend( latexNames, loc=legend_loc )
+   axs.set_xlabel( xlabel )
+   axs.set_ylabel( ylabel )
+   axs.set_yscale( 'log' )
+   plt.rcParams.update({
+      "text.usetex": True,
+      "font.family": "sans-serif",
+      "font.sans-serif": ["Helvetica"]})
+   plt.savefig( filename )
+   plt.close(fig)
+
+
+####
+# Effective BW profile
+def effective_bw_profile( df, formats, head_size=10 ):
+   if not os.path.exists("BW-profile"):
+      os.mkdir("BW-profile")
+   profiles = {}
+   for format in formats:
+      print( f"Writing BW profile of {format}" )
+      fig, axs = plt.subplots( 1, 1, figsize=(6,4) )
+      t = np.arange(df[(format,'GPU','bandwidth')].size )
+      if format == 'CSR':
+         df.sort_values(by=[(format,'CPU','bandwidth')],inplace=True,ascending=False)
+         profiles[format] = df[(format,'CPU','bandwidth')].copy()
+         axs.plot( t, df[(format,'CPU','bandwidth')], '-o', ms=1, lw=1 )
+      else:
+         df.sort_values(by=[(format,'GPU','bandwidth')],inplace=True,ascending=False)
+         profiles[format] = df[(format,'GPU','bandwidth')].copy()
+         axs.plot( t, df[(format,'GPU','bandwidth')], '-o', ms=1, lw=1 )
+      axs.legend( [ latexFormatName(format), 'CSR on CPU' ], loc='upper right' )
+      axs.set_ylabel( 'Effective bandwidth in GB/sec' )
+      plt.rcParams.update({
+         "text.usetex": True,
+         "font.family": "sans-serif",
+         "font.sans-serif": ["Helvetica"]})
+      plt.savefig( f"BW-profile/{format}.pdf")
+      plt.close(fig)
+      fig, axs = plt.subplots( 1, 1, figsize=(6,4) )
+      axs.set_yscale( 'log' )
+      axs.plot( t, result[(format,'GPU','bandwidth')], '-o', ms=1, lw=1 )
+      axs.legend( [ latexFormatName(format), 'CSR on CPU' ], loc='lower left' )
+      axs.set_xlabel( f"Matrix number - sorted w.r.t. {latexFormatName(format)} performance" )
+      axs.set_ylabel( 'Effective bandwidth in GB/sec' )
+      plt.rcParams.update({
+         "text.usetex": True,
+         "font.family": "sans-serif",
+         "font.sans-serif": ["Helvetica"]})
+      plt.savefig( f"BW-profile/{format}-log.pdf")
+      plt.close(fig)
+      copy_df = df.copy()
+      for f in formats:
+         if not f in ['cusparse','CSR',format]:
+            copy_df.drop( labels=f, axis='columns', level=0, inplace=True )
+      copy_df.to_html( f"BW-profile/{format}.html" )
+
+   # Draw ellpack formats profiles
+   current_formats = []
+   xlabel = "Matrix number - sorted by particular formats effective bandwidth"
+   ylabel = "Effective bandwidth in GB/sec"
+   for format in formats:
+      if( ( 'Ellpack' in format and not 'Binary' in format and not 'Symmetric' in format and not 'Legacy' in format ) or
+          format == 'CSR' or
+          format == 'cusparse' ):
+         current_formats.append( format )
+   draw_profiles( current_formats, profiles, xlabel, ylabel, "ellpack-profiles-bw.pdf", 'lower left', "none" )
+
+   # Draw CSR formats profiles
+   current_formats.clear()
+   for format in formats:
+      if( ( 'CSR' in format and not 'Binary' in format and not 'Symmetric' in format and not 'Legacy' in format and not 'Hybrid' in format ) or
+          format == 'cusparse' ):
+         current_formats.append( format )
+   draw_profiles( current_formats, profiles, xlabel, ylabel, "csr-profiles-bw.pdf", 'lower left', 'none' )
+
+
+####
+# Comparison with Cusparse
+def cusparse_comparison( df, formats, head_size=10 ):
+   if not os.path.exists("Cusparse-bw"):
+      os.mkdir("Cusparse-bw")
+   ascend_df = df.copy()
+   df.sort_values(by=[('cusparse','GPU','bandwidth')],inplace=True,ascending=False)
+   ascend_df.sort_values(by=[('cusparse','GPU','bandwidth')],inplace=True,ascending=True)
+   for format in formats:
+      if not format in ['cusparse','CSR']:
+         print( f"Writing comparison of {format} and cuSPARSE" )
+         filtered_df = df.dropna( subset=[(format,'GPU','bandwidth','')] )
+         filtered_ascend_df = ascend_df.dropna( subset=[(format,'GPU','bandwidth','')] )
+         t = np.arange(filtered_df[(format,'GPU','bandwidth')].size )
+         fig, axs = plt.subplots( 2, 1 )
+         axs[0].plot( t, filtered_df[(format,'GPU','bandwidth')], '-o', ms=1, lw=1 )
+         axs[0].plot( t, filtered_df[('cusparse','GPU','bandwidth')], '-o', ms=1, lw=1 )
+         axs[0].legend( [ format, 'cuSPARSE' ], loc='upper right' )
+         axs[0].set_ylabel( 'Effective bandwidth in GB/sec' )
+         axs[1].set_yscale( 'log' )
+         axs[1].plot( t, filtered_df[(format,'GPU','bandwidth')], '-o', ms=1, lw=1 )
+         axs[1].plot( t, filtered_df[('cusparse','GPU','bandwidth')], '-o', ms=1, lw=1 )
+         axs[1].legend( [ latexFormatName(format), 'cuSPARSE' ], loc='lower left' )
+         axs[1].set_xlabel( 'Matrix number - sorted w.r.t. cuSPARSE performance' )
+         axs[1].set_ylabel( 'Effective bandwidth in GB/sec' )
+         plt.savefig( f"Cusparse-bw/{format}.pdf" )
+         plt.close(fig)
+         copy_df = df.copy()
+         for f in formats:
+            if not f in ['cusparse','CSR',format]:
+               copy_df.drop( labels=f, axis='columns', level=0, inplace=True )
+         copy_df.to_html( f"Cusparse-bw/{format}.html" )
+
+####
+# Comparison with CSR on CPU
+def csr_comparison( df, formats, head_size=10 ):
+   if not os.path.exists("CSR-bw"):
+      os.mkdir("CSR-bw")
+   for device in [ 'CPU', 'GPU' ]:
+      for format in formats:
+         if not format in ['cusparse','CSR']:
+            print( f"Writing comparison of {format} and CSR on CPU" )
+            try:
+               df.sort_values(by=[(format,device,'bandwidth')],inplace=True,ascending=False)
+            except:
+               continue
+            fig, axs = plt.subplots( 2, 1 )
+            t = np.arange(df[(format,device,'bandwidth')].size )
+            axs[0].plot( t, df[(format,device,'bandwidth')], '-o', ms=1, lw=1 )
+            axs[0].plot( t, df[('CSR','CPU','bandwidth')], '-o', ms=1, lw=1 )
+            axs[0].legend( [ latexFormatName(format), 'CSR on CPU' ], loc='upper right' )
+            axs[0].set_ylabel( 'Effective bandwidth in GB/sec' )
+            axs[1].set_yscale( 'log' )
+            axs[1].plot( t, result[(format,device,'bandwidth')], '-o', ms=1, lw=1 )
+            axs[1].plot( t, result[('CSR','CPU','bandwidth')], '-o', ms=1, lw=1 )
+            axs[1].legend( [ latexFormatName(format), 'CSR on CPU' ], loc='lower left' )
+            axs[1].set_xlabel( f"Matrix number - sorted w.r.t. {latexFormatName(format)} performance" )
+            axs[1].set_ylabel( 'Effective bandwidth in GB/sec' )
+            plt.rcParams.update({
+               "text.usetex": True,
+               "font.family": "sans-serif",
+               "font.sans-serif": ["Helvetica"]})
+            plt.savefig( f"CSR-bw/{format}-{device}.pdf")
+            plt.close(fig)
+            copy_df = df.copy()
+            for f in formats:
+               if not f in ['cusparse','CSR',format]:
+                  copy_df.drop( labels=f, axis='columns', level=0, inplace=True )
+            copy_df.to_html( f"CSR-bw/{format}-{device}.html" )
+
+####
+# Comparison of Legacy formats
+def legacy_formats_comparison( df, formats, head_size=10 ):
+   if not os.path.exists("Legacy-bw"):
+      os.mkdir("Legacy-bw")
+   for ref_format, legacy_format in [ ('Ellpack', 'Ellpack Legacy'),
+                                    ('SlicedEllpack', 'SlicedEllpack Legacy'),
+                                    ('ChunkedEllpack', 'ChunkedEllpack Legacy'),
+                                    ('BiEllpack', 'BiEllpack Legacy'),
+                                    ('CSR< Adaptive >', 'CSR Legacy Adaptive'),
+                                    ('CSR< Scalar >', 'CSR Legacy Scalar'),
+                                    ('CSR< Vector >', 'CSR Legacy Vector') ]:
+      if ref_format in formats and legacy_format in formats:
+         print( f"Writing comparison of {ref_format} and {legacy_format}" )
+         ascend_df = df.copy()
+         df.sort_values(by=[(ref_format,'GPU','bandwidth')],inplace=True,ascending=False)
+         ascend_df.sort_values(by=[(ref_format,'GPU','bandwidth')],inplace=True,ascending=True)
+         fig, axs = plt.subplots( 2, 1 )
+         t = np.arange(df[(ref_format,'GPU','bandwidth')].size )
+         axs[0].plot( t, df[(ref_format,'GPU','bandwidth')], '-o', ms=1, lw=1 )
+         axs[0].plot( t, df[(legacy_format,'GPU','bandwidth')], '-o', ms=1, lw=1 )
+         axs[0].legend( [ latexFormatName(ref_format), latexFormatName(legacy_format) ], loc='upper right' )
+         axs[0].set_ylabel( 'Effective bandwidth in GB/sec' )
+         axs[1].set_yscale( 'log' )
+         axs[1].plot( t, df[(ref_format,'GPU','bandwidth')], '-o', ms=1, lw=1 )
+         axs[1].plot( t, df[(legacy_format,'GPU','bandwidth')], '-o', ms=1, lw=1 )
+         axs[1].legend( [ latexFormatName(ref_format), latexFormatName(legacy_format) ], loc='lower left' )
+         axs[1].set_xlabel( f"Matrix number - sorted w.r.t. {latexFormatName(ref_format)}  performance" )
+         axs[1].set_ylabel( 'Effective bandwidth in GB/sec' )
+         plt.rcParams.update({
+            "text.usetex": True,
+            "font.family": "sans-serif",
+            "font.sans-serif": ["Helvetica"]})
+         plt.savefig( f"Legacy-bw/{ref_format}.pdf")
+         plt.close(fig)
+         copy_df = df.copy()
+         for f in formats:
+            if not f in ['cusparse','CSR',format]:
+               copy_df.drop( labels=f, axis='columns', level=0, inplace=True )
+         copy_df.to_html( f"Legacy-bw/{format}.html" )
+
+####
+# Comparison of speed-up w.r.t. CSR
+def csr_speedup_comparison( df, formats, head_size=10 ):
+   if not os.path.exists("CSR-speed-up"):
+      os.mkdir("CSR-speed-up")
+   for device in ['CPU', 'GPU']:
+      profiles = {}
+      for format in formats:
+         if not format in ['cusparse','CSR']:
+            print( f"Writing comparison of speed-up of {format} compared to CSR" )
+            df['tmp'] = df[(format, device,'bandwidth')]
+            filtered_df=df.dropna(subset=[('tmp','','','')])
+            try:
+               filtered_df.sort_values(by=[(format,device,'speed-up','CSR CPU')],inplace=True,ascending=False)
+            except:
+               continue
+            profiles[format] = filtered_df[(format,device,'speed-up','CSR CPU')].copy()
+            fig, axs = plt.subplots( 1, 1, figsize=(6,4) )
+            size = len(filtered_df[(format,device,'speed-up','CSR CPU')].index)
+            t = np.arange( size )
+            bar = np.full( size, 1 )
+            axs.plot( t, filtered_df[(format,device,'speed-up','CSR CPU')], '-o', ms=1, lw=1 )
+            axs.plot( t, bar, '-', ms=1, lw=1 )
+            axs.legend( [ latexFormatName(format), 'CSR CPU' ], loc='upper right' )
+            axs.set_ylabel( 'Speedup' )
+            axs.set_xlabel( f"Matrix number - sorted w.r.t. {latexFormatName(format)} speed-up" )
+            plt.rcParams.update({
+               "text.usetex": True,
+               "font.family": "sans-serif",
+               "font.sans-serif": ["Helvetica"]})
+            plt.savefig( f"CSR-speed-up/{format}.pdf")
+            plt.close(fig)
+
+            fig, axs = plt.subplots( 1, 1, figsize=(6,4) )
+            axs.set_yscale( 'log' )
+            axs.plot( t, filtered_df[(format,device,'speed-up','CSR CPU')], '-o', ms=1, lw=1 )
+            axs.plot( t, bar, '-', ms=1, lw=1 )
+            axs.legend( [ latexFormatName(format), 'CSR' ], loc='lower left' )
+            axs.set_xlabel( f"Matrix number - sorted w.r.t. {latexFormatName(format)} speed-up" )
+            axs.set_ylabel( 'Speedup' )
+            plt.savefig( f"CSR-speed-up/{format}-{device}-log.pdf")
+            plt.close(fig)
+            copy_df = df.copy()
+            for f in formats:
+               if not f in ['cusparse','CSR',format]:
+                  copy_df.drop( labels=f, axis='columns', level=0, inplace=True )
+            copy_df.to_html( f"CSR-speed-up/{format}-{device}.html" )
+
+
+####
+# Comparison of speed-up w.r.t. Cusparse
+def cusparse_speedup_comparison( df, formats, head_size=10 ):
+   if not os.path.exists("Cusparse-speed-up"):
+      os.mkdir("Cusparse-speed-up")
+   profiles = {}
+   for format in formats:
+      if not format in ['cusparse','CSR']:
+         print( f"Writing comparison of speed-up of {format} ({latexFormatName(format)}) compared to cuSPARSE" )
+         df['tmp'] = df[(format, 'GPU','bandwidth')]
+         filtered_df=df.dropna(subset=[('tmp','','','')])
+         filtered_df.sort_values(by=[(format,'GPU','speed-up','cusparse')],inplace=True,ascending=False)
+         profiles[format] = filtered_df[(format,'GPU','speed-up','cusparse')].copy()
+         fig, axs = plt.subplots( 1, 1, figsize=(6,4) )
+         size = len(filtered_df[(format,'GPU','speed-up','cusparse')].index)
+         t = np.arange( size )
+         bar = np.full( size, 1 )
+         axs.plot( t, filtered_df[(format,'GPU','speed-up','cusparse')], '-o', ms=1, lw=1 )
+         axs.plot( t, bar, '-', ms=1, lw=1 )
+         axs.legend( [ latexFormatName(format), 'cuSPARSE' ], loc='upper right' )
+         axs.set_ylabel( 'Speedup' )
+         axs.set_xlabel( f"Matrix number - sorted w.r.t. {latexFormatName(format)} speed-up" )
+         plt.rcParams.update({
+            "text.usetex": True,
+            "font.family": "sans-serif",
+            "font.sans-serif": ["Helvetica"]})
+         plt.savefig( f"Cusparse-speed-up/{format}.pdf")
+         plt.close(fig)
+
+         fig, axs = plt.subplots( 1, 1, figsize=(6,4) )
+         axs.set_yscale( 'log' )
+         axs.plot( t, filtered_df[(format,'GPU','speed-up','cusparse')], '-o', ms=1, lw=1 )
+         axs.plot( t, bar, '-', ms=1, lw=1 )
+         axs.legend( [ latexFormatName(format), 'cuSPARSE' ], loc='lower left' )
+         axs.set_xlabel( f"Matrix number - sorted w.r.t. {latexFormatName(format)} speed-up" )
+         axs.set_ylabel( 'Speedup' )
+         plt.savefig( f"Cusparse-speed-up/{format}-log.pdf")
+         plt.close(fig)
+         copy_df = df.copy()
+         for f in formats:
+            if not f in ['cusparse','CSR',format]:
+               copy_df.drop( labels=f, axis='columns', level=0, inplace=True )
+         copy_df.to_html( f"Cusparse-speed-up/{format}.html" )
+
+   # Draw Ellpack formats profiles
+   xlabel = "Matrix number - sorted by particular formats speedup compared to cuSPARSE"
+   ylabel = "Speedup"
+   current_formats = []
+   for format in formats:
+      if( 'Ellpack' in format and not 'Symmetric' in format and not 'Binary' in format and not 'Legacy' in format ):
+         current_formats.append( format )
+   draw_profiles( current_formats, profiles, xlabel, ylabel, "ellpack-profiles-cusparse-speedup.pdf", 'upper right', "cuSPARSE" )
+
+   current_formats.clear()
+   for format in formats:
+      if( 'Ellpack' in format and 'Symmetric' in format and not 'Binary' in format and not 'Legacy' in format ):
+         current_formats.append( format )
+   draw_profiles( current_formats, profiles, xlabel, ylabel, "symmetric-ellpack-profiles-cusparse-speedup.pdf", 'upper right', "cuSPARSE" )
+
+   current_formats.clear()
+   for format in formats:
+      if( 'Ellpack' in format and not 'Symmetric' in format and 'Binary' in format and not 'Legacy' in format ):
+         current_formats.append( format )
+   draw_profiles( current_formats, profiles, xlabel, ylabel, "binary-ellpack-profiles-cusparse-speedup.pdf", 'upper right', "cuSPARSE" )
+
+   current_formats.clear()
+   for format in formats:
+      if( 'Ellpack' in format and 'Symmetric' in format and 'Binary' in format and not 'Legacy' in format ):
+         current_formats.append( format )
+   draw_profiles( current_formats, profiles, xlabel, ylabel, "symmetric-binary-ellpack-profiles-cusparse-speedup.pdf", 'upper right', "cuSPARSE" )
+
+
+   # Draw CSR formats profiles
+   current_formats.clear()
+   for format in formats:
+      if( 'CSR' in format and not 'Symmetric' in format and not 'Binary' in format and not 'Legacy' in format and not 'Hybrid' in format and format != 'CSR' ):
+         current_formats.append( format )
+   draw_profiles( current_formats, profiles, xlabel, ylabel, "csr-profiles-cusparse-speedup.pdf", 'upper right', "cuSPARSE" )
+   current_formats.clear()
+   for format in formats:
+      if( 'CSR' in format and 'Symmetric' in format and not 'Binary' in format and not 'Legacy' in format and not 'Hybrid' in format and format != 'CSR' ):
+         current_formats.append( format )
+   draw_profiles( current_formats, profiles, xlabel, ylabel, "symmetric-csr-profiles-cusparse-speedup.pdf", 'upper right', "cuSPARSE" )
+   current_formats.clear()
+
+   for format in formats:
+      if( 'CSR' in format and not 'Symmetric' in format and 'Binary' in format and not 'Legacy' in format and not 'Hybrid' in format and format != 'CSR' ):
+         current_formats.append( format )
+   draw_profiles( current_formats, profiles, xlabel, ylabel, "binary-csr-profiles-cusparse-speedup.pdf", 'upper right', "cuSPARSE" )
+   current_formats.clear()
+
+   for format in formats:
+      if( 'CSR' in format and 'Symmetric' in format and 'Binary' in format and not 'Legacy' in format and not 'Hybrid' in format and format != 'CSR' ):
+         current_formats.append( format )
+   draw_profiles( current_formats, profiles, xlabel, ylabel, "-symmetric-binary-csr-profiles-cusparse-speedup.pdf", 'upper right', "cuSPARSE" )
+   current_formats.clear()
+
+####
+# Comparison of binary matrices
+def binary_matrices_comparison( df, formats, head_size = 10 ):
+   if not os.path.exists("Binary-speed-up"):
+      os.mkdir("Binary-speed-up")
+   for format in formats:
+      if 'Binary' in format:
+         non_binary_format = format.replace('Binary ','')
+         print( f"Writing comparison of speed-up of {format} vs {non_binary_format}" )
+         #df['tmp'] = df[(format, 'GPU','speed-up','non-binary')]
+         filtered_df=df.dropna(subset=[(format, 'GPU','speed-up','non-binary')]) #('tmp','','','')])
+         #print( f"{format} -> {filtered_df[(format,'GPU','speed-up','non-binary')]}" )
+         ascend_df = filtered_df.copy()
+         filtered_df.sort_values(by=[(format,'GPU','speed-up','non-binary')],inplace=True,ascending=False)
+         ascend_df.sort_values(by=[(format,'GPU','speed-up','non-binary')],inplace=True,ascending=True)
+         fig, axs = plt.subplots( 1, 1, figsize=(6,4) )
+         size = len(filtered_df[(format,'GPU','speed-up','non-binary')].index)
+         t = np.arange( size )
+         bar = np.full( size, 1 )
+         axs.plot( t, filtered_df[(format,'GPU','speed-up','non-binary')], '-o', ms=1, lw=1 )
+         axs.plot( t, bar, '-', ms=1, lw=1 )
+         axs.legend( [ latexFormatName(format), latexFormatName(non_binary_format) ], loc='upper right' )
+         axs.set_ylabel( 'Speedup' )
+         axs.set_xlabel( f"Matrix number - sorted w.r.t. {latexFormatName(format)} speed-up" )
+         plt.rcParams.update({
+            "text.usetex": True,
+            "font.family": "sans-serif",
+            "font.sans-serif": ["Helvetica"]})
+         # for Palatino and other serif fonts use:
+         #plt.rcParams.update({
+         #   "text.usetex": True,
+         #   "font.family": "serif",
+         #   "font.serif": ["Palatino"],
+         #})
+         plt.savefig( f"Binary-speed-up/{format}.pdf")
+         plt.close(fig)
+
+         fig, axs = plt.subplots( 1, 1, figsize=(6,4) )
+         axs.set_yscale( 'log' )
+         axs.plot( t, filtered_df[(format,'GPU','speed-up','non-binary')], '-o', ms=1, lw=1 )
+         axs.plot( t, bar, '-', ms=1, lw=1 )
+         axs.legend( [ latexFormatName(format), latexFormatName(non_binary_format) ], loc='upper right' )
+         axs.set_xlabel( f"Matrix number - sorted w.r.t. {latexFormatName(format)} speed-up" )
+         axs.set_ylabel( 'Speedup' )
+         plt.savefig( f"Binary-speed-up/{format}-log.pdf")
+         plt.close(fig)
+         #head_df = filtered_df.head( head_size )
+         #bottom_df = ascend_df.head( head_size )
+         copy_df = df.copy()
+         for f in formats:
+            if not f in ['cusparse','CSR',format,non_binary_format]:
+               #print( f"Droping {f}..." )
+               #head_df.drop( labels=f, axis='columns', level=0, inplace=True )
+               copy_df.drop( labels=f, axis='columns', level=0, inplace=True )
+         #head_df.to_html( f"Binary-speed-up/{format}-head.html" )
+         copy_df.to_html( f"Binary-speed-up/{format}.html" )
+
+####
+# Comparison of symmetric matrices
+def symmetric_matrices_comparison( df, formats, head_size = 10 ):
+   if not os.path.exists("Symmetric-speed-up"):
+      os.mkdir("Symmetric-speed-up")
+   for format in formats:
+      if 'Symmetric' in format:
+         non_symmetric_format = format.replace('Symmetric ','')
+         print( f"Writing comparison of speed-up of {format} vs {non_symmetric_format}" )
+         #df['tmp'] = df[(format, 'GPU','speed-up','non-symmetric')]
+         filtered_df=df.dropna(subset=[(format, 'GPU','speed-up','non-symmetric')]) #('tmp','','','')])
+         #ascend_df = filtered_df.copy()
+         #print( f"{format} -> {filtered_df[(format,'GPU','speed-up','non-symmetric')]}" )
+         filtered_df.sort_values(by=[(format,'GPU','speed-up','non-symmetric')],inplace=True,ascending=False)
+         #ascend_df.sort_values(by=[(format,'GPU','speed-up','non-symmetric')],inplace=True,ascending=True)
+
+         cusparse_filtered_df=df.dropna(subset=[(format, 'GPU','speed-up','cusparse')]) #('tmp','','','')])
+         cusparse_filtered_df.sort_values(by=[(format,'GPU','speed-up','cusparse')],inplace=True,ascending=False)
+
+         fig, axs = plt.subplots( 1, 1, figsize=(6,4) )
+         size = len(filtered_df[(format,'GPU','speed-up','non-symmetric')].index)
+         t = np.arange( size )
+         bar = np.full( size, 1 )
+         axs.plot( t, filtered_df[(format,'GPU','speed-up','non-symmetric')], '-o', ms=1, lw=1 )
+         axs.plot( t, bar, '-', ms=1, lw=1 )
+         axs.legend( [ latexFormatName(format), latexFormatName(non_symmetric_format) ], loc='upper right' )
+         axs.set_ylabel( 'Speedup' )
+         axs.set_xlabel( f"Matrix number - sorted w.r.t. {latexFormatName(format)} speed-up" )
+         plt.rcParams.update({
+            "text.usetex": True,
+            "font.family": "sans-serif",
+            "font.sans-serif": ["Helvetica"]})
+         plt.savefig( f"Symmetric-speed-up/{format}.pdf")
+         plt.close(fig)
+
+         fig, axs = plt.subplots( 1, 1, figsize=(6,4) )
+         axs.set_yscale( 'log' )
+         axs.plot( t, filtered_df[(format,'GPU','speed-up','non-symmetric')], '-o', ms=1, lw=1 )
+         axs.plot( t, bar, '-', ms=1, lw=1 )
+         axs.legend( [ latexFormatName(format), latexFormatName(non_symmetric_format) ], loc='lower left' )
+         axs.set_xlabel( f"Matrix number - sorted w.r.t. {latexFormatName(format)} speed-up" )
+         axs.set_ylabel( 'Speedup' )
+         plt.savefig( f"Symmetric-speed-up/{format}-log.pdf")
+         plt.close(fig)
+         #head_df = filtered_df.head( head_size )
+         #bottom_df = ascend_df.head( head_size )
+
+         size = len(cusparse_filtered_df[(format,'GPU','speed-up','cusparse')].index)
+         t = np.arange( size )
+         bar = np.full( size, 1 )
+         fig, axs = plt.subplots( 1, 1, figsize=(6,4) )
+         axs.plot( t, cusparse_filtered_df[(format,'GPU','speed-up','cusparse')], '-o', ms=1, lw=1 )
+         axs.plot( t, bar, '-', ms=1, lw=1 )
+         axs.legend( [ latexFormatName(format), 'cuSPARSE' ], loc='upper right' )
+         axs.set_ylabel( 'Speedup' )
+         axs.set_xlabel( f"Matrix number - sorted w.r.t. {latexFormatName(format)} speed-up" )
+         plt.rcParams.update({
+            "text.usetex": True,
+            "font.family": "sans-serif",
+            "font.sans-serif": ["Helvetica"]})
+         plt.savefig( f"Symmetric-speed-up/{format}-cusparse.pdf")
+         plt.close(fig)
+
+         fig, axs = plt.subplots( 1, 1, figsize=(6,4) )
+         axs.set_yscale( 'log' )
+         axs.plot( t, cusparse_filtered_df[(format,'GPU','speed-up','cusparse')], '-o', ms=1, lw=1 )
+         axs.plot( t, bar, '-', ms=1, lw=1 )
+         axs.legend( [ latexFormatName(format), 'cuSPARSE' ], loc='lower left' )
+         axs.set_xlabel( f"Matrix number - sorted w.r.t. {latexFormatName(format)} speed-up" )
+         axs.set_ylabel( 'Speedup' )
+         plt.savefig( f"Symmetric-speed-up/{format}-cusparse-log.pdf")
+         plt.close(fig)
+
+
+         copy_df = df.copy()
+         for f in formats:
+            if not f in ['cusparse','CSR',format,non_symmetric_format]:
+               #print( f"Droping {f}..." )
+               #head_df.drop( labels=f, axis='columns', level=0, inplace=True )
+               copy_df.drop( labels=f, axis='columns', level=0, inplace=True )
+         #head_df.to_html( f"Symmetric-speed-up/{format}-head.html" )
+         copy_df.sort_values(by=[(format,'GPU','speed-up','non-symmetric')],inplace=True,ascending=False)
+         copy_df.to_html( f"Symmetric-speed-up/{format}.html" )
+         #copy_df.sort_values(by=[(format,'GPU','speed-up','non-symmetric')],inplace=True,descending=True)
+         #copy_df.to_html( f"Symmetric-speed-up/{format}-sort.html" )
+
+####
+# Comparison of speed-up w.r.t. LightSpMV
+def csr_light_speedup_comparison( df, head_size=10 ):
+   format = 'CSR< Light > Automatic Light'
+   print( f"Writing comparison of speed-up of CSR Light compared to LightSPMV" )
+   df['tmp'] = df[(format, 'GPU','bandwidth')]
+   filtered_df=df.dropna(subset=[('tmp','','','')])
+   ascend_df = filtered_df.copy()
+   filtered_df.sort_values(by=[(format,'GPU','speed-up','LightSpMV Vector')],inplace=True,ascending=False)
+   ascend_df.sort_values(by=[(format,'GPU','speed-up','LightSpMV Vector')],inplace=True,ascending=True)
+   fig, axs = plt.subplots( 1, 1, figsize=(6,4) )
+   size = len(filtered_df[(format,'GPU','speed-up','LightSpMV Vector')].index)
+   t = np.arange( size )
+   bar = np.full( size, 1 )
+   axs.plot( t, filtered_df[(format,'GPU','speed-up','LightSpMV Vector')], '-o', ms=1, lw=1 )
+   axs.plot( t, bar, '-', ms=1, lw=1 )
+   axs.legend( [ latexFormatName(format), 'LightSpMV' ], loc='upper right' )
+   axs.set_ylabel( 'Speedup' )
+   axs.set_xlabel( f"Matrix number - sorted w.r.t. {latexFormatName(format)} speed-up" )
+   plt.rcParams.update({
+      "text.usetex": True,
+      "font.family": "sans-serif",
+      "font.sans-serif": ["Helvetica"]})
+   # for Palatino and other serif fonts use:
+   #plt.rcParams.update({
+   #   "text.usetex": True,
+   #   "font.family": "serif",
+   #   "font.serif": ["Palatino"],
+   #})
+   plt.savefig( f"LightSpMV-speed-up.pdf")
+   plt.close(fig)
+
+   fig, axs = plt.subplots( 1, 1, figsize=(6,4) )
+   axs.set_yscale( 'log' )
+   axs.plot( t, filtered_df[(format,'GPU','speed-up','LightSpMV Vector')], '-o', ms=1, lw=1 )
+   axs.plot( t, bar, '-', ms=1, lw=1 )
+   axs.legend( [ latexFormatName(format), 'LightSpMV' ], loc='lower left' )
+   axs.set_xlabel( f"Matrix number - sorted w.r.t. {latexFormatName(format)} speed-up" )
+   axs.set_ylabel( 'Speedup' )
+   plt.savefig( f"LightSpMV-speed-up-log.pdf")
+   plt.close(fig)
+   #head_df = filtered_df.head( head_size )
+   #bottom_df = ascend_df.head( head_size )
+   copy_df = df.copy()
+   for f in formats:
+      if not f in ['cusparse','CSR',format]:
+         #print( f"Droping {f}..." )
+         #head_df.drop( labels=f, axis='columns', level=0, inplace=True )
+         copy_df.drop( labels=f, axis='columns', level=0, inplace=True )
+   #head_df.to_html( f"LightSpMV-speed-up-head.html" )
+   copy_df.to_html( f"LightSpMV-speed-up-bottom.html" )
+
+def write_colormap( file, max_bw, size, x_position, y_position, standalone = False ):
+   if standalone:
+      file.write( '\\documentclass{standalone}\n' )
+      file.write( '\\usepackage[utf8]{inputenc}\n' )
+      file.write( '\\usepackage{tikz}\n' )
+      file.write( '\\begin{document}\n' )
+      file.write( '\\begin{tikzpicture}\n' )
+   i = 0
+   x = x_position
+   while i <= max_bw:
+      y = y_position + i / max_bw * size
+      r, g, b = color_map(i, max_bw, map=heatmap)
+      file.write( f'\\definecolor{{color_hm_{i}}}{{rgb}}{{ {r}, {g}, {b} }}; \n' )
+      file.write( f'\\filldraw[color_hm_{i}] ({x},{y}) circle (2pt); \n' )
+      i = i + 5
+   i = 0
+   while i <= max_bw:
+      y = y_position + i / max_bw * size
+      file.write( f'\\filldraw[black] ({x},{y}) circle (1pt) node[anchor=west] {{{i}}}; \n' )
+      i = i + 400
+
+   if standalone:
+      file.write( '\\end{tikzpicture}\n' )
+      file.write( '\\end{document}\n' )
+
+def write_performance_circle_latex_base( file_name ):
+   file = open( f'{file_name}-base.tex', 'w')
+   file.write( '\\documentclass{standalone}\n' )
+   file.write( '\\usepackage[utf8]{inputenc}\n' )
+   file.write( '\\usepackage{tikz}\n' )
+   file.write( '\\begin{document}\n' )
+   file.write( '\\begin{tikzpicture}\n' )
+   file.write( f'\\input{{{file_name}.tex}}\n' )
+   file.write( '\\end{tikzpicture}\n' )
+   file.write( '\\end{document}\n' )
+
+#####
+# Draw performance circle in tikz
+def write_performance_circle( df, formats, circle_formats, file_name, scale=1, with_color_map = False ):
+   write_performance_circle_latex_base( file_name )
+   file = open( f'{file_name}.tex', 'w')
+   formats_number = 0
+   for format in circle_formats:
+      if format in formats:
+         formats_number += 1
+
+   format_idx = 0
+   pos_x = 5 * scale
+   pos_y = 5 * scale
+   rad = 5 * scale
+   formats_pos_x = {}
+   formats_pos_y = {}
+   for format in circle_formats:
+      if format in formats:
+         format_angle = math.pi/2 - 2*math.pi/formats_number*format_idx - math.pi / formats_number
+         if format_angle < 0:
+            format_angle = 2*math.pi + format_angle
+         x = pos_x + rad*math.cos( format_angle )
+         y = pos_y + rad*math.sin( format_angle )
+         formats_pos_x[ format ] = x
+         formats_pos_y[ format ] = y
+         anchor = ''
+         if format_angle <= math.pi * 1/4  or format_angle > math.pi * 7/4:
+            anchor = 'west'
+         if format_angle <= math.pi * 3/4 and format_angle > math.pi * 1/4:
+            anchor = 'south'
+         if format_angle <= math.pi * 5/4 and format_angle > math.pi * 3/4:
+            anchor = 'east'
+         if format_angle <= math.pi * 7/4 and format_angle > math.pi * 5/4:
+            anchor = 'north'
+         #print( f'{format_angle} : {format} -> {anchor} \n' )
+         file.write( f'\\filldraw[black] ({x},{y}) circle (2pt) node[anchor={anchor}]{{{latexFormatName(format)}}}; \n' )
+         div_angle = format_angle + math.pi / formats_number
+         div_x = pos_x + rad*math.cos( div_angle )
+         div_y = pos_y + rad*math.sin( div_angle )
+         file.write( f'\\draw [dashed] ({div_x},{div_y}) -- ({pos_x},{pos_y}); \n')
+         format_idx += 1
+   formats_count = format_idx
+   line_idx=0
+   elim = 0
+   while line_idx < len(df.index):
+      #matrixName = df.iloc[line_idx]['Matrix name']
+      sum_bw = 0
+      formats_bw = {}
+      max_bw = 0
+      for format in circle_formats:
+         if format in formats:
+            format_bw = df.iloc[line_idx][(format,'GPU','bandwidth','')]
+            formats_bw[ format ] = format_bw
+            #print( f'{matrixName} {format} -> {format_bw}')
+            #if format_bw > max_bw:
+            sum_bw = sum_bw + format_bw
+            if format_bw > max_bw:
+               max_bw = format_bw
+      for format in circle_formats:
+         if format in formats:
+            formats_bw[ format ] = formats_bw[ format ] / sum_bw
+      format_pos_x = 0
+      format_pos_y = 0
+      for format in circle_formats:
+         if format in formats:
+            format_pos_x = format_pos_x + formats_pos_x[ format ] * formats_bw[ format ]
+            format_pos_y = format_pos_y + formats_pos_y[ format ] * formats_bw[ format ]
+      if( format_pos_x == format_pos_x  and format_pos_y == format_pos_y ):  # check for NaN
+         r, g, b = color_map(max_bw, 1200, map=heatmap)
+         file.write( f'\\definecolor{{color_{line_idx}}}{{rgb}}{{ {r}, {g}, {b} }} \n' )
+         file.write( f'\\filldraw[color_{line_idx},opacity=0.75] ({format_pos_x},{format_pos_y}) circle (1pt); \n' )
+      else:
+         elim = elim + 1
+      line_idx += 1
+   if with_color_map:
+      write_colormap( file, 1200, 5, 13*scale, 1.5*scale, standalone=False )
+   os.system( f'pdflatex {file_name}-base.tex' )
+   print( f'Eliminated formats: {elim}')
+
+####
+# Parse input file
+print( "Parsing input file...." )
+with open('sparse-matrix-benchmark.log') as f:
+    d = json.load(f)
+input_df = json_normalize( d, record_path=['results'] )
+#input_df.to_html( "orig-pandas.html" )
+
+formats = list(set( input_df['format'].values.tolist() )) # list of all formats in the benchmark results
+formats.remove('CSR< Light > Automatic')
+formats.remove('Binary CSR< Light > Automatic')
+formats.remove('Symmetric CSR< Light > Automatic')
+formats.remove('Symmetric Binary CSR< Light > Automatic')
+formats.append('TNL Best')
+multicolumns, df_data = get_multiindex( input_df, formats )
+
+print( "Converting data..." )
+result = convert_data_frame( input_df, multicolumns, df_data, 0, 20000 )
+compute_speedup( result, formats )
+
+result.replace( to_replace=' ',value=np.nan,inplace=True)
+
+####
+# Make data analysis
+def processDf( df, formats, head_size = 10 ):
+   print( "Writting to HTML file..." )
+   df.to_html( f'output.html' )
+
+   # Generate tables and figures
+   effective_bw_profile( df, formats, head_size )
+   cusparse_comparison( df, formats, head_size )
+   csr_comparison( df, formats, head_size )
+   legacy_formats_comparison( df, formats, head_size )
+   csr_speedup_comparison( df, formats, head_size )
+   cusparse_speedup_comparison( df, formats, head_size )
+   binary_matrices_comparison( df, formats, head_size )
+   symmetric_matrices_comparison( df, formats, head_size )
+   csr_light_speedup_comparison( df, head_size )
+
+   best = df[('TNL Best','GPU','format')].tolist()
+   best_formats = list(set(best))
+   sum = 0
+   for format in formats:
+      if( not 'Binary' in format and
+          not 'Symmetric' in format and
+          not 'Legacy' in format and
+          not 'LightSpMV' in format and
+          not 'TNL Best' in format ):
+         cases = best.count(format)
+         print( f'{format} is best in {cases} cases.')
+         sum += cases
+   print( f'Total is {sum}.' )
+   print( f'Best formats {best_formats}.')
+   write_performance_circle( df, formats,
+         ['cusparse', 'Ellpack', 'SlicedEllpack', 'ChunkedEllpack', 'BiEllpack', 'CSR< Scalar >', 'CSR< Adaptive >', 'CSR< Vector >', 'CSR< Light > Automatic Light'],
+         'performance-graph' )
+
+   scale = 0.6
+   aux_df = df
+   aux_df.sort_values(by=[('SlicedEllpack','GPU','bandwidth')],inplace=True,ascending=True)
+   write_performance_circle( aux_df, formats, ['Ellpack', 'ChunkedEllpack', 'SlicedEllpack' ], 'performance-graph-ellpacks-1', scale, with_color_map = False )
+   write_performance_circle( aux_df, formats, ['BiEllpack', 'ChunkedEllpack', 'SlicedEllpack',  ], 'performance-graph-ellpacks-2', scale, with_color_map = True )
+   #write_performance_circle( df, formats, ['CSR< Scalar >', 'CSR< Adaptive >', 'CSR< Vector >', 'CSR< Light > Automatic Light'], 'performance-graph-csr-1' )
+   aux_df.sort_values(by=[('CSR< Light > Automatic Light','GPU','bandwidth')],inplace=True,ascending=True)
+   write_performance_circle( aux_df, formats, ['CSR< Scalar >', 'CSR< Vector >', 'CSR< Light > Automatic Light'], 'performance-graph-csr-1', scale, with_color_map = False )
+   write_performance_circle( aux_df, formats, ['CSR< Adaptive >', 'CSR< Vector >', 'CSR< Light > Automatic Light'], 'performance-graph-csr-2', scale, with_color_map = False )
+   aux_df.sort_values(by=[('cusparse','GPU','bandwidth')],inplace=True,ascending=True)
+   write_performance_circle( aux_df, formats, ['cusparse', 'SlicedEllpack', 'ChunkedEllpack' ], 'performance-graph-cusparse-ellpacks', scale, with_color_map = False )
+   write_performance_circle( aux_df, formats, ['cusparse', 'CSR< Vector >', 'CSR< Light > Automatic Light'], 'performance-graph-cusparse-csr-1', scale, with_color_map = False )
+   write_performance_circle( aux_df, formats, ['cusparse', 'CSR< Adaptive >', 'CSR< Light > Automatic Light'], 'performance-graph-cusparse-csr-2', scale, with_color_map = True )
+   write_performance_circle( aux_df, formats, ['cusparse', 'CSR< Scalar >', 'CSR< Light > Automatic Light'], 'performance-graph-cusparse-csr-3', scale, with_color_map = False )
+   write_performance_circle( aux_df, formats, ['cusparse', 'SlicedEllpack', 'CSR< Light > Automatic Light'], 'performance-graph-cusparse-csr-ellpack', scale, with_color_map = True )
+
+head_size = 25
+if not os.path.exists( 'general' ):
+   os.mkdir( 'general' )
+os.chdir( 'general' )
+processDf( result, formats, head_size )
+os.chdir( '..' )
+
+#for rows_count in [ 10, 100, 1000, 10000, 100000, 1000000, 10000000 ]:
+#   filtered_df = result[ result['rows'].astype('int32') <= rows_count ]
+#   if not os.path.exists(f'rows-le-{rows_count}'):
+#      os.mkdir( f'rows-le-{rows_count}')
+#   os.chdir( f'rows-le-{rows_count}')
+#   processDf( filtered_df, formats, head_size )
+#   os.chdir( '..' )
+
+#for rows_count in [ 10, 100, 1000, 10000, 100000, 1000000, 10000000 ]:
+#   filtered_df = result[ result['rows'].astype('int32') >= rows_count ]
+#   if not os.path.exists(f'rows-ge-{rows_count}'):
+#      os.mkdir( f'rows-ge-{rows_count}')
+#   os.chdir( f'rows-ge-{rows_count}')
+#   processDf( filtered_df, formats, head_size )
+#   os.chdir( '..' )
diff --git a/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables.py b/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables.py
index 8899dc9ebbc1b14a74a304e578b63c38a3bea3d2..3459643fd35d4f78e5b31c14625789481f123e97 100755
--- a/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables.py
+++ b/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables.py
@@ -154,8 +154,8 @@ Sort by comparison formats
 formats_comparison = defaultdict( list )
 for format in gpu_comparison_formats:
    df.sort_values(by=[f"{format} Bandwidth"],inplace=True,ascending=False)
-   formats_comparison[ format ] = df[format, "GPU", "bandwidth"].tolist();
-   formats_comparison[ gpu_comparison_formats[ format ] ] = df[gpu_comparison_formats[ format ], "GPU", "bandwidth"].tolist();
+   formats_comparison[ format ] = df[format, "GPU", "bandwidth"].tolist()
+   formats_comparison[ gpu_comparison_formats[ format ] ] = df[gpu_comparison_formats[ format ], "GPU", "bandwidth"].tolist()
 
 """
 Writting gnuplot source files
diff --git a/src/Python/pytnl/tnl/SparseMatrix.h b/src/Python/pytnl/tnl/SparseMatrix.h
index 1dc375f98938ce1abc68661305667815cf2315fc..aa0ea33941f0c1179b3f482b1a2ee4e1a4537cf0 100644
--- a/src/Python/pytnl/tnl/SparseMatrix.h
+++ b/src/Python/pytnl/tnl/SparseMatrix.h
@@ -67,7 +67,7 @@ struct export_CSR< Segments, typename TNL::enable_if_type< decltype(Segments{}.g
    static void e( Scope & s )
    {
       s
-         .def("getOffsets", []( const Segments& segments ) -> const typename Segments::OffsetsHolder& {
+         .def("getOffsets", []( const Segments& segments ) -> const typename Segments::OffsetsContainer& {
                   return segments.getOffsets();
             }, py::return_value_policy::reference_internal)
       ;
diff --git a/src/TNL/Algorithms/Segments/BiEllpack.h b/src/TNL/Algorithms/Segments/BiEllpack.h
index 3a5a7c2026a8efd69d54cfd472807a039f85d43e..3830d8e14190388e4f0656c92afcd206498dcda6 100644
--- a/src/TNL/Algorithms/Segments/BiEllpack.h
+++ b/src/TNL/Algorithms/Segments/BiEllpack.h
@@ -22,163 +22,179 @@ namespace TNL
       namespace Segments
       {
 
-         template <typename Device,
-                   typename Index,
-                   typename IndexAllocator = typename Allocators::Default<Device>::template Allocator<Index>,
-                   ElementsOrganization Organization = Algorithms::Segments::DefaultElementsOrganization<Device>::getOrganization(),
-                   int WarpSize = 32>
-         class BiEllpack
-         {
-         public:
-            using DeviceType = Device;
-            using IndexType = std::remove_const_t<Index>;
-            using OffsetsHolder = Containers::Vector< IndexType, DeviceType, IndexType, IndexAllocator>;
-            static constexpr ElementsOrganization getOrganization() { return Organization; }
-            using ViewType = BiEllpackView< Device, Index, Organization, WarpSize >;
-            template <typename Device_, typename Index_>
-            using ViewTemplate = BiEllpackView<Device_, Index_, Organization, WarpSize >;
-            using ConstViewType = typename ViewType::ConstViewType;
-            using SegmentViewType = typename ViewType::SegmentViewType;
+template <typename Device,
+            typename Index,
+            typename IndexAllocator = typename Allocators::Default<Device>::template Allocator<Index>,
+            ElementsOrganization Organization = Algorithms::Segments::DefaultElementsOrganization<Device>::getOrganization(),
+            int WarpSize = 32>
+class BiEllpack
+{
+   public:
+      using DeviceType = Device;
+      using IndexType = std::remove_const_t<Index>;
+      using OffsetsContainer = Containers::Vector< IndexType, DeviceType, IndexType, IndexAllocator>;
+      static constexpr ElementsOrganization getOrganization() { return Organization; }
+      using ViewType = BiEllpackView< Device, Index, Organization, WarpSize >;
+      template <typename Device_, typename Index_>
+      using ViewTemplate = BiEllpackView<Device_, Index_, Organization, WarpSize >;
+      using ConstViewType = typename ViewType::ConstViewType;
+      using SegmentViewType = typename ViewType::SegmentViewType;
+
+      static constexpr bool havePadding() { return true; };
+
+      BiEllpack() = default;
 
-            static constexpr bool havePadding() { return true; };
+      template< typename SizesContainer >
+      BiEllpack( const SizesContainer& sizes );
 
-            BiEllpack() = default;
+      template< typename ListIndex >
+      BiEllpack( const std::initializer_list< ListIndex >& segmentsSizes );
 
-            BiEllpack(const Containers::Vector<IndexType, DeviceType, IndexType> &sizes);
 
-            BiEllpack(const BiEllpack &segments);
+      BiEllpack(const BiEllpack &segments);
 
-            BiEllpack(const BiEllpack &&segments);
+      BiEllpack(const BiEllpack &&segments);
 
-            static String getSerializationType();
+      static String getSerializationType();
 
-            static String getSegmentsType();
+      static String getSegmentsType();
 
-            ViewType getView();
+      ViewType getView();
 
-            const ConstViewType getConstView() const;
+      const ConstViewType getConstView() const;
 
-            /**
+      /**
        * \brief Number of segments.
        */
-            __cuda_callable__
-                IndexType
-                getSegmentsCount() const;
+      __cuda_callable__
+            IndexType
+            getSegmentsCount() const;
 
-            /**
+      /**
        * \brief Set sizes of particular segments.
        */
-            template <typename SizesHolder = OffsetsHolder>
-            void setSegmentsSizes(const SizesHolder &sizes);
+      template <typename SizesHolder = OffsetsContainer>
+      void setSegmentsSizes(const SizesHolder &sizes);
 
-            void reset();
+      void reset();
 
-            IndexType getSegmentSize(const IndexType segmentIdx) const;
+      IndexType getSegmentSize(const IndexType segmentIdx) const;
 
-            /**
+      /**
        * \brief Number segments.
        */
-            __cuda_callable__
-                IndexType
-                getSize() const;
+      __cuda_callable__
+            IndexType
+            getSize() const;
 
-            __cuda_callable__
-                IndexType
-                getStorageSize() const;
+      __cuda_callable__
+            IndexType
+            getStorageSize() const;
 
-            __cuda_callable__
-                IndexType
-                getGlobalIndex(const IndexType segmentIdx, const IndexType localIdx) const;
+      __cuda_callable__
+            IndexType
+            getGlobalIndex(const IndexType segmentIdx, const IndexType localIdx) const;
 
-            __cuda_callable__
-                SegmentViewType
-                getSegmentView(const IndexType segmentIdx) const;
+      __cuda_callable__
+            SegmentViewType
+            getSegmentView(const IndexType segmentIdx) const;
 
-            /***
+      /***
        * \brief Go over all segments and for each segment element call
        * function 'f' with arguments 'args'. The return type of 'f' is bool.
        * When its true, the for-loop continues. Once 'f' returns false, the for-loop
        * is terminated.
        */
-            template <typename Function>
-            void forElements(IndexType first, IndexType last, Function &&f) const;
+      template< typename Function >
+      void forElements( IndexType first, IndexType last, Function&& f ) const;
 
-            template <typename Function>
-            void forAllElements(Function &&f) const;
+      template <typename Function>
+      void forAllElements(Function&& f ) const;
 
-            template <typename Function>
-            void forSegments(IndexType begin, IndexType end, Function &&f) const;
+      template <typename Function>
+      void forSegments(IndexType begin, IndexType end, Function&& f ) const;
 
-            template <typename Function>
-            void forEachSegment(Function &&f) const;
+      template <typename Function>
+      void forAllSegments( Function&& f ) const;
 
-            /***
+      /***
        * \brief Go over all segments and perform a reduction in each of them.
        */
-            template <typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args>
-            void segmentsReduction(IndexType first, IndexType last, Fetch &fetch, const Reduction &reduction, ResultKeeper &keeper, const Real &zero, Args... args) const;
+      template <typename Fetch, typename Reduction, typename ResultKeeper, typename Real >
+      void reduceSegments(IndexType first, IndexType last, Fetch &fetch, const Reduction &reduction, ResultKeeper &keeper, const Real &zero ) const;
 
-            template <typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args>
-            void allReduction(Fetch &fetch, const Reduction &reduction, ResultKeeper &keeper, const Real &zero, Args... args) const;
+      template <typename Fetch, typename Reduction, typename ResultKeeper, typename Real >
+      void reduceAllSegments(Fetch &fetch, const Reduction &reduction, ResultKeeper &keeper, const Real &zero ) const;
 
-            BiEllpack &operator=(const BiEllpack &source) = default;
+      BiEllpack &operator=(const BiEllpack &source) = default;
 
-            template <typename Device_, typename Index_, typename IndexAllocator_, ElementsOrganization Organization_>
-            BiEllpack &operator=(const BiEllpack<Device_, Index_, IndexAllocator_, Organization_, WarpSize> &source);
+      template <typename Device_, typename Index_, typename IndexAllocator_, ElementsOrganization Organization_>
+      BiEllpack &operator=(const BiEllpack<Device_, Index_, IndexAllocator_, Organization_, WarpSize> &source);
 
-            void save(File &file) const;
+      void save(File &file) const;
 
-            void load(File &file);
+      void load(File &file);
 
-            void printStructure(std::ostream &str) const;
+      template< typename Fetch >
+      SegmentsPrinter< BiEllpack, Fetch > print( Fetch&& fetch ) const;
 
-            // TODO: nvcc needs this public because of lambda function used inside
-            template <typename SizesHolder = OffsetsHolder>
-            void performRowBubbleSort(const SizesHolder &segmentsSize);
+      void printStructure(std::ostream &str) const;
 
-            // TODO: the same as  above
-            template <typename SizesHolder = OffsetsHolder>
-            void computeColumnSizes(const SizesHolder &segmentsSizes);
+      // TODO: nvcc needs this public because of lambda function used inside
+      template <typename SizesHolder = OffsetsContainer>
+      void performRowBubbleSort(const SizesHolder &segmentsSize);
 
-         protected:
-            static constexpr int getWarpSize() { return WarpSize; };
+      // TODO: the same as  above
+      template <typename SizesHolder = OffsetsContainer>
+      void computeColumnSizes(const SizesHolder &segmentsSizes);
 
-            static constexpr int getLogWarpSize() { return std::log2(WarpSize); };
+   protected:
+      static constexpr int getWarpSize() { return WarpSize; };
 
-            template <typename SizesHolder = OffsetsHolder>
-            void verifyRowPerm(const SizesHolder &segmentsSizes);
+      static constexpr int getLogWarpSize() { return std::log2(WarpSize); };
 
-            template <typename SizesHolder = OffsetsHolder>
-            void verifyRowLengths(const SizesHolder &segmentsSizes);
+      template <typename SizesHolder = OffsetsContainer>
+      void verifyRowPerm(const SizesHolder &segmentsSizes);
 
-            IndexType getStripLength(const IndexType stripIdx) const;
+      template <typename SizesHolder = OffsetsContainer>
+      void verifyRowLengths(const SizesHolder &segmentsSizes);
 
-            IndexType getGroupLength(const IndexType strip, const IndexType group) const;
+      IndexType getStripLength(const IndexType stripIdx) const;
 
-            IndexType size = 0, storageSize = 0;
+      IndexType getGroupLength(const IndexType strip, const IndexType group) const;
 
-            IndexType virtualRows = 0;
+      IndexType size = 0, storageSize = 0;
 
-            OffsetsHolder rowPermArray;
+      IndexType virtualRows = 0;
 
-            OffsetsHolder groupPointers;
+      OffsetsContainer rowPermArray;
 
-            // TODO: Replace later
-            __cuda_callable__ Index power(const IndexType number, const IndexType exponent) const
-            {
-               if (exponent >= 0)
-               {
-                  IndexType result = 1;
-                  for (IndexType i = 0; i < exponent; i++)
-                     result *= number;
-                  return result;
-               }
-               return 0;
-            };
+      OffsetsContainer groupPointers;
+
+      // TODO: Replace later
+      __cuda_callable__ Index power(const IndexType number, const IndexType exponent) const
+      {
+         if (exponent >= 0)
+         {
+            IndexType result = 1;
+            for (IndexType i = 0; i < exponent; i++)
+               result *= number;
+            return result;
+         }
+         return 0;
+      };
+
+      template <typename Device_, typename Index_, typename IndexAllocator_, ElementsOrganization Organization_, int WarpSize_>
+      friend class BiEllpack;
+};
+
+template <typename Device,
+          typename Index,
+          typename IndexAllocator,
+          ElementsOrganization Organization,
+          int WarpSize >
+std::ostream& operator<<( std::ostream& str, const BiEllpack< Device, Index, IndexAllocator, Organization, WarpSize >& segments ) { return printSegments( segments, str ); }
 
-            template <typename Device_, typename Index_, typename IndexAllocator_, ElementsOrganization Organization_, int WarpSize_>
-            friend class BiEllpack;
-         };
 
       } // namespace Segments
    }    // namespace Algorithms
diff --git a/src/TNL/Algorithms/Segments/BiEllpack.hpp b/src/TNL/Algorithms/Segments/BiEllpack.hpp
index 53a3eb905c16f994bf6b3ced08df2ea48e127804..ddb3c0342404b868ead912eea9e7cae7c795b479 100644
--- a/src/TNL/Algorithms/Segments/BiEllpack.hpp
+++ b/src/TNL/Algorithms/Segments/BiEllpack.hpp
@@ -26,10 +26,23 @@ template< typename Device,
           typename IndexAllocator,
           ElementsOrganization Organization,
           int WarpSize >
+   template< typename SizesContainer >
 BiEllpack< Device, Index, IndexAllocator, Organization, WarpSize >::
-BiEllpack( const Containers::Vector< IndexType, DeviceType, IndexType >& sizes )
+BiEllpack( const SizesContainer& segmentsSizes )
 {
-   this->setSegmentsSizes( sizes );
+   this->setSegmentsSizes( segmentsSizes );
+}
+
+template< typename Device,
+          typename Index,
+          typename IndexAllocator,
+          ElementsOrganization Organization,
+          int WarpSize >
+   template< typename ListIndex >
+BiEllpack< Device, Index, IndexAllocator, Organization, WarpSize >::
+BiEllpack( const std::initializer_list< ListIndex >& segmentsSizes )
+{
+   this->setSegmentsSizes( Containers::Vector< IndexType, DeviceType, IndexType >( segmentsSizes ) );
 }
 
 template< typename Device,
@@ -71,6 +84,7 @@ String
 BiEllpack< Device, Index, IndexAllocator, Organization, WarpSize >::
 getSerializationType()
 {
+   // FIXME: the serialized data DEPEND on the Organization and WarpSize parameters, so it should be reflected in the serialization type
    return "BiEllpack< [any_device], " + TNL::getSerializationType< IndexType >() + " >";
 }
 
@@ -486,9 +500,9 @@ template< typename Device,
    template< typename Function >
 void
 BiEllpack< Device, Index, IndexAllocator, Organization, WarpSize >::
-forEachSegment( Function&& f ) const
+forAllSegments( Function&& f ) const
 {
-   this->getConstView().forEachSegment( f );
+   this->getConstView().forAllSegments( f );
 }
 
 
@@ -497,12 +511,12 @@ template< typename Device,
           typename IndexAllocator,
           ElementsOrganization Organization,
           int WarpSize >
-   template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
+   template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real >
 void
 BiEllpack< Device, Index, IndexAllocator, Organization, WarpSize >::
-segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
+reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero ) const
 {
-   this->getConstView().segmentsReduction( first, last, fetch, reduction, keeper, zero, args... );
+   this->getConstView().reduceSegments( first, last, fetch, reduction, keeper, zero );
 }
 
 template< typename Device,
@@ -510,12 +524,12 @@ template< typename Device,
           typename IndexAllocator,
           ElementsOrganization Organization,
           int WarpSize >
-   template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
+   template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real >
 void
 BiEllpack< Device, Index, IndexAllocator, Organization, WarpSize >::
-allReduction( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
+reduceAllSegments( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero ) const
 {
-   this->segmentsReduction( 0, this->getSegmentsCount(), fetch, reduction, keeper, zero, args... );
+   this->reduceSegments( 0, this->getSegmentsCount(), fetch, reduction, keeper, zero );
 }
 
 template< typename Device,
@@ -568,6 +582,19 @@ load( File& file )
         >> this->groupPointers;
 }
 
+template< typename Device,
+          typename Index,
+          typename IndexAllocator,
+          ElementsOrganization Organization,
+          int WarpSize >
+      template< typename Fetch >
+auto
+BiEllpack< Device, Index, IndexAllocator, Organization, WarpSize >::
+print( Fetch&& fetch ) const -> SegmentsPrinter< BiEllpack, Fetch >
+{
+   return SegmentsPrinter< BiEllpack, Fetch >( *this, fetch );
+}
+
 template< typename Device,
           typename Index,
           typename IndexAllocator,
diff --git a/src/TNL/Algorithms/Segments/BiEllpackView.h b/src/TNL/Algorithms/Segments/BiEllpackView.h
index 50f69e3aa2266b65df3bc6f089e2f05477ea0ae3..f14282efb687c4e677faa81c015e6f9ab8576801 100644
--- a/src/TNL/Algorithms/Segments/BiEllpackView.h
+++ b/src/TNL/Algorithms/Segments/BiEllpackView.h
@@ -16,6 +16,7 @@
 #include <TNL/Algorithms/Segments/ElementsOrganization.h>
 #include <TNL/Algorithms/Segments/BiEllpackSegmentView.h>
 #include <TNL/Algorithms/Segments/detail/BiEllpack.h>
+#include <TNL/Algorithms/Segments/SegmentsPrinting.h>
 
 namespace TNL {
    namespace Algorithms {
@@ -32,7 +33,7 @@ class BiEllpackView
 
       using DeviceType = Device;
       using IndexType = std::remove_const_t< Index >;
-      using OffsetsView = typename Containers::VectorView< IndexType, DeviceType, IndexType >;
+      using OffsetsView = typename Containers::VectorView< Index, DeviceType, IndexType >;
       using ConstOffsetsView = typename OffsetsView::ConstViewType;
       using ViewType = BiEllpackView;
       template< typename Device_, typename Index_ >
@@ -121,16 +122,16 @@ class BiEllpackView
       void forSegments( IndexType begin, IndexType end, Function&& f ) const;
 
       template< typename Function >
-      void forEachSegment( Function&& f ) const;
+      void forAllSegments( Function&& f ) const;
 
       /***
        * \brief Go over all segments and perform a reduction in each of them.
        */
-      template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
-      void segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
+      template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real >
+      void reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero ) const;
 
-      template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
-      void allReduction( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
+      template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real >
+      void reduceAllSegments( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero ) const;
 
       BiEllpackView& operator=( const BiEllpackView& view );
 
@@ -138,6 +139,9 @@ class BiEllpackView
 
       void load( File& file );
 
+      template< typename Fetch >
+      SegmentsPrinter< BiEllpackView, Fetch > print( Fetch&& fetch ) const;
+
       void printStructure( std::ostream& str ) const;
 
    protected:
@@ -159,33 +163,29 @@ class BiEllpackView
                 typename Reduction,
                 typename ResultKeeper,
                 typename Real,
-                int BlockDim,
-                typename... Args >
+                int BlockDim >
       __device__
-      void segmentsReductionKernelWithAllParameters( IndexType gridIdx,
+      void reduceSegmentsKernelWithAllParameters( IndexType gridIdx,
                                                      IndexType first,
                                                      IndexType last,
                                                      Fetch fetch,
                                                      Reduction reduction,
                                                      ResultKeeper keeper,
-                                                     Real zero,
-                                                     Args... args ) const;
+                                                     Real zero ) const;
 
       template< typename Fetch,
                 typename Reduction,
                 typename ResultKeeper,
                 typename Real_,
-                int BlockDim,
-                typename... Args >
+                int BlockDim >
       __device__
-      void segmentsReductionKernel( IndexType gridIdx,
+      void reduceSegmentsKernel( IndexType gridIdx,
                                     IndexType first,
                                     IndexType last,
                                     Fetch fetch,
                                     Reduction reduction,
                                     ResultKeeper keeper,
-                                    Real_ zero,
-                                    Args... args ) const;
+                                    Real_ zero ) const;
 
       template< typename View_,
                 typename Index_,
@@ -193,23 +193,29 @@ class BiEllpackView
                 typename Reduction_,
                 typename ResultKeeper_,
                 typename Real_,
-                int BlockDim,
-                typename... Args_ >
+                int BlockDim >
       friend __global__
-      void BiEllpackSegmentsReductionKernel( View_ chunkedEllpack,
+      void BiEllpackreduceSegmentsKernel( View_ chunkedEllpack,
                                              Index_ gridIdx,
                                              Index_ first,
                                              Index_ last,
                                              Fetch_ fetch,
                                              Reduction_ reduction,
                                              ResultKeeper_ keeper,
-                                             Real_ zero,
-                                             Args_... args );
+                                             Real_ zero );
 
       template< typename Index_, typename Fetch_, int BlockDim_, int WarpSize_, bool B_ >
-      friend struct detail::BiEllpackSegmentsReductionDispatcher;
+      friend struct detail::BiEllpackreduceSegmentsDispatcher;
 #endif
 };
+
+template <typename Device,
+          typename Index,
+          ElementsOrganization Organization,
+          int WarpSize >
+std::ostream& operator<<( std::ostream& str, const BiEllpackView< Device, Index, Organization, WarpSize >& segments ) { return printSegments( str, segments ); }
+
+
       } // namespace Segments
    }  // namespace Algorithms
 } // namespace TNL
diff --git a/src/TNL/Algorithms/Segments/BiEllpackView.hpp b/src/TNL/Algorithms/Segments/BiEllpackView.hpp
index 03131a0de193e9a926b11ace51cb9bbdd8a97e52..2014ae3dc983dbf025a51a971deaa9dca90d1ef4 100644
--- a/src/TNL/Algorithms/Segments/BiEllpackView.hpp
+++ b/src/TNL/Algorithms/Segments/BiEllpackView.hpp
@@ -97,6 +97,7 @@ String
 BiEllpackView< Device, Index, Organization, WarpSize >::
 getSerializationType()
 {
+   // FIXME: the serialized data DEPEND on the Organization and WarpSize parameters, so it should be reflected in the serialization type
    return "BiEllpack< [any_device], " + TNL::getSerializationType< IndexType >() + " >";
 }
 
@@ -275,9 +276,8 @@ forElements( IndexType first, IndexType last, Function&& f ) const
       const IndexType groupsCount = detail::BiEllpack< IndexType, DeviceType, Organization, getWarpSize() >::getActiveGroupsCountDirect( segmentsPermutationView, segmentIdx );
       IndexType groupHeight = getWarpSize();
       //printf( "segmentIdx = %d strip = %d firstGroupInStrip = %d rowStripPerm = %d groupsCount = %d \n", segmentIdx, strip, firstGroupInStrip, rowStripPerm, groupsCount );
-      bool compute( true );
       IndexType localIdx( 0 );
-      for( IndexType groupIdx = firstGroupInStrip; groupIdx < firstGroupInStrip + groupsCount && compute; groupIdx++ )
+      for( IndexType groupIdx = firstGroupInStrip; groupIdx < firstGroupInStrip + groupsCount; groupIdx++ )
       {
          IndexType groupOffset = groupPointersView[ groupIdx ];
          const IndexType groupSize = groupPointersView[ groupIdx + 1 ] - groupOffset;
@@ -289,14 +289,14 @@ forElements( IndexType first, IndexType last, Function&& f ) const
             {
                if( Organization == RowMajorOrder )
                {
-                  f( segmentIdx, localIdx, groupOffset + rowStripPerm * groupWidth + i, compute );
+                  f( segmentIdx, localIdx, groupOffset + rowStripPerm * groupWidth + i );
                }
                else
                {
                   /*printf( "segmentIdx = %d localIdx = %d globalIdx = %d groupIdx = %d groupSize = %d groupWidth = %d\n",
                      segmentIdx, localIdx, groupOffset + rowStripPerm + i * groupHeight,
                      groupIdx, groupSize, groupWidth );*/
-                  f( segmentIdx, localIdx, groupOffset + rowStripPerm + i * groupHeight, compute );
+                  f( segmentIdx, localIdx, groupOffset + rowStripPerm + i * groupHeight );
                }
                localIdx++;
             }
@@ -343,7 +343,7 @@ template< typename Device,
    template< typename Function >
 void
 BiEllpackView< Device, Index, Organization, WarpSize >::
-forEachSegment( Function&& f ) const
+forAllSegments( Function&& f ) const
 {
    this->forSegments( 0, this->getSegmentsCount(), f );
 }
@@ -352,10 +352,10 @@ template< typename Device,
           typename Index,
           ElementsOrganization Organization,
           int WarpSize >
-   template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
+   template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real >
 void
 BiEllpackView< Device, Index, Organization, WarpSize >::
-segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
+reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero ) const
 {
    using RealType = typename detail::FetchLambdaAdapter< Index, Fetch >::ReturnType;
    if( this->getStorageSize() == 0 )
@@ -425,9 +425,9 @@ segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reductio
          dim3 cudaGridSize = Cuda::getMaxGridSize();
          if( gridIdx == cudaGrids - 1 )
             cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize();
-         detail::BiEllpackSegmentsReductionKernel< ViewType, IndexType, Fetch, Reduction, ResultKeeper, Real, BlockDim, Args...  >
+         detail::BiEllpackreduceSegmentsKernel< ViewType, IndexType, Fetch, Reduction, ResultKeeper, Real, BlockDim  >
             <<< cudaGridSize, cudaBlockSize, sharedMemory >>>
-            ( *this, gridIdx, first, last, fetch, reduction, keeper, zero, args... );
+            ( *this, gridIdx, first, last, fetch, reduction, keeper, zero );
          cudaThreadSynchronize();
          TNL_CHECK_CUDA_DEVICE;
       }
@@ -439,12 +439,12 @@ template< typename Device,
           typename Index,
           ElementsOrganization Organization,
           int WarpSize >
-   template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
+   template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real >
 void
 BiEllpackView< Device, Index, Organization, WarpSize >::
-allReduction( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
+reduceAllSegments( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero ) const
 {
-   this->segmentsReduction( 0, this->getSegmentsCount(), fetch, reduction, keeper, zero, args... );
+   this->reduceSegments( 0, this->getSegmentsCount(), fetch, reduction, keeper, zero );
 }
 
 template< typename Device,
@@ -478,6 +478,18 @@ save( File& file ) const
         << this->groupPointers;
 }
 
+template< typename Device,
+          typename Index,
+          ElementsOrganization Organization,
+          int WarpSize >
+      template< typename Fetch >
+auto
+BiEllpackView< Device, Index, Organization, WarpSize >::
+print( Fetch&& fetch ) const -> SegmentsPrinter< BiEllpackView, Fetch >
+{
+   return SegmentsPrinter< BiEllpackView, Fetch >( *this, fetch );
+}
+
 template< typename Device,
           typename Index,
           ElementsOrganization Organization,
@@ -513,21 +525,19 @@ template< typename Device,
              typename Reduction,
              typename ResultKeeper,
              typename Real,
-             int BlockDim,
-             typename... Args >
+             int BlockDim >
 __device__
 void
 BiEllpackView< Device, Index, Organization, WarpSize >::
-segmentsReductionKernelWithAllParameters( IndexType gridIdx,
+reduceSegmentsKernelWithAllParameters( IndexType gridIdx,
                                           IndexType first,
                                           IndexType last,
                                           Fetch fetch,
                                           Reduction reduction,
                                           ResultKeeper keeper,
-                                          Real zero,
-                                          Args... args ) const
+                                          Real zero ) const
 {
-   using RealType = decltype( fetch( IndexType(), IndexType(), IndexType(), std::declval< bool& >(), args... ) );
+   using RealType = decltype( fetch( IndexType(), IndexType(), IndexType(), std::declval< bool& >() ) );
    const IndexType segmentIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x + first;
    if( segmentIdx >= last )
       return;
@@ -569,21 +579,19 @@ template< typename Device,
              typename Reduction,
              typename ResultKeeper,
              typename Real,
-             int BlockDim,
-             typename... Args >
+             int BlockDim >
 __device__
 void
 BiEllpackView< Device, Index, Organization, WarpSize >::
-segmentsReductionKernel( IndexType gridIdx,
+reduceSegmentsKernel( IndexType gridIdx,
                          IndexType first,
                          IndexType last,
                          Fetch fetch,
                          Reduction reduction,
                          ResultKeeper keeper,
-                         Real zero,
-                         Args... args ) const
+                         Real zero ) const
 {
-   using RealType = decltype( fetch( IndexType(), std::declval< bool& >(), args... ) );
+   using RealType = decltype( fetch( IndexType(), std::declval< bool& >() ) );
    Index segmentIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x + first;
 
    const IndexType strip = segmentIdx >> getLogWarpSize();
diff --git a/src/TNL/Algorithms/Segments/CSR.h b/src/TNL/Algorithms/Segments/CSR.h
index e63b4c8da49b15644ce7f2436d7c78de3a76ef71..27bdfe3e2c1b72b5c19c9a7e6611616dd3751945 100644
--- a/src/TNL/Algorithms/Segments/CSR.h
+++ b/src/TNL/Algorithms/Segments/CSR.h
@@ -21,6 +21,22 @@ namespace TNL {
    namespace Algorithms {
       namespace Segments {
 
+/**
+ * \brief Data structure for CSR segments format.
+ *
+ * See \ref TNL::Algorithms::Segments for more details about segments.
+ *
+ * \tparam Device is type of device where the segments will be operating.
+ * \tparam Index is type for indexing of the elements managed by the segments.
+ * \tparam Kernel is type of kernel used for parallel operations with segments.
+ *    It can be any of the following:
+ *    \ref TNL::Containers::Segments::Kernels::CSRAdaptiveKernel,
+ *    \ref TNL::Containers::Segments::Kernels::CSRHybridKernel,
+ *    \ref TNL::Containers::Segments::Kernels::CSRScalarKernel,
+ *    \ref TNL::Containers::Segments::Kernels::CSRVectorKernel
+ *
+ * \tparam IndexAllocator is allocator for supporting index containers.
+ */
 template< typename Device,
           typename Index,
           typename Kernel = CSRScalarKernel< Index, Device >,
@@ -29,122 +45,496 @@ class CSR
 {
    public:
 
+      /**
+       * \brief The device where the segments are operating.
+       */
       using DeviceType = Device;
+
+      /**
+       * \brief The type used for indexing of segments elements.
+       */
       using IndexType = std::remove_const_t< Index >;
+
+      /**
+       * \brief Type of kernel used for reduction operations.
+       */
       using KernelType = Kernel;
-      using OffsetsHolder = Containers::Vector< Index, DeviceType, IndexType, IndexAllocator >;
-      using SegmentsSizes = OffsetsHolder;
+
+      /**
+       * \brief Type of container storing offsets of particular rows.
+       */
+      using OffsetsContainer = Containers::Vector< Index, DeviceType, IndexType, IndexAllocator >;
+
+      /**
+       * \brief Templated view type.
+       *
+       * \tparam Device_ is alternative device type for the view.
+       * \tparam Index_ is alternative index type for the view.
+       */
       template< typename Device_, typename Index_ >
       using ViewTemplate = CSRView< Device_, Index_, KernelType >;
+
+      /**
+       * \brief Type of segments view.1
+       */
       using ViewType = CSRView< Device, Index, KernelType >;
+
+      /**
+       * \brief Type of constant segments view.
+       */
       using ConstViewType = CSRView< Device, std::add_const_t< IndexType >, KernelType >;
+
+      /**
+       * \brief Accessor type fro one particular segment.
+       */
       using SegmentViewType = SegmentView< IndexType, RowMajorOrder >;
 
-      static constexpr ElementsOrganization getOrganization() { return ColumnMajorOrder; }
+      /**
+       * \brief This functions says that CSR format is always organised in row-major order.
+       */
+      static constexpr ElementsOrganization getOrganization() { return RowMajorOrder; }
 
+      /**
+       * \brief This function says that CSR format does not use padding elements.
+       */
       static constexpr bool havePadding() { return false; };
 
+      /**
+       * \brief Construct with no parameters to create empty segments.
+       */
       CSR();
 
-      CSR( const SegmentsSizes& sizes );
+      /**
+       * \brief Construct with segments sizes.
+       *
+       * The number of segments is given by the size of \e segmentsSizes. Particular elements
+       * of this container define sizes of particular segments.
+       *
+       * \tparam SizesContainer is a type of container for segments sizes.  It can be \ref TNL::Containers::Array or
+       *  \ref TNL::Containers::Vector for example.
+       * \param sizes is an instance of the container with the segments sizes.
+       *
+       * See the following example:
+       *
+       * \includelineno Algorithms/Segments/SegmentsExample_CSR_constructor_1.cpp
+       *
+       * The result looks as follows:
+       *
+       * \include SegmentsExample_CSR_constructor_1.out
+       */
+      template< typename SizesContainer >
+      CSR( const SizesContainer& segmentsSizes );
 
+      /**
+       * \brief Construct with segments sizes in initializer list..
+       *
+       * The number of segments is given by the size of \e segmentsSizes. Particular elements
+       * of this initializer list define sizes of particular segments.
+       *
+       * \tparam ListIndex is a type of indexes of the initializer list.
+       * \param sizes is an instance of the container with the segments sizes.
+       *
+       * See the following example:
+       *
+       * \includelineno Algorithms/Segments/SegmentsExample_CSR_constructor_2.cpp
+       *
+       * The result looks as follows:
+       *
+       * \include SegmentsExample_CSR_constructor_2.out
+       */
+      template< typename ListIndex >
+      CSR( const std::initializer_list< ListIndex >& segmentsSizes );
+
+      /**
+       * \brief Copy constructor.
+       *
+       * \param segments are the source segments.
+       */
       CSR( const CSR& segments );
 
+      /**
+       * \brief Move constructor.
+       *
+       * \param segments  are the source segments.
+       */
       CSR( const CSR&& segments );
 
+      /**
+       * \brief Returns string with serialization type.
+       *
+       * The string has a form `Algorithms::Segments::CSR< IndexType,  [any_device], [any_kernel], [any_allocator] >`.
+       *
+       * \return \ref String with the serialization type.
+       *
+       * \par Example
+       * \include Algorithms/Segments/SegmentsExample_CSR_getSerializationType.cpp
+       * \par Output
+       * \include SegmentsExample_CSR_getSerializationType.out
+       */
       static String getSerializationType();
 
+      /**
+       * \brief Returns string with segments type.
+       *
+       * The string has a form `CSR< KernelType >`.
+       *
+       * \return \ref String with the segments type.
+       *
+       * \par Example
+       * \include Algorithms/Segments/SegmentsExample_CSR_getSegmentsType.cpp
+       * \par Output
+       * \include SegmentsExample_CSR_getSegmentsType.out
+       */
       static String getSegmentsType();
 
       /**
        * \brief Set sizes of particular segments.
+       *
+       * \tparam SizesContainer is a container with segments sizes. It can be \ref TNL::Containers::Array or
+       *  \ref TNL::Containers::Vector for example.
+       *
+       * \param segmentsSizes is an instance of the container with segments sizes.
        */
-      template< typename SizesHolder = OffsetsHolder >
-      void setSegmentsSizes( const SizesHolder& sizes );
+      template< typename SizesContainer >
+      void setSegmentsSizes( const SizesContainer& segmentsSizes );
 
+      /**
+       * \brief Reset the segments to empty states.
+       *
+       * It means that there is no segment in the CSR segments.
+       */
       void reset();
 
+      /**
+       * \brief Getter of a view object.
+       *
+       * \return View for this instance of CSR segments which can by used for example in
+       *  lambda functions running in GPU kernels.
+       */
       ViewType getView();
 
+      /**
+       * \brief Getter of a view object for constants instances.
+       *
+       * \return View for this instance of CSR segments which can by used for example in
+       *  lambda functions running in GPU kernels.
+       */
       const ConstViewType getConstView() const;
 
       /**
-       * \brief Number of segments.
+       * \brief Getter of number of segments.
+       *
+       * \return number of segments within this object.
        */
       __cuda_callable__
       IndexType getSegmentsCount() const;
 
-      /***
-       * \brief Returns size of the segment number \r segmentIdx
+      /**
+       * \brief Returns size of particular segment.
+       *
+       * \return size of the segment number \e segmentIdx.
        */
       __cuda_callable__
       IndexType getSegmentSize( const IndexType segmentIdx ) const;
 
       /***
        * \brief Returns number of elements managed by all segments.
+       *
+       * \return number of elements managed by all segments.
        */
       __cuda_callable__
       IndexType getSize() const;
 
-      /***
-       * \brief Returns number of elements that needs to be allocated.
+      /**
+       * \brief Returns number of elements that needs to be allocated by a container connected to this segments.
+       *
+       * \return size of container connected to this segments.
        */
       __cuda_callable__
       IndexType getStorageSize() const;
 
+      /**
+       * \brief Computes the global index of an element managed by the segments.
+       *
+       * The global index serves as a refernce on the element in its container.
+       *
+       * \param segmentIdx is index of a segment with the element.
+       * \param localIdx is tha local index of the element within the segment.
+       * \return global index of the element.
+       */
       __cuda_callable__
       IndexType getGlobalIndex( const Index segmentIdx, const Index localIdx ) const;
 
+      /**
+       * \brief Returns segment view (i.e. segment accessor) of segment with given index.
+       *
+       * \param segmentIdx is index of the request segment.
+       * \return segment view of given segment.
+       *
+       * \par Example
+       * \include Algorithms/Segments/SegmentsExample_CSR_getSegmentView.cpp
+       * \par Output
+       * \include SegmentsExample_CSR_getSegmentView.out
+       */
       __cuda_callable__
       SegmentViewType getSegmentView( const IndexType segmentIdx ) const;
 
-      const OffsetsHolder& getOffsets() const;
+      /**
+       * \brief Returns reference on constant vector with row offsets used in the CSR format.
+       *
+       * \return reference on constant vector with row offsets used in the CSR format.
+       */
+      const OffsetsContainer& getOffsets() const;
 
-      OffsetsHolder& getOffsets();
+      /**
+       * \brief Returns reference on vector with row offsets used in the CSR format.
+       *
+       * \return reference on vector with row offsets used in the CSR format.
+       */
+      OffsetsContainer& getOffsets();
 
-      /***
-       * \brief Go over all segments and for each segment element call
-       * function 'f'. The return type of 'f' is bool.
-       * When its true, the for-loop continues. Once 'f' returns false, the for-loop
-       * is terminated.
+      /**
+       * \brief Iterate over all elements of given segments in parallel and call given lambda function.
+       *
+       * \tparam Function is a type of the lambda function to be performed on each element.
+       * \param begin defines begining of an interval [ \e begin, \e end ) of segments on
+       *    elements of which we want to apply the lambda function.
+       * \param end defines end of an interval [ \e begin, \e end ) of segments on
+       *    elements of which we want to apply the lambda function.
+       * \param function is the lambda function to be applied on the elements of the segments.
+       *
+       * Declaration of the lambda function \e function is supposed to be
+       *
+       * ```
+       * auto f = [=] __cuda_callable__ ( IndexType segmentIdx, IndexType localIdx, IndexType globalIdx ) {...}
+       * ```
+       * where \e segmentIdx is index of segment where given element belong to, \e localIdx is rank of the element
+       * within the segment and \e globalIdx is index of the element within the related container.
+       *
+       * \par Example
+       * \include Algorithms/Segments/SegmentsExample_CSR_forElements.cpp
+       * \par Output
+       * \include SegmentsExample_CSR_forElements.out
        */
       template< typename Function >
-      void forElements( IndexType begin, IndexType end, Function&& f ) const;
+      void forElements( IndexType begin, IndexType end, Function&& function ) const;
 
+      /**
+       * \brief Call \ref TNL::Algorithms::Segments::CSR::forElements for all elements of the segments.
+       *
+       * See \ref TNL::Algorithms::Segments::CSR::forElements for more details.
+       */
       template< typename Function >
-      void forAllElements( Function&& f ) const;
+      void forAllElements( Function&& function ) const;
 
+      /**
+       * \brief Iterate over all segments in parallel and call given lambda function.
+       *
+       * \tparam Function is a type of the lambda function to be performed on each segment.
+       * \param begin defines begining of an interval [ \e begin, \e end ) of segments on
+       *    elements of which we want to apply the lambda function.
+       * \param end defines end of an interval [ \e begin, \e end ) of segments on
+       *    elements of which we want to apply the lambda function.
+       * \param function is the lambda function to be applied on the elements of the segments.
+       *
+       *  Declaration of the lambda function \e function is supposed to be
+       *
+       * ```
+       * auto f = [=] __cuda_callable__ ( const SegmentView& segment ) {...}
+       * ```
+       * where \e segment represents given segment (see \ref TNL::Algorithms::Segments::SegmentView).
+       * Its type is given by \ref SegmentViewType.
+       *
+       * \par Example
+       * \include Algorithms/Segments/SegmentsExample_CSR_forSegments.cpp
+       * \par Output
+       * \include SegmentsExample_CSR_forSegments.out
+       */
       template< typename Function >
-      void forSegments( IndexType begin, IndexType end, Function&& f ) const;
+      void forSegments( IndexType begin, IndexType end, Function&& function ) const;
 
+      /**
+       * \brief Call \ref TNL::Algorithms::Segments::CSR::forSegments for all segments.
+       *
+       * See \ref TNL::Algorithms::Segments::CSR::forSegments for more details.
+       */
       template< typename Function >
-      void forEachSegment( Function&& f ) const;
+      void forAllSegments( Function&& function ) const;
 
-      /***
-       * \brief Go over all segments and perform a reduction in each of them.
+      /**
+       * \brief Call \ref TNL::Algorithms::Segments::CSR::forSegments sequentially for particular segments.
+       *
+       * With this method, the given segments are processed sequentially one-by-one. This is usefull for example
+       * for printing of segments based data structures or for debugging reasons.
+       *
+       * \param begin defines begining of an interval [ \e begin, \e end ) of segments on
+       *    elements of which we want to apply the lambda function.
+       * \param end defines end of an interval [ \e begin, \e end ) of segments on
+       *    elements of which we want to apply the lambda function.
+       * \param function is the lambda function to be applied on the elements of the segments.
+       *
+       * See \ref TNL::Algorithms::Segments::CSR::forSegments for more details.
+       *
+       * \par Example
+       * \include Algorithms/Segments/SegmentsExample_CSR_sequentialForSegments.cpp
+       * \par Output
+       * \include SegmentsExample_CSR_sequentialForSegments.out
+       */
+      template< typename Function >
+      void sequentialForSegments( IndexType begin, IndexType end, Function&& function ) const;
+
+      /**
+       * \brief Call \ref TNL::Algorithms::Segments::CSR::sequentialForSegments for all segments.
+       *
+       * See \ref TNL::Algorithms::Segments::CSR::sequentialForSegments for more details.
+       */
+      template< typename Function >
+      void sequentialForAllSegments( Function&& f ) const;
+
+      /**
+       * \brief Compute reduction in each segment.
+       *
+       * \tparam Fetch is type of lambda function for data fetching.
+       * \tparam Reduce is a reduction operation.
+       * \tparam Keep is lambda function for storing results from particular segments.
+       *
+       * \param begin defines begining of an interval [ \e begin, \e end ) of segments in
+       *    which we want to perform the reduction.
+       * \param end defines and of an interval [ \e begin, \e end ) of segments in
+       *    which we want to perform the reduction.
+       * \param fetch is a lambda function for fetching of data. It is suppos have one of the
+       *  following forms:
+       * 1. Full form
+       *  ```
+       *  auto fetch = [=] __cuda_callable__ ( IndexType segmentIdx, IndexType localIdx, IndexType globalIdx, bool& compute ) { ... }
+       *  ```
+       * 2. Brief form
+       * ```
+       * auto fetch = [=] __cuda_callable__ ( IndexType globalIdx, bool& compute ) { ... }
+       * ```
+       * where for both variants \e segmentIdx is segment index, \e localIdx is a rank of element in the segment, \e globalIdx is index of the element
+       * in related container and \e compute is a boolean variable which serves for stopping the reduction if it is set to \e false. It is however,
+       * only a hint and the real behaviour depends on type of kernel used ofr the redcution.
+       * Some kernels are optimized so that they can be significantly faster with the brief variant of the \e fetch lambda function.
+       * \param reduce is a lambda function representing the reduction opeartion. It is supposed to be defined as:
+       *
+       * ```
+       * auto reduce = [=] __cuda_callable__ ( const Value& a, const Value& b ) -> Value { ... }
+       * ```
+       *
+       * where \e a and \e b are values to be reduced and the lambda function returns result of the reduction.
+       * \param keep is a lambda function for saving results from particular segments. It is supposed to be defined as:
+       *
+       * ```
+       * auto keep = [=] __cuda_callable__ ( IndexType segmentIdx, const Value& value ) { ... }
+       * ```
+       *
+       * where \e segmentIdx is an index of the segment and \e value is the result of the reduction in given segment to be stored.
+       *
+       * \par Example
+       * \include Algorithms/Segments/SegmentsExample_CSR_reduceSegments.cpp
+       * \par Output
+       * \include SegmentsExample_CSR_reduceSegments.out
        */
-      template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
-      void segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
+      template< typename Fetch, typename Reduce, typename Keep, typename Value >
+      void reduceSegments( IndexType begin, IndexType end, Fetch& fetch, const Reduce& reduce, Keep& keep, const Value& zero ) const;
 
-      template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
-      void allReduction( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
+      /**
+       * \brief Call \ref TNL::Algorithms::Segments::CSR::reduceSegments for all segments.
+       *
+       * See \ref TNL::Algorithms::Segments::CSR::reduceSegments for more details.
+       */
+      template< typename Fetch, typename Reduce, typename Keep, typename Value >
+      void reduceAllSegments( Fetch& fetch, const Reduce& reduce, Keep& keep, const Value& zero ) const;
 
-      CSR& operator=( const CSR& rhsSegments ) = default;
+      /**
+       * \brief Assignment operator.
+       *
+       * It makes a deep copy of the source segments.
+       *
+       * \param source are the CSR segments to be assigned.
+       * \return reference to this instance.
+       */
+      CSR& operator=( const CSR& source ) = default;
 
+      /**
+       * \brief Assignment operator with CSR segments with different template parameters.
+       *
+       * It makes a deep copy of the source segments.
+       *
+       * \tparam Device_ is device type of the source segments.
+       * \tparam Index_ is the index type of the source segments.
+       * \tparam Kernel_ is the kernel type of the source segments.
+       * \tparam IndexAllocator_ is the index allocator of the source segments.
+       * \param source is the source segments object.
+       * \return reference to this instance.
+       */
       template< typename Device_, typename Index_, typename Kernel_, typename IndexAllocator_ >
       CSR& operator=( const CSR< Device_, Index_, Kernel_, IndexAllocator_ >& source );
 
+      /**
+       * \brief Method for saving the segments to a file in a binary form.
+       *
+       * \param file is the target file.
+       */
       void save( File& file ) const;
 
+      /**
+       * \brief Method for loading the segments from a file in a binary form.
+       *
+       * \param file is the source file.
+       */
       void load( File& file );
 
+      /**
+       * \brief Return simple proxy object for insertion to output stream.
+       *
+       * The proxy object serves for wrapping segments with lambda function mediating access to data managed by the segments.
+       *
+       * \tparam Fetch is type of lambda function for data access.
+       * \param fetch is an instance of lambda function for data access. It is supposed to be defined as
+       *
+       * ```
+       * auto fetch = [=] __cuda_callable__ ( IndexType globalIdx ) -> ValueType { return data_view[ globalIdx ]; };
+       * ```
+       * \return Proxy object for insertion to output stream.
+       *
+       * \par Example
+       * \include Algorithms/Segments/SegmentsPrintingExample-2.cpp
+       * \par Output
+       * \include SegmentsPrintingExample-2.out
+       */
+      template< typename Fetch >
+      SegmentsPrinter< CSR, Fetch > print( Fetch&& fetch ) const;
+
+      KernelType& getKernel() { return kernel; }
+
+      const KernelType& getKernel() const { return kernel; }
+
    protected:
 
-      OffsetsHolder offsets;
+      OffsetsContainer offsets;
 
       KernelType kernel;
 };
 
+/**
+ * \brief Insertion operator of CSR segments to output stream.
+ *
+ * \tparam Device is the device type of the source segments.
+ * \tparam Index is the index type of the source segments.
+ * \tparam Kernel is kernel type of the source segments.
+ * \tparam IndexAllocator is the index allocator of the source segments.
+ * \param str is the output stream.
+ * \param segments are the source segments.
+ * \return reference to the output stream.
+ */
+template< typename Device,
+          typename Index,
+          typename Kernel,
+          typename IndexAllocator >
+std::ostream& operator<<( std::ostream& str, const CSR< Device, Index, Kernel, IndexAllocator >& segments ) { return printSegments( segments, str ); }
+
 template< typename Device,
           typename Index,
           typename IndexAllocator = typename Allocators::Default< Device >::template Allocator< Index > >
@@ -160,6 +550,11 @@ template< typename Device,
           typename IndexAllocator = typename Allocators::Default< Device >::template Allocator< Index > >
 using CSRHybrid = CSR< Device, Index, CSRHybridKernel< Index, Device >, IndexAllocator >;
 
+template< typename Device,
+          typename Index,
+          typename IndexAllocator = typename Allocators::Default< Device >::template Allocator< Index > >
+using CSRLight = CSR< Device, Index, CSRLightKernel< Index, Device >, IndexAllocator >;
+
 template< typename Device,
           typename Index,
           typename IndexAllocator = typename Allocators::Default< Device >::template Allocator< Index > >
@@ -170,7 +565,6 @@ template< typename Device,
           typename IndexAllocator = typename Allocators::Default< Device >::template Allocator< Index > >
 using CSRDefault = CSRScalar< Device, Index, IndexAllocator >;
 
-
       } // namespace Segments
    }  // namespace Algorithms
 } // namespace TNL
diff --git a/src/TNL/Algorithms/Segments/CSR.hpp b/src/TNL/Algorithms/Segments/CSR.hpp
index 44f9aa799cb3ce29a6f2b35ef8b78b9030b663ba..0bd2d33ca628db43a2e103d0f8d6128e424d73c9 100644
--- a/src/TNL/Algorithms/Segments/CSR.hpp
+++ b/src/TNL/Algorithms/Segments/CSR.hpp
@@ -33,12 +33,24 @@ template< typename Device,
           typename Index,
           typename Kernel,
           typename IndexAllocator >
+   template< typename SizesContainer >
 CSR< Device, Index, Kernel, IndexAllocator >::
-CSR( const SegmentsSizes& segmentsSizes )
+CSR( const SizesContainer& segmentsSizes )
 {
    this->setSegmentsSizes( segmentsSizes );
 }
 
+template< typename Device,
+          typename Index,
+          typename Kernel,
+          typename IndexAllocator >
+   template< typename ListIndex >
+CSR< Device, Index, Kernel, IndexAllocator >::
+CSR( const std::initializer_list< ListIndex >& segmentsSizes )
+{
+   this->setSegmentsSizes( Containers::Vector< IndexType, DeviceType, IndexType >( segmentsSizes ) );
+}
+
 template< typename Device,
           typename Index,
           typename Kernel,
@@ -67,7 +79,8 @@ CSR< Device, Index, Kernel, IndexAllocator >::
 getSerializationType()
 {
    return "CSR< [any_device], " +
-      TNL::getSerializationType< IndexType >() +
+      TNL::getSerializationType< IndexType >() + ", " +
+      // FIXME: the serialized data do not depend on the the kernel type so it should not be in the serialization type
       TNL::getSerializationType< KernelType >() + " >";
 }
 
@@ -207,7 +220,7 @@ template< typename Device,
           typename IndexAllocator >
 auto
 CSR< Device, Index, Kernel, IndexAllocator >::
-getOffsets() const -> const OffsetsHolder&
+getOffsets() const -> const OffsetsContainer&
 {
    return this->offsets;
 }
@@ -218,7 +231,7 @@ template< typename Device,
           typename IndexAllocator >
 auto
 CSR< Device, Index, Kernel, IndexAllocator >::
-getOffsets() -> OffsetsHolder&
+getOffsets() -> OffsetsContainer&
 {
    return this->offsets;
 }
@@ -266,33 +279,57 @@ template< typename Device,
    template< typename Function >
 void
 CSR< Device, Index, Kernel, IndexAllocator >::
-forEachSegment( Function&& f ) const
+forAllSegments( Function&& f ) const
 {
-   this->getConstView().forEachSegment( f );
+   this->getConstView().forAllSegments( f );
 }
 
 template< typename Device,
           typename Index,
           typename Kernel,
           typename IndexAllocator >
-   template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
+   template< typename Function >
+void
+CSR< Device, Index, Kernel, IndexAllocator >::
+sequentialForSegments( IndexType begin, IndexType end, Function&& f ) const
+{
+   this->getConstView().sequentialForSegments( begin, end, f );
+}
+
+template< typename Device,
+          typename Index,
+          typename Kernel,
+          typename IndexAllocator >
+   template< typename Function >
 void
 CSR< Device, Index, Kernel, IndexAllocator >::
-segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
+sequentialForAllSegments( Function&& f ) const
 {
-   this->getConstView().segmentsReduction( first, last, fetch, reduction, keeper, zero, args... );
+   this->getConstView().sequentialForAllSegments( f );
 }
 
 template< typename Device,
           typename Index,
           typename Kernel,
           typename IndexAllocator >
-   template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
+   template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real >
 void
 CSR< Device, Index, Kernel, IndexAllocator >::
-allReduction( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
+reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero ) const
 {
-   this->segmentsReduction( 0, this->getSegmentsCount(), fetch, reduction, keeper, zero, args... );
+   this->getConstView().reduceSegments( first, last, fetch, reduction, keeper, zero );
+}
+
+template< typename Device,
+          typename Index,
+          typename Kernel,
+          typename IndexAllocator >
+   template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real >
+void
+CSR< Device, Index, Kernel, IndexAllocator >::
+reduceAllSegments( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero ) const
+{
+   this->reduceSegments( 0, this->getSegmentsCount(), fetch, reduction, keeper, zero );
 }
 
 template< typename Device,
@@ -332,6 +369,18 @@ load( File& file )
    this->kernel.init( this->offsets );
 }
 
+template< typename Device,
+          typename Index,
+          typename Kernel,
+          typename IndexAllocator >
+      template< typename Fetch >
+auto
+CSR< Device, Index, Kernel, IndexAllocator >::
+print( Fetch&& fetch ) const -> SegmentsPrinter< CSR, Fetch >
+{
+   return SegmentsPrinter< CSR, Fetch >( *this, fetch );
+}
+
       } // namespace Segments
    }  // namespace Algorithms
 } // namespace TNL
diff --git a/src/TNL/Algorithms/Segments/CSRHybridKernel.hpp b/src/TNL/Algorithms/Segments/CSRHybridKernel.hpp
deleted file mode 100644
index 90505358e7246f350157d4ee90dfebd4b470c432..0000000000000000000000000000000000000000
--- a/src/TNL/Algorithms/Segments/CSRHybridKernel.hpp
+++ /dev/null
@@ -1,204 +0,0 @@
-/***************************************************************************
-                          CSRHybridKernel.hpp -  description
-                             -------------------
-    begin                : Jan 23, 2021 -> Joe Biden inauguration
-    copyright            : (C) 2021 by Tomas Oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-#pragma once
-
-#include <TNL/Assert.h>
-#include <TNL/Cuda/LaunchHelpers.h>
-#include <TNL/Containers/VectorView.h>
-#include <TNL/Algorithms/ParallelFor.h>
-#include <TNL/Algorithms/Segments/detail/LambdaAdapter.h>
-#include <TNL/Algorithms/Segments/CSRHybridKernel.h>
-
-namespace TNL {
-   namespace Algorithms {
-      namespace Segments {
-
-#ifdef HAVE_CUDA
-template< int ThreadsPerSegment,
-          typename Offsets,
-          typename Index,
-          typename Fetch,
-          typename Reduction,
-          typename ResultKeeper,
-          typename Real,
-          typename... Args >
-__global__
-void segmentsReductionCSRHybridKernel(
-    int gridIdx,
-    const Offsets offsets,
-    Index first,
-    Index last,
-    Fetch fetch,
-    const Reduction reduce,
-    ResultKeeper keep,
-    const Real zero,
-    Args... args )
-{
-    /***
-     * We map one warp to each segment
-     */
-    const Index segmentIdx =  TNL::Cuda::getGlobalThreadIdx( gridIdx ) / ThreadsPerSegment + first;
-    if( segmentIdx >= last )
-        return;
-
-    const int laneIdx = threadIdx.x & ( ThreadsPerSegment - 1 ); // & is cheaper than %
-    Index endIdx = offsets[ segmentIdx + 1] ;
-
-    Index localIdx( laneIdx );
-    Real aux = zero;
-    bool compute( true );
-    for( Index globalIdx = offsets[ segmentIdx ] + localIdx; globalIdx < endIdx; globalIdx += ThreadsPerSegment )
-    {
-      aux = reduce( aux, detail::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, localIdx, globalIdx, compute ) );
-      localIdx += TNL::Cuda::getWarpSize();
-    }
-
-    /****
-     * Reduction in each segment.
-     */
-    if( ThreadsPerSegment == 32 )
-        aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux, 16 ) );
-    if( ThreadsPerSegment >= 16 )
-        aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux,  8 ) );
-    if( ThreadsPerSegment >= 8 )
-        aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux,  4 ) );
-    if( ThreadsPerSegment >= 4 )
-        aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux,  2 ) );
-    if( ThreadsPerSegment >= 2 )
-        aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux,  1 ) );
-
-    if( laneIdx == 0 )
-        keep( segmentIdx, aux );
-}
-#endif
-
-
-
-template< typename Index,
-          typename Device >
-    template< typename Offsets >
-void
-CSRHybridKernel< Index, Device >::
-init( const Offsets& offsets )
-{
-    const Index segmentsCount = offsets.getSize() - 1;
-    const Index elementsInSegment = std::ceil( ( double ) offsets.getElement( segmentsCount ) / ( double ) segmentsCount );
-    this->threadsPerSegment = TNL::min( std::pow( 2, std::ceil( std::log2( elementsInSegment ) ) ), TNL::Cuda::getWarpSize() );
-    TNL_ASSERT_GE( threadsPerSegment, 0, "" );
-    TNL_ASSERT_LE( threadsPerSegment, 32, "" );
-}
-
-template< typename Index,
-          typename Device >
-void
-CSRHybridKernel< Index, Device >::
-reset()
-{
-    this->threadsPerSegment = 0;
-}
-
-template< typename Index,
-          typename Device >
-auto
-CSRHybridKernel< Index, Device >::
-getView() -> ViewType
-{
-    return *this;
-}
-
-template< typename Index,
-          typename Device >
-TNL::String
-CSRHybridKernel< Index, Device >::
-getKernelType()
-{
-    return "Hybrid";
-}
-
-template< typename Index,
-          typename Device >
-auto
-CSRHybridKernel< Index, Device >::
-getConstView() const -> ConstViewType
-{
-    return *this;
-};
-
-
-template< typename Index,
-          typename Device >
-    template< typename OffsetsView,
-              typename Fetch,
-              typename Reduction,
-              typename ResultKeeper,
-              typename Real,
-              typename... Args >
-void
-CSRHybridKernel< Index, Device >::
-segmentsReduction( const OffsetsView& offsets,
-                         Index first,
-                         Index last,
-                         Fetch& fetch,
-                         const Reduction& reduction,
-                         ResultKeeper& keeper,
-                         const Real& zero,
-                         Args... args ) const
-{
-    TNL_ASSERT_GE( this->threadsPerSegment, 0, "" );
-    TNL_ASSERT_LE( this->threadsPerSegment, 32, "" );
-
-#ifdef HAVE_CUDA
-    const size_t threadsCount = this->threadsPerSegment * ( last - first );
-    dim3 blocksCount, gridsCount, blockSize( 256 );
-    TNL::Cuda::setupThreads( blockSize, blocksCount, gridsCount, threadsCount );
-    //std::cerr << " this->threadsPerSegment = " << this->threadsPerSegment << " offsets = " << offsets << std::endl;
-    for( unsigned int gridIdx = 0; gridIdx < gridsCount.x; gridIdx ++ )
-    {
-        dim3 gridSize;
-        TNL::Cuda::setupGrid( blocksCount, gridsCount, gridIdx, gridSize );
-        switch( this->threadsPerSegment )
-        {
-            case 0:      // this means zero/empty matrix
-                break;
-            case 1:
-                segmentsReductionCSRHybridKernel<  1, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>(
-                    gridIdx, offsets, first, last, fetch, reduction, keeper, zero, args... );
-                    break;
-            case 2:
-                segmentsReductionCSRHybridKernel<  2, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>(
-                    gridIdx, offsets, first, last, fetch, reduction, keeper, zero, args... );
-                    break;
-            case 4:
-                segmentsReductionCSRHybridKernel<  4, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>(
-                    gridIdx, offsets, first, last, fetch, reduction, keeper, zero, args... );
-                    break;
-            case 8:
-                segmentsReductionCSRHybridKernel<  8, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>(
-                    gridIdx, offsets, first, last, fetch, reduction, keeper, zero, args... );
-                    break;
-            case 16:
-                segmentsReductionCSRHybridKernel< 16, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>(
-                    gridIdx, offsets, first, last, fetch, reduction, keeper, zero, args... );
-                    break;
-            case 32:
-                segmentsReductionCSRHybridKernel< 32, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>(
-                    gridIdx, offsets, first, last, fetch, reduction, keeper, zero, args... );
-                    break;
-            default:
-                throw std::runtime_error( std::string( "Wrong value of threadsPerSegment: " ) + std::to_string( this->threadsPerSegment ) );
-        }
-    }
-#endif
-}
-
-      } // namespace Segments
-   }  // namespace Algorithms
-} // namespace TNL
diff --git a/src/TNL/Algorithms/Segments/CSRScalarKernel.hpp b/src/TNL/Algorithms/Segments/CSRScalarKernel.hpp
deleted file mode 100644
index dd05fee201cd6360c65b1fd4311c5f9616a88c6f..0000000000000000000000000000000000000000
--- a/src/TNL/Algorithms/Segments/CSRScalarKernel.hpp
+++ /dev/null
@@ -1,121 +0,0 @@
-/***************************************************************************
-                          CSRScalarKernel.h -  description
-                             -------------------
-    begin                : Jan 23, 2021 -> Joe Biden inauguration
-    copyright            : (C) 2021 by Tomas Oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-#pragma once
-
-#include <TNL/Assert.h>
-#include <TNL/Cuda/LaunchHelpers.h>
-#include <TNL/Containers/VectorView.h>
-#include <TNL/Algorithms/ParallelFor.h>
-#include <TNL/Algorithms/Segments/CSRScalarKernel.h>
-#include <TNL/Algorithms/Segments/detail/LambdaAdapter.h>
-
-namespace TNL {
-   namespace Algorithms {
-      namespace Segments {
-
-template< typename Index,
-          typename Device >
-    template< typename Offsets >
-void
-CSRScalarKernel< Index, Device >::
-init( const Offsets& offsets )
-{
-}
-
-template< typename Index,
-          typename Device >
-void
-CSRScalarKernel< Index, Device >::
-reset()
-{
-}
-
-template< typename Index,
-          typename Device >
-auto
-CSRScalarKernel< Index, Device >::
-getView() -> ViewType
-{
-    return *this;
-}
-
-template< typename Index,
-          typename Device >
-auto
-CSRScalarKernel< Index, Device >::
-getConstView() const -> ConstViewType
-{
-    return *this;
-};
-
-template< typename Index,
-          typename Device >
-TNL::String
-CSRScalarKernel< Index, Device >::
-getKernelType()
-{
-    return "Scalar";
-}
-
-template< typename Index,
-          typename Device >
-    template< typename OffsetsView,
-              typename Fetch,
-              typename Reduction,
-              typename ResultKeeper,
-              typename Real,
-              typename... Args >
-void
-CSRScalarKernel< Index, Device >::
-segmentsReduction( const OffsetsView& offsets,
-                   Index first,
-                   Index last,
-                   Fetch& fetch,
-                   const Reduction& reduction,
-                   ResultKeeper& keeper,
-                   const Real& zero,
-                   Args... args )
-{
-    auto l = [=] __cuda_callable__ ( const IndexType segmentIdx, Args... args ) mutable {
-        const IndexType begin = offsets[ segmentIdx ];
-        const IndexType end = offsets[ segmentIdx + 1 ];
-        Real aux( zero );
-        IndexType localIdx( 0 );
-        bool compute( true );
-        for( IndexType globalIdx = begin; globalIdx < end && compute; globalIdx++  )
-            aux = reduction( aux, detail::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx++, globalIdx, compute ) );
-        keeper( segmentIdx, aux );
-    };
-
-     if( std::is_same< DeviceType, TNL::Devices::Host >::value )
-    {
-#ifdef HAVE_OPENMP
-        #pragma omp parallel for firstprivate( l ) schedule( dynamic, 100 ), if( Devices::Host::isOMPEnabled() )
-#endif
-        for( Index segmentIdx = first; segmentIdx < last; segmentIdx ++ )
-            l( segmentIdx, args... );
-        /*{
-            const IndexType begin = offsets[ segmentIdx ];
-            const IndexType end = offsets[ segmentIdx + 1 ];
-            Real aux( zero );
-            IndexType localIdx( 0 );
-            bool compute( true );
-            for( IndexType globalIdx = begin; globalIdx < end && compute; globalIdx++  )
-                aux = reduction( aux, detail::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx++, globalIdx, compute ) );
-            keeper( segmentIdx, aux );
-        }*/
-    }
-    else
-        Algorithms::ParallelFor< Device >::exec( first, last, l, args... );
-}
-      } // namespace Segments
-   }  // namespace Algorithms
-} // namespace TNL
diff --git a/src/TNL/Algorithms/Segments/CSRView.h b/src/TNL/Algorithms/Segments/CSRView.h
index cd9e44a2a323f1185363cdb46dadfb69a74d8566..b593dc46772a4f86e23db44abfb3529df9bed328 100644
--- a/src/TNL/Algorithms/Segments/CSRView.h
+++ b/src/TNL/Algorithms/Segments/CSRView.h
@@ -14,10 +14,12 @@
 
 #include <TNL/Containers/Vector.h>
 #include <TNL/Algorithms/Segments/SegmentView.h>
-#include <TNL/Algorithms/Segments/CSRScalarKernel.h>
-#include <TNL/Algorithms/Segments/CSRVectorKernel.h>
-#include <TNL/Algorithms/Segments/CSRHybridKernel.h>
-#include <TNL/Algorithms/Segments/CSRAdaptiveKernel.h>
+#include <TNL/Algorithms/Segments/Kernels/CSRScalarKernel.h>
+#include <TNL/Algorithms/Segments/Kernels/CSRVectorKernel.h>
+#include <TNL/Algorithms/Segments/Kernels/CSRHybridKernel.h>
+#include <TNL/Algorithms/Segments/Kernels/CSRLightKernel.h>
+#include <TNL/Algorithms/Segments/Kernels/CSRAdaptiveKernel.h>
+#include <TNL/Algorithms/Segments/SegmentsPrinting.h>
 
 namespace TNL {
    namespace Algorithms {
@@ -51,13 +53,13 @@ class CSRView
       CSRView( const OffsetsView& offsets, const KernelView& kernel );
 
       __cuda_callable__
-      CSRView( const OffsetsView&& offsets, const KernelView&& kernel );
+      CSRView( OffsetsView&& offsets, KernelView&& kernel );
 
       __cuda_callable__
       CSRView( const CSRView& csr_view );
 
       __cuda_callable__
-      CSRView( const CSRView&& csr_view );
+      CSRView( CSRView&& csr_view );
 
       static String getSerializationType();
 
@@ -115,16 +117,22 @@ class CSRView
       void forSegments( IndexType begin, IndexType end, Function&& f ) const;
 
       template< typename Function >
-      void forEachSegment( Function&& f ) const;
+      void forAllSegments( Function&& f ) const;
+
+      template< typename Function >
+      void sequentialForSegments( IndexType begin, IndexType end, Function&& f ) const;
+
+      template< typename Function >
+      void sequentialForAllSegments( Function&& f ) const;
 
       /***
        * \brief Go over all segments and perform a reduction in each of them.
        */
-      template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
-      void segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
+      template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real >
+      void reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero ) const;
 
-      template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
-      void allReduction( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
+      template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real >
+      void reduceAllSegments( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero ) const;
 
       CSRView& operator=( const CSRView& view );
 
@@ -132,6 +140,13 @@ class CSRView
 
       void load( File& file );
 
+      template< typename Fetch >
+      SegmentsPrinter< CSRView, Fetch > print( Fetch&& fetch ) const;
+
+      KernelType& getKernel() { return kernel; }
+
+      const KernelType& getKernel() const { return kernel; }
+
    protected:
 
       OffsetsView offsets;
@@ -139,6 +154,12 @@ class CSRView
       KernelView kernel;
 };
 
+
+template< typename Device,
+          typename Index,
+          typename Kernel >
+std::ostream& operator<<( std::ostream& str, const CSRView< Device, Index, Kernel >& segments ) { return printSegments( str, segments ); }
+
 template< typename Device,
           typename Index >
 using CSRViewScalar = CSRView< Device, Index, CSRScalarKernel< Index, Device > >;
@@ -147,9 +168,14 @@ template< typename Device,
           typename Index >
 using CSRViewVector = CSRView< Device, Index, CSRVectorKernel< Index, Device > >;
 
+template< typename Device,
+          typename Index,
+          int ThreadsInBlock = 256 >
+using CSRViewHybrid = CSRView< Device, Index, CSRHybridKernel< Index, Device, ThreadsInBlock > >;
+
 template< typename Device,
           typename Index >
-using CSRViewHybrid = CSRView< Device, Index, CSRHybridKernel< Index, Device > >;
+using CSRViewLight = CSRView< Device, Index, CSRLightKernel< Index, Device > >;
 
 template< typename Device,
           typename Index >
diff --git a/src/TNL/Algorithms/Segments/CSRView.hpp b/src/TNL/Algorithms/Segments/CSRView.hpp
index 8c9f1e78944698cc7bc4e74bb123915f0ac41730..7aac457afac9d16110e32f22821d060aa118138c 100644
--- a/src/TNL/Algorithms/Segments/CSRView.hpp
+++ b/src/TNL/Algorithms/Segments/CSRView.hpp
@@ -46,8 +46,8 @@ template< typename Device,
           typename Kernel >
 __cuda_callable__
 CSRView< Device, Index, Kernel >::
-CSRView( const OffsetsView&& offsets_view,
-         const KernelView&& kernel_view )
+CSRView( OffsetsView&& offsets_view,
+         KernelView&& kernel_view )
    : offsets( std::move( offsets_view ) ), kernel( std::move( kernel_view ) )
 {
 }
@@ -67,7 +67,7 @@ template< typename Device,
           typename Kernel >
 __cuda_callable__
 CSRView< Device, Index, Kernel >::
-CSRView( const CSRView&& csr_view )
+CSRView( CSRView&& csr_view )
    : offsets( std::move( csr_view.offsets ) ), kernel( std::move( csr_view.kernel ) )
 {
 }
@@ -80,7 +80,8 @@ CSRView< Device, Index, Kernel >::
 getSerializationType()
 {
    return "CSR< [any_device], " +
-      TNL::getSerializationType< IndexType >() +
+      TNL::getSerializationType< IndexType >() + ", " +
+      // FIXME: the serialized data do not depend on the the kernel type so it should not be in the serialization type
       TNL::getSerializationType< KernelType >() + " >";
 }
 
@@ -193,9 +194,8 @@ forElements( IndexType begin, IndexType end, Function&& f ) const
       const IndexType begin = offsetsView[ segmentIdx ];
       const IndexType end = offsetsView[ segmentIdx + 1 ];
       IndexType localIdx( 0 );
-      bool compute( true );
-      for( IndexType globalIdx = begin; globalIdx < end && compute; globalIdx++  )
-         f( segmentIdx, localIdx++, globalIdx, compute );
+      for( IndexType globalIdx = begin; globalIdx < end; globalIdx++  )
+         f( segmentIdx, localIdx++, globalIdx );
    };
    Algorithms::ParallelFor< Device >::exec( begin, end, l );
 }
@@ -233,7 +233,7 @@ template< typename Device,
    template< typename Function >
 void
 CSRView< Device, Index, Kernel >::
-forEachSegment( Function&& f ) const
+forAllSegments( Function&& f ) const
 {
    this->forSegments( 0, this->getSegmentsCount(), f );
 }
@@ -241,26 +241,49 @@ forEachSegment( Function&& f ) const
 template< typename Device,
           typename Index,
           typename Kernel >
-   template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
+   template< typename Function >
+void
+CSRView< Device, Index, Kernel >::
+sequentialForSegments( IndexType begin, IndexType end, Function&& function ) const
+{
+   for( IndexType i = begin; i < end; i++ )
+      forSegments( i, i + 1, function );
+}
+
+template< typename Device,
+          typename Index,
+          typename Kernel >
+   template< typename Function >
 void
 CSRView< Device, Index, Kernel >::
-segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
+sequentialForAllSegments( Function&& f ) const
+{
+   this->sequentialForSegments( 0, this->getSegmentsCount(), f );
+}
+
+template< typename Device,
+          typename Index,
+          typename Kernel >
+   template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real >
+void
+CSRView< Device, Index, Kernel >::
+reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero ) const
 {
    if( std::is_same< DeviceType, TNL::Devices::Host >::value )
-      TNL::Algorithms::Segments::CSRScalarKernel< IndexType, DeviceType >::segmentsReduction( offsets, first, last, fetch, reduction, keeper, zero, args... );
+      TNL::Algorithms::Segments::CSRScalarKernel< IndexType, DeviceType >::reduceSegments( offsets, first, last, fetch, reduction, keeper, zero );
    else
-      kernel.segmentsReduction( offsets, first, last, fetch, reduction, keeper, zero, args... );
+      kernel.reduceSegments( offsets, first, last, fetch, reduction, keeper, zero );
 }
 
 template< typename Device,
           typename Index,
           typename Kernel >
-   template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
+   template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real >
 void
 CSRView< Device, Index, Kernel >::
-allReduction( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
+reduceAllSegments( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero ) const
 {
-   this->segmentsReduction( 0, this->getSegmentsCount(), fetch, reduction, keeper, zero, args... );
+   this->reduceSegments( 0, this->getSegmentsCount(), fetch, reduction, keeper, zero );
 }
 
 template< typename Device,
@@ -296,6 +319,18 @@ load( File& file )
    this->kernel.init( this->offsets );
 }
 
+template< typename Device,
+          typename Index,
+          typename Kernel >
+      template< typename Fetch >
+auto
+CSRView< Device, Index, Kernel >::
+print( Fetch&& fetch ) const -> SegmentsPrinter< CSRView, Fetch >
+{
+   return SegmentsPrinter< CSRView, Fetch >( *this, fetch );
+}
+
+
       } // namespace Segments
    }  // namespace Containers
 } // namespace TNL
diff --git a/src/TNL/Algorithms/Segments/ChunkedEllpack.h b/src/TNL/Algorithms/Segments/ChunkedEllpack.h
index 5abb93b5a0ad2dd6027c46940c4b73cea6b0a227..1d4f9fabcdc53293d5240f14554209fab4902525 100644
--- a/src/TNL/Algorithms/Segments/ChunkedEllpack.h
+++ b/src/TNL/Algorithms/Segments/ChunkedEllpack.h
@@ -14,6 +14,7 @@
 #include <TNL/Containers/Vector.h>
 #include <TNL/Algorithms/Segments/ChunkedEllpackView.h>
 #include <TNL/Algorithms/Segments/SegmentView.h>
+#include <TNL/Algorithms/Segments/SegmentsPrinting.h>
 
 namespace TNL {
    namespace Algorithms {
@@ -29,12 +30,12 @@ class ChunkedEllpack
 
       using DeviceType = Device;
       using IndexType = std::remove_const_t< Index >;
-      using OffsetsHolder = Containers::Vector< Index, DeviceType, IndexType, IndexAllocator >;
+      using OffsetsContainer = Containers::Vector< Index, DeviceType, IndexType, IndexAllocator >;
       static constexpr ElementsOrganization getOrganization() { return Organization; }
       using ViewType = ChunkedEllpackView< Device, Index, Organization >;
       template< typename Device_, typename Index_ >
       using ViewTemplate = ChunkedEllpackView< Device_, Index_, Organization >;
-      using ConstViewType = ChunkedEllpackView< Device, std::add_const_t< IndexType >, Organization >;
+      using ConstViewType = typename ViewType::ConstViewType;
       using SegmentViewType = typename ViewType::SegmentViewType;
       using ChunkedEllpackSliceInfoType = typename ViewType::ChunkedEllpackSliceInfoType; // detail::ChunkedEllpackSliceInfo< IndexType >;
       //TODO: using ChunkedEllpackSliceInfoAllocator = typename IndexAllocatorType::retype< ChunkedEllpackSliceInfoType >;
@@ -45,7 +46,11 @@ class ChunkedEllpack
 
       ChunkedEllpack() = default;
 
-      ChunkedEllpack( const Containers::Vector< IndexType, DeviceType, IndexType >& sizes );
+      template< typename SizesContainer >
+      ChunkedEllpack( const SizesContainer& sizes );
+
+      template< typename ListIndex >
+      ChunkedEllpack( const std::initializer_list< ListIndex >& segmentsSizes );
 
       ChunkedEllpack( const ChunkedEllpack& segments );
 
@@ -68,7 +73,7 @@ class ChunkedEllpack
       /**
        * \brief Set sizes of particular segments.
        */
-      template< typename SizesHolder = OffsetsHolder >
+      template< typename SizesHolder = OffsetsContainer >
       void setSegmentsSizes( const SizesHolder& sizes );
 
       void reset();
@@ -106,16 +111,16 @@ class ChunkedEllpack
       void forSegments( IndexType begin, IndexType end, Function&& f ) const;
 
       template< typename Function >
-      void forEachSegment( Function&& f ) const;
+      void forAllSegments( Function&& f ) const;
 
       /***
        * \brief Go over all segments and perform a reduction in each of them.
        */
-      template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
-      void segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
+      template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real >
+      void reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero ) const;
 
-      template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
-      void allReduction( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
+      template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real >
+      void reduceAllSegments( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero ) const;
 
       ChunkedEllpack& operator=( const ChunkedEllpack& source ) = default;
 
@@ -126,6 +131,9 @@ class ChunkedEllpack
 
       void load( File& file );
 
+      template< typename Fetch >
+      SegmentsPrinter< ChunkedEllpack, Fetch > print( Fetch&& fetch ) const;
+
       void printStructure( std::ostream& str ); // TODO const;
 
    protected:
@@ -146,19 +154,19 @@ class ChunkedEllpack
        * For each segment, this keeps index of the slice which contains the
        * segment.
        */
-      OffsetsHolder rowToSliceMapping;
+      OffsetsContainer rowToSliceMapping;
 
       /**
        * For each row, this keeps index of the first chunk within a slice.
        */
-      OffsetsHolder rowToChunkMapping;
+      OffsetsContainer rowToChunkMapping;
 
-      OffsetsHolder chunksToSegmentsMapping;
+      OffsetsContainer chunksToSegmentsMapping;
 
       /**
        * Keeps index of the first segment index.
        */
-      OffsetsHolder rowPointers;
+      OffsetsContainer rowPointers;
 
       ChunkedEllpackSliceInfoContainer slices;
 
@@ -168,6 +176,13 @@ class ChunkedEllpack
       friend class ChunkedEllpack;
 };
 
+template <typename Device,
+          typename Index,
+          typename IndexAllocator,
+          ElementsOrganization Organization >
+std::ostream& operator<<( std::ostream& str, const ChunkedEllpack< Device, Index, IndexAllocator, Organization >& segments ) { return printSegments( segments, str ); }
+
+
       } // namespace Segments
    }  // namespace Algorithms
 } // namespace TNL
diff --git a/src/TNL/Algorithms/Segments/ChunkedEllpack.hpp b/src/TNL/Algorithms/Segments/ChunkedEllpack.hpp
index b4f60047bb128069d7c477ec43217cb6d2c8cc95..6218a451ce0e5e05581768fcc6c6fe12a90d0c60 100644
--- a/src/TNL/Algorithms/Segments/ChunkedEllpack.hpp
+++ b/src/TNL/Algorithms/Segments/ChunkedEllpack.hpp
@@ -24,10 +24,22 @@ template< typename Device,
           typename Index,
           typename IndexAllocator,
           ElementsOrganization Organization >
+   template< typename SizesContainer >
 ChunkedEllpack< Device, Index, IndexAllocator, Organization >::
-ChunkedEllpack( const Containers::Vector< IndexType, DeviceType, IndexType >& sizes )
+ChunkedEllpack( const SizesContainer& segmentsSizes )
 {
-   this->setSegmentsSizes( sizes );
+   this->setSegmentsSizes( segmentsSizes );
+}
+
+template< typename Device,
+          typename Index,
+          typename IndexAllocator,
+          ElementsOrganization Organization >
+   template< typename ListIndex >
+ChunkedEllpack< Device, Index, IndexAllocator, Organization >::
+ChunkedEllpack( const std::initializer_list< ListIndex >& segmentsSizes )
+{
+   this->setSegmentsSizes( Containers::Vector< IndexType, DeviceType, IndexType >( segmentsSizes ) );
 }
 
 template< typename Device,
@@ -76,6 +88,7 @@ String
 ChunkedEllpack< Device, Index, IndexAllocator, Organization >::
 getSerializationType()
 {
+   // FIXME: the serialized data DEPEND on the Organization parameter, so it should be reflected in the serialization type
    return "ChunkedEllpack< [any_device], " + TNL::getSerializationType< IndexType >() + " >";
 }
 
@@ -337,9 +350,9 @@ auto ChunkedEllpack< Device, Index, IndexAllocator, Organization >::
 getSegmentSize( const IndexType segmentIdx ) const -> IndexType
 {
    return detail::ChunkedEllpack< IndexType, DeviceType, Organization >::getSegmentSize(
-      rowToSliceMapping.getView(),
-      slices.getView(),
-      rowToChunkMapping.getView(),
+      rowToSliceMapping.getConstView(),
+      slices.getConstView(),
+      rowToChunkMapping.getConstView(),
       segmentIdx );
 }
 
@@ -431,33 +444,33 @@ template< typename Device,
    template< typename Function >
 void
 ChunkedEllpack< Device, Index, IndexAllocator, Organization >::
-forEachSegment( Function&& f ) const
+forAllSegments( Function&& f ) const
 {
-   this->getConstView().forEachSegment( f );
+   this->getConstView().forAllSegments( f );
 }
 
 template< typename Device,
           typename Index,
           typename IndexAllocator,
           ElementsOrganization Organization >
-   template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
+   template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real >
 void
 ChunkedEllpack< Device, Index, IndexAllocator, Organization >::
-segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
+reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero ) const
 {
-   this->getConstView().segmentsReduction( first, last, fetch, reduction, keeper, zero, args... );
+   this->getConstView().reduceSegments( first, last, fetch, reduction, keeper, zero );
 }
 
 template< typename Device,
           typename Index,
           typename IndexAllocator,
           ElementsOrganization Organization >
-   template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
+   template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real >
 void
 ChunkedEllpack< Device, Index, IndexAllocator, Organization >::
-allReduction( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
+reduceAllSegments( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero ) const
 {
-   this->segmentsReduction( 0, this->getSegmentsCount(), fetch, reduction, keeper, zero, args... );
+   this->reduceSegments( 0, this->getSegmentsCount(), fetch, reduction, keeper, zero );
 }
 
 template< typename Device,
@@ -522,6 +535,18 @@ load( File& file )
    file.load( &this->numberOfSlices );
 }
 
+template< typename Device,
+          typename Index,
+          typename IndexAllocator,
+          ElementsOrganization Organization >
+      template< typename Fetch >
+auto
+ChunkedEllpack< Device, Index, IndexAllocator, Organization >::
+print( Fetch&& fetch ) const -> SegmentsPrinter< ChunkedEllpack, Fetch >
+{
+   return SegmentsPrinter< ChunkedEllpack, Fetch >( *this, fetch );
+}
+
 template< typename Device,
           typename Index,
           typename IndexAllocator,
diff --git a/src/TNL/Algorithms/Segments/ChunkedEllpackView.h b/src/TNL/Algorithms/Segments/ChunkedEllpackView.h
index f7211c21625fc147eb6db63a44298110b633e3aa..0ed8ed413fd8a7992f4428e7405459db3953c618 100644
--- a/src/TNL/Algorithms/Segments/ChunkedEllpackView.h
+++ b/src/TNL/Algorithms/Segments/ChunkedEllpackView.h
@@ -17,6 +17,7 @@
 #include <TNL/Algorithms/Segments/ElementsOrganization.h>
 #include <TNL/Algorithms/Segments/ChunkedEllpackSegmentView.h>
 #include <TNL/Algorithms/Segments/detail/ChunkedEllpack.h>
+#include <TNL/Algorithms/Segments/SegmentsPrinting.h>
 
 namespace TNL {
    namespace Algorithms {
@@ -32,7 +33,7 @@ class ChunkedEllpackView
 
       using DeviceType = Device;
       using IndexType = std::remove_const_t< Index >;
-      using OffsetsView = typename Containers::VectorView< IndexType, DeviceType, IndexType >;
+      using OffsetsView = typename Containers::VectorView< Index, DeviceType, IndexType >;
       using ConstOffsetsView = typename OffsetsView::ConstViewType;
       using ViewType = ChunkedEllpackView;
       template< typename Device_, typename Index_ >
@@ -41,7 +42,7 @@ class ChunkedEllpackView
       using SegmentViewType = ChunkedEllpackSegmentView< IndexType, Organization >;
       using ChunkedEllpackSliceInfoType = detail::ChunkedEllpackSliceInfo< IndexType >;
       using ChunkedEllpackSliceInfoAllocator = typename Allocators::Default< Device >::template Allocator< ChunkedEllpackSliceInfoType >;
-      using ChunkedEllpackSliceInfoContainer = Containers::Array< ChunkedEllpackSliceInfoType, DeviceType, IndexType, ChunkedEllpackSliceInfoAllocator >;
+      using ChunkedEllpackSliceInfoContainer = Containers::Array< typename TNL::copy_const< ChunkedEllpackSliceInfoType >::template from< Index >::type, DeviceType, IndexType, ChunkedEllpackSliceInfoAllocator >;
       using ChunkedEllpackSliceInfoContainerView = typename ChunkedEllpackSliceInfoContainer::ViewType;
 
       static constexpr bool havePadding() { return true; };
@@ -135,21 +136,24 @@ class ChunkedEllpackView
       void forSegments( IndexType begin, IndexType end, Function&& f ) const;
 
       template< typename Function >
-      void forEachSegment( Function&& f ) const;
+      void forAllSegments( Function&& f ) const;
 
       /***
        * \brief Go over all segments and perform a reduction in each of them.
        */
-      template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
-      void segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
+      template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real >
+      void reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero ) const;
 
-      template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
-      void allReduction( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
+      template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real >
+      void reduceAllSegments( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero ) const;
 
       ChunkedEllpackView& operator=( const ChunkedEllpackView& view );
 
       void save( File& file ) const;
 
+      template< typename Fetch >
+      SegmentsPrinter< ChunkedEllpackView, Fetch > print( Fetch&& fetch ) const;
+
       void printStructure( std::ostream& str ) const;
 
    protected:
@@ -158,32 +162,28 @@ class ChunkedEllpackView
       template< typename Fetch,
                 typename Reduction,
                 typename ResultKeeper,
-                typename Real,
-                typename... Args >
+                typename Real >
       __device__
-      void segmentsReductionKernelWithAllParameters( IndexType gridIdx,
+      void reduceSegmentsKernelWithAllParameters( IndexType gridIdx,
                                                      IndexType first,
                                                      IndexType last,
                                                      Fetch fetch,
                                                      Reduction reduction,
                                                      ResultKeeper keeper,
-                                                     Real zero,
-                                                     Args... args ) const;
+                                                     Real zero ) const;
 
       template< typename Fetch,
                 typename Reduction,
                 typename ResultKeeper,
-                typename Real,
-                typename... Args >
+                typename Real >
       __device__
-      void segmentsReductionKernel( IndexType gridIdx,
+      void reduceSegmentsKernel( IndexType gridIdx,
                                     IndexType first,
                                     IndexType last,
                                     Fetch fetch,
                                     Reduction reduction,
                                     ResultKeeper keeper,
-                                    Real zero,
-                                    Args... args ) const;
+                                    Real zero ) const;
 #endif
 
       IndexType size = 0, storageSize = 0, numberOfSlices = 0;
@@ -216,21 +216,19 @@ class ChunkedEllpackView
                 typename Fetch_,
                 typename Reduction_,
                 typename ResultKeeper_,
-                typename Real_,
-                typename... Args_ >
+                typename Real_ >
       friend __global__
-      void ChunkedEllpackSegmentsReductionKernel( View_ chunkedEllpack,
+      void ChunkedEllpackreduceSegmentsKernel( View_ chunkedEllpack,
                                                   Index_ gridIdx,
                                                   Index_ first,
                                                   Index_ last,
                                                   Fetch_ fetch,
                                                   Reduction_ reduction,
                                                   ResultKeeper_ keeper,
-                                                  Real_ zero,
-                                                  Args_... args );
+                                                  Real_ zero );
 
       template< typename Index_, typename Fetch_, bool B_ >
-      friend struct detail::ChunkedEllpackSegmentsReductionDispatcher;
+      friend struct detail::ChunkedEllpackreduceSegmentsDispatcher;
 #endif
 };
       } // namespace Segments
diff --git a/src/TNL/Algorithms/Segments/ChunkedEllpackView.hpp b/src/TNL/Algorithms/Segments/ChunkedEllpackView.hpp
index 26e8fd0f75d64d2c70619c8ef79fe2a16bafcc14..6133a843844b089bc60a16da7181cc8149c14c2a 100644
--- a/src/TNL/Algorithms/Segments/ChunkedEllpackView.hpp
+++ b/src/TNL/Algorithms/Segments/ChunkedEllpackView.hpp
@@ -122,6 +122,7 @@ String
 ChunkedEllpackView< Device, Index, Organization >::
 getSerializationType()
 {
+   // FIXME: the serialized data DEPEND on the Organization parameter, so it should be reflected in the serialization type
    return "ChunkedEllpack< [any_device], " + TNL::getSerializationType< IndexType >() + " >";
 }
 
@@ -323,14 +324,13 @@ forElements( IndexType first, IndexType last, Function&& f ) const
       const IndexType chunkSize = slices[ sliceIdx ].chunkSize;
 
       const IndexType segmentSize = segmentChunksCount * chunkSize;
-      bool compute( true );
       if( Organization == RowMajorOrder )
       {
          IndexType begin = sliceOffset + firstChunkOfSegment * chunkSize;
          IndexType end = begin + segmentSize;
          IndexType localIdx( 0 );
-         for( IndexType j = begin; j < end && compute; j++ )
-            f( segmentIdx, localIdx++, j, compute );
+         for( IndexType j = begin; j < end; j++ )
+            f( segmentIdx, localIdx++, j );
       }
       else
       {
@@ -339,9 +339,9 @@ forElements( IndexType first, IndexType last, Function&& f ) const
          {
             IndexType begin = sliceOffset + firstChunkOfSegment + chunkIdx;
             IndexType end = begin + chunksInSlice * chunkSize;
-            for( IndexType j = begin; j < end && compute; j += chunksInSlice )
+            for( IndexType j = begin; j < end; j += chunksInSlice )
             {
-               f( segmentIdx, localIdx++, j, compute );
+               f( segmentIdx, localIdx++, j );
             }
          }
       }
@@ -384,7 +384,7 @@ template< typename Device,
    template< typename Function >
 void
 ChunkedEllpackView< Device, Index, Organization >::
-forEachSegment( Function&& f ) const
+forAllSegments( Function&& f ) const
 {
    this->forSegments( 0, this->getSegmentsCount(), f );
 }
@@ -393,15 +393,15 @@ forEachSegment( Function&& f ) const
 template< typename Device,
           typename Index,
           ElementsOrganization Organization >
-   template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
+   template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real >
 void
 ChunkedEllpackView< Device, Index, Organization >::
-segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
+reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero ) const
 {
    using RealType = typename detail::FetchLambdaAdapter< Index, Fetch >::ReturnType;
    if( std::is_same< DeviceType, Devices::Host >::value )
    {
-      //segmentsReductionKernel( 0, first, last, fetch, reduction, keeper, zero, args... );
+      //reduceSegmentsKernel( 0, first, last, fetch, reduction, keeper, zero );
       //return;
 
       for( IndexType segmentIdx = first; segmentIdx < last; segmentIdx++ )
@@ -456,9 +456,9 @@ segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reductio
       {
          if( gridIdx == cudaGrids - 1 )
             cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize();
-         detail::ChunkedEllpackSegmentsReductionKernel< ViewType, IndexType, Fetch, Reduction, ResultKeeper, Real, Args...  >
+         detail::ChunkedEllpackreduceSegmentsKernel< ViewType, IndexType, Fetch, Reduction, ResultKeeper, Real  >
             <<< cudaGridSize, cudaBlockSize, sharedMemory  >>>
-            ( *this, gridIdx, first, last, fetch, reduction, keeper, zero, args... );
+            ( *this, gridIdx, first, last, fetch, reduction, keeper, zero );
       }
 #endif
    }
@@ -467,12 +467,12 @@ segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reductio
 template< typename Device,
           typename Index,
           ElementsOrganization Organization >
-   template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
+   template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real >
 void
 ChunkedEllpackView< Device, Index, Organization >::
-allReduction( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
+reduceAllSegments( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero ) const
 {
-   this->segmentsReduction( 0, this->getSegmentsCount(), fetch, reduction, keeper, zero, args... );
+   this->reduceSegments( 0, this->getSegmentsCount(), fetch, reduction, keeper, zero );
 }
 
 template< typename Device,
@@ -514,6 +514,18 @@ save( File& file ) const
    file.save( &this->numberOfSlices );
 }
 
+template< typename Device,
+          typename Index,
+          ElementsOrganization Organization >
+      template< typename Fetch >
+auto
+ChunkedEllpackView< Device, Index, Organization >::
+print( Fetch&& fetch ) const -> SegmentsPrinter< ChunkedEllpackView, Fetch >
+{
+   return SegmentsPrinter< ChunkedEllpackView, Fetch >( *this, fetch );
+}
+
+
 template< typename Device,
           typename Index,
           ElementsOrganization Organization >
@@ -543,21 +555,19 @@ template< typename Device,
    template< typename Fetch,
              typename Reduction,
              typename ResultKeeper,
-             typename Real,
-             typename... Args >
+             typename Real >
 __device__
 void
 ChunkedEllpackView< Device, Index, Organization >::
-segmentsReductionKernelWithAllParameters( IndexType gridIdx,
+reduceSegmentsKernelWithAllParameters( IndexType gridIdx,
                                           IndexType first,
                                           IndexType last,
                                           Fetch fetch,
                                           Reduction reduction,
                                           ResultKeeper keeper,
-                                          Real zero,
-                                          Args... args ) const
+                                          Real zero ) const
 {
-   using RealType = decltype( fetch( IndexType(), IndexType(), IndexType(), std::declval< bool& >(), args... ) );
+   using RealType = decltype( fetch( IndexType(), IndexType(), IndexType(), std::declval< bool& >() ) );
 
    const IndexType firstSlice = rowToSliceMapping[ first ];
    const IndexType lastSlice = rowToSliceMapping[ last - 1 ];
@@ -621,21 +631,19 @@ template< typename Device,
    template< typename Fetch,
              typename Reduction,
              typename ResultKeeper,
-             typename Real,
-             typename... Args >
+             typename Real >
 __device__
 void
 ChunkedEllpackView< Device, Index, Organization >::
-segmentsReductionKernel( IndexType gridIdx,
+reduceSegmentsKernel( IndexType gridIdx,
                          IndexType first,
                          IndexType last,
                          Fetch fetch,
                          Reduction reduction,
                          ResultKeeper keeper,
-                         Real zero,
-                         Args... args ) const
+                         Real zero ) const
 {
-   using RealType = decltype( fetch( IndexType(), std::declval< bool& >(), args... ) );
+   using RealType = decltype( fetch( IndexType(), std::declval< bool& >() ) );
 
    const IndexType firstSlice = rowToSliceMapping[ first ];
    const IndexType lastSlice = rowToSliceMapping[ last - 1 ];
diff --git a/src/TNL/Algorithms/Segments/Ellpack.h b/src/TNL/Algorithms/Segments/Ellpack.h
index c88ba6a1d01d4c6e49a6c2b6af2850bd14a8f3f0..c363d30003a0aa74d02d464e4096889741e37296 100644
--- a/src/TNL/Algorithms/Segments/Ellpack.h
+++ b/src/TNL/Algorithms/Segments/Ellpack.h
@@ -31,8 +31,8 @@ class Ellpack
       using IndexType = std::remove_const_t< Index >;
       static constexpr int getAlignment() { return Alignment; }
       static constexpr ElementsOrganization getOrganization() { return Organization; }
-      using OffsetsHolder = Containers::Vector< IndexType, DeviceType, IndexType >;
-      using SegmentsSizes = OffsetsHolder;
+      using OffsetsContainer = Containers::Vector< IndexType, DeviceType, IndexType >;
+      using SegmentsSizes = OffsetsContainer;
       template< typename Device_, typename Index_ >
       using ViewTemplate = EllpackView< Device_, Index_, Organization, Alignment >;
       using ViewType = EllpackView< Device, Index, Organization, Alignment >;
@@ -43,7 +43,11 @@ class Ellpack
 
       Ellpack();
 
-      Ellpack( const SegmentsSizes& sizes );
+      template< typename SizesContainer >
+      Ellpack( const SizesContainer& sizes );
+
+      template< typename ListIndex >
+      Ellpack( const std::initializer_list< ListIndex >& segmentsSizes );
 
       Ellpack( const IndexType segmentsCount, const IndexType segmentSize );
 
@@ -62,7 +66,7 @@ class Ellpack
       /**
        * \brief Set sizes of particular segments.
        */
-      template< typename SizesHolder = OffsetsHolder >
+      template< typename SizesHolder = OffsetsContainer >
       void setSegmentsSizes( const SizesHolder& sizes );
 
       void setSegmentsSizes( const IndexType segmentsCount, const IndexType segmentSize );
@@ -106,16 +110,16 @@ class Ellpack
       void forSegments( IndexType begin, IndexType end, Function&& f ) const;
 
       template< typename Function >
-      void forEachSegment( Function&& f ) const;
+      void forAllSegments( Function&& f ) const;
 
       /***
        * \brief Go over all segments and perform a reduction in each of them.
        */
-      template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
-      void segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
+      template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real >
+      void reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero ) const;
 
-      template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
-      void allReduction( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
+      template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real >
+      void reduceAllSegments( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero ) const;
 
       Ellpack& operator=( const Ellpack& source ) = default;
 
@@ -126,11 +130,21 @@ class Ellpack
 
       void load( File& file );
 
+      template< typename Fetch >
+      SegmentsPrinter< Ellpack, Fetch > print( Fetch&& fetch ) const;
+
    protected:
 
       IndexType segmentSize, size, alignedSize;
 };
 
+template <typename Device,
+          typename Index,
+          typename IndexAllocator,
+          ElementsOrganization Organization,
+          int Alignment >
+std::ostream& operator<<( std::ostream& str, const Ellpack< Device, Index, IndexAllocator, Organization, Alignment >& segments ) { return printSegments( segments, str ); }
+
       } // namespace Segments
    }  // namespace Algorithms
 } // namespace TNL
diff --git a/src/TNL/Algorithms/Segments/Ellpack.hpp b/src/TNL/Algorithms/Segments/Ellpack.hpp
index 124e3dfc2cabb13d7533e8269c27e3ce85ca50df..baef4d16133a450cdd9120ef799f0f842ebdd240 100644
--- a/src/TNL/Algorithms/Segments/Ellpack.hpp
+++ b/src/TNL/Algorithms/Segments/Ellpack.hpp
@@ -35,13 +35,27 @@ template< typename Device,
           typename IndexAllocator,
           ElementsOrganization Organization,
           int Alignment >
+   template< typename SizesContainer >
 Ellpack< Device, Index, IndexAllocator, Organization, Alignment >::
-Ellpack( const SegmentsSizes& segmentsSizes )
+Ellpack( const SizesContainer& segmentsSizes )
    : segmentSize( 0 ), size( 0 ), alignedSize( 0 )
 {
    this->setSegmentsSizes( segmentsSizes );
 }
 
+template< typename Device,
+          typename Index,
+          typename IndexAllocator,
+          ElementsOrganization Organization,
+          int Alignment >
+   template< typename ListIndex >
+Ellpack< Device, Index, IndexAllocator, Organization, Alignment >::
+Ellpack( const std::initializer_list< ListIndex >& segmentsSizes )
+   : segmentSize( 0 ), size( 0 ), alignedSize( 0 )
+{
+   this->setSegmentsSizes( Containers::Vector< IndexType, DeviceType, IndexType >( segmentsSizes ) );
+}
+
 template< typename Device,
           typename Index,
           typename IndexAllocator,
@@ -85,6 +99,7 @@ String
 Ellpack< Device, Index, IndexAllocator, Organization, Alignment >::
 getSerializationType()
 {
+   // FIXME: the serialized data DEPEND on the Organization and Alignment parameters, so it should be reflected in the serialization type
    return "Ellpack< [any_device], " + TNL::getSerializationType< IndexType >() + " >";
 }
 
@@ -294,9 +309,9 @@ template< typename Device,
    template< typename Function >
 void
 Ellpack< Device, Index, IndexAllocator, Organization, Alignment >::
-forEachSegment( Function&& f ) const
+forAllSegments( Function&& f ) const
 {
-   this->getConstView().forEachSegment( f );
+   this->getConstView().forAllSegments( f );
 }
 
 template< typename Device,
@@ -304,12 +319,12 @@ template< typename Device,
           typename IndexAllocator,
           ElementsOrganization Organization,
           int Alignment >
-   template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
+   template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real >
 void
 Ellpack< Device, Index, IndexAllocator, Organization, Alignment >::
-segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
+reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero ) const
 {
-   this->getConstView().segmentsReduction( first, last, fetch, reduction, keeper, zero, args... );
+   this->getConstView().reduceSegments( first, last, fetch, reduction, keeper, zero );
 }
 
 template< typename Device,
@@ -317,12 +332,12 @@ template< typename Device,
           typename IndexAllocator,
           ElementsOrganization Organization,
           int Alignment >
-   template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
+   template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real >
 void
 Ellpack< Device, Index, IndexAllocator, Organization, Alignment >::
-allReduction( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
+reduceAllSegments( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero ) const
 {
-   this->segmentsReduction( 0, this->getSegmentsCount(), fetch, reduction, keeper, zero, args... );
+   this->reduceSegments( 0, this->getSegmentsCount(), fetch, reduction, keeper, zero );
 }
 
 template< typename Device,
@@ -369,6 +384,19 @@ load( File& file )
    file.load( &alignedSize );
 }
 
+template< typename Device,
+          typename Index,
+          typename IndexAllocator,
+          ElementsOrganization Organization,
+          int Alignment >
+      template< typename Fetch >
+auto
+Ellpack< Device, Index, IndexAllocator, Organization, Alignment >::
+print( Fetch&& fetch ) const -> SegmentsPrinter< Ellpack, Fetch >
+{
+   return SegmentsPrinter< Ellpack, Fetch >( *this, fetch );
+}
+
       } // namespace Segments
    }  // namespace Containers
 } // namespace TNL
diff --git a/src/TNL/Algorithms/Segments/EllpackView.h b/src/TNL/Algorithms/Segments/EllpackView.h
index 77d0d8b7b239262cc8a50ea947fd3e5fbf93a00a..1a14db3384e3f74b9e20df3ed3f3f1fe83a07eb1 100644
--- a/src/TNL/Algorithms/Segments/EllpackView.h
+++ b/src/TNL/Algorithms/Segments/EllpackView.h
@@ -15,12 +15,15 @@
 #include <TNL/Containers/Vector.h>
 #include <TNL/Algorithms/Segments/SegmentView.h>
 #include <TNL/Algorithms/Segments/ElementsOrganization.h>
+#include <TNL/Algorithms/Segments/SegmentsPrinting.h>
 
 
 namespace TNL {
    namespace Algorithms {
       namespace Segments {
 
+enum EllpackKernelType { Scalar, Vector, Vector2, Vector4, Vector8, Vector16 };
+
 template< typename Device,
           typename Index,
           ElementsOrganization Organization = Segments::DefaultElementsOrganization< Device >::getOrganization(),
@@ -32,9 +35,9 @@ class EllpackView
       using DeviceType = Device;
       using IndexType = std::remove_const_t< Index >;
       static constexpr int getAlignment() { return Alignment; }
-      static constexpr bool getOrganization() { return Organization; }
-      using OffsetsHolder = Containers::Vector< IndexType, DeviceType, IndexType >;
-      using SegmentsSizes = OffsetsHolder;
+      static constexpr ElementsOrganization getOrganization() { return Organization; }
+      using OffsetsContainer = Containers::Vector< IndexType, DeviceType, IndexType >;
+      using SegmentsSizes = OffsetsContainer;
       template< typename Device_, typename Index_ >
       using ViewTemplate = EllpackView< Device_, Index_, Organization, Alignment >;
       using ViewType = EllpackView;
@@ -105,16 +108,16 @@ class EllpackView
       void forSegments( IndexType begin, IndexType end, Function&& f ) const;
 
       template< typename Function >
-      void forEachSegment( Function&& f ) const;
+      void forAllSegments( Function&& f ) const;
 
       /***
        * \brief Go over all segments and perform a reduction in each of them.
        */
-      template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
-      void segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
+      template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real >
+      void reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero ) const;
 
-      template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
-      void allReduction( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
+      template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real >
+      void reduceAllSegments( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero ) const;
 
       EllpackView& operator=( const EllpackView& view );
 
@@ -122,11 +125,20 @@ class EllpackView
 
       void load( File& file );
 
+      template< typename Fetch >
+      SegmentsPrinter< EllpackView, Fetch > print( Fetch&& fetch ) const;
+
    protected:
 
       IndexType segmentSize, segmentsCount, alignedSize;
 };
 
+template< typename Device,
+          typename Index,
+          ElementsOrganization Organization,
+          int Alignment >
+std::ostream& operator<<( std::ostream& str, const EllpackView< Device, Index, Organization, Alignment >& ellpack ) { return printSegments( str, ellpack ); }
+
       } // namespace Segments
    }  // namespace Algorithms
 } // namespace TNL
diff --git a/src/TNL/Algorithms/Segments/EllpackView.hpp b/src/TNL/Algorithms/Segments/EllpackView.hpp
index 6215f4ef971be08c63c23af0e985dd368a7d3e6f..b5311d7939e1d49d826b6ba76bffb11af23aff0f 100644
--- a/src/TNL/Algorithms/Segments/EllpackView.hpp
+++ b/src/TNL/Algorithms/Segments/EllpackView.hpp
@@ -19,6 +19,124 @@ namespace TNL {
    namespace Algorithms {
       namespace Segments {
 
+#ifdef HAVE_CUDA
+template< typename Index,
+          typename Fetch,
+          typename Reduction,
+          typename ResultKeeper,
+          typename Real >
+__global__ void
+EllpackCudaReductionKernelFull( Index first, Index last, Fetch fetch, const Reduction reduction, ResultKeeper keep, const Real zero, Index segmentSize )
+{
+   const int warpSize = 32;
+   const int gridID = 0;
+   const Index segmentIdx = first + ((gridID * TNL::Cuda::getMaxGridXSize() ) + (blockIdx.x * blockDim.x) + threadIdx.x) / warpSize;
+   if (segmentIdx >= last)
+      return;
+
+   Real result = zero;
+   const Index laneID = threadIdx.x & 31; // & is cheaper than %
+   const Index begin = segmentIdx * segmentSize;
+   const Index end = begin + segmentSize;
+
+   /* Calculate result */
+   Index localIdx( 0 );
+   bool compute( true );
+   for( Index i = begin + laneID; i < end; i += warpSize)
+      result = reduction( result, fetch( segmentIdx, localIdx++, i, compute ) );
+
+   /* Reduction */
+   result = reduction( result, __shfl_down_sync(0xFFFFFFFF, result, 16 ) );
+   result = reduction( result, __shfl_down_sync(0xFFFFFFFF, result,  8 ) );
+   result = reduction( result, __shfl_down_sync(0xFFFFFFFF, result,  4 ) );
+   result = reduction( result, __shfl_down_sync(0xFFFFFFFF, result,  2 ) );
+   result = reduction( result, __shfl_down_sync(0xFFFFFFFF, result,  1 ) );
+   /* Write result */
+   if( laneID == 0 )
+      keep( segmentIdx, result );
+}
+
+template< typename Index,
+          typename Fetch,
+          typename Reduction,
+          typename ResultKeeper,
+          typename Real >
+__global__ void
+EllpackCudaReductionKernelCompact( Index first, Index last, Fetch fetch, const Reduction reduction, ResultKeeper keep, const Real zero, Index segmentSize )
+{
+   const int warpSize = 32;
+   const int gridID = 0;
+   const Index segmentIdx = first + ((gridID * TNL::Cuda::getMaxGridXSize() ) + (blockIdx.x * blockDim.x) + threadIdx.x) / warpSize;
+   if (segmentIdx >= last)
+      return;
+
+   Real result = zero;
+   const Index laneID = threadIdx.x & 31; // & is cheaper than %
+   const Index begin = segmentIdx * segmentSize;
+   const Index end = begin + segmentSize;
+
+   /* Calculate result */
+   bool compute( true );
+   for( Index i = begin + laneID; i < end; i += warpSize)
+      result = reduction( result, fetch( i, compute ) );
+
+   /* Reduction */
+   result = reduction( result, __shfl_down_sync(0xFFFFFFFF, result, 16 ) );
+   result = reduction( result, __shfl_down_sync(0xFFFFFFFF, result,  8 ) );
+   result = reduction( result, __shfl_down_sync(0xFFFFFFFF, result,  4 ) );
+   result = reduction( result, __shfl_down_sync(0xFFFFFFFF, result,  2 ) );
+   result = reduction( result, __shfl_down_sync(0xFFFFFFFF, result,  1 ) );
+   /* Write result */
+   if( laneID == 0 )
+      keep( segmentIdx, result );
+
+}
+#endif
+
+template< typename Index,
+          typename Fetch,
+          typename Reduction,
+          typename ResultKeeper,
+          typename Real,
+          bool FullFetch = detail::CheckFetchLambda< Index, Fetch >::hasAllParameters() >
+struct EllpackCudaReductionDispatcher
+{
+   static void
+   exec( Index first, Index last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Index segmentSize )
+   {
+   #ifdef HAVE_CUDA
+      const Index segmentsCount = last - first;
+      const Index threadsCount = segmentsCount * 32;
+      const Index blocksCount = Cuda::getNumberOfBlocks( threadsCount, 256 );
+      dim3 blockSize( 256 );
+      dim3 gridSize( blocksCount );
+      EllpackCudaReductionKernelFull<<< gridSize, blockSize >>>( first, last, fetch, reduction, keeper, zero, segmentSize );
+      cudaDeviceSynchronize();
+   #endif
+   }
+};
+
+template< typename Index,
+          typename Fetch,
+          typename Reduction,
+          typename ResultKeeper,
+          typename Real >
+struct EllpackCudaReductionDispatcher< Index, Fetch, Reduction, ResultKeeper, Real, false >
+{
+   static void
+   exec( Index first, Index last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Index segmentSize )
+   {
+   #ifdef HAVE_CUDA
+      const Index segmentsCount = last - first;
+      const Index threadsCount = segmentsCount * 32;
+      const Index blocksCount = Cuda::getNumberOfBlocks( threadsCount, 256 );
+      dim3 blockSize( 256 );
+      dim3 gridSize( blocksCount );
+      EllpackCudaReductionKernelCompact<<< gridSize, blockSize >>>( first, last, fetch, reduction, keeper, zero, segmentSize );
+      cudaDeviceSynchronize();
+   #endif
+   }
+};
 
 template< typename Device,
           typename Index,
@@ -87,6 +205,7 @@ String
 EllpackView< Device, Index, Organization, Alignment >::
 getSerializationType()
 {
+   // FIXME: the serialized data DEPEND on the Organization and Alignment parameters, so it should be reflected in the serialization type
    return "Ellpack< [any_device], " + TNL::getSerializationType< IndexType >() + " >";
 }
 
@@ -110,7 +229,7 @@ typename EllpackView< Device, Index, Organization, Alignment >::ViewType
 EllpackView< Device, Index, Organization, Alignment >::
 getView()
 {
-   return ViewType( segmentSize, segmentsCount, alignedSize );
+   return ViewType( segmentsCount, segmentSize, alignedSize );
 }
 
 template< typename Device,
@@ -207,9 +326,8 @@ forElements( IndexType first, IndexType last, Function&& f ) const
          const IndexType begin = segmentIdx * segmentSize;
          const IndexType end = begin + segmentSize;
          IndexType localIdx( 0 );
-         bool compute( true );
-         for( IndexType globalIdx = begin; globalIdx < end && compute; globalIdx++  )
-            f( segmentIdx, localIdx++, globalIdx, compute );
+         for( IndexType globalIdx = begin; globalIdx < end; globalIdx++  )
+            f( segmentIdx, localIdx++, globalIdx );
       };
       Algorithms::ParallelFor< Device >::exec( first, last, l );
    }
@@ -221,9 +339,8 @@ forElements( IndexType first, IndexType last, Function&& f ) const
          const IndexType begin = segmentIdx;
          const IndexType end = storageSize;
          IndexType localIdx( 0 );
-         bool compute( true );
-         for( IndexType globalIdx = begin; globalIdx < end && compute; globalIdx += alignedSize )
-            f( segmentIdx, localIdx++, globalIdx, compute );
+         for( IndexType globalIdx = begin; globalIdx < end; globalIdx += alignedSize )
+            f( segmentIdx, localIdx++, globalIdx );
       };
       Algorithms::ParallelFor< Device >::exec( first, last, l );
    }
@@ -262,7 +379,7 @@ template< typename Device,
           int Alignment >
    template< typename Function >
 void EllpackView< Device, Index, Organization, Alignment >::
-forEachSegment( Function&& f ) const
+forAllSegments( Function&& f ) const
 {
    this->forSegments( 0, this->getSegmentsCount(), f );
 }
@@ -271,32 +388,37 @@ template< typename Device,
           typename Index,
           ElementsOrganization Organization,
           int Alignment >
-   template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
+   template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real >
 void EllpackView< Device, Index, Organization, Alignment >::
-segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
+reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero ) const
 {
-   //using RealType = decltype( fetch( IndexType(), IndexType(), IndexType(), std::declval< bool& >(), args... ) );
+   //using RealType = decltype( fetch( IndexType(), IndexType(), IndexType(), std::declval< bool& >() ) );
    using RealType = typename detail::FetchLambdaAdapter< Index, Fetch >::ReturnType;
    if( Organization == RowMajorOrder )
    {
-      const IndexType segmentSize = this->segmentSize;
-      auto l = [=] __cuda_callable__ ( const IndexType segmentIdx, Args... args ) mutable {
-         const IndexType begin = segmentIdx * segmentSize;
-         const IndexType end = begin + segmentSize;
-         RealType aux( zero );
-         IndexType localIdx( 0 );
-         bool compute( true );
-         for( IndexType j = begin; j < end && compute; j++  )
-            aux = reduction( aux, detail::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx++, j, compute ) );
-         keeper( segmentIdx, aux );
-      };
-      Algorithms::ParallelFor< Device >::exec( first, last, l, args... );
+      if( std::is_same< Device, Devices::Cuda >::value )
+         EllpackCudaReductionDispatcher< IndexType, Fetch, Reduction, ResultKeeper, Real>::exec( first, last, fetch, reduction, keeper, zero, segmentSize );
+      else
+      {
+         const IndexType segmentSize = this->segmentSize;
+         auto l = [=] __cuda_callable__ ( const IndexType segmentIdx ) mutable {
+            const IndexType begin = segmentIdx * segmentSize;
+            const IndexType end = begin + segmentSize;
+            Real aux( zero );
+            IndexType localIdx( 0 );
+            bool compute( true );
+            for( IndexType j = begin; j < end && compute; j++  )
+               aux = reduction( aux, detail::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx++, j, compute ) );
+            keeper( segmentIdx, aux );
+         };
+         Algorithms::ParallelFor< Device >::exec( first, last, l );
+      }
    }
    else
    {
       const IndexType storageSize = this->getStorageSize();
       const IndexType alignedSize = this->alignedSize;
-      auto l = [=] __cuda_callable__ ( const IndexType segmentIdx, Args... args ) mutable {
+      auto l = [=] __cuda_callable__ ( const IndexType segmentIdx ) mutable {
          const IndexType begin = segmentIdx;
          const IndexType end = storageSize;
          RealType aux( zero );
@@ -306,7 +428,7 @@ segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reductio
             aux = reduction( aux, detail::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx++, j, compute ) );
          keeper( segmentIdx, aux );
       };
-      Algorithms::ParallelFor< Device >::exec( first, last, l, args... );
+      Algorithms::ParallelFor< Device >::exec( first, last, l );
    }
 }
 
@@ -314,11 +436,11 @@ template< typename Device,
           typename Index,
           ElementsOrganization Organization,
           int Alignment >
-   template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
+   template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real >
 void EllpackView< Device, Index, Organization, Alignment >::
-allReduction( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
+reduceAllSegments( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero ) const
 {
-   this->segmentsReduction( 0, this->getSegmentsCount(), fetch, reduction, keeper, zero, args... );
+   this->reduceSegments( 0, this->getSegmentsCount(), fetch, reduction, keeper, zero );
 }
 
 template< typename Device,
@@ -359,6 +481,18 @@ load( File& file )
    file.load( &alignedSize );
 }
 
+template< typename Device,
+          typename Index,
+          ElementsOrganization Organization,
+          int Alignment >
+      template< typename Fetch >
+auto
+EllpackView< Device, Index, Organization, Alignment >::
+print( Fetch&& fetch ) const -> SegmentsPrinter< EllpackView, Fetch >
+{
+   return SegmentsPrinter< EllpackView, Fetch >( *this, fetch );
+}
+
       } // namespace Segments
    }  // namespace Algorithms
 } // namespace TNL
diff --git a/src/TNL/Algorithms/Segments/CSRAdaptiveKernel.h b/src/TNL/Algorithms/Segments/Kernels/CSRAdaptiveKernel.h
similarity index 90%
rename from src/TNL/Algorithms/Segments/CSRAdaptiveKernel.h
rename to src/TNL/Algorithms/Segments/Kernels/CSRAdaptiveKernel.h
index 640120f86cba0b515829587954df770cc7d0c01c..53a59d2297fc7b595df5b7182a173879f4669e21 100644
--- a/src/TNL/Algorithms/Segments/CSRAdaptiveKernel.h
+++ b/src/TNL/Algorithms/Segments/Kernels/CSRAdaptiveKernel.h
@@ -15,9 +15,9 @@
 #include <TNL/Containers/VectorView.h>
 #include <TNL/Algorithms/ParallelFor.h>
 #include <TNL/Algorithms/Segments/detail/LambdaAdapter.h>
-#include <TNL/Algorithms/Segments/CSRScalarKernel.h>
-#include <TNL/Algorithms/Segments/CSRAdaptiveKernelView.h>
-#include <TNL/Algorithms/Segments/detail/CSRAdaptiveKernelBlockDescriptor.h>
+#include <TNL/Algorithms/Segments/Kernels/CSRScalarKernel.h>
+#include <TNL/Algorithms/Segments/Kernels/CSRAdaptiveKernelView.h>
+#include <TNL/Algorithms/Segments/Kernels/details/CSRAdaptiveKernelBlockDescriptor.h>
 
 namespace TNL {
    namespace Algorithms {
@@ -39,7 +39,7 @@ template< int CudaBlockSize,
           typename Real,
           typename... Args >
 __global__ void
-segmentsReductionCSRAdaptiveKernel( BlocksView blocks,
+reduceSegmentsCSRAdaptiveKernel( BlocksView blocks,
                                     int gridIdx,
                                     Offsets offsets,
                                     Index first,
@@ -84,7 +84,7 @@ struct CSRAdaptiveKernel
               typename ResultKeeper,
               typename Real,
               typename... Args >
-   void segmentsReduction( const OffsetsView& offsets,
+   void reduceSegments( const OffsetsView& offsets,
                         Index first,
                         Index last,
                         Fetch& fetch,
@@ -117,4 +117,4 @@ struct CSRAdaptiveKernel
    }  // namespace Algorithms
 } // namespace TNL
 
-#include <TNL/Algorithms/Segments/CSRAdaptiveKernel.hpp>
+#include <TNL/Algorithms/Segments/Kernels/CSRAdaptiveKernel.hpp>
diff --git a/src/TNL/Algorithms/Segments/CSRAdaptiveKernel.hpp b/src/TNL/Algorithms/Segments/Kernels/CSRAdaptiveKernel.hpp
similarity index 95%
rename from src/TNL/Algorithms/Segments/CSRAdaptiveKernel.hpp
rename to src/TNL/Algorithms/Segments/Kernels/CSRAdaptiveKernel.hpp
index a510ac395687bcb5057a6c397fe6e5031a9f5c58..de72daf77d2dc2e643b964ea9365a1c494bc5e80 100644
--- a/src/TNL/Algorithms/Segments/CSRAdaptiveKernel.hpp
+++ b/src/TNL/Algorithms/Segments/Kernels/CSRAdaptiveKernel.hpp
@@ -15,8 +15,8 @@
 #include <TNL/Containers/VectorView.h>
 #include <TNL/Algorithms/ParallelFor.h>
 #include <TNL/Algorithms/Segments/detail/LambdaAdapter.h>
-#include <TNL/Algorithms/Segments/CSRScalarKernel.h>
-#include <TNL/Algorithms/Segments/detail/CSRAdaptiveKernelBlockDescriptor.h>
+#include <TNL/Algorithms/Segments/Kernels/CSRScalarKernel.h>
+#include <TNL/Algorithms/Segments/Kernels/details/CSRAdaptiveKernelBlockDescriptor.h>
 
 namespace TNL {
    namespace Algorithms {
@@ -100,7 +100,7 @@ template< typename Index,
                typename... Args >
 void
 CSRAdaptiveKernel< Index, Device >::
-segmentsReduction( const OffsetsView& offsets,
+reduceSegments( const OffsetsView& offsets,
                    Index first,
                    Index last,
                    Fetch& fetch,
@@ -109,7 +109,7 @@ segmentsReduction( const OffsetsView& offsets,
                    const Real& zero,
                    Args... args ) const
 {
-   view.segmentsReduction( offsets, first, last, fetch, reduction, keeper, zero, args... );
+   view.reduceSegments( offsets, first, last, fetch, reduction, keeper, zero, args... );
 }
 
 template< typename Index,
diff --git a/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.h b/src/TNL/Algorithms/Segments/Kernels/CSRAdaptiveKernelView.h
similarity index 88%
rename from src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.h
rename to src/TNL/Algorithms/Segments/Kernels/CSRAdaptiveKernelView.h
index 9de407051b52609f124ffb5d07a6c0a4a364ea79..f7521d558eb0191f4677adc070c5e9652d7a0502 100644
--- a/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.h
+++ b/src/TNL/Algorithms/Segments/Kernels/CSRAdaptiveKernelView.h
@@ -11,8 +11,8 @@
 #pragma once
 
 #include <TNL/Containers/Vector.h>
-#include <TNL/Algorithms/Segments/detail/CSRAdaptiveKernelBlockDescriptor.h>
-#include <TNL/Algorithms/Segments/detail/CSRAdaptiveKernelParameters.h>
+#include <TNL/Algorithms/Segments/Kernels/details/CSRAdaptiveKernelBlockDescriptor.h>
+#include <TNL/Algorithms/Segments/Kernels/details/CSRAdaptiveKernelParameters.h>
 
 namespace TNL {
    namespace Algorithms {
@@ -49,7 +49,7 @@ struct CSRAdaptiveKernelView
              typename ResultKeeper,
              typename Real,
              typename... Args >
-   void segmentsReduction( const OffsetsView& offsets,
+   void reduceSegments( const OffsetsView& offsets,
                         Index first,
                         Index last,
                         Fetch& fetch,
@@ -70,4 +70,4 @@ struct CSRAdaptiveKernelView
    }  // namespace Algorithms
 } // namespace TNL
 
-#include <TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp>
+#include <TNL/Algorithms/Segments/Kernels/CSRAdaptiveKernelView.hpp>
diff --git a/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp b/src/TNL/Algorithms/Segments/Kernels/CSRAdaptiveKernelView.hpp
similarity index 91%
rename from src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp
rename to src/TNL/Algorithms/Segments/Kernels/CSRAdaptiveKernelView.hpp
index 4f15608579de2c4d831111c4b0283a5a349b465e..9c495fd70ad15b17db8219c69cc0638b0c42d7fe 100644
--- a/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp
+++ b/src/TNL/Algorithms/Segments/Kernels/CSRAdaptiveKernelView.hpp
@@ -15,10 +15,10 @@
 #include <TNL/Containers/VectorView.h>
 #include <TNL/Algorithms/ParallelFor.h>
 #include <TNL/Algorithms/Segments/detail/LambdaAdapter.h>
-#include <TNL/Algorithms/Segments/CSRScalarKernel.h>
-#include <TNL/Algorithms/Segments/CSRAdaptiveKernelView.h>
-#include <TNL/Algorithms/Segments/detail/CSRAdaptiveKernelBlockDescriptor.h>
-#include <TNL/Algorithms/Segments/detail/CSRAdaptiveKernelParameters.h>
+#include <TNL/Algorithms/Segments/Kernels/CSRScalarKernel.h>
+#include <TNL/Algorithms/Segments/Kernels/CSRAdaptiveKernelView.h>
+#include <TNL/Algorithms/Segments/Kernels/details/CSRAdaptiveKernelBlockDescriptor.h>
+#include <TNL/Algorithms/Segments/Kernels/details/CSRAdaptiveKernelParameters.h>
 
 namespace TNL {
    namespace Algorithms {
@@ -35,7 +35,7 @@ template< typename BlocksView,
           typename Real,
           typename... Args >
 __global__ void
-segmentsReductionCSRAdaptiveKernel( BlocksView blocks,
+reduceSegmentsCSRAdaptiveKernel( BlocksView blocks,
                                     int gridIdx,
                                     Offsets offsets,
                                     Index first,
@@ -56,7 +56,7 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks,
    __shared__ Real multivectorShared[ CudaBlockSize / WarpSize ];
    //__shared__ BlockType sharedBlocks[ WarpsCount ];
 
-   const Index index = ( ( gridIdx * TNL::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x ) + threadIdx.x;
+   const Index index = ( ( gridIdx * TNL::Cuda::getMaxGridXSize() + blockIdx.x ) * blockDim.x ) + threadIdx.x;
    const Index blockIdx = index / WarpSize;
    if( blockIdx >= blocks.getSize() - 1 )
       return;
@@ -183,14 +183,14 @@ template< typename Index,
           bool DispatchScalarCSR =
             detail::CheckFetchLambda< Index, Fetch >::hasAllParameters() ||
             std::is_same< Device, Devices::Host >::value >
-struct CSRAdaptiveKernelSegmentsReductionDispatcher;
+struct CSRAdaptiveKernelreduceSegmentsDispatcher;
 
 template< typename Index,
           typename Device,
           typename Fetch,
           typename Reduction,
           typename ResultKeeper >
-struct CSRAdaptiveKernelSegmentsReductionDispatcher< Index, Device, Fetch, Reduction, ResultKeeper, true >
+struct CSRAdaptiveKernelreduceSegmentsDispatcher< Index, Device, Fetch, Reduction, ResultKeeper, true >
 {
 
    template< typename BlocksView,
@@ -208,7 +208,7 @@ struct CSRAdaptiveKernelSegmentsReductionDispatcher< Index, Device, Fetch, Reduc
                        Args... args)
    {
       TNL::Algorithms::Segments::CSRScalarKernel< Index, Device >::
-         segmentsReduction( offsets, first, last, fetch, reduction, keeper, zero, args... );
+         reduceSegments( offsets, first, last, fetch, reduction, keeper, zero, args... );
    }
 };
 
@@ -217,7 +217,7 @@ template< typename Index,
           typename Fetch,
           typename Reduction,
           typename ResultKeeper >
-struct CSRAdaptiveKernelSegmentsReductionDispatcher< Index, Device, Fetch, Reduction, ResultKeeper, false >
+struct CSRAdaptiveKernelreduceSegmentsDispatcher< Index, Device, Fetch, Reduction, ResultKeeper, false >
 {
    template< typename BlocksView,
              typename Offsets,
@@ -238,7 +238,7 @@ struct CSRAdaptiveKernelSegmentsReductionDispatcher< Index, Device, Fetch, Reduc
       Index blocksCount;
 
       const Index threads = detail::CSRAdaptiveKernelParameters< sizeof( Real ) >::CudaBlockSize();
-      constexpr size_t maxGridSize = TNL::Cuda::getMaxGridSize();
+      constexpr size_t maxGridSize = TNL::Cuda::getMaxGridXSize();
 
       // Fill blocks
       size_t neededThreads = blocks.getSize() * TNL::Cuda::getWarpSize(); // one warp per block
@@ -256,7 +256,7 @@ struct CSRAdaptiveKernelSegmentsReductionDispatcher< Index, Device, Fetch, Reduc
             neededThreads -= maxGridSize * threads;
          }
 
-         segmentsReductionCSRAdaptiveKernel<
+         reduceSegmentsCSRAdaptiveKernel<
                BlocksView,
                Offsets,
                Index, Fetch, Reduction, ResultKeeper, Real, Args... >
@@ -322,7 +322,7 @@ template< typename Index,
                typename... Args >
 void
 CSRAdaptiveKernelView< Index, Device >::
-segmentsReduction( const OffsetsView& offsets,
+reduceSegments( const OffsetsView& offsets,
                    Index first,
                    Index last,
                    Fetch& fetch,
@@ -336,11 +336,11 @@ segmentsReduction( const OffsetsView& offsets,
    if( detail::CheckFetchLambda< Index, Fetch >::hasAllParameters() || valueSizeLog >= MaxValueSizeLog )
    {
       TNL::Algorithms::Segments::CSRScalarKernel< Index, Device >::
-         segmentsReduction( offsets, first, last, fetch, reduction, keeper, zero, args... );
+         reduceSegments( offsets, first, last, fetch, reduction, keeper, zero, args... );
       return;
    }
 
-   CSRAdaptiveKernelSegmentsReductionDispatcher< Index, Device, Fetch, Reduction, ResultKeeper  >::template
+   CSRAdaptiveKernelreduceSegmentsDispatcher< Index, Device, Fetch, Reduction, ResultKeeper  >::template
       reduce< BlocksView, OffsetsView, Real, Args... >( offsets, this->blocksArray[ valueSizeLog ], first, last, fetch, reduction, keeper, zero, args... );
 }
 
diff --git a/src/TNL/Algorithms/Segments/CSRHybridKernel.h b/src/TNL/Algorithms/Segments/Kernels/CSRHybridKernel.h
similarity index 79%
rename from src/TNL/Algorithms/Segments/CSRHybridKernel.h
rename to src/TNL/Algorithms/Segments/Kernels/CSRHybridKernel.h
index d3e48be1eeb7ab0fe386de91f7541292329cb406..c3271d7762fe130e2ae06521cbffd139e2ce5da3 100644
--- a/src/TNL/Algorithms/Segments/CSRHybridKernel.h
+++ b/src/TNL/Algorithms/Segments/Kernels/CSRHybridKernel.h
@@ -21,13 +21,14 @@ namespace TNL {
       namespace Segments {
 
 template< typename Index,
-          typename Device >
+          typename Device,
+          int ThreadsInBlock = 128 >
 struct CSRHybridKernel
 {
    using IndexType = Index;
    using DeviceType = Device;
-   using ViewType = CSRHybridKernel< Index, Device >;
-   using ConstViewType = CSRHybridKernel< Index, Device >;
+   using ViewType = CSRHybridKernel< Index, Device, ThreadsInBlock >;
+   using ConstViewType = CSRHybridKernel< Index, Device, ThreadsInBlock >;
 
    template< typename Offsets >
    void init( const Offsets& offsets );
@@ -44,16 +45,14 @@ struct CSRHybridKernel
              typename Fetch,
              typename Reduction,
              typename ResultKeeper,
-             typename Real,
-             typename... Args >
-   void segmentsReduction( const OffsetsView& offsets,
+             typename Real >
+   void reduceSegments( const OffsetsView& offsets,
                                   Index first,
                                   Index last,
                                   Fetch& fetch,
                                   const Reduction& reduction,
                                   ResultKeeper& keeper,
-                                  const Real& zero,
-                                  Args... args ) const;
+                                  const Real& zero ) const;
 
    protected:
       int threadsPerSegment;
@@ -63,4 +62,4 @@ struct CSRHybridKernel
    }  // namespace Algorithms
 } // namespace TNL
 
-#include <TNL/Algorithms/Segments/CSRHybridKernel.hpp>
+#include <TNL/Algorithms/Segments/Kernels/CSRHybridKernel.hpp>
diff --git a/src/TNL/Algorithms/Segments/Kernels/CSRHybridKernel.hpp b/src/TNL/Algorithms/Segments/Kernels/CSRHybridKernel.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..68198f995f4a8daecd67b8e05881432aacc4372d
--- /dev/null
+++ b/src/TNL/Algorithms/Segments/Kernels/CSRHybridKernel.hpp
@@ -0,0 +1,305 @@
+/***************************************************************************
+                          CSRHybridKernel.hpp -  description
+                             -------------------
+    begin                : Jan 23, 2021 -> Joe Biden inauguration
+    copyright            : (C) 2021 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#include <TNL/Assert.h>
+#include <TNL/Cuda/LaunchHelpers.h>
+#include <TNL/Containers/VectorView.h>
+#include <TNL/Algorithms/ParallelFor.h>
+#include <TNL/Algorithms/Segments/detail/LambdaAdapter.h>
+#include <TNL/Algorithms/Segments/Kernels/CSRHybridKernel.h>
+
+namespace TNL {
+   namespace Algorithms {
+      namespace Segments {
+
+#ifdef HAVE_CUDA
+template< int ThreadsPerSegment,
+          typename Offsets,
+          typename Index,
+          typename Fetch,
+          typename Reduction,
+          typename ResultKeeper,
+          typename Real >
+__global__
+void reduceSegmentsCSRHybridVectorKernel(
+    int gridIdx,
+    const Offsets offsets,
+    Index first,
+    Index last,
+    Fetch fetch,
+    const Reduction reduce,
+    ResultKeeper keep,
+    const Real zero )
+{
+    const Index segmentIdx =  TNL::Cuda::getGlobalThreadIdx( gridIdx ) / ThreadsPerSegment + first;
+    if( segmentIdx >= last )
+        return;
+
+    const int laneIdx = threadIdx.x & ( ThreadsPerSegment - 1 ); // & is cheaper than %
+    Index endIdx = offsets[ segmentIdx + 1] ;
+
+    Index localIdx( laneIdx );
+    Real aux = zero;
+    bool compute( true );
+    for( Index globalIdx = offsets[ segmentIdx ] + localIdx; globalIdx < endIdx; globalIdx += ThreadsPerSegment )
+    {
+      aux = reduce( aux, detail::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, localIdx, globalIdx, compute ) );
+      localIdx += TNL::Cuda::getWarpSize();
+    }
+
+    /****
+     * Reduction in each segment.
+     */
+    if( ThreadsPerSegment == 32 )
+        aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux, 16 ) );
+    if( ThreadsPerSegment >= 16 )
+        aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux,  8 ) );
+    if( ThreadsPerSegment >= 8 )
+        aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux,  4 ) );
+    if( ThreadsPerSegment >= 4 )
+        aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux,  2 ) );
+    if( ThreadsPerSegment >= 2 )
+        aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux,  1 ) );
+
+    if( laneIdx == 0 )
+        keep( segmentIdx, aux );
+}
+
+template< int BlockSize,
+          int ThreadsPerSegment,
+          typename Offsets,
+          typename Index,
+          typename Fetch,
+          typename Reduction,
+          typename ResultKeeper,
+          typename Real >
+__global__
+void reduceSegmentsCSRHybridMultivectorKernel(
+    int gridIdx,
+    const Offsets offsets,
+    Index first,
+    Index last,
+    Fetch fetch,
+    const Reduction reduce,
+    ResultKeeper keep,
+    const Real zero )
+{
+    const Index segmentIdx =  TNL::Cuda::getGlobalThreadIdx( gridIdx ) / ThreadsPerSegment + first;
+    if( segmentIdx >= last )
+        return;
+
+    __shared__ Real shared[ BlockSize / 32 ];
+    if( threadIdx.x < BlockSize / TNL::Cuda::getWarpSize() )
+        shared[ threadIdx.x ] = zero;
+
+    const int laneIdx = threadIdx.x & ( ThreadsPerSegment - 1 ); // & is cheaper than %
+    const int inWarpLaneIdx = threadIdx.x & ( TNL::Cuda::getWarpSize() - 1 ); // & is cheaper than %
+    const Index beginIdx = offsets[ segmentIdx ];
+    const Index endIdx   = offsets[ segmentIdx + 1 ] ;
+
+    Real result = zero;
+    bool compute( true );
+    Index localIdx = laneIdx;
+    for( Index globalIdx = beginIdx + laneIdx; globalIdx < endIdx && compute; globalIdx += ThreadsPerSegment )
+    {
+       result = reduce( result, detail::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, localIdx, globalIdx, compute ) );
+       localIdx += ThreadsPerSegment;
+    }
+    result += __shfl_down_sync(0xFFFFFFFF, result, 16);
+    result += __shfl_down_sync(0xFFFFFFFF, result, 8);
+    result += __shfl_down_sync(0xFFFFFFFF, result, 4);
+    result += __shfl_down_sync(0xFFFFFFFF, result, 2);
+    result += __shfl_down_sync(0xFFFFFFFF, result, 1);
+
+    const Index warpIdx = threadIdx.x / TNL::Cuda::getWarpSize();
+    if( inWarpLaneIdx == 0 )
+        shared[ warpIdx ] = result;
+
+    __syncthreads();
+    // Reduction in shared
+    if( warpIdx == 0 && inWarpLaneIdx < 16 )
+    {
+        //constexpr int totalWarps = BlockSize / WarpSize;
+        constexpr int warpsPerSegment = ThreadsPerSegment / TNL::Cuda::getWarpSize();
+        if( warpsPerSegment >= 32 )
+        {
+            shared[ inWarpLaneIdx ] =  reduce( shared[ inWarpLaneIdx ], shared[ inWarpLaneIdx + 16 ] );
+            __syncwarp();
+        }
+        if( warpsPerSegment >= 16 )
+        {
+            shared[ inWarpLaneIdx ] =  reduce( shared[ inWarpLaneIdx ], shared[ inWarpLaneIdx +  8 ] );
+            __syncwarp();
+        }
+        if( warpsPerSegment >= 8 )
+        {
+            shared[ inWarpLaneIdx ] =  reduce( shared[ inWarpLaneIdx ], shared[ inWarpLaneIdx +  4 ] );
+            __syncwarp();
+        }
+        if( warpsPerSegment >= 4 )
+        {
+            shared[ inWarpLaneIdx ] =  reduce( shared[ inWarpLaneIdx ], shared[ inWarpLaneIdx +  2 ] );
+            __syncwarp();
+        }
+        if( warpsPerSegment >= 2 )
+        {
+            shared[ inWarpLaneIdx ] =  reduce( shared[ inWarpLaneIdx ], shared[ inWarpLaneIdx +  1 ] );
+            __syncwarp();
+        }
+        constexpr int segmentsCount = BlockSize / ThreadsPerSegment;
+        if( inWarpLaneIdx < segmentsCount && segmentIdx + inWarpLaneIdx < last )
+        {
+            //printf( "Long: segmentIdx %d -> %d \n", segmentIdx, aux );
+            keep( segmentIdx + inWarpLaneIdx, shared[ inWarpLaneIdx * ThreadsPerSegment / 32 ] );
+        }
+    }
+}
+#endif
+
+
+
+template< typename Index,
+          typename Device,
+          int ThreadsInBlock >
+    template< typename Offsets >
+void
+CSRHybridKernel< Index, Device, ThreadsInBlock >::
+init( const Offsets& offsets )
+{
+    const Index segmentsCount = offsets.getSize() - 1;
+    const Index elementsInSegment = std::ceil( ( double ) offsets.getElement( segmentsCount ) / ( double ) segmentsCount );
+    this->threadsPerSegment = TNL::min( std::pow( 2, std::ceil( std::log2( elementsInSegment ) ) ), ThreadsInBlock ); //TNL::Cuda::getWarpSize() );
+    TNL_ASSERT_GE( threadsPerSegment, 0, "" );
+    TNL_ASSERT_LE( threadsPerSegment, ThreadsInBlock, "" );
+}
+
+template< typename Index,
+          typename Device,
+          int ThreadsInBlock >
+void
+CSRHybridKernel< Index, Device, ThreadsInBlock >::
+reset()
+{
+    this->threadsPerSegment = 0;
+}
+
+template< typename Index,
+          typename Device,
+          int ThreadsInBlock >
+auto
+CSRHybridKernel< Index, Device, ThreadsInBlock >::
+getView() -> ViewType
+{
+    return *this;
+}
+
+template< typename Index,
+          typename Device,
+          int ThreadsInBlock >
+TNL::String
+CSRHybridKernel< Index, Device, ThreadsInBlock >::
+getKernelType()
+{
+    return "Hybrid " + TNL::convertToString( ThreadsInBlock );
+}
+
+template< typename Index,
+          typename Device,
+          int ThreadsInBlock >
+auto
+CSRHybridKernel< Index, Device, ThreadsInBlock >::
+getConstView() const -> ConstViewType
+{
+    return *this;
+};
+
+
+template< typename Index,
+          typename Device,
+          int ThreadsInBlock >
+    template< typename OffsetsView,
+              typename Fetch,
+              typename Reduction,
+              typename ResultKeeper,
+              typename Real >
+void
+CSRHybridKernel< Index, Device, ThreadsInBlock >::
+reduceSegments( const OffsetsView& offsets,
+                         Index first,
+                         Index last,
+                         Fetch& fetch,
+                         const Reduction& reduction,
+                         ResultKeeper& keeper,
+                         const Real& zero ) const
+{
+    TNL_ASSERT_GE( this->threadsPerSegment, 0, "" );
+    TNL_ASSERT_LE( this->threadsPerSegment, ThreadsInBlock, "" );
+
+#ifdef HAVE_CUDA
+    const size_t threadsCount = this->threadsPerSegment * ( last - first );
+    dim3 blocksCount, gridsCount, blockSize( ThreadsInBlock );
+    TNL::Cuda::setupThreads( blockSize, blocksCount, gridsCount, threadsCount );
+    //std::cerr << " this->threadsPerSegment = " << this->threadsPerSegment << " offsets = " << offsets << std::endl;
+    for( unsigned int gridIdx = 0; gridIdx < gridsCount.x; gridIdx ++ )
+    {
+        dim3 gridSize;
+        TNL::Cuda::setupGrid( blocksCount, gridsCount, gridIdx, gridSize );
+        switch( this->threadsPerSegment )
+        {
+            case 0:      // this means zero/empty matrix
+                break;
+            case 1:
+                reduceSegmentsCSRHybridVectorKernel<  1, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real ><<< gridSize, blockSize >>>(
+                    gridIdx, offsets, first, last, fetch, reduction, keeper, zero );
+                    break;
+            case 2:
+                reduceSegmentsCSRHybridVectorKernel<  2, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real ><<< gridSize, blockSize >>>(
+                    gridIdx, offsets, first, last, fetch, reduction, keeper, zero );
+                    break;
+            case 4:
+                reduceSegmentsCSRHybridVectorKernel<  4, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real ><<< gridSize, blockSize >>>(
+                    gridIdx, offsets, first, last, fetch, reduction, keeper, zero );
+                    break;
+            case 8:
+                reduceSegmentsCSRHybridVectorKernel<  8, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real ><<< gridSize, blockSize >>>(
+                    gridIdx, offsets, first, last, fetch, reduction, keeper, zero );
+                    break;
+            case 16:
+                reduceSegmentsCSRHybridVectorKernel< 16, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real ><<< gridSize, blockSize >>>(
+                    gridIdx, offsets, first, last, fetch, reduction, keeper, zero );
+                    break;
+            case 32:
+                reduceSegmentsCSRHybridVectorKernel< 32, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real ><<< gridSize, blockSize >>>(
+                    gridIdx, offsets, first, last, fetch, reduction, keeper, zero );
+                    break;
+            case 64:
+                reduceSegmentsCSRHybridMultivectorKernel< ThreadsInBlock,  64, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real ><<< gridSize, blockSize >>>(
+                    gridIdx, offsets, first, last, fetch, reduction, keeper, zero );
+                    break;
+            case 128:
+                reduceSegmentsCSRHybridMultivectorKernel< ThreadsInBlock, 128, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real ><<< gridSize, blockSize >>>(
+                    gridIdx, offsets, first, last, fetch, reduction, keeper, zero );
+                    break;
+            case 256:
+                reduceSegmentsCSRHybridMultivectorKernel< ThreadsInBlock, 256, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real ><<< gridSize, blockSize >>>(
+                    gridIdx, offsets, first, last, fetch, reduction, keeper, zero );
+                    break;
+            default:
+                throw std::runtime_error( std::string( "Wrong value of threadsPerSegment: " ) + std::to_string( this->threadsPerSegment ) );
+        }
+    }
+#endif
+}
+
+      } // namespace Segments
+   }  // namespace Algorithms
+} // namespace TNL
diff --git a/src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.h b/src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..be5fc1331b22119384a9f390ae2deb04815230c3
--- /dev/null
+++ b/src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.h
@@ -0,0 +1,80 @@
+/***************************************************************************
+                          CSRLightKernel.h -  description
+                             -------------------
+    begin                : Jun 9, 2021 -> Joe Biden inauguration
+    copyright            : (C) 2021 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#include <TNL/Assert.h>
+#include <TNL/Cuda/LaunchHelpers.h>
+#include <TNL/Containers/VectorView.h>
+#include <TNL/Algorithms/ParallelFor.h>
+#include <TNL/Algorithms/Segments/detail/LambdaAdapter.h>
+
+namespace TNL {
+   namespace Algorithms {
+      namespace Segments {
+
+enum LightCSRSThreadsMapping { LightCSRConstantThreads, CSRLightAutomaticThreads, CSRLightAutomaticThreadsLightSpMV };
+
+template< typename Index,
+          typename Device >
+struct CSRLightKernel
+{
+   using IndexType = Index;
+   using DeviceType = Device;
+   using ViewType = CSRLightKernel< Index, Device >;
+   using ConstViewType = CSRLightKernel< Index, Device >;
+
+   template< typename Offsets >
+   void init( const Offsets& offsets );
+
+   void reset();
+
+   ViewType getView();
+
+   ConstViewType getConstView() const;
+
+   static TNL::String getKernelType();
+
+   TNL::String getSetup() const;
+
+   template< typename OffsetsView,
+             typename Fetch,
+             typename Reduction,
+             typename ResultKeeper,
+             typename Real >
+   void reduceSegments( const OffsetsView& offsets,
+                        Index first,
+                        Index last,
+                        Fetch& fetch,
+                        const Reduction& reduction,
+                        ResultKeeper& keeper,
+                        const Real& zero ) const;
+
+
+   void setThreadsMapping( LightCSRSThreadsMapping mapping );
+
+   LightCSRSThreadsMapping getThreadsMapping() const;
+
+   void setThreadsPerSegment( int threadsPerSegment );
+
+   int getThreadsPerSegment() const;
+
+   protected:
+
+      LightCSRSThreadsMapping mapping = CSRLightAutomaticThreads;
+
+      int threadsPerSegment = 32;
+};
+
+      } // namespace Segments
+   }  // namespace Algorithms
+} // namespace TNL
+
+#include <TNL/Algorithms/Segments/Kernels/CSRLightKernel.hpp>
diff --git a/src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.hpp b/src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..93d3e2800e9d6b6b9277306458ea50c0c897a4e0
--- /dev/null
+++ b/src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.hpp
@@ -0,0 +1,663 @@
+/***************************************************************************
+                          CSRLightKernel.hpp -  description
+                             -------------------
+    begin                : Jun 9, 2021 -> Joe Biden inauguration
+    copyright            : (C) 2021 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#include <TNL/Assert.h>
+#include <TNL/Cuda/LaunchHelpers.h>
+#include <TNL/Containers/VectorView.h>
+#include <TNL/Algorithms/ParallelFor.h>
+#include <TNL/Algorithms/Segments/detail/LambdaAdapter.h>
+#include <TNL/Algorithms/Segments/Kernels/CSRLightKernel.h>
+
+namespace TNL {
+   namespace Algorithms {
+      namespace Segments {
+
+#ifdef HAVE_CUDA
+template< typename Real,
+          typename Index,
+          typename OffsetsView,
+          typename Fetch,
+          typename Reduce,
+          typename Keep >
+__global__
+void SpMVCSRLight2( OffsetsView offsets,
+                                 const Index first,
+                                 const Index last,
+                                 Fetch fetch,
+                                 Reduce reduce,
+                                 Keep keep,
+                                 const Real zero,
+                                 const Index gridID)
+{
+   const Index segmentIdx =
+      first + ( ( gridID * TNL::Cuda::getMaxGridXSize() ) + (blockIdx.x * blockDim.x) + threadIdx.x ) / 2;
+   if( segmentIdx >= last )
+      return;
+
+   const Index inGroupID = threadIdx.x & 1; // & is cheaper than %
+   const Index maxID = offsets[ segmentIdx  + 1];
+
+   Real result = zero;
+   bool compute = true;
+   for( Index i = offsets[segmentIdx] + inGroupID; i < maxID; i += 2)
+      result = reduce( result, fetch( i, compute ) );
+
+   /* Parallel reduction */
+   result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result, 1 ) );
+
+   /* Write result */
+   if( inGroupID == 0 )
+      keep( segmentIdx, result );
+}
+
+template< typename Real,
+          typename Index,
+          typename OffsetsView,
+          typename Fetch,
+          typename Reduce,
+          typename Keep >
+__global__
+void SpMVCSRLight4( OffsetsView offsets,
+                                 const Index first,
+                                 const Index last,
+                                 Fetch fetch,
+                                 Reduce reduce,
+                                 Keep keep,
+                                 const Real zero,
+                                 const Index gridID )
+{
+   const Index segmentIdx =
+      first + ((gridID * TNL::Cuda::getMaxGridXSize() ) + (blockIdx.x * blockDim.x) + threadIdx.x) / 4;
+   if (segmentIdx >= last)
+      return;
+
+   const Index inGroupID = threadIdx.x & 3; // & is cheaper than %
+   const Index maxID = offsets[segmentIdx + 1];
+
+   Real result = zero;
+   bool compute = true;
+   for (Index i = offsets[segmentIdx] + inGroupID; i < maxID; i += 4)
+      result = reduce( result, fetch( i, compute ) );
+
+   /* Parallel reduction */
+   result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result, 2 ) );
+   result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result, 1 ) );
+
+   /* Write result */
+   if( inGroupID == 0 )
+      keep( segmentIdx, result );
+
+}
+
+template< typename Real,
+          typename Index,
+          typename OffsetsView,
+          typename Fetch,
+          typename Reduce,
+          typename Keep >
+__global__
+void SpMVCSRLight8( OffsetsView offsets,
+                                 const Index first,
+                                 const Index last,
+                                 Fetch fetch,
+                                 Reduce reduce,
+                                 Keep keep,
+                                 const Real zero,
+                                 const Index gridID)
+{
+   const Index segmentIdx =
+      first + ((gridID * TNL::Cuda::getMaxGridXSize() ) + (blockIdx.x * blockDim.x) + threadIdx.x) / 8;
+   if (segmentIdx >= last)
+      return;
+
+   Index i;
+   const Index inGroupID = threadIdx.x & 7; // & is cheaper than %
+   const Index maxID = offsets[segmentIdx + 1];
+
+   Real result = zero;
+   bool compute = true;
+   for (i = offsets[segmentIdx] + inGroupID; i < maxID; i += 8)
+      result = reduce( result, fetch( i, compute ) );
+
+   /* Parallel reduction */
+   result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result, 4 ) );
+   result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result, 2 ) );
+   result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result, 1 ) );
+
+   /* Write result */
+   if( inGroupID == 0 )
+      keep( segmentIdx, result );
+}
+
+template< typename Real,
+          typename Index,
+          typename OffsetsView,
+          typename Fetch,
+          typename Reduce,
+          typename Keep >
+__global__
+void SpMVCSRLight16( OffsetsView offsets,
+                                  const Index first,
+                                  const Index last,
+                                  Fetch fetch,
+                                  Reduce reduce,
+                                  Keep keep,
+                                  const Real zero,
+                                  const Index gridID )
+{
+   const Index segmentIdx =
+      first + ((gridID * TNL::Cuda::getMaxGridXSize() ) + (blockIdx.x * blockDim.x) + threadIdx.x ) / 16;
+   if( segmentIdx >= last )
+      return;
+
+   Index i;
+   const Index inGroupID = threadIdx.x & 15; // & is cheaper than %
+   const Index maxID = offsets[segmentIdx + 1];
+
+   Real result = zero;
+   bool compute = true;
+   for( i = offsets[segmentIdx] + inGroupID; i < maxID; i += 16 )
+      result = reduce( result, fetch( i, compute ) );
+
+   /* Parallel reduction */
+   result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 8 ) );
+   result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 4 ) );
+   result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 2 ) );
+   result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 1 ) );
+
+   /* Write result */
+   if( inGroupID == 0 )
+      keep( segmentIdx, result );
+}
+
+/*template< typename Real,
+          typename Index,
+          typename OffsetsView,
+          typename Fetch,
+          typename Reduce,
+          typename Keep >
+__global__
+void SpMVCSRVector( OffsetsView offsets,
+                    const Index first,
+                    const Index last,
+                    Fetch fetch,
+                    Reduce reduce,
+                    Keep keep,
+                    const Real zero,
+                    const Index gridID )
+{
+   const int warpSize = 32;
+   const Index warpID = first + ((gridID * TNL::Cuda::getMaxGridXSize() ) + (blockIdx.x * blockDim.x) + threadIdx.x) / warpSize;
+   if (warpID >= last)
+      return;
+
+   Real result = zero;
+   const Index laneID = threadIdx.x & 31; // & is cheaper than %
+   Index endID = offsets[warpID + 1];
+
+   // Calculate result
+   bool compute = true;
+   for (Index i = offsets[warpID] + laneID; i < endID; i += warpSize)
+      result = reduce( result, fetch( i, compute ) );
+
+   // Reduction
+   result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result, 16 ) );
+   result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result,  8 ) );
+   result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result,  4 ) );
+   result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result,  2 ) );
+   result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result,  1 ) );
+   // Write result
+   if( laneID == 0 )
+      keep( warpID, result );
+}*/
+
+template< int ThreadsPerSegment,
+          typename Real,
+          typename Index,
+          typename OffsetsView,
+          typename Fetch,
+          typename Reduce,
+          typename Keep >
+__global__
+void SpMVCSRVector( OffsetsView offsets,
+                    const Index first,
+                    const Index last,
+                    Fetch fetch,
+                    Reduce reduce,
+                    Keep keep,
+                    const Real zero,
+                    const Index gridID )
+{
+   //const int warpSize = 32;
+   const Index warpID = first + ((gridID * TNL::Cuda::getMaxGridXSize() ) + (blockIdx.x * blockDim.x) + threadIdx.x) / ThreadsPerSegment;
+   if (warpID >= last)
+      return;
+
+   Real result = zero;
+   const Index laneID = threadIdx.x & ( ThreadsPerSegment - 1 ); // & is cheaper than %
+   Index endID = offsets[warpID + 1];
+
+   // Calculate result
+   bool compute = true;
+   for (Index i = offsets[warpID] + laneID; i < endID; i += ThreadsPerSegment )
+      result = reduce( result, fetch( i, compute ) );
+
+   // Reduction
+   if( ThreadsPerSegment > 16 )
+   {
+      result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result, 16 ) );
+      result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result,  8 ) );
+      result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result,  4 ) );
+      result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result,  2 ) );
+      result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result,  1 ) );
+   } else if( ThreadsPerSegment > 8 ) {
+      result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result,  8 ) );
+      result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result,  4 ) );
+      result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result,  2 ) );
+      result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result,  1 ) );
+   } else if( ThreadsPerSegment > 4 ) {
+      result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result,  4 ) );
+      result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result,  2 ) );
+      result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result,  1 ) );
+   } else if( ThreadsPerSegment > 2 ) {
+      result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result,  2 ) );
+      result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result,  1 ) );
+   } else if( ThreadsPerSegment > 1 )
+      result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result,  1 ) );
+
+   // Store result
+   if( laneID == 0 )
+      keep( warpID, result );
+}
+
+
+template< int BlockSize,
+          int ThreadsPerSegment,
+          typename Offsets,
+          typename Index,
+          typename Fetch,
+          typename Reduction,
+          typename ResultKeeper,
+          typename Real >
+__global__
+void reduceSegmentsCSRLightMultivectorKernel(
+    int gridIdx,
+    const Offsets offsets,
+    Index first,
+    Index last,
+    Fetch fetch,
+    const Reduction reduce,
+    ResultKeeper keep,
+    const Real zero )
+{
+    const Index segmentIdx =  TNL::Cuda::getGlobalThreadIdx( gridIdx ) / ThreadsPerSegment + first;
+    if( segmentIdx >= last )
+        return;
+
+    __shared__ Real shared[ BlockSize / 32 ];
+    if( threadIdx.x < BlockSize / TNL::Cuda::getWarpSize() )
+        shared[ threadIdx.x ] = zero;
+
+    const int laneIdx = threadIdx.x & ( ThreadsPerSegment - 1 ); // & is cheaper than %
+    const int inWarpLaneIdx = threadIdx.x & ( TNL::Cuda::getWarpSize() - 1 ); // & is cheaper than %
+    const Index beginIdx = offsets[ segmentIdx ];
+    const Index endIdx   = offsets[ segmentIdx + 1 ] ;
+
+    Real result = zero;
+    bool compute( true );
+    Index localIdx = laneIdx;
+    for( Index globalIdx = beginIdx + laneIdx; globalIdx < endIdx && compute; globalIdx += ThreadsPerSegment )
+    {
+       result = reduce( result, detail::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, localIdx, globalIdx, compute ) );
+       localIdx += ThreadsPerSegment;
+    }
+    result += __shfl_down_sync(0xFFFFFFFF, result, 16);
+    result += __shfl_down_sync(0xFFFFFFFF, result, 8);
+    result += __shfl_down_sync(0xFFFFFFFF, result, 4);
+    result += __shfl_down_sync(0xFFFFFFFF, result, 2);
+    result += __shfl_down_sync(0xFFFFFFFF, result, 1);
+
+    const Index warpIdx = threadIdx.x / TNL::Cuda::getWarpSize();
+    if( inWarpLaneIdx == 0 )
+        shared[ warpIdx ] = result;
+
+    __syncthreads();
+    // Reduction in shared
+    if( warpIdx == 0 && inWarpLaneIdx < 16 )
+    {
+        //constexpr int totalWarps = BlockSize / WarpSize;
+        constexpr int warpsPerSegment = ThreadsPerSegment / TNL::Cuda::getWarpSize();
+        if( warpsPerSegment >= 32 )
+        {
+            shared[ inWarpLaneIdx ] =  reduce( shared[ inWarpLaneIdx ], shared[ inWarpLaneIdx + 16 ] );
+            __syncwarp();
+        }
+        if( warpsPerSegment >= 16 )
+        {
+            shared[ inWarpLaneIdx ] =  reduce( shared[ inWarpLaneIdx ], shared[ inWarpLaneIdx +  8 ] );
+            __syncwarp();
+        }
+        if( warpsPerSegment >= 8 )
+        {
+            shared[ inWarpLaneIdx ] =  reduce( shared[ inWarpLaneIdx ], shared[ inWarpLaneIdx +  4 ] );
+            __syncwarp();
+        }
+        if( warpsPerSegment >= 4 )
+        {
+            shared[ inWarpLaneIdx ] =  reduce( shared[ inWarpLaneIdx ], shared[ inWarpLaneIdx +  2 ] );
+            __syncwarp();
+        }
+        if( warpsPerSegment >= 2 )
+        {
+            shared[ inWarpLaneIdx ] =  reduce( shared[ inWarpLaneIdx ], shared[ inWarpLaneIdx +  1 ] );
+            __syncwarp();
+        }
+        constexpr int segmentsCount = BlockSize / ThreadsPerSegment;
+        if( inWarpLaneIdx < segmentsCount && segmentIdx + inWarpLaneIdx < last )
+        {
+            //printf( "Long: segmentIdx %d -> %d \n", segmentIdx, aux );
+            keep( segmentIdx + inWarpLaneIdx, shared[ inWarpLaneIdx * ThreadsPerSegment / 32 ] );
+        }
+    }
+}
+
+#endif
+template< typename Index,
+          typename Device,
+          typename Fetch,
+          typename Reduce,
+          typename Keep,
+          bool DispatchScalarCSR =
+            detail::CheckFetchLambda< Index, Fetch >::hasAllParameters() ||
+            std::is_same< Device, Devices::Host >::value >
+struct CSRLightKernelreduceSegmentsDispatcher;
+
+template< typename Index,
+          typename Device,
+          typename Fetch,
+          typename Reduction,
+          typename ResultKeeper >
+struct CSRLightKernelreduceSegmentsDispatcher< Index, Device, Fetch, Reduction, ResultKeeper, true >
+{
+
+   template< typename Offsets,
+             typename Real >
+   static void reduce( const Offsets& offsets,
+                       Index first,
+                       Index last,
+                       Fetch& fetch,
+                       const Reduction& reduce,
+                       ResultKeeper& keep,
+                       const Real& zero,
+                       const Index threadsPerSegment )
+   {
+      TNL::Algorithms::Segments::CSRScalarKernel< Index, Device >::
+         reduceSegments( offsets, first, last, fetch, reduce, keep, zero );
+   }
+};
+
+template< typename Index,
+          typename Device,
+          typename Fetch,
+          typename Reduce,
+          typename Keep >
+struct CSRLightKernelreduceSegmentsDispatcher< Index, Device, Fetch, Reduce, Keep, false >
+{
+   template< typename OffsetsView,
+             typename Real >
+   static void reduce( const OffsetsView& offsets,
+                       Index first,
+                       Index last,
+                       Fetch& fetch,
+                       const Reduce& reduce,
+                       Keep& keep,
+                       const Real& zero,
+                       const Index threadsPerSegment )
+   {
+#ifdef HAVE_CUDA
+      const size_t threads = 128;
+      Index blocks, groupSize;
+
+      size_t  neededThreads = threadsPerSegment * ( last - first );
+
+      for (Index grid = 0; neededThreads != 0; ++grid)
+      {
+         if( TNL::Cuda::getMaxGridXSize() * threads >= neededThreads)
+         {
+            blocks = roundUpDivision(neededThreads, threads);
+            neededThreads = 0;
+         }
+         else
+         {
+            blocks = TNL::Cuda::getMaxGridXSize();
+            neededThreads -= TNL::Cuda::getMaxGridXSize() * threads;
+         }
+
+         if( threadsPerSegment == 1 )
+            SpMVCSRVector< 1, Real, Index, OffsetsView, Fetch, Reduce, Keep ><<< blocks, threads >>>(
+               offsets, first, last, fetch, reduce, keep, zero, grid );
+         if( threadsPerSegment == 2 )
+            SpMVCSRVector< 2, Real, Index, OffsetsView, Fetch, Reduce, Keep ><<< blocks, threads >>>(
+               offsets, first, last, fetch, reduce, keep, zero, grid );
+         if( threadsPerSegment == 4 )
+            SpMVCSRVector< 4, Real, Index, OffsetsView, Fetch, Reduce, Keep ><<< blocks, threads >>>(
+               offsets, first, last, fetch, reduce, keep, zero, grid );
+         if( threadsPerSegment == 8 )
+            SpMVCSRVector< 8, Real, Index, OffsetsView, Fetch, Reduce, Keep ><<< blocks, threads >>>(
+               offsets, first, last, fetch, reduce, keep, zero, grid );
+         if( threadsPerSegment == 16 )
+            SpMVCSRVector< 16, Real, Index, OffsetsView, Fetch, Reduce, Keep ><<< blocks, threads >>>(
+               offsets, first, last, fetch, reduce, keep, zero, grid );
+         if( threadsPerSegment == 32 )
+            SpMVCSRVector< 32, Real, Index, OffsetsView, Fetch, Reduce, Keep ><<< blocks, threads >>>(
+               offsets, first, last, fetch, reduce, keep, zero, grid );
+         if( threadsPerSegment == 64 )
+         { // Execute CSR MultiVector
+            reduceSegmentsCSRLightMultivectorKernel< 128, 64 ><<<blocks, threads>>>(
+                     grid, offsets, first, last, fetch, reduce, keep, zero );
+         }
+         if (threadsPerSegment >= 128 )
+         { // Execute CSR MultiVector
+            reduceSegmentsCSRLightMultivectorKernel< 128, 128 ><<<blocks, threads>>>(
+                     grid, offsets, first, last, fetch, reduce, keep, zero );
+         }
+
+
+         /*if (threadsPerSegment == 2)
+            SpMVCSRLight2<Real, Index, OffsetsView, Fetch, Reduce, Keep ><<<blocks, threads>>>(
+               offsets, first, last, fetch, reduce, keep, zero, grid );
+         else if (threadsPerSegment == 4)
+            SpMVCSRLight4<Real, Index, OffsetsView, Fetch, Reduce, Keep ><<<blocks, threads>>>(
+               offsets, first, last, fetch, reduce, keep, zero, grid );
+         else if (threadsPerSegment == 8)
+            SpMVCSRLight8<Real, Index, OffsetsView, Fetch, Reduce, Keep ><<<blocks, threads>>>(
+               offsets, first, last, fetch, reduce, keep, zero, grid );
+         else if (threadsPerSegment == 16)
+            SpMVCSRLight16<Real, Index, OffsetsView, Fetch, Reduce, Keep ><<<blocks, threads>>>(
+               offsets, first, last, fetch, reduce, keep, zero, grid );
+         else if (threadsPerSegment == 32)
+         { // CSR SpMV Light with threadsPerSegment = 32 is CSR Vector
+            SpMVCSRVector<Real, Index, OffsetsView, Fetch, Reduce, Keep ><<<blocks, threads>>>(
+               offsets, first, last, fetch, reduce, keep, zero, grid );
+         }
+         else if (threadsPerSegment == 64 )
+         { // Execute CSR MultiVector
+            reduceSegmentsCSRLightMultivectorKernel< 128, 64 ><<<blocks, threads>>>(
+                     grid, offsets, first, last, fetch, reduce, keep, zero );
+         }
+         else //if (threadsPerSegment == 64 )
+         { // Execute CSR MultiVector
+            reduceSegmentsCSRLightMultivectorKernel< 128, 128 ><<<blocks, threads>>>(
+                     grid, offsets, first, last, fetch, reduce, keep, zero );
+         }*/
+      }
+#endif
+
+   }
+};
+
+
+template< typename Index,
+          typename Device >
+    template< typename Offsets >
+void
+CSRLightKernel< Index, Device >::
+init( const Offsets& offsets )
+{
+   const Index segmentsCount = offsets.getSize() - 1;
+
+   if( this->getThreadsMapping() == CSRLightAutomaticThreads )
+   {
+      const Index elementsInSegment = roundUpDivision( offsets.getElement( segmentsCount ), segmentsCount ); // non zeroes per row
+      if( elementsInSegment <= 2 )
+         this->threadsPerSegment = 2;
+      else if( elementsInSegment <= 4 )
+         this->threadsPerSegment = 4;
+      else if( elementsInSegment <= 8 )
+         this->threadsPerSegment = 8;
+      else if( elementsInSegment <= 16 )
+         this->threadsPerSegment = 16;
+      else //if (nnz <= 2 * matrix.MAX_ELEMENTS_PER_WARP)
+         this->threadsPerSegment = 32; // CSR Vector
+      //else
+      //   threadsPerSegment = roundUpDivision(nnz, matrix.MAX_ELEMENTS_PER_WARP) * 32; // CSR MultiVector
+   }
+
+   if( this->getThreadsMapping() == CSRLightAutomaticThreadsLightSpMV )
+   {
+      const Index elementsInSegment = roundUpDivision( offsets.getElement( segmentsCount ), segmentsCount ); // non zeroes per row
+      if( elementsInSegment <= 2 )
+         this->threadsPerSegment = 2;
+      else if( elementsInSegment <= 4 )
+         this->threadsPerSegment = 4;
+      else if( elementsInSegment <= 8 )
+         this->threadsPerSegment = 8;
+      else if( elementsInSegment <= 16 )
+         this->threadsPerSegment = 16;
+      else //if (nnz <= 2 * matrix.MAX_ELEMENTS_PER_WARP)
+         this->threadsPerSegment = 32; // CSR Vector
+      //else
+      //   threadsPerSegment = roundUpDivision(nnz, matrix.MAX_ELEMENTS_PER_WARP) * 32; // CSR MultiVector
+   }
+
+   TNL_ASSERT_GE( this->threadsPerSegment, 0, "" );
+   TNL_ASSERT_LE( this->threadsPerSegment, 33, "" );
+
+}
+
+template< typename Index,
+          typename Device >
+void
+CSRLightKernel< Index, Device >::
+reset()
+{
+   this->threadsPerSegment = 0;
+}
+
+template< typename Index,
+          typename Device >
+auto
+CSRLightKernel< Index, Device >::
+getView() -> ViewType
+{
+    return *this;
+}
+
+template< typename Index,
+          typename Device >
+TNL::String
+CSRLightKernel< Index, Device >::
+getKernelType()
+{
+    return "Light";
+}
+
+template< typename Index,
+          typename Device >
+auto
+CSRLightKernel< Index, Device >::
+getConstView() const -> ConstViewType
+{
+    return *this;
+};
+
+
+template< typename Index,
+          typename Device >
+    template< typename OffsetsView,
+              typename Fetch,
+              typename Reduce,
+              typename Keep,
+              typename Real >
+void
+CSRLightKernel< Index, Device >::
+reduceSegments( const OffsetsView& offsets,
+                Index first,
+                Index last,
+                Fetch& fetch,
+                const Reduce& reduce,
+                Keep& keep,
+                const Real& zero ) const
+{
+   TNL_ASSERT_GE( this->threadsPerSegment, 0, "" );
+   TNL_ASSERT_LE( this->threadsPerSegment, 33, "" );
+   CSRLightKernelreduceSegmentsDispatcher< Index, Device, Fetch, Reduce, Keep >::reduce(
+      offsets, first, last, fetch, reduce, keep, zero, this->threadsPerSegment );
+}
+
+template< typename Index,
+          typename Device >
+void
+CSRLightKernel< Index, Device >::
+setThreadsMapping( LightCSRSThreadsMapping mapping )
+{
+   this-> mapping = mapping;
+}
+
+template< typename Index,
+          typename Device >
+LightCSRSThreadsMapping
+CSRLightKernel< Index, Device >::
+getThreadsMapping() const
+{
+   return this->mapping;
+}
+
+template< typename Index,
+          typename Device >
+void
+CSRLightKernel< Index, Device >::
+setThreadsPerSegment( int threadsPerSegment )
+{
+   if( threadsPerSegment !=  1 &&
+       threadsPerSegment !=  2 &&
+       threadsPerSegment !=  4 &&
+       threadsPerSegment !=  8 &&
+       threadsPerSegment != 16 &&
+       threadsPerSegment != 32 )
+       throw std::runtime_error( "Number of threads per segment must be power of 2 - 1, 2, ... 32." );
+   this->threadsPerSegment = threadsPerSegment;
+}
+
+template< typename Index,
+          typename Device >
+int
+CSRLightKernel< Index, Device >::
+getThreadsPerSegment() const
+{
+   return this->threadsPerSegment;
+}
+
+
+      } // namespace Segments
+   }  // namespace Algorithms
+} // namespace TNL
diff --git a/src/TNL/Algorithms/Segments/CSRScalarKernel.h b/src/TNL/Algorithms/Segments/Kernels/CSRScalarKernel.h
similarity index 93%
rename from src/TNL/Algorithms/Segments/CSRScalarKernel.h
rename to src/TNL/Algorithms/Segments/Kernels/CSRScalarKernel.h
index c767083193c59abc770f38a8bb52abb3c4ac06a0..f0c8accd3037e4f690249da9e254a1a0cafec6a3 100644
--- a/src/TNL/Algorithms/Segments/CSRScalarKernel.h
+++ b/src/TNL/Algorithms/Segments/Kernels/CSRScalarKernel.h
@@ -46,7 +46,7 @@ struct CSRScalarKernel
               typename ResultKeeper,
               typename Real,
               typename... Args >
-    static void segmentsReduction( const OffsetsView& offsets,
+    static void reduceSegments( const OffsetsView& offsets,
                                Index first,
                                Index last,
                                Fetch& fetch,
@@ -60,4 +60,4 @@ struct CSRScalarKernel
    }  // namespace Algorithms
 } // namespace TNL
 
-#include <TNL/Algorithms/Segments/CSRScalarKernel.hpp>
\ No newline at end of file
+#include <TNL/Algorithms/Segments/Kernels/CSRScalarKernel.hpp>
diff --git a/src/TNL/Algorithms/Segments/Kernels/CSRScalarKernel.hpp b/src/TNL/Algorithms/Segments/Kernels/CSRScalarKernel.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..e901acfb99529d12a488b13722f6611ade7c30f1
--- /dev/null
+++ b/src/TNL/Algorithms/Segments/Kernels/CSRScalarKernel.hpp
@@ -0,0 +1,226 @@
+/***************************************************************************
+                          CSRScalarKernel.h -  description
+                             -------------------
+    begin                : Jan 23, 2021 -> Joe Biden inauguration
+    copyright            : (C) 2021 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#include <TNL/Assert.h>
+#include <TNL/Cuda/LaunchHelpers.h>
+#include <TNL/Containers/VectorView.h>
+#include <TNL/Algorithms/ParallelFor.h>
+#include <TNL/Algorithms/Segments/Kernels/CSRScalarKernel.h>
+#include <TNL/Algorithms/Segments/detail/LambdaAdapter.h>
+
+namespace TNL {
+   namespace Algorithms {
+      namespace Segments {
+
+template< typename Index,
+          typename Device,
+          typename Fetch,
+          typename Reduce,
+          typename Keep,
+          bool DispatchScalarCSR = detail::CheckFetchLambda< Index, Fetch >::hasAllParameters() >
+struct CSRScalarKernelreduceSegmentsDispatcher;
+
+template< typename Index,
+          typename Device,
+          typename Fetch,
+          typename Reduction,
+          typename ResultKeeper >
+struct CSRScalarKernelreduceSegmentsDispatcher< Index, Device, Fetch, Reduction, ResultKeeper, true >
+{
+
+   template< typename Offsets,
+             typename Real >
+   static void reduce( const Offsets& offsets,
+                       Index first,
+                       Index last,
+                       Fetch& fetch,
+                       const Reduction& reduction,
+                       ResultKeeper& keep,
+                       const Real& zero )
+   {
+      auto l = [=] __cuda_callable__ ( const Index segmentIdx ) mutable {
+         const Index begin = offsets[ segmentIdx ];
+         const Index end = offsets[ segmentIdx + 1 ];
+         Real aux( zero );
+         Index localIdx( 0 );
+         bool compute( true );
+         for( Index globalIdx = begin; globalIdx < end && compute; globalIdx++  )
+             aux = reduction( aux, fetch( segmentIdx, localIdx++, globalIdx, compute ) );
+         keep( segmentIdx, aux );
+      };
+
+      if( std::is_same< Device, TNL::Devices::Sequential >::value )
+      {
+         for( Index segmentIdx = first; segmentIdx < last; segmentIdx ++ )
+            l( segmentIdx );
+      }
+      else if( std::is_same< Device, TNL::Devices::Host >::value )
+      {
+#ifdef HAVE_OPENMP
+        #pragma omp parallel for firstprivate( l ) schedule( dynamic, 100 ), if( Devices::Host::isOMPEnabled() )
+#endif
+         for( Index segmentIdx = first; segmentIdx < last; segmentIdx ++ )
+            l( segmentIdx );
+      }
+      else
+         Algorithms::ParallelFor< Device >::exec( first, last, l );
+   }
+};
+
+template< typename Index,
+          typename Device,
+          typename Fetch,
+          typename Reduce,
+          typename Keep >
+struct CSRScalarKernelreduceSegmentsDispatcher< Index, Device, Fetch, Reduce, Keep, false >
+{
+   template< typename OffsetsView,
+             typename Real >
+   static void reduce( const OffsetsView& offsets,
+                       Index first,
+                       Index last,
+                       Fetch& fetch,
+                       const Reduce& reduction,
+                       Keep& keep,
+                       const Real& zero )
+   {
+      auto l = [=] __cuda_callable__ ( const Index segmentIdx ) mutable {
+         const Index begin = offsets[ segmentIdx ];
+         const Index end = offsets[ segmentIdx + 1 ];
+         Real aux( zero );
+         bool compute( true );
+         for( Index globalIdx = begin; globalIdx < end && compute; globalIdx++  )
+             aux = reduction( aux, fetch( globalIdx, compute ) );
+         keep( segmentIdx, aux );
+      };
+
+      if( std::is_same< Device, TNL::Devices::Sequential >::value )
+      {
+         for( Index segmentIdx = first; segmentIdx < last; segmentIdx ++ )
+            l( segmentIdx );
+      }
+      else if( std::is_same< Device, TNL::Devices::Host >::value )
+      {
+#ifdef HAVE_OPENMP
+        #pragma omp parallel for firstprivate( l ) schedule( dynamic, 100 ), if( Devices::Host::isOMPEnabled() )
+#endif
+         for( Index segmentIdx = first; segmentIdx < last; segmentIdx ++ )
+            l( segmentIdx );
+      }
+      else
+         Algorithms::ParallelFor< Device >::exec( first, last, l );
+
+   }
+};
+
+
+template< typename Index,
+          typename Device >
+    template< typename Offsets >
+void
+CSRScalarKernel< Index, Device >::
+init( const Offsets& offsets )
+{
+}
+
+template< typename Index,
+          typename Device >
+void
+CSRScalarKernel< Index, Device >::
+reset()
+{
+}
+
+template< typename Index,
+          typename Device >
+auto
+CSRScalarKernel< Index, Device >::
+getView() -> ViewType
+{
+    return *this;
+}
+
+template< typename Index,
+          typename Device >
+auto
+CSRScalarKernel< Index, Device >::
+getConstView() const -> ConstViewType
+{
+    return *this;
+};
+
+template< typename Index,
+          typename Device >
+TNL::String
+CSRScalarKernel< Index, Device >::
+getKernelType()
+{
+    return "Scalar";
+}
+
+template< typename Index,
+          typename Device >
+    template< typename OffsetsView,
+              typename Fetch,
+              typename Reduction,
+              typename ResultKeeper,
+              typename Real,
+              typename... Args >
+void
+CSRScalarKernel< Index, Device >::
+reduceSegments( const OffsetsView& offsets,
+                   Index first,
+                   Index last,
+                   Fetch& fetch,
+                   const Reduction& reduction,
+                   ResultKeeper& keeper,
+                   const Real& zero,
+                   Args... args )
+{
+   CSRScalarKernelreduceSegmentsDispatcher< Index, Device, Fetch, Reduction, ResultKeeper >::reduce(
+      offsets, first, last, fetch, reduction, keeper, zero );
+   /*
+    auto l = [=] __cuda_callable__ ( const IndexType segmentIdx, Args... args ) mutable {
+        const IndexType begin = offsets[ segmentIdx ];
+        const IndexType end = offsets[ segmentIdx + 1 ];
+        Real aux( zero );
+        IndexType localIdx( 0 );
+        bool compute( true );
+        for( IndexType globalIdx = begin; globalIdx < end && compute; globalIdx++  )
+            aux = reduction( aux, detail::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx++, globalIdx, compute ) );
+        keeper( segmentIdx, aux );
+    };
+
+     if( std::is_same< DeviceType, TNL::Devices::Host >::value )
+    {
+#ifdef HAVE_OPENMP
+        #pragma omp parallel for firstprivate( l ) schedule( dynamic, 100 ), if( Devices::Host::isOMPEnabled() )
+#endif
+        for( Index segmentIdx = first; segmentIdx < last; segmentIdx ++ )
+            l( segmentIdx, args... );
+        {
+            const IndexType begin = offsets[ segmentIdx ];
+            const IndexType end = offsets[ segmentIdx + 1 ];
+            Real aux( zero );
+            IndexType localIdx( 0 );
+            bool compute( true );
+            for( IndexType globalIdx = begin; globalIdx < end && compute; globalIdx++  )
+                aux = reduction( aux, detail::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx++, globalIdx, compute ) );
+            keeper( segmentIdx, aux );
+        }
+    }
+    else
+        Algorithms::ParallelFor< Device >::exec( first, last, l, args... );*/
+}
+      } // namespace Segments
+   }  // namespace Algorithms
+} // namespace TNL
diff --git a/src/TNL/Algorithms/Segments/CSRVectorKernel.h b/src/TNL/Algorithms/Segments/Kernels/CSRVectorKernel.h
similarity index 93%
rename from src/TNL/Algorithms/Segments/CSRVectorKernel.h
rename to src/TNL/Algorithms/Segments/Kernels/CSRVectorKernel.h
index 074f15c5a35c8c096c52134bc7f6fbb5dd536bec..0654b5ef6f048c19df9ba7d40b65ebe3e7ed5ad8 100644
--- a/src/TNL/Algorithms/Segments/CSRVectorKernel.h
+++ b/src/TNL/Algorithms/Segments/Kernels/CSRVectorKernel.h
@@ -46,7 +46,7 @@ struct CSRVectorKernel
              typename ResultKeeper,
              typename Real,
              typename... Args >
-   static void segmentsReduction( const OffsetsView& offsets,
+   static void reduceSegments( const OffsetsView& offsets,
                                   Index first,
                                   Index last,
                                   Fetch& fetch,
@@ -60,4 +60,4 @@ struct CSRVectorKernel
    }  // namespace Algorithms
 } // namespace TNL
 
-#include <TNL/Algorithms/Segments/CSRVectorKernel.hpp>
+#include <TNL/Algorithms/Segments/Kernels/CSRVectorKernel.hpp>
diff --git a/src/TNL/Algorithms/Segments/CSRVectorKernel.hpp b/src/TNL/Algorithms/Segments/Kernels/CSRVectorKernel.hpp
similarity index 94%
rename from src/TNL/Algorithms/Segments/CSRVectorKernel.hpp
rename to src/TNL/Algorithms/Segments/Kernels/CSRVectorKernel.hpp
index 847d1c355f1d775259b3291744bcae287144d7e4..cf7d80af65b8bbefcbba0490b5658ae905fc9e6d 100644
--- a/src/TNL/Algorithms/Segments/CSRVectorKernel.hpp
+++ b/src/TNL/Algorithms/Segments/Kernels/CSRVectorKernel.hpp
@@ -15,7 +15,7 @@
 #include <TNL/Containers/VectorView.h>
 #include <TNL/Algorithms/ParallelFor.h>
 #include <TNL/Algorithms/Segments/detail/LambdaAdapter.h>
-#include <TNL/Algorithms/Segments/CSRVectorKernel.h>
+#include <TNL/Algorithms/Segments/Kernels/CSRVectorKernel.h>
 
 namespace TNL {
    namespace Algorithms {
@@ -30,7 +30,7 @@ template< typename Offsets,
           typename Real,
           typename... Args >
 __global__
-void segmentsReductionCSRKernelVector(
+void reduceSegmentsCSRKernelVector(
     int gridIdx,
     const Offsets offsets,
     Index first,
@@ -130,7 +130,7 @@ template< typename Index,
               typename... Args >
 void
 CSRVectorKernel< Index, Device >::
-segmentsReduction( const OffsetsView& offsets,
+reduceSegments( const OffsetsView& offsets,
                          Index first,
                          Index last,
                          Fetch& fetch,
@@ -149,7 +149,7 @@ segmentsReduction( const OffsetsView& offsets,
     {
         dim3 gridSize;
         TNL::Cuda::setupGrid( blocksCount, gridsCount, gridIdx, gridSize );
-        segmentsReductionCSRKernelVector< OffsetsView, IndexType, Fetch, Reduction, ResultKeeper, Real, Args... >
+        reduceSegmentsCSRKernelVector< OffsetsView, IndexType, Fetch, Reduction, ResultKeeper, Real, Args... >
         <<< gridSize, blockSize >>>(
             gridIdx.x, offsets, first, last, fetch, reduction, keeper, zero, args... );
     };
diff --git a/src/TNL/Algorithms/Segments/Kernels/details/CSRAdaptiveKernelBlockDescriptor.h b/src/TNL/Algorithms/Segments/Kernels/details/CSRAdaptiveKernelBlockDescriptor.h
new file mode 100644
index 0000000000000000000000000000000000000000..83faa105d198be0a0d5e97cd3d550a085cce2818
--- /dev/null
+++ b/src/TNL/Algorithms/Segments/Kernels/details/CSRAdaptiveKernelBlockDescriptor.h
@@ -0,0 +1,251 @@
+/***************************************************************************
+                          CSRAdaptiveKernelBlockDescriptor.h -  description
+                             -------------------
+    begin                : Jan 25, 2021 -> Joe Biden inauguration
+    copyright            : (C) 2021 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+namespace TNL {
+   namespace Algorithms {
+      namespace Segments {
+         namespace detail {
+
+enum class Type {
+   /* LONG = 0!!! Non zero value rewrites index[1] */
+   LONG = 0,
+   STREAM = 1,
+   VECTOR = 2
+};
+
+//#define CSR_ADAPTIVE_UNION
+
+#ifdef CSR_ADAPTIVE_UNION
+template< typename Index >
+union CSRAdaptiveKernelBlockDescriptor
+{
+   CSRAdaptiveKernelBlockDescriptor(Index row, Type type = Type::VECTOR, Index index = 0, uint8_t warpsCount = 0) noexcept
+   {
+      this->index[0] = row;
+      this->index[1] = index;
+      this->byte[sizeof(Index) == 4 ? 7 : 15] = (uint8_t)type;
+   }
+
+   CSRAdaptiveKernelBlockDescriptor(Index row, Type type, Index nextRow, Index maxID, Index minID) noexcept
+   {
+      this->index[0] = row;
+      this->index[1] = 0;
+      this->twobytes[sizeof(Index) == 4 ? 2 : 4] = maxID - minID;
+
+      if (type == Type::STREAM)
+         this->twobytes[sizeof(Index) == 4 ? 3 : 5] = nextRow - row;
+
+      if (type == Type::STREAM)
+         this->byte[sizeof(Index) == 4 ? 7 : 15] |= 0b1000000;
+      else if (type == Type::VECTOR)
+         this->byte[sizeof(Index) == 4 ? 7 : 15] |= 0b10000000;
+   }
+
+   CSRAdaptiveKernelBlockDescriptor() = default;
+
+   __cuda_callable__ Type getType() const
+   {
+      if( byte[ sizeof( Index ) == 4 ? 7 : 15 ] & 0b1000000 )
+         return Type::STREAM;
+      if( byte[ sizeof( Index ) == 4 ? 7 : 15 ] & 0b10000000 )
+         return Type::VECTOR;
+      return Type::LONG;
+   }
+
+   __cuda_callable__ const Index& getFirstSegment() const
+   {
+      return index[ 0 ];
+   }
+
+   /***
+    * \brief Returns number of elements covered by the block.
+    */
+   __cuda_callable__ const Index getSize() const
+   {
+      return twobytes[ sizeof(Index) == 4 ? 2 : 4 ];
+   }
+
+   /***
+    * \brief Returns number of segments covered by the block.
+    */
+   __cuda_callable__ const Index getSegmentsInBlock() const
+   {
+      return ( twobytes[ sizeof( Index ) == 4 ? 3 : 5 ] & 0x3FFF );
+   }
+
+   __cuda_callable__ uint8_t getWarpIdx() const
+   {
+      return index[ 1 ];
+   }
+
+   __cuda_callable__ uint8_t getWarpsCount() const
+   {
+      return 1;
+   }
+
+   void print( std::ostream& str ) const
+   {
+      Type type = this->getType();
+      str << "Type: ";
+      switch( type )
+      {
+         case Type::STREAM:
+            str << " Stream ";
+            break;
+         case Type::VECTOR:
+            str << " Vector ";
+            break;
+         case Type::LONG:
+            str << " Long ";
+            break;
+      }
+      str << " first segment: " << getFirstSegment();
+      str << " block end: " << getSize();
+      str << " index in warp: " << index[ 1 ];
+   }
+   Index index[2]; // index[0] is row pointer, index[1] is index in warp
+   uint8_t byte[sizeof(Index) == 4 ? 8 : 16]; // byte[7/15] is type specificator
+   uint16_t twobytes[sizeof(Index) == 4 ? 4 : 8]; //twobytes[2/4] is maxID - minID
+                                                //twobytes[3/5] is nextRow - row
+};
+#else
+
+template< typename Index >
+struct CSRAdaptiveKernelBlockDescriptor
+{
+   CSRAdaptiveKernelBlockDescriptor( Index firstSegmentIdx,
+                                     Type type = Type::VECTOR,
+                                     uint8_t warpIdx = 0,
+                                     uint8_t warpsCount = 0 ) noexcept
+   {
+      this->firstSegmentIdx = firstSegmentIdx;
+      this->type = ( uint8_t ) type;
+      this->warpIdx = warpIdx;
+      this->warpsCount = warpsCount;
+      /*this->index[0] = row;
+      this->index[1] = index;
+      this->byte[sizeof(Index) == 4 ? 7 : 15] = (uint8_t)type;*/
+   }
+
+   CSRAdaptiveKernelBlockDescriptor( Index firstSegmentIdx,
+                                     Type type,
+                                     Index lastSegmentIdx,
+                                     Index end,
+                                     Index begin ) noexcept
+   {
+      this->firstSegmentIdx = firstSegmentIdx;
+      this->warpIdx = 0;
+      this->blockSize = end - begin;
+      this->segmentsInBlock = lastSegmentIdx - firstSegmentIdx;
+      this->type = ( uint8_t ) type;
+
+      /*this->index[0] = row;
+      this->index[1] = 0;
+      this->twobytes[sizeof(Index) == 4 ? 2 : 4] = maxID - minID;
+
+      if (type == Type::STREAM)
+         this->twobytes[sizeof(Index) == 4 ? 3 : 5] = nextRow - row;
+
+      if (type == Type::STREAM)
+         this->byte[sizeof(Index) == 4 ? 7 : 15] |= 0b1000000;
+      else if (type == Type::VECTOR)
+         this->byte[sizeof(Index) == 4 ? 7 : 15] |= 0b10000000;*/
+   }
+
+   CSRAdaptiveKernelBlockDescriptor() = default;
+
+   __cuda_callable__ Type getType() const
+   {
+      return ( Type ) this->type;
+      /*if( byte[ sizeof( Index ) == 4 ? 7 : 15 ] & 0b1000000 )
+         return Type::STREAM;
+      if( byte[ sizeof( Index ) == 4 ? 7 : 15 ] & 0b10000000 )
+         return Type::VECTOR;
+      return Type::LONG;*/
+   }
+
+   __cuda_callable__ const Index& getFirstSegment() const
+   {
+      return this->firstSegmentIdx;
+      //return index[ 0 ];
+   }
+
+   /***
+    * \brief Returns number of elements covered by the block.
+    */
+   __cuda_callable__ const Index getSize() const
+   {
+      return this->blockSize;
+      //return twobytes[ sizeof(Index) == 4 ? 2 : 4 ];
+   }
+
+   /***
+    * \brief Returns number of segments covered by the block.
+    */
+   __cuda_callable__ const Index getSegmentsInBlock() const
+   {
+      return this->segmentsInBlock;
+      //return ( twobytes[ sizeof( Index ) == 4 ? 3 : 5 ] & 0x3FFF );
+   }
+
+   __cuda_callable__ uint8_t getWarpIdx() const
+   {
+      return this->warpIdx;
+   }
+
+   __cuda_callable__ uint8_t getWarpsCount() const
+   {
+      return this->warpsCount;
+   }
+
+   void print( std::ostream& str ) const
+   {
+      str << "Type: ";
+      switch( this->getType() )
+      {
+         case Type::STREAM:
+            str << " Stream ";
+            break;
+         case Type::VECTOR:
+            str << " Vector ";
+            break;
+         case Type::LONG:
+            str << " Long ";
+            break;
+      }
+      str << " first segment: " << this->getFirstSegment();
+      str << " block end: " << this->getSize();
+      str << " index in warp: " << this->getWarpIdx();
+   }
+
+   uint8_t type;
+   Index firstSegmentIdx, blockSize, segmentsInBlock;
+   uint8_t warpIdx, warpsCount;
+
+   //Index index[2]; // index[0] is row pointer, index[1] is index in warp
+   //uint8_t byte[sizeof(Index) == 4 ? 8 : 16]; // byte[7/15] is type specificator
+   //uint16_t twobytes[sizeof(Index) == 4 ? 4 : 8]; //twobytes[2/4] is maxID - minID
+                                                //twobytes[3/5] is nextRow - row
+};
+
+#endif
+
+template< typename Index >
+std::ostream& operator<< ( std::ostream& str, const CSRAdaptiveKernelBlockDescriptor< Index >& block )
+{
+   block.print( str );
+   return str;
+}
+         } // namespace detail
+      } // namespace Segments
+   }  // namespace Algorithms
+} // namespace TNL
diff --git a/src/TNL/Algorithms/Segments/Kernels/details/CSRAdaptiveKernelParameters.h b/src/TNL/Algorithms/Segments/Kernels/details/CSRAdaptiveKernelParameters.h
new file mode 100644
index 0000000000000000000000000000000000000000..4af0197d26030e538cb9009daf97770b213c2640
--- /dev/null
+++ b/src/TNL/Algorithms/Segments/Kernels/details/CSRAdaptiveKernelParameters.h
@@ -0,0 +1,112 @@
+/***************************************************************************
+                          CSRAdaptiveKernelBlockDescriptor.h -  description
+                             -------------------
+    begin                : Jan 25, 2021 -> Joe Biden inauguration
+    copyright            : (C) 2021 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+namespace TNL {
+   namespace Algorithms {
+      namespace Segments {
+         namespace detail {
+
+// This can be used for tunning the number of CUDA threads per block depending on the size of Value
+// TODO: Perform some tests
+static constexpr int CSRAdaptiveKernelParametersCudaBlockSizes[] = { 256, 256, 256, 256, 256, 256 };
+
+template< int SizeOfValue = 1,
+          int StreamedSharedMemory_ = 24576 >
+struct CSRAdaptiveKernelParameters
+{
+   static constexpr int MaxValueSizeLog = 6;
+
+   static constexpr int getSizeValueLogConstexpr( const int i );
+
+   static constexpr int getSizeOfValue() { return SizeOfValue; };
+
+   static constexpr int SizeOfValueLog = getSizeValueLogConstexpr( SizeOfValue );
+
+   static_assert( SizeOfValueLog < MaxValueSizeLog, "Parameter SizeOfValue is too large." );
+
+   /**
+    * \brief Computes number of CUDA threads per block depending on Value type.
+    *
+    * \return CUDA block size.
+    */
+   static constexpr int CudaBlockSize() { return CSRAdaptiveKernelParametersCudaBlockSizes[ SizeOfValueLog ]; };
+   //{ return SizeOfValue == 8 ? 128 : 256; };
+
+   /**
+    * \brief Returns amount of shared memory dedicated for stream CSR kernel.
+    *
+    * \return Stream shared memory.
+    */
+   static constexpr size_t StreamedSharedMemory() { return StreamedSharedMemory_; };
+
+   /**
+    * \brief Number of elements fitting into streamed shared memory.
+    */
+   static constexpr size_t StreamedSharedElementsCount() { return StreamedSharedMemory() / SizeOfValue; };
+
+   /**
+    * \brief Computes number of warps in one CUDA block.
+    */
+   static constexpr size_t WarpsCount() { return CudaBlockSize() / Cuda::getWarpSize(); };
+
+   /**
+    * \brief Computes number of elements to be streamed into the shared memory.
+    *
+    * \return Number of elements to be streamed into the shared memory.
+    */
+   static constexpr size_t StreamedSharedElementsPerWarp() { return StreamedSharedElementsCount() / WarpsCount(); };
+
+   /**
+    * \brief Returns maximum number of elements per warp for vector and hybrid kernel.
+    *
+    * \return Maximum number of elements per warp for vector and hybrid kernel.
+    */
+   static constexpr int MaxVectorElementsPerWarp() { return 384; };
+
+   /**
+    * \brief Returns maximum number of elements per warp for adaptive kernel.
+    *
+    * \return Maximum number of elements per warp for adaptive kernel.
+    */
+   static constexpr int MaxAdaptiveElementsPerWarp() { return 512; };
+
+   static int getSizeValueLog( const int i )
+   {
+      if( i ==  1 ) return 0;
+      if( i ==  2 ) return 1;
+      if( i <=  4 ) return 2;
+      if( i <=  8 ) return 3;
+      if( i <= 16 ) return 4;
+      return 5;
+   }
+};
+
+
+template< int SizeOfValue,
+          int StreamedSharedMemory_ >
+constexpr int
+CSRAdaptiveKernelParameters< SizeOfValue, StreamedSharedMemory_ >::
+getSizeValueLogConstexpr( const int i )
+{
+   if( i ==  1 ) return 0;
+   if( i ==  2 ) return 1;
+   if( i <=  4 ) return 2;
+   if( i <=  8 ) return 3;
+   if( i <= 16 ) return 4;
+   if( i <= 32 ) return 5;
+   return 6;
+};
+
+         } // namespace detail
+      } // namespace Segments
+   }  // namespace Algorithms
+} // namespace TNL
diff --git a/src/TNL/Algorithms/Segments/SegmentElement.h b/src/TNL/Algorithms/Segments/SegmentElement.h
new file mode 100644
index 0000000000000000000000000000000000000000..71f78cdd37e3e1b0f019079ac4735c7c8f3ca50b
--- /dev/null
+++ b/src/TNL/Algorithms/Segments/SegmentElement.h
@@ -0,0 +1,84 @@
+/***************************************************************************
+                          SegmentElement.h -  description
+                             -------------------
+    begin                : Apr 5, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#include <ostream>
+
+#include <TNL/Cuda/CudaCallable.h>
+
+namespace TNL {
+   namespace Algorithms {
+      namespace Segments {
+
+/**
+ * \brief Simple structure representing one element of a segment.
+ *
+ * \tparam Index is type used for indexing of the elements.
+ */
+template< typename Index >
+class SegmentElement
+{
+   public:
+
+      /**
+       * \brief Type used for indexing of the elements.
+       */
+      using IndexType = Index;
+
+      /**
+       * \brief Constructor of the segment element with all parameters.
+       *
+       * \param segmentIdx is in index of the parent segment.
+       * \param localIdx is a rank of the element in the segment.
+       * \param globalIdx is an index of the element in the related container.
+       */
+      __cuda_callable__
+      SegmentElement( const IndexType& segmentIdx,
+                      const IndexType& localIdx,
+                      const IndexType globalIdx )
+      : segmentIdx( segmentIdx ), localIdx( localIdx ), globalIdx( globalIdx ) {};
+
+      /**
+       * \brief Returns index of the parent segment.
+       *
+       * \return index of the parent segment.
+       */
+      __cuda_callable__
+      const IndexType& segmentIndex() const { return segmentIdx; };
+
+      /**
+       * \brief Returns rank of the element in the segment.
+       *
+       * \return rank of the element in the segment.
+       */
+      __cuda_callable__
+      const IndexType& localIndex() const { return localIdx; };
+
+      /**
+       * \brief Returns index of the element in the related container.
+       *
+       * \return index of the element in the related container.
+       */
+      __cuda_callable__
+      const IndexType& globalIndex() const { return globalIdx; };
+
+   protected:
+
+      const IndexType& segmentIdx;
+
+      const IndexType& localIdx;
+
+      const IndexType globalIdx;
+};
+
+      } // namespace Segments
+   } // namespace Algorithms
+} // namespace TNL
diff --git a/src/TNL/Algorithms/Segments/SegmentView.h b/src/TNL/Algorithms/Segments/SegmentView.h
index ecf1c95f6e387bfac7e2623374d55f8dc1055415..aac6e0a9408844ea07133779dc75f73113047d0f 100644
--- a/src/TNL/Algorithms/Segments/SegmentView.h
+++ b/src/TNL/Algorithms/Segments/SegmentView.h
@@ -11,22 +11,54 @@
 #pragma once
 
 #include <TNL/Algorithms/Segments/ElementsOrganization.h>
+#include <TNL/Algorithms/Segments/SegmentViewIterator.h>
 
 namespace TNL {
    namespace Algorithms {
       namespace Segments {
 
+/**
+ * \brief Data structure for accessing particular segment.
+ *
+ * \tparam Index is type for indexing elements in related segments.
+ *
+ * See the template specializations \ref TNL::Algorithms::Segments::SegmentView< Index, ColumnMajorOrder >
+ *  and \ref TNL::Algorithms::Segments::SegmentView< Index, RowMajorOrder > for column-major
+ * and row-major elements organization respectively. They have equivalent interface.
+ */
 template< typename Index,
           ElementsOrganization Organization >
 class SegmentView;
 
+
+/**
+ * \brief Data structure for accessing particular segment.
+ *
+ * \tparam Index is type for indexing elements in related segments.
+ */
 template< typename Index >
 class SegmentView< Index, ColumnMajorOrder >
 {
    public:
 
+      /**
+       * \brief Type for indexing elements in related segments.
+       */
       using IndexType = Index;
 
+      /**
+       * \brief Type of iterator for iterating over elements of the segment.
+       */
+      using IteratorType = SegmentViewIterator< SegmentView >;
+
+      /**
+       * \brief Conctructor with all parameters.
+       *
+       * \param segmentIdx is an index of segment the segment view will point to.
+       * \param offset is an offset of the segment in the parent segments.
+       * \param size is a size of the segment.
+       * \param step is stepping between neighbouring elements in the segment.
+       */
       __cuda_callable__
       SegmentView( const IndexType segmentIdx,
                    const IndexType offset,
@@ -34,16 +66,32 @@ class SegmentView< Index, ColumnMajorOrder >
                    const IndexType step )
       : segmentIdx( segmentIdx ), segmentOffset( offset ), segmentSize( size ), step( step ){};
 
+      /**
+       * \brief Copy constructor.
+       *
+       * \param view is the source view.
+       */
       __cuda_callable__
       SegmentView( const SegmentView& view )
       : segmentIdx( view.segmentIdx ), segmentOffset( view.segmentOffset ), segmentSize( view.segmentSize ), step( view.step ){};
 
+      /**
+       * \brief Get the size of the segment, i.e. number of elements in the segment.
+       *
+       * \return number of elements in the segment.
+       */
       __cuda_callable__
       const IndexType& getSize() const
       {
          return this->segmentSize;
       };
 
+      /**
+       * \brief Get global index of an element with rank \e localIndex in the segment.
+       *
+       * \param localIndex is the rank of the element in the segment.
+       * \return global index of the element.
+       */
       __cuda_callable__
       IndexType getGlobalIndex( const IndexType localIndex ) const
       {
@@ -51,12 +99,49 @@ class SegmentView< Index, ColumnMajorOrder >
          return segmentOffset + localIndex * step;
       };
 
+      /**
+       * \brief Get index of the segment.
+       *
+       * \return index of the segment.
+       */
       __cuda_callable__
       const IndexType& getSegmentIndex() const
       {
          return this->segmentIdx;
       };
 
+      /**
+       * \brief Returns iterator pointing at the beginning of the segment.
+       *
+       * \return iterator pointing at the beginning.
+       */
+      __cuda_callable__
+      IteratorType begin() const { return IteratorType( *this, 0 ); };
+
+      /**
+       * \brief Returns iterator pointing at the end of the segment.
+       *
+       * \return iterator pointing at the end.
+       */
+      __cuda_callable__
+      IteratorType end() const { return IteratorType( *this, this->getSize() ); };
+
+      /**
+       * \brief Returns constant iterator pointing at the beginning of the segment.
+       *
+       * \return iterator pointing at the beginning.
+       */
+      __cuda_callable__
+      const IteratorType cbegin() const { return IteratorType( *this, 0 ); };
+
+      /**
+       * \brief Returns constant iterator pointing at the end of the segment.
+       *
+       * \return iterator pointing at the end.
+       */
+      __cuda_callable__
+      const IteratorType cend() const { return IteratorType( *this, this->getSize() ); };
+
       protected:
 
          IndexType segmentIdx, segmentOffset, segmentSize, step;
@@ -67,8 +152,24 @@ class SegmentView< Index, RowMajorOrder >
 {
    public:
 
+      /**
+       * \brief Type for indexing elements in related segments.
+       */
       using IndexType = Index;
 
+      /**
+       * \brief Type of iterator for iterating over elements of the segment.
+       */
+      using IteratorType = SegmentViewIterator< SegmentView >;
+
+      /**
+       * \brief Conctructor with all parameters.
+       *
+       * \param segmentIdx is an index of segment the segment view will point to.
+       * \param offset is an offset of the segment in the parent segments.
+       * \param size is a size of the segment.
+       * \param step is stepping between neighbouring elements in the segment.
+       */
       __cuda_callable__
       SegmentView( const IndexType segmentIdx,
                    const IndexType offset,
@@ -76,12 +177,32 @@ class SegmentView< Index, RowMajorOrder >
                    const IndexType step = 1 ) // For compatibility with previous specialization
       : segmentIdx( segmentIdx ), segmentOffset( offset ), segmentSize( size ){};
 
+      /**
+       * \brief Copy constructor.
+       *
+       * \param view is the source view.
+       */
+      __cuda_callable__
+      SegmentView( const SegmentView& view )
+      : segmentIdx( view.segmentIdx ), segmentOffset( view.segmentOffset ), segmentSize( view.segmentSize ) {};
+
+      /**
+       * \brief Get the size of the segment, i.e. number of elements in the segment.
+       *
+       * \return number of elements in the segment.
+       */
       __cuda_callable__
       const IndexType& getSize() const
       {
          return this->segmentSize;
       };
 
+      /**
+       * \brief Get global index of an element with rank \e localIndex in the segment.
+       *
+       * \param localIndex is the rank of the element in the segment.
+       * \return global index of the element.
+       */
       __cuda_callable__
       IndexType getGlobalIndex( const IndexType localIndex ) const
       {
@@ -89,12 +210,49 @@ class SegmentView< Index, RowMajorOrder >
          return segmentOffset + localIndex;
       };
 
+      /**
+       * \brief Get index of the segment.
+       *
+       * \return index of the segment.
+       */
       __cuda_callable__
       const IndexType& getSegmentIndex() const
       {
          return this->segmentIdx;
       };
 
+      /**
+       * \brief Returns iterator pointing at the beginning of the segment.
+       *
+       * \return iterator pointing at the beginning.
+       */
+      __cuda_callable__
+      IteratorType begin() const { return IteratorType( *this, 0 ); };
+
+      /**
+       * \brief Returns iterator pointing at the end of the segment.
+       *
+       * \return iterator pointing at the end.
+       */
+      __cuda_callable__
+      IteratorType end() const { return IteratorType( *this, this->getSize() ); };
+
+      /**
+       * \brief Returns constant iterator pointing at the beginning of the segment.
+       *
+       * \return iterator pointing at the beginning.
+       */
+      __cuda_callable__
+      const IteratorType cbegin() const { return IteratorType( *this, 0 ); };
+
+      /**
+       * \brief Returns constant iterator pointing at the end of the segment.
+       *
+       * \return iterator pointing at the end.
+       */
+      __cuda_callable__
+      const IteratorType cend() const { return IteratorType( *this, this->getSize() ); };
+
       protected:
 
          IndexType segmentIdx, segmentOffset, segmentSize;
diff --git a/src/TNL/Algorithms/Segments/SegmentViewIterator.h b/src/TNL/Algorithms/Segments/SegmentViewIterator.h
new file mode 100644
index 0000000000000000000000000000000000000000..a0e78883268a0b77dcc0069ed9ceecd36d43fbaf
--- /dev/null
+++ b/src/TNL/Algorithms/Segments/SegmentViewIterator.h
@@ -0,0 +1,107 @@
+ /***************************************************************************
+                          SegmentViewIterator.h -  description
+                             -------------------
+    begin                : Apr 5, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#include <ostream>
+
+#include <TNL/Cuda/CudaCallable.h>
+#include <TNL/Algorithms/Segments/SegmentElement.h>
+
+namespace TNL {
+   namespace Algorithms {
+      namespace Segments {
+
+/**
+ * \brief Iterator for iterating over elements of a segment.
+ *
+ * The iterator can be used even in GPU kernels.
+ *
+ * \tparam SegmentView is a type of related segment view.
+ */
+template< typename SegmentView >
+class SegmentViewIterator
+{
+   public:
+
+      /**
+       * \brief Type of SegmentView
+       */
+      using SegmentViewType = SegmentView;
+
+      /**
+       * \brief The type used for matrix elements indexing.
+       */
+      using IndexType = typename SegmentViewType::IndexType;
+
+      /**
+       * \brief The type of related matrix element.
+       */
+      using SegmentElementType = SegmentElement< IndexType >;
+
+      __cuda_callable__
+      SegmentViewIterator( const SegmentViewType& segmentView,
+                           const IndexType& localIdx );
+
+      /**
+       * \brief Comparison of two matrix Segment iterators.
+       *
+       * \param other is another matrix Segment iterator.
+       * \return \e true if both iterators points at the same point of the same matrix, \e false otherwise.
+       */
+      __cuda_callable__
+      bool operator==( const SegmentViewIterator& other ) const;
+
+      /**
+       * \brief Comparison of two matrix Segment iterators.
+       *
+       * \param other is another matrix Segment iterator.
+       * \return \e false if both iterators points at the same point of the same matrix, \e true otherwise.
+       */
+      __cuda_callable__
+      bool operator!=( const SegmentViewIterator& other ) const;
+
+      /**
+       * \brief Operator for incrementing the iterator, i.e. moving to the next element.
+       *
+       * \return reference to this iterator.
+       */
+      __cuda_callable__
+      SegmentViewIterator& operator++();
+
+      /**
+       * \brief Operator for decrementing the iterator, i.e. moving to the previous element.
+       *
+       * \return reference to this iterator.
+       */
+      __cuda_callable__
+      SegmentViewIterator& operator--();
+
+      /**
+       * \brief Operator for derefrencing the iterator.
+       *
+       * It returns structure \ref SegmentElementType which represent one element of a segment.
+       * \return segment element the iterator points to.
+       */
+      __cuda_callable__
+      const SegmentElementType operator*() const;
+
+   protected:
+
+      const SegmentViewType& segmentView;
+
+      IndexType localIdx = 0;
+};
+
+      } // namespace Segments
+   } // namespace Algorithms
+} // namespace TNL
+
+#include <TNL/Algorithms/Segments/SegmentViewIterator.hpp>
diff --git a/src/TNL/Algorithms/Segments/SegmentViewIterator.hpp b/src/TNL/Algorithms/Segments/SegmentViewIterator.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..47154da99eb26f18d6c567cb13154e8d5ccb07bb
--- /dev/null
+++ b/src/TNL/Algorithms/Segments/SegmentViewIterator.hpp
@@ -0,0 +1,83 @@
+/***************************************************************************
+                          SegmentViewIterator.hpp -  description
+                             -------------------
+    begin                : Apr 5, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#include <TNL/Algorithms/Segments/SegmentView.h>
+#include <TNL/Assert.h>
+
+namespace TNL {
+   namespace Algorithms {
+      namespace Segments {
+
+template< typename SegmentView >
+__cuda_callable__
+SegmentViewIterator< SegmentView >::
+SegmentViewIterator( const SegmentViewType& segmentView,
+                     const IndexType& localIdx )
+: segmentView( segmentView ), localIdx( localIdx )
+{
+}
+
+template< typename SegmentView >
+__cuda_callable__ bool
+SegmentViewIterator< SegmentView >::
+operator==( const SegmentViewIterator& other ) const
+{
+   if( &this->segmentView == &other.segmentView &&
+       localIdx == other.localIdx )
+      return true;
+   return false;
+}
+
+template< typename SegmentView >
+__cuda_callable__ bool
+SegmentViewIterator< SegmentView >::
+operator!=( const SegmentViewIterator& other ) const
+{
+   return ! ( other == *this );
+}
+
+template< typename SegmentView >
+__cuda_callable__
+SegmentViewIterator< SegmentView >&
+SegmentViewIterator< SegmentView >::
+operator++()
+{
+   if( localIdx < segmentView.getSize() )
+      localIdx ++;
+   return *this;
+}
+
+template< typename SegmentView >
+__cuda_callable__
+SegmentViewIterator< SegmentView >&
+SegmentViewIterator< SegmentView >::
+operator--()
+{
+   if( localIdx > 0 )
+      localIdx --;
+   return *this;
+}
+
+template< typename SegmentView >
+__cuda_callable__ auto
+SegmentViewIterator< SegmentView >::
+operator*() const -> const SegmentElementType
+{
+   return SegmentElementType(
+      this->segmentView.getSegmentIndex(),
+      this->localIdx,
+      this->segmentView.getGlobalIndex( this->localIdx ) );
+}
+
+      } // namespace Segments
+   } // namespace Algorithms
+} // namespace TNL
diff --git a/src/TNL/Algorithms/Segments/SegmentsPrinting.h b/src/TNL/Algorithms/Segments/SegmentsPrinting.h
new file mode 100644
index 0000000000000000000000000000000000000000..491f059fa6b66c300f68d37f06a25772ddfd2b78
--- /dev/null
+++ b/src/TNL/Algorithms/Segments/SegmentsPrinting.h
@@ -0,0 +1,139 @@
+/***************************************************************************
+                          SegmentsPrinting.h -  description
+                             -------------------
+    begin                : Apr 1, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#include <iostream>
+#include <TNL/Containers/Array.h>
+
+namespace TNL {
+   namespace Algorithms {
+      namespace Segments {
+
+/**
+ * \brief Print segments sizes, i.e. the segments setup.
+ *
+ * \tparam Segments is type of segments.
+ * \param segments is an instance of segments.
+ * \param str is output stream.
+ * \return reference to the output stream.
+ *
+ * \par Example
+ * \include Algorithms/Segments/SegmentsPrintingExample-1.cpp
+ * \par Output
+ * \include SegmentsPrintingExample-1.out
+ */
+template< typename Segments >
+std::ostream& printSegments( const Segments& segments, std::ostream& str )
+{
+   using IndexType = typename Segments::IndexType;
+   using DeviceType = typename Segments::DeviceType;
+
+   auto segmentsCount = segments.getSegmentsCount();
+   str << " [";
+   for( IndexType segmentIdx = 0; segmentIdx < segmentsCount; segmentIdx++ )
+   {
+      auto segmentSize = segments.getSegmentSize( segmentIdx );
+      str << " " << segmentSize;
+      if( segmentIdx < segmentsCount )
+         str << ",";
+   }
+   str << " ] ";
+   return str;
+}
+
+/// This is to prevent from appearing in Doxygen documentation.
+/// \cond HIDDEN_CLASS
+template< typename Segments,
+          typename Fetch >
+struct SegmentsPrinter
+{
+   SegmentsPrinter( const Segments& segments, Fetch&& fetch )
+   : segments( segments ), fetch( fetch ) {}
+
+   std::ostream& print( std::ostream& str ) const
+   {
+      using IndexType = typename Segments::IndexType;
+      using DeviceType = typename Segments::DeviceType;
+      using ValueType = decltype( fetch( IndexType() ) );
+
+      TNL::Containers::Array< ValueType, DeviceType, IndexType > aux( 1 );
+      auto view = segments.getConstView();
+      for( IndexType segmentIdx = 0; segmentIdx < segments.getSegmentsCount(); segmentIdx++ )
+      {
+         str << "Seg. " << segmentIdx << ": [ ";
+         auto segmentSize = segments.getSegmentSize( segmentIdx );
+         for( IndexType localIdx = 0; localIdx < segmentSize; localIdx++ )
+         {
+            aux.forAllElements( [=] __cuda_callable__ ( IndexType elementIdx, double& v ) mutable {
+               //printf( "####### localIdx = %d, globalIdx = %d \n", localIdx, view.getGlobalIndex( segmentIdx, localIdx ) );
+               //v = view.getGlobalIndex( segmentIdx, localIdx );
+               v = fetch( view.getGlobalIndex( segmentIdx, localIdx ) );
+            } );
+            auto value = aux.getElement( 0 );
+            str << value;
+            if( localIdx < segmentSize - 1 )
+               str << ", ";
+         }
+         str << " ] " << std::endl;
+      }
+      return str;
+   }
+
+   protected:
+
+   const Segments& segments;
+
+   Fetch fetch;
+};
+
+/*template< typename Segments,
+          typename Fetch >
+std::ostream& operator<<( std::ostream& str, const SegmentsPrinter< Segments, Fetch >& printer )
+{
+   return printer.print( str );
+}*/
+
+template< typename Segments,
+          typename Fetch >
+std::ostream& printSegments( const Segments& segments, Fetch&& fetch, std::ostream& str )
+{
+   using IndexType = typename Segments::IndexType;
+   using DeviceType = typename Segments::DeviceType;
+   using ValueType = decltype( fetch( IndexType() ) );
+
+   TNL::Containers::Array< ValueType, DeviceType, IndexType > aux( 1 );
+   auto view = segments.getConstView();
+   for( IndexType segmentIdx = 0; segmentIdx < segments.getSegmentsCount(); segmentIdx++ )
+   {
+      str << "Seg. " << segmentIdx << ": [ ";
+      auto segmentSize = segments.getSegmentSize( segmentIdx );
+      //std::cerr << "Segment size = " << segmentSize << std::endl;
+      for( IndexType localIdx = 0; localIdx < segmentSize; localIdx++ )
+      {
+         aux.forAllElements( [=] __cuda_callable__ ( IndexType elementIdx, double& v ) mutable {
+            //printf( "####### localIdx = %d, globalIdx = %d \n", localIdx, view.getGlobalIndex( segmentIdx, localIdx ) );
+            v = fetch( view.getGlobalIndex( segmentIdx, localIdx ) );
+            //v = view.getGlobalIndex( segmentIdx, localIdx );
+         } );
+         auto value = aux.getElement( 0 );
+         str << value;
+         if( localIdx < segmentSize - 1 )
+            str << ", ";
+      }
+      str << " ] " << std::endl;
+   }
+   return str;
+}
+/// \endcond
+
+      } // namespace Segments
+   } // namespace Algorithms
+} // namespace TNL
diff --git a/src/TNL/Algorithms/Segments/SlicedEllpack.h b/src/TNL/Algorithms/Segments/SlicedEllpack.h
index 942306c7516259e33a7ad6fcf7ebe5c83a376e44..974087e4b8ca8b8e8936b96838a6ae4b1ff8575e 100644
--- a/src/TNL/Algorithms/Segments/SlicedEllpack.h
+++ b/src/TNL/Algorithms/Segments/SlicedEllpack.h
@@ -30,7 +30,7 @@ class SlicedEllpack
 
       using DeviceType = Device;
       using IndexType = std::remove_const_t< Index >;
-      using OffsetsHolder = Containers::Vector< Index, DeviceType, IndexType, IndexAllocator >;
+      using OffsetsContainer = Containers::Vector< Index, DeviceType, IndexType, IndexAllocator >;
       static constexpr int getSliceSize() { return SliceSize; }
       static constexpr ElementsOrganization getOrganization() { return Organization; }
       using ViewType = SlicedEllpackView< Device, Index, Organization, SliceSize >;
@@ -43,7 +43,11 @@ class SlicedEllpack
 
       SlicedEllpack();
 
-      SlicedEllpack( const Containers::Vector< IndexType, DeviceType, IndexType >& sizes );
+      template< typename SizesContainer >
+      SlicedEllpack( const SizesContainer& sizes );
+
+      template< typename ListIndex >
+      SlicedEllpack( const std::initializer_list< ListIndex >& segmentsSizes );
 
       SlicedEllpack( const SlicedEllpack& segments );
 
@@ -60,7 +64,7 @@ class SlicedEllpack
       /**
        * \brief Set sizes of particular segments.
        */
-      template< typename SizesHolder = OffsetsHolder >
+      template< typename SizesHolder = OffsetsContainer >
       void setSegmentsSizes( const SizesHolder& sizes );
 
       void reset();
@@ -103,16 +107,16 @@ class SlicedEllpack
       void forSegments( IndexType begin, IndexType end, Function&& f ) const;
 
       template< typename Function >
-      void forEachSegment( Function&& f ) const;
+      void forAllSegments( Function&& f ) const;
 
       /***
        * \brief Go over all segments and perform a reduction in each of them.
        */
-      template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
-      void segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
+      template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real >
+      void reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero ) const;
 
-      template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
-      void allReduction( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
+      template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real >
+      void reduceAllSegments( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero ) const;
 
       SlicedEllpack& operator=( const SlicedEllpack& source ) = default;
 
@@ -123,13 +127,23 @@ class SlicedEllpack
 
       void load( File& file );
 
+      template< typename Fetch >
+      SegmentsPrinter< SlicedEllpack, Fetch > print( Fetch&& fetch ) const;
+
    protected:
 
       IndexType size, alignedSize, segmentsCount;
 
-      OffsetsHolder sliceOffsets, sliceSegmentSizes;
+      OffsetsContainer sliceOffsets, sliceSegmentSizes;
 };
 
+template <typename Device,
+          typename Index,
+          typename IndexAllocator,
+          ElementsOrganization Organization,
+          int SliceSize >
+std::ostream& operator<<( std::ostream& str, const SlicedEllpack< Device, Index, IndexAllocator, Organization, SliceSize >& segments ) { return printSegments( str, segments ); }
+
       } // namespace Segements
    }  // namespace Algorithms
 } // namespace TNL
diff --git a/src/TNL/Algorithms/Segments/SlicedEllpack.hpp b/src/TNL/Algorithms/Segments/SlicedEllpack.hpp
index 4482cd567b704fcc5fdde99a33aa14be0491dabe..b1e0a21f37bb18f1fcc0cad57b9ab2a5b64d9fb2 100644
--- a/src/TNL/Algorithms/Segments/SlicedEllpack.hpp
+++ b/src/TNL/Algorithms/Segments/SlicedEllpack.hpp
@@ -37,11 +37,25 @@ template< typename Device,
           typename IndexAllocator,
           ElementsOrganization Organization,
           int SliceSize >
+   template< typename SizesContainer >
 SlicedEllpack< Device, Index, IndexAllocator, Organization, SliceSize >::
-SlicedEllpack( const Containers::Vector< IndexType, DeviceType, IndexType >& sizes )
+SlicedEllpack( const SizesContainer& segmentsSizes )
    : size( 0 ), alignedSize( 0 ), segmentsCount( 0 )
 {
-   this->setSegmentsSizes( sizes );
+   this->setSegmentsSizes( segmentsSizes );
+}
+
+template< typename Device,
+          typename Index,
+          typename IndexAllocator,
+          ElementsOrganization Organization,
+          int SliceSize >
+   template< typename ListIndex >
+SlicedEllpack< Device, Index, IndexAllocator, Organization, SliceSize >::
+SlicedEllpack( const std::initializer_list< ListIndex >& segmentsSizes )
+   : size( 0 ), alignedSize( 0 ), segmentsCount( 0 )
+{
+   this->setSegmentsSizes( Containers::Vector< IndexType, DeviceType, IndexType >( segmentsSizes ) );
 }
 
 template< typename Device,
@@ -79,6 +93,7 @@ String
 SlicedEllpack< Device, Index, IndexAllocator, Organization, SliceSize >::
 getSerializationType()
 {
+   // FIXME: the serialized data DEPEND on the Organization and Alignment parameters, so it should be reflected in the serialization type
    return "SlicedEllpack< [any_device], " + TNL::getSerializationType< IndexType >() + " >";
 }
 
@@ -152,8 +167,9 @@ setSegmentsSizes( const SizesHolder& sizes )
       slices_view[ i ] = res * SliceSize;
       slice_segment_size_view[ i ] = res;
    };
-   ellpack.allReduction( fetch, reduce, keep, std::numeric_limits< IndexType >::min() );
-   inplaceExclusiveScan( this->sliceOffsets );
+   ellpack.reduceAllSegments( fetch, reduce, keep, std::numeric_limits< IndexType >::min() );
+   Algorithms::inplaceExclusiveScan( this->sliceOffsets );
+   //this->sliceOffsets.template exclusiveScan< Algorithms::detail::ScanType::Exclusive >();
    this->size = sum( sizes );
    this->alignedSize = this->sliceOffsets.getElement( slicesCount );
 }
@@ -328,9 +344,9 @@ template< typename Device,
    template< typename Function >
 void
 SlicedEllpack< Device, Index, IndexAllocator, Organization, SliceSize >::
-forEachSegment( Function&& f ) const
+forAllSegments( Function&& f ) const
 {
-   this->getConstView().forEachSegment( f );
+   this->getConstView().forAllSegments( f );
 }
 
 template< typename Device,
@@ -338,12 +354,12 @@ template< typename Device,
           typename IndexAllocator,
           ElementsOrganization Organization,
           int SliceSize >
-   template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
+   template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real >
 void
 SlicedEllpack< Device, Index, IndexAllocator, Organization, SliceSize >::
-segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
+reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero ) const
 {
-   this->getConstView().segmentsReduction( first, last, fetch, reduction, keeper, zero, args... );
+   this->getConstView().reduceSegments( first, last, fetch, reduction, keeper, zero );
 }
 
 template< typename Device,
@@ -351,12 +367,12 @@ template< typename Device,
           typename IndexAllocator,
           ElementsOrganization Organization,
           int SliceSize >
-   template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
+   template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real >
 void
 SlicedEllpack< Device, Index, IndexAllocator, Organization, SliceSize >::
-allReduction( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
+reduceAllSegments( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero ) const
 {
-   this->segmentsReduction( 0, this->getSegmentsCount(), fetch, reduction, keeper, zero, args... );
+   this->reduceSegments( 0, this->getSegmentsCount(), fetch, reduction, keeper, zero );
 }
 
 template< typename Device,
@@ -409,6 +425,19 @@ load( File& file )
    file >> this->sliceSegmentSizes;
 }
 
+template< typename Device,
+          typename Index,
+          typename IndexAllocator,
+          ElementsOrganization Organization,
+          int SliceSize >
+      template< typename Fetch >
+auto
+SlicedEllpack< Device, Index, IndexAllocator, Organization, SliceSize >::
+print( Fetch&& fetch ) const -> SegmentsPrinter< SlicedEllpack, Fetch >
+{
+   return SegmentsPrinter< SlicedEllpack, Fetch >( *this, fetch );
+}
+
       } // namespace Segments
    }  // namespace Algorithms
 } // namespace TNL
diff --git a/src/TNL/Algorithms/Segments/SlicedEllpackView.h b/src/TNL/Algorithms/Segments/SlicedEllpackView.h
index 2955ee3515960b6861bcd238d2134302dc28357e..0df58aec66b6fbc04fe120bbfe8e00536897b20c 100644
--- a/src/TNL/Algorithms/Segments/SlicedEllpackView.h
+++ b/src/TNL/Algorithms/Segments/SlicedEllpackView.h
@@ -15,6 +15,7 @@
 #include <TNL/Containers/Vector.h>
 #include <TNL/Algorithms/Segments/ElementsOrganization.h>
 #include <TNL/Algorithms/Segments/SegmentView.h>
+#include <TNL/Algorithms/Segments/SegmentsPrinting.h>
 
 namespace TNL {
    namespace Algorithms {
@@ -32,7 +33,7 @@ class SlicedEllpackView
       using IndexType = std::remove_const_t< Index >;
       using OffsetsView = typename Containers::VectorView< Index, DeviceType, IndexType >;
       static constexpr int getSliceSize() { return SliceSize; }
-      static constexpr bool getOrganization() { return Organization; }
+      static constexpr ElementsOrganization getOrganization() { return Organization; }
       template< typename Device_, typename Index_ >
       using ViewTemplate = SlicedEllpackView< Device_, Index_, Organization, SliceSize >;
       using ViewType = SlicedEllpackView;
@@ -104,16 +105,16 @@ class SlicedEllpackView
       void forSegments( IndexType begin, IndexType end, Function&& f ) const;
 
       template< typename Function >
-      void forEachSegment( Function&& f ) const;
+      void forAllSegments( Function&& f ) const;
 
       /***
        * \brief Go over all segments and perform a reduction in each of them.
        */
-      template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
-      void segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
+      template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real >
+      void reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero ) const;
 
-      template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
-      void allReduction( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
+      template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real >
+      void reduceAllSegments( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero ) const;
 
       SlicedEllpackView& operator=( const SlicedEllpackView& view );
 
@@ -121,6 +122,9 @@ class SlicedEllpackView
 
       void load( File& file );
 
+      template< typename Fetch >
+      SegmentsPrinter< SlicedEllpackView, Fetch > print( Fetch&& fetch ) const;
+
    protected:
 
       IndexType size, alignedSize, segmentsCount;
@@ -128,6 +132,12 @@ class SlicedEllpackView
       OffsetsView sliceOffsets, sliceSegmentSizes;
 };
 
+template <typename Device,
+          typename Index,
+          ElementsOrganization Organization,
+          int SliceSize >
+std::ostream& operator<<( std::ostream& str, const SlicedEllpackView< Device, Index, Organization, SliceSize >& segments ) { return printSegments( str, segments ); }
+
       } // namespace Segements
    }  // namespace Algorithms
 } // namespace TNL
diff --git a/src/TNL/Algorithms/Segments/SlicedEllpackView.hpp b/src/TNL/Algorithms/Segments/SlicedEllpackView.hpp
index 94bebca13412a0d16d7f4548495c8b075f93e51c..80700367c3bed8ac190cbe7f04d3a4de73a6fc68 100644
--- a/src/TNL/Algorithms/Segments/SlicedEllpackView.hpp
+++ b/src/TNL/Algorithms/Segments/SlicedEllpackView.hpp
@@ -83,6 +83,7 @@ String
 SlicedEllpackView< Device, Index, Organization, SliceSize >::
 getSerializationType()
 {
+   // FIXME: the serialized data DEPEND on the Organization and Alignment parameters, so it should be reflected in the serialization type
    return "SlicedEllpack< [any_device], " + TNL::getSerializationType< IndexType >() + " >";
 }
 
@@ -242,15 +243,14 @@ forElements( IndexType first, IndexType last, Function&& f ) const
          const IndexType begin = sliceOffsets_view[ sliceIdx ] + segmentInSliceIdx * segmentSize;
          const IndexType end = begin + segmentSize;
          IndexType localIdx( 0 );
-         bool compute( true );
-         for( IndexType globalIdx = begin; globalIdx < end && compute; globalIdx++  )
+         for( IndexType globalIdx = begin; globalIdx < end; globalIdx++  )
          {
             // The following is a workaround of a bug in nvcc 11.2
 #if CUDART_VERSION == 11020
-             f( segmentIdx, localIdx, globalIdx, compute );
+             f( segmentIdx, localIdx, globalIdx );
              localIdx++;
 #else
-             f( segmentIdx, localIdx++, globalIdx, compute );
+             f( segmentIdx, localIdx++, globalIdx );
 #endif
          }
       };
@@ -265,15 +265,14 @@ forElements( IndexType first, IndexType last, Function&& f ) const
          const IndexType begin = sliceOffsets_view[ sliceIdx ] + segmentInSliceIdx;
          const IndexType end = sliceOffsets_view[ sliceIdx + 1 ];
          IndexType localIdx( 0 );
-         bool compute( true );
-         for( IndexType globalIdx = begin; globalIdx < end && compute; globalIdx += SliceSize )
+         for( IndexType globalIdx = begin; globalIdx < end; globalIdx += SliceSize )
          {
             // The following is a workaround of a bug in nvcc 11.2
 #if CUDART_VERSION == 11020
-            f( segmentIdx, localIdx, globalIdx, compute );
+            f( segmentIdx, localIdx, globalIdx );
             localIdx++;
 #else
-            f( segmentIdx, localIdx++, globalIdx, compute );
+            f( segmentIdx, localIdx++, globalIdx );
 #endif
          }
       };
@@ -317,7 +316,7 @@ template< typename Device,
    template< typename Function >
 void
 SlicedEllpackView< Device, Index, Organization, SliceSize >::
-forEachSegment( Function&& f ) const
+forAllSegments( Function&& f ) const
 {
    this->forSegments( 0, this->getSegmentsCount(), f );
 }
@@ -326,18 +325,18 @@ template< typename Device,
           typename Index,
           ElementsOrganization Organization,
           int SliceSize >
-   template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
+   template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real >
 void
 SlicedEllpackView< Device, Index, Organization, SliceSize >::
-segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
+reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero ) const
 {
    using RealType = typename detail::FetchLambdaAdapter< Index, Fetch >::ReturnType;
-   //using RealType = decltype( fetch( IndexType(), IndexType(), IndexType(), std::declval< bool& >(), args... ) );
+   //using RealType = decltype( fetch( IndexType(), IndexType(), IndexType(), std::declval< bool& >() ) );
    const auto sliceSegmentSizes_view = this->sliceSegmentSizes.getConstView();
    const auto sliceOffsets_view = this->sliceOffsets.getConstView();
    if( Organization == RowMajorOrder )
    {
-      auto l = [=] __cuda_callable__ ( const IndexType segmentIdx, Args... args ) mutable {
+      auto l = [=] __cuda_callable__ ( const IndexType segmentIdx ) mutable {
          const IndexType sliceIdx = segmentIdx / SliceSize;
          const IndexType segmentInSliceIdx = segmentIdx % SliceSize;
          const IndexType segmentSize = sliceSegmentSizes_view[ sliceIdx ];
@@ -350,11 +349,11 @@ segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reductio
             aux = reduction( aux, detail::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx++, globalIdx, compute ) );
          keeper( segmentIdx, aux );
       };
-      Algorithms::ParallelFor< Device >::exec( first, last, l, args... );
+      Algorithms::ParallelFor< Device >::exec( first, last, l );
    }
    else
    {
-      auto l = [=] __cuda_callable__ ( const IndexType segmentIdx, Args... args ) mutable {
+      auto l = [=] __cuda_callable__ ( const IndexType segmentIdx ) mutable {
          const IndexType sliceIdx = segmentIdx / SliceSize;
          const IndexType segmentInSliceIdx = segmentIdx % SliceSize;
          //const IndexType segmentSize = sliceSegmentSizes_view[ sliceIdx ];
@@ -367,7 +366,7 @@ segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reductio
             aux = reduction( aux, detail::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx++, globalIdx, compute ) );
          keeper( segmentIdx, aux );
       };
-      Algorithms::ParallelFor< Device >::exec( first, last, l, args... );
+      Algorithms::ParallelFor< Device >::exec( first, last, l );
    }
 }
 
@@ -375,12 +374,12 @@ template< typename Device,
           typename Index,
           ElementsOrganization Organization,
           int SliceSize >
-   template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
+   template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real >
 void
 SlicedEllpackView< Device, Index, Organization, SliceSize >::
-allReduction( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
+reduceAllSegments( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero ) const
 {
-   this->segmentsReduction( 0, this->getSegmentsCount(), fetch, reduction, keeper, zero, args... );
+   this->reduceSegments( 0, this->getSegmentsCount(), fetch, reduction, keeper, zero );
 }
 
 template< typename Device,
@@ -429,6 +428,18 @@ load( File& file )
    file >> this->sliceSegmentSizes;
 }
 
+template< typename Device,
+          typename Index,
+          ElementsOrganization Organization,
+          int SliceSize >
+      template< typename Fetch >
+auto
+SlicedEllpackView< Device, Index, Organization, SliceSize >::
+print( Fetch&& fetch ) const -> SegmentsPrinter< SlicedEllpackView, Fetch >
+{
+   return SegmentsPrinter< SlicedEllpackView, Fetch >( *this, fetch );
+}
+
       } // namespace Segments
    }  // namespace Algorithms
 } // namespace TNL
diff --git a/src/TNL/Algorithms/Segments/_NamespaceDoxy.h b/src/TNL/Algorithms/Segments/_NamespaceDoxy.h
new file mode 100644
index 0000000000000000000000000000000000000000..6a9bab68a8ade5b4b3940c5a36a806c9c7a01e64
--- /dev/null
+++ b/src/TNL/Algorithms/Segments/_NamespaceDoxy.h
@@ -0,0 +1,142 @@
+/***************************************************************************
+                          _NamespaceDoxy.h -  description
+                             -------------------
+    begin                : Apr 1, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+namespace TNL {
+   namespace Algorithms {
+/**
+ * \brief Namespace holding segments data structures.
+ *
+ * *Segments* represent data structure for manipulation with several local arrays (denoted also as segments)
+ having different size in general. All the local arrays are supposed to be allocated in one continuos global array.
+ The data structure segments offers mapping between indexes of particular local arrays and indexes
+ of the global array. In addition,one can perform parallel operations like for or flexible reduction on partical
+ local arrays.
+
+ A typical example for use of *segments* is implementation of sparse matrices. Sparse matrix like the following
+ \f[
+  \left(
+  \begin{array}{ccccc}
+   1  &  0  &  2  &  0  &  0 \\
+    0  &  0  &  5  &  0  &  0 \\
+    3  &  4  &  7  &  9  &  0 \\
+    0  &  0  &  0  &  0  & 12 \\
+   0  &  0  & 15  & 17  & 20
+  \end{array}
+  \right)
+ \f]
+ is usually first compressed which means that the zero elements are omitted to get the following "matrix":
+
+ \f[
+ \begin{array}{ccccc}
+    1  &   2  \\
+    5   \\
+    3  &   4  &  7 &  9   \\
+    12 \\
+    15 & 17  & 20
+ \end{array}
+ \f]
+ We have to store column index of each matrix elements as well in a "matrix" like this:
+ \f[
+ \begin{array}{ccccc}
+    0  &   2  \\
+    2   \\
+    0  &   1  &  2 &  3   \\
+    4 \\
+    2 & 3  & 4
+ \end{array}
+ \f]
+
+ Such "matrices" can be stored in memory in a row-wise manner in one contiguous array because of the performance reasons. The first "matrix" (i.e. values of the matrix elements)
+ would be stored as follows
+
+ \f[
+    \begin{array}{|cc|c|cccc|c|cc|} 1 & 2 &  5 & 3 & 4 & 7 & 9 & 12 & 15 & 17 & 20 \end{array}
+ \f]
+
+and the second one (i.e. column indexes of the matrix values) as follows
+
+\f[
+    \begin{array}{|cc|c|cccc|c|cc|} 0 & 2 & 2 & 0 & 1 & 2 & 3 & 4 & 2 & 3 & 4 \end{array}
+ \f]
+
+What we see above is so called [CSR sparse matrix format](https://en.wikipedia.org/wiki/Sparse_matrix#Compressed_sparse_row_(CSR,_CRS_or_Yale_format)).
+It is the most popular format for storage of sparse matrices designed for high performance. However, it may not be the most efficient format for storage
+of sparse matrices on GPUs. Therefore many other formats have been developed to get better performance. These formats often have different layout
+of the matrix elements in the memory. They have to deal especially with two difficulties:
+
+1. Efficient storage of matrix elements in the memory to fulfill the requirements of coalesced memory accesses on GPUs or good spatial locality
+ for efficient use of caches on CPUs.
+2. Efficient mapping of GPU threads to different matrix rows.
+
+Necessity of working with this kind of data structure is not limited only to sparse matrices. We could name at least few others:
+
+1. Efficient storage of [graphs](https://en.wikipedia.org/wiki/Graph_(discrete_mathematics)) - one segment represents one graph node,
+   the elements in one segments are indexes of its neighbors.
+2. [Unstructured numerical meshes](https://en.wikipedia.org/wiki/Types_of_mesh) - unstructured numerical mesh is a graph in fact.
+3. [Particle in cell method](https://en.wikipedia.org/wiki/Particle-in-cell) - one segment represents one cell, the elements in one segment
+   are indexes of the particles.
+4. [K-means clustering](https://en.wikipedia.org/wiki/K-means_clustering) - segments represent one cluster, the elements represent vectors
+   belonging to given cluster.
+5. [Hashing](https://arxiv.org/abs/1907.02900) - segments are particular rows of the hash table, elements in segments corresponds with coliding
+   hashed elements.
+
+In general, segments can be used for problems that somehow corresponds wit 2D data structure where each row can have different size and we need
+to perform miscellaneous operations within the rows. The name *segments* comes from segmented parallel reduction or
+[segmented scan (prefix-sum)](https://en.wikipedia.org/wiki/Segmented_scan).
+
+The following example demonstrates the essence of *segments* in TNL:
+
+\includelineno Algorithms/Segments/SegmentsExample_General.cpp
+
+We demonstrate two formats of segments - \ref TNL::Algorithms::Segments::CSR and \ref TNL::Algorithms::Segments::Ellpack running on both CPU and GPU
+(lines 58-76). For each of them, we call function `SegmentsExample` which first creates given segments (line 18). The segments are defined by the sizes of
+particular segments.
+
+Next we allocate array with data related to the segments (line 24). The number of elemets managed by the segments is given by
+\ref TNL::Algorithms::Segments::CSR::getStorageSize and \ref TNL::Algorithms::Segments::Ellpack::getStorageSize respectively.
+
+Next we setup the segments elements (lines 29-33) by calling \ref TNL::Algorithms::Segments::CSR::forAllElements
+(and \ref TNL::Algorithms::Segments::CSR::forAllElements respectively) which iterates over all elements of the segments
+in parallel and perform given lambda function. The lambda function receives index of the segment (`segmentIdx`),
+index of the element within the segment (`localIdx`), index of the element within the array `data` and a reference to boolean (`compute`) which serves as a
+hint for interrupting the iteration over the elements of given segment when it is set to `false`. The value of the elements having the local index smaller or equal
+to the segments index is set to the value of the segment index. It creates, in fact, lower triangular matrix elements of which have values equal to row index.
+
+Next we use a function \ref TNL::Algorithms::Segments::printSegments to print the content of the segments (lines 38-39). To do this we have to provide a lambda function
+`fetch` (line 38) which returns value of elements with given global index.
+
+Finally we show how to compute sum of all elemnts in each segment. Firstly, we create vector into which we will store the sums (line 44) and get its view (line 45).
+The size of the vector is given by the number of the segments which can be obtained by the means of the method \ref TNL::Algorithms::Segments::CSR::getSegmentsCount
+(and \ref TNL::Algorithms::Segments::Ellpack::getSegmentsCount respectively). The sums are computed using the method \ref TNL::Algorithms::Segments::CSR::reduceAllSegments
+(and \ref TNL::Algorithms::Segments::Ellpack::reduceAllSegments respectively) which works the same way as the flexible parallel reduction (\ref TNL::Algorithms::Reduction).
+It requires lambda functions `fetch` for reading the data related to particular elements of the segments, function `reduce` which is \ref std::plus in this case and a
+function `keep` to store the result of sums in particular segments.
+
+The result looks as follows:
+
+\include SegmentsExample_General.out
+
+Note that the Ellpack format manages more elements than we asked for. It is because some formats use padding elements for more efficient memory accesses. The padding
+elements are available to the user as well and so we must ensure that work only with those elements we want to. This is the reason why we use the if statement on the
+line 31 when setting up the values of the elements in segments. The padding elements can be used in case when we later need more elements than we requested. However,
+the segments data structure does not allow any resizing of the segments. One can change the sizes of the segments, however, the access to the originally managed data
+is becoming invalid at that moment.
+
+*/
+
+
+
+      namespace Segments {
+
+      } // namespace Segments
+   }  // namespace Algorithms
+} // namespace TNL
diff --git a/src/TNL/Algorithms/Segments/detail/BiEllpack.h b/src/TNL/Algorithms/Segments/detail/BiEllpack.h
index a45e16d779273fb63aed38ddecedce5c2989a657..f5f51f020ae624b96643f39b3e8b414cee1c1325 100644
--- a/src/TNL/Algorithms/Segments/detail/BiEllpack.h
+++ b/src/TNL/Algorithms/Segments/detail/BiEllpack.h
@@ -31,10 +31,10 @@ class BiEllpack
       using DeviceType = Device;
       using IndexType = Index;
       static constexpr bool getOrganization() { return Organization; }
-      using OffsetsHolder = Containers::Vector< IndexType, DeviceType, IndexType >;
-      using OffsetsHolderView = typename OffsetsHolder::ViewType;
+      using OffsetsContainer = Containers::Vector< IndexType, DeviceType, IndexType >;
+      using OffsetsHolderView = typename OffsetsContainer::ConstViewType;
       using ConstOffsetsHolderView = typename OffsetsHolderView::ConstViewType;
-      using SegmentsSizes = OffsetsHolder;
+      using SegmentsSizes = OffsetsContainer;
       using SegmentViewType = BiEllpackSegmentView< IndexType, Organization >;
 
       static constexpr int getWarpSize() { return WarpSize; };
@@ -293,10 +293,10 @@ template< typename Index,
           int BlockDim = 256,
           int WarpSize = 32,
           bool HasAllParameters = detail::CheckFetchLambda< Index, Fetch >::hasAllParameters() >
-struct BiEllpackSegmentsReductionDispatcher{};
+struct BiEllpackreduceSegmentsDispatcher{};
 
 template< typename Index, typename Fetch, int BlockDim, int WarpSize >
-struct BiEllpackSegmentsReductionDispatcher< Index, Fetch, BlockDim, WarpSize, true >
+struct BiEllpackreduceSegmentsDispatcher< Index, Fetch, BlockDim, WarpSize, true >
 {
    template< typename View,
              typename Reduction,
@@ -314,12 +314,12 @@ struct BiEllpackSegmentsReductionDispatcher< Index, Fetch, BlockDim, WarpSize, t
                      Real zero,
                      Args... args )
    {
-      biEllpack.template segmentsReductionKernelWithAllParameters< Fetch, Reduction, ResultKeeper, Real, BlockDim, Args... >( gridIdx, first, last, fetch, reduction, keeper, zero, args... );
+      biEllpack.template reduceSegmentsKernelWithAllParameters< Fetch, Reduction, ResultKeeper, Real, BlockDim, Args... >( gridIdx, first, last, fetch, reduction, keeper, zero, args... );
    }
 };
 
 template< typename Index, typename Fetch, int BlockDim, int WarpSize >
-struct BiEllpackSegmentsReductionDispatcher< Index, Fetch, BlockDim, WarpSize, false >
+struct BiEllpackreduceSegmentsDispatcher< Index, Fetch, BlockDim, WarpSize, false >
 {
    template< typename View,
              typename Reduction,
@@ -337,7 +337,7 @@ struct BiEllpackSegmentsReductionDispatcher< Index, Fetch, BlockDim, WarpSize, f
                      Real zero,
                      Args... args )
    {
-      biEllpack.template segmentsReductionKernel< Fetch, Reduction, ResultKeeper, Real, BlockDim, Args... >( gridIdx, first, last, fetch, reduction, keeper, zero, args... );
+      biEllpack.template reduceSegmentsKernel< Fetch, Reduction, ResultKeeper, Real, BlockDim, Args... >( gridIdx, first, last, fetch, reduction, keeper, zero, args... );
    }
 };
 
@@ -350,7 +350,7 @@ template< typename View,
           int BlockDim,
           typename... Args >
 __global__
-void BiEllpackSegmentsReductionKernel( View biEllpack,
+void BiEllpackreduceSegmentsKernel( View biEllpack,
                                        Index gridIdx,
                                        Index first,
                                        Index last,
@@ -360,7 +360,7 @@ void BiEllpackSegmentsReductionKernel( View biEllpack,
                                        Real zero,
                                        Args... args )
 {
-   BiEllpackSegmentsReductionDispatcher< Index, Fetch, BlockDim >::exec( biEllpack, gridIdx, first, last, fetch, reduction, keeper, zero, args... );
+   BiEllpackreduceSegmentsDispatcher< Index, Fetch, BlockDim >::exec( biEllpack, gridIdx, first, last, fetch, reduction, keeper, zero, args... );
 }
 #endif
 
diff --git a/src/TNL/Algorithms/Segments/detail/CSR.h b/src/TNL/Algorithms/Segments/detail/CSR.h
index e43a97b671757586449ee76b811872e142209252..b6ce3b6e45428aa6a5ddb15311536ca4810097e7 100644
--- a/src/TNL/Algorithms/Segments/detail/CSR.h
+++ b/src/TNL/Algorithms/Segments/detail/CSR.h
@@ -105,10 +105,10 @@ class CSR
        * \brief Go over all segments and perform a reduction in each of them.
        */
       template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
-      void segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
+      void reduceSegments( IndexType first, IndexType last, Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
 
       template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
-      void allReduction( Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
+      void reduceAllSegments( Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
 };
          } // namespace detail
       } // namespace Segments
diff --git a/src/TNL/Algorithms/Segments/detail/CSRAdaptiveKernelParameters.h b/src/TNL/Algorithms/Segments/detail/CSRAdaptiveKernelParameters.h
index f11668c2d348371917e04518124f2bea846f8fae..4af0197d26030e538cb9009daf97770b213c2640 100644
--- a/src/TNL/Algorithms/Segments/detail/CSRAdaptiveKernelParameters.h
+++ b/src/TNL/Algorithms/Segments/detail/CSRAdaptiveKernelParameters.h
@@ -93,7 +93,7 @@ struct CSRAdaptiveKernelParameters
 
 template< int SizeOfValue,
           int StreamedSharedMemory_ >
-constexpr int 
+constexpr int
 CSRAdaptiveKernelParameters< SizeOfValue, StreamedSharedMemory_ >::
 getSizeValueLogConstexpr( const int i )
 {
diff --git a/src/TNL/Algorithms/Segments/detail/ChunkedEllpack.h b/src/TNL/Algorithms/Segments/detail/ChunkedEllpack.h
index 5f47b0cafc9ac77e4b5d7257b74ad05b040e3651..ed6163f3fac3251d08971cd49c63743331f6402d 100644
--- a/src/TNL/Algorithms/Segments/detail/ChunkedEllpack.h
+++ b/src/TNL/Algorithms/Segments/detail/ChunkedEllpack.h
@@ -62,13 +62,13 @@ class ChunkedEllpack
       using DeviceType = Device;
       using IndexType = Index;
       static constexpr ElementsOrganization getOrganization() { return Organization; }
-      using OffsetsHolder = Containers::Vector< IndexType, DeviceType, IndexType >;
-      using OffsetsHolderView = typename OffsetsHolder::ViewType;
-      using SegmentsSizes = OffsetsHolder;
+      using OffsetsContainer = Containers::Vector< IndexType, DeviceType, IndexType >;
+      using OffsetsHolderView = typename OffsetsContainer::ConstViewType;
+      using SegmentsSizes = OffsetsContainer;
       using ChunkedEllpackSliceInfoType = detail::ChunkedEllpackSliceInfo< IndexType >;
       using ChunkedEllpackSliceInfoAllocator = typename Allocators::Default< Device >::template Allocator< ChunkedEllpackSliceInfoType >;
       using ChunkedEllpackSliceInfoContainer = Containers::Array< ChunkedEllpackSliceInfoType, DeviceType, IndexType, ChunkedEllpackSliceInfoAllocator >;
-      using ChunkedEllpackSliceInfoContainerView = typename ChunkedEllpackSliceInfoContainer::ViewType;
+      using ChunkedEllpackSliceInfoContainerView = typename ChunkedEllpackSliceInfoContainer::ConstViewType;
       using SegmentViewType = ChunkedEllpackSegmentView< IndexType, Organization >;
 
       __cuda_callable__ static
@@ -234,10 +234,10 @@ class ChunkedEllpack
 template< typename Index,
           typename Fetch,
           bool HasAllParameters = detail::CheckFetchLambda< Index, Fetch >::hasAllParameters() >
-struct ChunkedEllpackSegmentsReductionDispatcher{};
+struct ChunkedEllpackreduceSegmentsDispatcher{};
 
 template< typename Index, typename Fetch >
-struct ChunkedEllpackSegmentsReductionDispatcher< Index, Fetch, true >
+struct ChunkedEllpackreduceSegmentsDispatcher< Index, Fetch, true >
 {
    template< typename View,
              typename Reduction,
@@ -255,12 +255,12 @@ struct ChunkedEllpackSegmentsReductionDispatcher< Index, Fetch, true >
                      Real zero,
                      Args... args )
    {
-      chunkedEllpack.segmentsReductionKernelWithAllParameters( gridIdx, first, last, fetch, reduction, keeper, zero, args... );
+      chunkedEllpack.reduceSegmentsKernelWithAllParameters( gridIdx, first, last, fetch, reduction, keeper, zero, args... );
    }
 };
 
 template< typename Index, typename Fetch >
-struct ChunkedEllpackSegmentsReductionDispatcher< Index, Fetch, false >
+struct ChunkedEllpackreduceSegmentsDispatcher< Index, Fetch, false >
 {
    template< typename View,
              typename Reduction,
@@ -278,7 +278,7 @@ struct ChunkedEllpackSegmentsReductionDispatcher< Index, Fetch, false >
                      Real zero,
                      Args... args )
    {
-      chunkedEllpack.segmentsReductionKernel( gridIdx, first, last, fetch, reduction, keeper, zero, args... );
+      chunkedEllpack.reduceSegmentsKernel( gridIdx, first, last, fetch, reduction, keeper, zero, args... );
    }
 };
 
@@ -290,7 +290,7 @@ template< typename View,
           typename Real,
           typename... Args >
 __global__
-void ChunkedEllpackSegmentsReductionKernel( View chunkedEllpack,
+void ChunkedEllpackreduceSegmentsKernel( View chunkedEllpack,
                                             Index gridIdx,
                                             Index first,
                                             Index last,
@@ -300,7 +300,7 @@ void ChunkedEllpackSegmentsReductionKernel( View chunkedEllpack,
                                             Real zero,
                                             Args... args )
 {
-   ChunkedEllpackSegmentsReductionDispatcher< Index, Fetch >::exec( chunkedEllpack, gridIdx, first, last, fetch, reduction, keeper, zero, args... );
+   ChunkedEllpackreduceSegmentsDispatcher< Index, Fetch >::exec( chunkedEllpack, gridIdx, first, last, fetch, reduction, keeper, zero, args... );
 }
 #endif
 
diff --git a/src/TNL/Algorithms/SequentialFor.h b/src/TNL/Algorithms/SequentialFor.h
new file mode 100644
index 0000000000000000000000000000000000000000..ea783ca33dbfb875c15b55be12fe3c0aa2480c0e
--- /dev/null
+++ b/src/TNL/Algorithms/SequentialFor.h
@@ -0,0 +1,54 @@
+/***************************************************************************
+                          SequentialFor.h  -  description
+                             -------------------
+    begin                : Apr 5, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#include <TNL/Algorithms/ParallelFor.h>
+
+
+namespace TNL {
+   namespace Algorithms {
+
+/**
+ * \brief Wrapper to ParallelFor which makes it run sequentially.
+ *
+ *  It is helpfull for debuging or just sequential for loops on GPUs.
+ */
+template< typename Device = Devices::Sequential >
+struct SequentialFor
+{
+   /**
+    * \brief Static method for execution of the loop.
+    *
+    * \tparam Index defines the type of indexes over which the loop iterates.
+    * \tparam Function is the type of function to be called in each iteration.
+    *
+    * \param start the for-loop iterates over index interval [start, end).
+    * \param end the for-loop iterates over index interval [start, end).
+    * \param f is the function to be called in each iteration
+    *
+    * \par Example
+    * \include Algorithms/SequentialForExample.cpp
+    * \par Output
+    * \include SequentialForExample.out
+    *
+    */
+   template< typename Index,
+             typename Function >
+   static void exec( Index start, Index end, Function f )
+   {
+      for( Index i = start; i < end; i++ )
+         ParallelFor< Device >::exec( i, i + 1, f );
+   }
+};
+
+
+   } // namespace Algorithms
+} // namespace TNL
\ No newline at end of file
diff --git a/src/TNL/Containers/ArrayView.hpp b/src/TNL/Containers/ArrayView.hpp
index cd0a4c537b758401e9e5e0dc6679a822807f4f90..1562508a8bf81a30faf59d566a8f34c302b5d051 100644
--- a/src/TNL/Containers/ArrayView.hpp
+++ b/src/TNL/Containers/ArrayView.hpp
@@ -49,7 +49,7 @@ void
 ArrayView< Value, Device, Index >::
 bind( ValueType* data, IndexType size )
 {
-   TNL_ASSERT_GE( size, 0, "ArrayView size was initialized with a negative size." );
+   TNL_ASSERT_GE( size, ( IndexType ) 0, "ArrayView size was initialized with a negative size." );
    TNL_ASSERT_TRUE( (data == nullptr && size == 0) || (data != nullptr && size > 0),
                     "ArrayView was initialized with a positive address and zero size or zero address and positive size." );
 
diff --git a/src/TNL/Containers/detail/ArrayAssignment.h b/src/TNL/Containers/detail/ArrayAssignment.h
index c6ac6cb1c354ee7c0950b55c632cc30ab4819b86..ee1487c769d854b9ce0e0b24245fc4c389c27b77 100644
--- a/src/TNL/Containers/detail/ArrayAssignment.h
+++ b/src/TNL/Containers/detail/ArrayAssignment.h
@@ -38,7 +38,7 @@ struct ArrayAssignment< Array, T, true >
 
    static void assign( Array& a, const T& t )
    {
-      TNL_ASSERT_EQ( a.getSize(), t.getSize(), "The sizes of the arrays must be equal." );
+      TNL_ASSERT_EQ( a.getSize(), ( decltype( a.getSize() ) ) t.getSize(), "The sizes of the arrays must be equal." );
       // skip assignment of empty arrays
       if( a.getSize() == 0 )
          return;
diff --git a/src/TNL/Cuda/LaunchHelpers.h b/src/TNL/Cuda/LaunchHelpers.h
index a9e8bc1683111b75312e5e334391a0192be22b2f..278f23da5e75527b91e727caec236bc42257c022 100644
--- a/src/TNL/Cuda/LaunchHelpers.h
+++ b/src/TNL/Cuda/LaunchHelpers.h
@@ -22,6 +22,21 @@ inline constexpr std::size_t getMaxGridSize()
    return 65535;
 }
 
+inline constexpr size_t getMaxGridXSize()
+{
+   return 2147483647;//65535;
+}
+
+inline constexpr size_t getMaxGridYSize()
+{
+   return 65535;
+}
+
+inline constexpr size_t getMaxGridZSize()
+{
+   return 65535;
+}
+
 inline constexpr int getMaxBlockSize()
 {
    return 1024;
diff --git a/src/TNL/Matrices/DenseMatrix.h b/src/TNL/Matrices/DenseMatrix.h
index a65c12d80bcb0475198714b5adf597d45e7a926c..3d0e2d62da85d102c502a5770cbe3fc2be013408 100644
--- a/src/TNL/Matrices/DenseMatrix.h
+++ b/src/TNL/Matrices/DenseMatrix.h
@@ -457,10 +457,12 @@ class DenseMatrix : public Matrix< Real, Device, Index, RealAllocator >
        *
        * \tparam Function is type of lambda function that will operate on matrix elements.
        *    It is should have form like
-       *  `function( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx_, const RealType& value, bool& compute )`.
+       *
+       * ```
+       * auto function = [=] __cuda_callable__ ( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx_, const RealType& value ) { ... };
+       * ```
+       *
        *  The column index repeats twice only for compatibility with sparse matrices.
-       *  If the 'compute' variable is set to false the iteration over the row can
-       *  be interrupted.
        *
        * \param begin defines beginning of the range [begin,end) of rows to be processed.
        * \param end defines ending of the range [begin,end) of rows to be processed.
@@ -479,10 +481,12 @@ class DenseMatrix : public Matrix< Real, Device, Index, RealAllocator >
        *
        * \tparam Function is type of lambda function that will operate on matrix elements.
        *    It is should have form like
-       *  `function( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx_, RealType& value, bool& compute )`.
+       *
+       * ```
+       * auto function = [=] __cuda_callable__ ( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx_, RealType& value ) { ... };
+       * ```
+       *
        *  The column index repeats twice only for compatibility with sparse matrices.
-       *  If the 'compute' variable is set to false the iteration over the row can
-       *  be interrupted.
        *
        * \param begin defines beginning of the range [begin,end) of rows to be processed.
        * \param end defines ending of the range [begin,end) of rows to be processed.
@@ -541,10 +545,10 @@ class DenseMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * \param function is an instance of the lambda function to be called for each row.
        *
        * ```
-       * auto function = [] __cuda_callable__ ( RowViewType& row ) mutable { ... };
+       * auto function = [] __cuda_callable__ ( RowView& row ) mutable { ... };
        * ```
        *
-       * \e RowViewType represents matrix row - see \ref TNL::Matrices::DenseMatrix::RowViewType.
+       * \e RowView represents matrix row - see \ref TNL::Matrices::DenseMatrix::RowView.
        *
        * \par Example
        * \include Matrices/DenseMatrix/DenseMatrixExample_forRows.cpp
@@ -567,10 +571,10 @@ class DenseMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * \param function is an instance of the lambda function to be called for each row.
        *
        * ```
-       * auto function = [] __cuda_callable__ ( RowViewType& row ) { ... };
+       * auto function = [] __cuda_callable__ ( RowView& row ) { ... };
        * ```
        *
-       * \e RowViewType represents matrix row - see \ref TNL::Matrices::DenseMatrix::RowViewType.
+       * \e RowView represents matrix row - see \ref TNL::Matrices::DenseMatrix::RowView.
        *
        * \par Example
        * \include Matrices/DenseMatrix/DenseMatrixExample_forRows.cpp
@@ -591,10 +595,10 @@ class DenseMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * \param function is an instance of the lambda function to be called for each row.
        *
        * ```
-       * auto function = [] __cuda_callable__ ( RowViewType& row ) mutable { ... };
+       * auto function = [] __cuda_callable__ ( RowView& row ) mutable { ... };
        * ```
        *
-       * \e RowViewType represents matrix row - see \ref TNL::Matrices::DenseMatrix::RowViewType.
+       * \e RowView represents matrix row - see \ref TNL::Matrices::DenseMatrix::RowView.
        *
        * \par Example
        * \include Matrices/DenseMatrix/DenseMatrixExample_forRows.cpp
@@ -615,10 +619,10 @@ class DenseMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * \param function is an instance of the lambda function to be called for each row.
        *
        * ```
-       * auto function = [] __cuda_callable__ ( RowViewType& row ) { ... };
+       * auto function = [] __cuda_callable__ ( RowView& row ) { ... };
        * ```
        *
-       * \e RowViewType represents matrix row - see \ref TNL::Matrices::DenseMatrix::RowViewType.
+       * \e RowView represents matrix row - see \ref TNL::Matrices::DenseMatrix::RowView.
        *
        * \par Example
        * \include Matrices/DenseMatrix/DenseMatrixExample_forRows.cpp
@@ -633,10 +637,12 @@ class DenseMatrix : public Matrix< Real, Device, Index, RealAllocator >
        *
        * \tparam Function is type of lambda function that will operate on matrix elements.
        *    It is should have form like
-       *  `function( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx_, const RealType& value, bool& compute )`.
-       *  The column index repeats twice only for compatibility with sparse matrices.
-       *  If the 'compute' variable is set to false the iteration over the row can
-       *  be interrupted.
+       *
+       * ```
+       * auto function = [] __cuda_callable__ ( RowView& row ) { ... };
+       * ```
+       *
+       * \e RowView represents matrix row - see \ref TNL::Matrices::DenseMatrix::RowView.
        *
        * \param begin defines beginning of the range [begin,end) of rows to be processed.
        * \param end defines ending of the range [begin,end) of rows to be processed.
@@ -650,10 +656,12 @@ class DenseMatrix : public Matrix< Real, Device, Index, RealAllocator >
        *
        * \tparam Function is type of lambda function that will operate on matrix elements.
        *    It is should have form like
-       *  `function( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx_, RealType& value, bool& compute )`.
-       *  The column index repeats twice only for compatibility with sparse matrices.
-       *  If the 'compute' variable is set to false the iteration over the row can
-       *  be interrupted.
+       *
+       * ```
+       * auto function = [] __cuda_callable__ ( RowView& row ) { ... };
+       * ```
+       *
+       * \e RowView represents matrix row - see \ref TNL::Matrices::DenseMatrix::RowView.
        *
        * \param begin defines beginning of the range [begin,end) of rows to be processed.
        * \param end defines ending of the range [begin,end) of rows to be processed.
@@ -688,12 +696,25 @@ class DenseMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * \brief Method for performing general reduction on matrix rows.
        *
        * \tparam Fetch is a type of lambda function for data fetch declared as
-       *          `fetch( IndexType rowIdx, IndexType columnIdx, RealType elementValue ) -> FetchValue`.
-       *          The return type of this lambda can be any non void.
+       *
+       * ```
+       * auto fetch = [=] __cuda_callable__ ( IndexType rowIdx, IndexType columnIdx, RealType elementValue ) -> FetchValue { ... };
+       * ```
+       *
+       * The return type of this lambda can be any non void.
        * \tparam Reduce is a type of lambda function for reduction declared as
-       *          `reduce( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue`.
+       *
+       * ```
+       * auto reduce = [=] __cuda_callable__ ( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue { ... };
+       * ```
+       *
        * \tparam Keep is a type of lambda function for storing results of reduction in each row.
-       *          It is declared as `keep( const IndexType rowIdx, const double& value )`.
+       *  It is declared as
+       *
+       * ```
+       * auto keep = [=] __cuda_callable__ ( const IndexType rowIdx, const double& value ) { ... };
+       * ```
+       *
        * \tparam FetchValue is type returned by the Fetch lambda function.
        *
        * \param begin defines beginning of the range [begin,end) of rows to be processed.
@@ -717,12 +738,25 @@ class DenseMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * \brief Method for performing general reduction on matrix rows for constant instances.
        *
        * \tparam Fetch is a type of lambda function for data fetch declared as
-       *          `fetch( IndexType rowIdx, IndexType columnIdx, RealType elementValue ) -> FetchValue`.
-       *          The return type of this lambda can be any non void.
+       *
+       * ```
+       * auto fetch = [=] __cuda_callable__ ( IndexType rowIdx, IndexType columnIdx, RealType elementValue ) -> FetchValue { ... };
+       * ```
+       *
+       * The return type of this lambda can be any non void.
        * \tparam Reduce is a type of lambda function for reduction declared as
-       *          `reduce( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue`.
+       *
+       * ```
+       * auto reduce = [=] __cuda_callable__ ( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue { ... };
+       * ```
+       *
        * \tparam Keep is a type of lambda function for storing results of reduction in each row.
-       *          It is declared as `keep( const IndexType rowIdx, const double& value )`.
+       *          It is declared as
+       *
+       * ````
+       * auto keep = [=] __cuda_callable__ ( const IndexType rowIdx, const double& value ) { ... };
+       * ```
+       *
        * \tparam FetchValue is type returned by the Fetch lambda function.
        *
        * \param begin defines beginning of the range [begin,end) of rows to be processed.
@@ -746,12 +780,24 @@ class DenseMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * \brief Method for performing general reduction on ALL matrix rows.
        *
        * \tparam Fetch is a type of lambda function for data fetch declared as
-       *          `fetch( IndexType rowIdx, IndexType columnIdx, RealType elementValue ) -> FetchValue`.
-       *          The return type of this lambda can be any non void.
+       *
+       * ```
+       * auto fetch = [=] __cuda_callable__ ( IndexType rowIdx, IndexType columnIdx, RealType elementValue ) -> FetchValue { ... };
+       * ```
+       *
+       *      The return type of this lambda can be any non void.
        * \tparam Reduce is a type of lambda function for reduction declared as
-       *          `reduce( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue`.
+       *
+       * ````
+       * auto reduce = [=] __cuda_callable__ ( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue { ... };
+       * ```
+       *
        * \tparam Keep is a type of lambda function for storing results of reduction in each row.
-       *          It is declared as `keep( const IndexType rowIdx, const double& value )`.
+       *          It is declared as
+       * ```
+       * auto keep = [=] __cuda_callable__ ( const IndexType rowIdx, const double& value ) { ... };
+       * ```
+       *
        * \tparam FetchValue is type returned by the Fetch lambda function.
        *
        * \param fetch is an instance of lambda function for data fetch.
@@ -773,12 +819,25 @@ class DenseMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * \brief Method for performing general reduction on ALL matrix rows for constant instances.
        *
        * \tparam Fetch is a type of lambda function for data fetch declared as
-       *          `fetch( IndexType rowIdx, IndexType columnIdx, RealType elementValue ) -> FetchValue`.
+       *
+       * ```
+       * auto fetch = [=] __cuda_callable__ ( IndexType rowIdx, IndexType columnIdx, RealType elementValue ) -> FetchValue { ... };
+       * ```
+       *
        *          The return type of this lambda can be any non void.
        * \tparam Reduce is a type of lambda function for reduction declared as
-       *          `reduce( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue`.
+       *
+       * ```
+       * auto reduce = [=] __cuda_callable__ ( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue { ... };
+       * ```
+       *
        * \tparam Keep is a type of lambda function for storing results of reduction in each row.
-       *          It is declared as `keep( const IndexType rowIdx, const double& value )`.
+       *          It is declared as
+       *
+       *  ```
+       * auto keep = [=] __cuda_callable__ ( const IndexType rowIdx, const double& value ) { ... };
+       * ```
+       *
        * \tparam FetchValue is type returned by the Fetch lambda function.
        *
        * \param fetch is an instance of lambda function for data fetch.
@@ -801,7 +860,9 @@ class DenseMatrix : public Matrix< Real, Device, Index, RealAllocator >
        *
        * More precisely, it computes:
        *
-       * `outVector = matrixMultiplicator * ( *this ) * inVector + outVectorMultiplicator * outVector`
+       * ```
+       * outVector = matrixMultiplicator * ( *this ) * inVector + outVectorMultiplicator * outVector
+       * ```
        *
        * \tparam InVector is type of input vector.  It can be \ref Vector,
        *     \ref VectorView, \ref Array, \ref ArraView or similar container.
@@ -913,7 +974,7 @@ class DenseMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * \return \e true if the RHS matrix is equal, \e false otherwise.
        */
       template< typename Real_, typename Device_, typename Index_, typename RealAllocator_ >
-      bool operator==( const DenseMatrix< Real_, Device_, Index_, Organization >& matrix ) const;
+      bool operator==( const DenseMatrix< Real_, Device_, Index_, Organization, RealAllocator_ >& matrix ) const;
 
       /**
        * \brief Comparison operator with another dense matrix.
@@ -922,7 +983,43 @@ class DenseMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * \return \e false if the RHS matrix is equal, \e true otherwise.
        */
       template< typename Real_, typename Device_, typename Index_, typename RealAllocator_ >
-      bool operator!=( const DenseMatrix< Real_, Device_, Index_, Organization >& matrix ) const;
+      bool operator!=( const DenseMatrix< Real_, Device_, Index_, Organization, RealAllocator_ >& matrix ) const;
+
+      /**
+       * \brief Comparison operator with another dense matrix view.
+       *
+       * \param matrix is the right-hand side matrix view.
+       * \return \e true if the RHS matrix view is equal, \e false otherwise.
+       */
+      template< typename Real_, typename Device_, typename Index_ >
+      bool operator==( const DenseMatrixView< Real_, Device_, Index_, Organization >& matrix ) const;
+
+      /**
+       * \brief Comparison operator with another dense matrix view.
+       *
+       * \param matrix is the right-hand side matrix view.
+       * \return \e false if the RHS matrix view is equal, \e true otherwise.
+       */
+      template< typename Real_, typename Device_, typename Index_ >
+      bool operator!=( const DenseMatrixView< Real_, Device_, Index_, Organization >& matrix ) const;
+
+      /**
+       * \brief Comparison operator with another arbitrary matrix type.
+       *
+       * \param matrix is the right-hand side matrix.
+       * \return \e true if the RHS matrix is equal, \e false otherwise.
+       */
+      template< typename Matrix >
+      bool operator==( const Matrix& m ) const;
+
+      /**
+       * \brief Comparison operator with another arbitrary matrix type.
+       *
+       * \param matrix is the right-hand side matrix.
+       * \return \e true if the RHS matrix is equal, \e false otherwise.
+       */
+      template< typename Matrix >
+      bool operator!=( const Matrix& m ) const;
 
       /**
        * \brief Method for saving the matrix to the file with given filename.
@@ -984,6 +1081,33 @@ template< typename Real,
           typename RealAllocator >
 std::ostream& operator<< ( std::ostream& str, const DenseMatrix< Real, Device, Index, Organization, RealAllocator >& matrix );
 
+/**
+ * \brief Comparison operator with another dense matrix view.
+ *
+ * \param leftMatrix is the left-hand side matrix view.
+ * \param rightMatrix is the right-hand side matrix.
+ * \return \e true if the both matrices are is equal, \e false otherwise.
+ */
+template< typename Real, typename Device, typename Index,
+          typename Real_, typename Device_, typename Index_,
+          ElementsOrganization Organization, typename RealAllocator >
+bool operator==( const DenseMatrixView< Real, Device, Index, Organization >& leftMatrix,
+                 const DenseMatrix< Real_, Device_, Index_, Organization, RealAllocator >& rightMatrix );
+
+/**
+ * \brief Comparison operator with another dense matrix view.
+ *
+ * \param leftMatrix is the left-hand side matrix view.
+ * \param rightMatrix is the right-hand side matrix.
+ * \return \e false if the both matrices are is equal, \e true otherwise.
+ */
+template< typename Real, typename Device, typename Index,
+          typename Real_, typename Device_, typename Index_,
+          ElementsOrganization Organization, typename RealAllocator >
+bool operator!=( const DenseMatrixView< Real, Device, Index, Organization >& leftMatrix,
+                 const DenseMatrix< Real_, Device_, Index_, Organization, RealAllocator >& rightMatrix );
+
+
 } // namespace Matrices
 } // namespace TNL
 
diff --git a/src/TNL/Matrices/DenseMatrix.hpp b/src/TNL/Matrices/DenseMatrix.hpp
index 16b844cda7eebc28412b164e33bb58393b578648..a42421aa7a661fba0ec7fde144a335322af6f157 100644
--- a/src/TNL/Matrices/DenseMatrix.hpp
+++ b/src/TNL/Matrices/DenseMatrix.hpp
@@ -162,7 +162,7 @@ setDimensions( const IndexType rows,
 {
    Matrix< Real, Device, Index, RealAllocator >::setDimensions( rows, columns );
    this->segments.setSegmentsSizes( rows, columns );
-   this->values.setSize( rows * columns );
+   this->values.setSize( this->segments.getStorageSize() );
    this->values = 0.0;
    this->view = this->getView();
 }
@@ -1139,7 +1139,7 @@ operator=( const DenseMatrixView< RHSReal, RHSDevice, RHSIndex, RHSOrganization
    auto this_view = this->view;
    if( std::is_same< DeviceType, RHSDeviceType >::value )
    {
-      auto f = [=] __cuda_callable__ ( RHSIndexType rowIdx, RHSIndexType localIdx, RHSIndexType columnIdx, const RHSRealType& value, bool& compute ) mutable {
+      auto f = [=] __cuda_callable__ ( RHSIndexType rowIdx, RHSIndexType localIdx, RHSIndexType columnIdx, const RHSRealType& value ) mutable {
          this_view( rowIdx, columnIdx ) = value;
       };
       matrix.forAllElements( f );
@@ -1162,7 +1162,7 @@ operator=( const DenseMatrixView< RHSReal, RHSDevice, RHSIndex, RHSOrganization
 
          ////
          // Copy matrix elements into buffer
-         auto f1 = [=] __cuda_callable__ ( RHSIndexType rowIdx, RHSIndexType localIdx, RHSIndexType columnIdx, const RHSRealType& value, bool& compute ) mutable {
+         auto f1 = [=] __cuda_callable__ ( RHSIndexType rowIdx, RHSIndexType localIdx, RHSIndexType columnIdx, const RHSRealType& value ) mutable {
             const IndexType bufferIdx = ( rowIdx - baseRow ) * maxRowLength + columnIdx;
             matrixValuesBuffer_view[ bufferIdx ] = value;
          };
@@ -1214,7 +1214,7 @@ operator=( const RHSMatrix& matrix )
    if( std::is_same< DeviceType, RHSDeviceType >::value )
    {
       const auto segments_view = this->segments.getView();
-      auto f = [=] __cuda_callable__ ( RHSIndexType rowIdx, RHSIndexType localIdx_, RHSIndexType columnIdx, const RHSRealType& value, bool& compute ) mutable {
+      auto f = [=] __cuda_callable__ ( RHSIndexType rowIdx, RHSIndexType localIdx_, RHSIndexType columnIdx, const RHSRealType& value ) mutable {
          if( value != 0.0 && columnIdx != padding_index )
             values_view[ segments_view.getGlobalIndex( rowIdx, columnIdx ) ] = value;
       };
@@ -1244,7 +1244,7 @@ operator=( const RHSMatrix& matrix )
 
          ////
          // Copy matrix elements into buffer
-         auto f1 = [=] __cuda_callable__ ( RHSIndexType rowIdx, RHSIndexType localIdx, RHSIndexType columnIndex, const RHSRealType& value, bool& compute ) mutable {
+         auto f1 = [=] __cuda_callable__ ( RHSIndexType rowIdx, RHSIndexType localIdx, RHSIndexType columnIndex, const RHSRealType& value ) mutable {
             if( columnIndex != padding_index )
             {
                const IndexType bufferIdx = ( rowIdx - baseRow ) * maxRowLength + localIdx;
@@ -1284,7 +1284,7 @@ template< typename Real,
    template< typename Real_, typename Device_, typename Index_, typename RealAllocator_ >
 bool
 DenseMatrix< Real, Device, Index, Organization, RealAllocator >::
-operator==( const DenseMatrix< Real_, Device_, Index_, Organization >& matrix ) const
+operator==( const DenseMatrix< Real_, Device_, Index_, Organization, RealAllocator_ >& matrix ) const
 {
    return( this->getRows() == matrix.getRows() &&
            this->getColumns() == matrix.getColumns() &&
@@ -1299,11 +1299,65 @@ template< typename Real,
    template< typename Real_, typename Device_, typename Index_, typename RealAllocator_ >
 bool
 DenseMatrix< Real, Device, Index, Organization, RealAllocator >::
-operator!=( const DenseMatrix< Real_, Device_, Index_, Organization >& matrix ) const
+operator!=( const DenseMatrix< Real_, Device_, Index_, Organization, RealAllocator_ >& matrix ) const
 {
    return ! ( *this == matrix );
 }
 
+template< typename Real,
+          typename Device,
+          typename Index,
+          ElementsOrganization Organization,
+          typename RealAllocator >
+   template< typename Real_, typename Device_, typename Index_ >
+bool
+DenseMatrix< Real, Device, Index, Organization, RealAllocator >::
+operator==( const DenseMatrixView< Real_, Device_, Index_, Organization >& matrix ) const
+{
+   return( this->getRows() == matrix.getRows() &&
+           this->getColumns() == matrix.getColumns() &&
+           this->getValues() == matrix.getValues() );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          ElementsOrganization Organization,
+          typename RealAllocator >
+   template< typename Real_, typename Device_, typename Index_ >
+bool
+DenseMatrix< Real, Device, Index, Organization, RealAllocator >::
+operator!=( const DenseMatrixView< Real_, Device_, Index_, Organization >& matrix ) const
+{
+   return ! ( *this == matrix );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          ElementsOrganization Organization,
+          typename RealAllocator >
+   template< typename Matrix >
+bool
+DenseMatrix< Real, Device, Index, Organization, RealAllocator >::
+operator==( const Matrix& m ) const
+{
+   return ( this->view == m );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          ElementsOrganization Organization,
+          typename RealAllocator >
+   template< typename Matrix >
+bool
+DenseMatrix< Real, Device, Index, Organization, RealAllocator >::
+operator!=( const Matrix& m ) const
+{
+   return ( this->view != m );
+}
+
 template< typename Real,
           typename Device,
           typename Index,
@@ -1380,5 +1434,23 @@ std::ostream& operator<< ( std::ostream& str, const DenseMatrix< Real, Device, I
    return str;
 }
 
+template< typename Real, typename Device, typename Index,
+          typename Real_, typename Device_, typename Index_,
+          ElementsOrganization Organization, typename RealAllocator >
+bool operator==( const DenseMatrixView< Real, Device, Index, Organization >& leftMatrix,
+                 const DenseMatrix< Real_, Device_, Index_, Organization, RealAllocator >& rightMatrix )
+{
+   return rightMatrix == leftMatrix;
+}
+
+template< typename Real, typename Device, typename Index,
+          typename Real_, typename Device_, typename Index_,
+          ElementsOrganization Organization, typename RealAllocator >
+bool operator!=( const DenseMatrixView< Real, Device, Index, Organization >& leftMatrix,
+                 const DenseMatrix< Real_, Device_, Index_, Organization, RealAllocator >& rightMatrix )
+{
+   return rightMatrix != leftMatrix;
+}
+
 } // namespace Matrices
 } // namespace TNL
diff --git a/src/TNL/Matrices/DenseMatrixElement.h b/src/TNL/Matrices/DenseMatrixElement.h
index e35235fd9ed6cbb8a752dec1d024d26cda938790..c9dcc3c86ea699607621b8f31a685fc989281e70 100644
--- a/src/TNL/Matrices/DenseMatrixElement.h
+++ b/src/TNL/Matrices/DenseMatrixElement.h
@@ -17,17 +17,36 @@
 namespace TNL {
 namespace Matrices {
 
-
+/**
+ * \brief Accessor for dense matrix elements.
+ *
+ * \tparam Real is a type of matrix elements values.
+ * \tparam Index is a type of matrix elements column indexes.
+ */
 template< typename Real,
           typename Index >
 class DenseMatrixElement
 {
    public:
 
+      /**
+       * \brief Type of matrix elements values.
+       */
       using RealType = Real;
 
+      /**
+       * \brief Type of matrix elements column indexes.
+       */
       using IndexType = Index;
 
+      /**
+       * \brief Constructor.
+       *
+       * \param value is matrix element value.
+       * \param rowIdx is row index of the matrix element.
+       * \param columnIdx is a column index of the matrix element.
+       * \param localIdx is the column index of the matrix element as well.
+       */
       __cuda_callable__
       DenseMatrixElement( RealType& value,
                           const IndexType& rowIdx,
@@ -35,18 +54,43 @@ class DenseMatrixElement
                           const IndexType& localIdx )  // localIdx is here only for compatibility with SparseMatrixElement
       : value_( value ), rowIdx( rowIdx ), columnIdx( columnIdx ) {};
 
+      /**
+       * \brief Returns reference on matrix element value.
+       *
+       * \return reference on matrix element value.
+       */
       __cuda_callable__
       RealType& value() { return value_; };
 
+      /**
+       * \brief Returns constant reference on matrix element value.
+       *
+       * \return constant reference on matrix element value.
+       */
       __cuda_callable__
       const RealType& value() const { return value_; };
 
+      /**
+       * \brief Returns constant reference on matrix element row index.
+       *
+       * \return constant reference on matrix element row index.
+       */
       __cuda_callable__
       const IndexType& rowIndex() const { return rowIdx; };
 
+      /**
+       * \brief Returns constant reference on matrix element column index.
+       *
+       * \return constant reference on matrix element column index.
+       */
       __cuda_callable__
       const IndexType& columnIndex() const { return columnIdx; };
 
+      /**
+       * \brief Returns constant reference on matrix element column index.
+       *
+       * \return constant reference on matrix element column index.
+       */
       __cuda_callable__
       const IndexType& localIndex() const { return columnIdx; };
 
diff --git a/src/TNL/Matrices/DenseMatrixView.h b/src/TNL/Matrices/DenseMatrixView.h
index ea7f6dbe74d89fee10596363fccb710089486206..89a2219b3bddcc9b9139d8984694010fa298b61b 100644
--- a/src/TNL/Matrices/DenseMatrixView.h
+++ b/src/TNL/Matrices/DenseMatrixView.h
@@ -405,12 +405,25 @@ class DenseMatrixView : public MatrixView< Real, Device, Index >
        * \brief Method for performing general reduction on matrix rows.
        *
        * \tparam Fetch is a type of lambda function for data fetch declared as
-       *          `fetch( IndexType rowIdx, IndexType columnIdx, RealType elementValue ) -> FetchValue`.
-       *          The return type of this lambda can be any non void.
+       *
+       * ```
+       * auto fetch = [=] __cuda_callable__ ( IndexType rowIdx, IndexType columnIdx, RealType elementValue ) -> FetchValue { ... };
+       * ```
+       *
+       *   The return type of this lambda can be any non void.
        * \tparam Reduce is a type of lambda function for reduction declared as
-       *          `reduce( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue`.
+       *
+       * ```
+       * auto reduce = [=] __cuda_callable__ ( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue { ... };
+       * ```
+       *
        * \tparam Keep is a type of lambda function for storing results of reduction in each row.
-       *          It is declared as `keep( const IndexType rowIdx, const double& value )`.
+       *          It is declared as
+       *
+       * ```
+       * auto keep = [=] __cuda_callable__ ( const IndexType rowIdx, const double& value ) { ... };
+       * ```
+       *
        * \tparam FetchValue is type returned by the Fetch lambda function.
        *
        * \param begin defines beginning of the range [begin,end) of rows to be processed.
@@ -434,12 +447,25 @@ class DenseMatrixView : public MatrixView< Real, Device, Index >
        * \brief Method for performing general reduction on matrix rows for constant instances.
        *
        * \tparam Fetch is a type of lambda function for data fetch declared as
-       *          `fetch( IndexType rowIdx, IndexType columnIdx, RealType elementValue ) -> FetchValue`.
-       *          The return type of this lambda can be any non void.
+       *
+       * ```
+       * auto fetch = [=] __cuda_callable__ ( IndexType rowIdx, IndexType columnIdx, RealType elementValue ) -> FetchValue { ... };
+       * ```
+       *
+       *  The return type of this lambda can be any non void.
        * \tparam Reduce is a type of lambda function for reduction declared as
-       *          `reduce( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue`.
+       *
+       * ```
+       * auto reduce = [=] __cuda_callable__ ( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue { ... };
+       * ```
+       *
        * \tparam Keep is a type of lambda function for storing results of reduction in each row.
-       *          It is declared as `keep( const IndexType rowIdx, const double& value )`.
+       *          It is declared as
+       *
+       * ```
+       * auto keep = [=] __cuda_callable__ ( const IndexType rowIdx, const double& value ) { ... };
+       * ```
+       *
        * \tparam FetchValue is type returned by the Fetch lambda function.
        *
        * \param begin defines beginning of the range [begin,end) of rows to be processed.
@@ -463,12 +489,25 @@ class DenseMatrixView : public MatrixView< Real, Device, Index >
        * \brief Method for performing general reduction on ALL matrix rows.
        *
        * \tparam Fetch is a type of lambda function for data fetch declared as
-       *          `fetch( IndexType rowIdx, IndexType columnIdx, RealType elementValue ) -> FetchValue`.
-       *          The return type of this lambda can be any non void.
+       *
+       * ```
+       * auto fetch = [=] __cuda_callable__ ( IndexType rowIdx, IndexType columnIdx, RealType elementValue ) -> FetchValue
+       * ```
+       *
+       * The return type of this lambda can be any non void.
        * \tparam Reduce is a type of lambda function for reduction declared as
-       *          `reduce( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue`.
+       *
+       * ```
+       * auto reduce = [=] __cuda_callable__ ( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue { ... };
+       * ```
+       *
        * \tparam Keep is a type of lambda function for storing results of reduction in each row.
-       *          It is declared as `keep( const IndexType rowIdx, const double& value )`.
+       *   It is declared as
+       *
+       * ```
+       * auto keep = [=] __cuda_callable__ ( const IndexType rowIdx, const double& value ) { ... };
+       * ```
+       *
        * \tparam FetchValue is type returned by the Fetch lambda function.
        *
        * \param fetch is an instance of lambda function for data fetch.
@@ -490,12 +529,25 @@ class DenseMatrixView : public MatrixView< Real, Device, Index >
        * \brief Method for performing general reduction on ALL matrix rows for constant instances.
        *
        * \tparam Fetch is a type of lambda function for data fetch declared as
-       *          `fetch( IndexType rowIdx, IndexType columnIdx, RealType elementValue ) -> FetchValue`.
-       *          The return type of this lambda can be any non void.
+       *
+       * ```
+       * auto fetch = [=] __cuda_callable__ ( IndexType rowIdx, IndexType columnIdx, RealType elementValue ) -> FetchValue { ... };
+       * ```
+       *
+       *  The return type of this lambda can be any non void.
        * \tparam Reduce is a type of lambda function for reduction declared as
-       *          `reduce( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue`.
+       *
+       * ```
+       * auto reduce = [=] __cuda_callable__ ( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue { ... };
+       * ```
+       *
        * \tparam Keep is a type of lambda function for storing results of reduction in each row.
-       *          It is declared as `keep( const IndexType rowIdx, const double& value )`.
+       *  It is declared as
+       *
+       * ```
+       * auto keep = [=] __cuda_callable__ ( const IndexType rowIdx, const double& value ) { ... };
+       * ```
+       *
        * \tparam FetchValue is type returned by the Fetch lambda function.
        *
        * \param fetch is an instance of lambda function for data fetch.
@@ -518,10 +570,12 @@ class DenseMatrixView : public MatrixView< Real, Device, Index >
        *
        * \tparam Function is type of lambda function that will operate on matrix elements.
        *    It is should have form like
-       *  `function( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx, const RealType& value, bool& compute )`.
+       *
+       * ```
+       * auto function = [=] __cuda_callable__ ( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx, const RealType& value ) { ... };
+       * ```
+       *
        *  The column index repeats twice only for compatibility with sparse matrices.
-       *  If the 'compute' variable is set to false the iteration over the row can
-       *  be interrupted.
        *
        * \param begin defines beginning of the range [begin,end) of rows to be processed.
        * \param end defines ending of the range [begin,end) of rows to be processed.
@@ -540,10 +594,12 @@ class DenseMatrixView : public MatrixView< Real, Device, Index >
        *
        * \tparam Function is type of lambda function that will operate on matrix elements.
        *    It is should have form like
-       *  `function( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx, RealType& value, bool& compute )`.
+       *
+       * ```
+       * auto function = [=] __cuda_callable__ ( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx, RealType& value ) { ... };
+       * ```
+       *
        *  The column index repeats twice only for compatibility with sparse matrices.
-       *  If the 'compute' variable is set to false the iteration over the row can
-       *  be interrupted.
        *
        * \param begin defines beginning of the range [begin,end) of rows to be processed.
        * \param end defines ending of the range [begin,end) of rows to be processed.
@@ -602,10 +658,10 @@ class DenseMatrixView : public MatrixView< Real, Device, Index >
        * \param function is an instance of the lambda function to be called for each row.
        *
        * ```
-       * auto function = [] __cuda_callable__ ( RowViewType& row ) mutable { ... };
+       * auto function = [] __cuda_callable__ ( RowView& row ) mutable { ... };
        * ```
        *
-       * \e RowViewType represents matrix row - see \ref TNL::Matrices::DenseMatrix::RowViewType.
+       * \e RowView represents matrix row - see \ref TNL::Matrices::DenseMatrix::RowView.
        *
        * \par Example
        * \include Matrices/DenseMatrix/DenseMatrixExample_forRows.cpp
@@ -628,10 +684,10 @@ class DenseMatrixView : public MatrixView< Real, Device, Index >
        * \param function is an instance of the lambda function to be called for each row.
        *
        * ```
-       * auto function = [] __cuda_callable__ ( RowViewType& row ) { ... };
+       * auto function = [] __cuda_callable__ ( RowView& row ) { ... };
        * ```
        *
-       * \e RowViewType represents matrix row - see \ref TNL::Matrices::DenseMatrixView::RowViewType.
+       * \e RowView represents matrix row - see \ref TNL::Matrices::DenseMatrixView::RowView.
        *
        * \par Example
        * \include Matrices/DenseMatrix/DenseMatrixViewExample_forRows.cpp
@@ -652,10 +708,10 @@ class DenseMatrixView : public MatrixView< Real, Device, Index >
        * \param function is an instance of the lambda function to be called for each row.
        *
        * ```
-       * auto function = [] __cuda_callable__ ( RowViewType& row ) mutable { ... };
+       * auto function = [] __cuda_callable__ ( RowView& row ) mutable { ... };
        * ```
        *
-       * \e RowViewType represents matrix row - see \ref TNL::Matrices::DenseMatrixView::RowViewType.
+       * \e RowView represents matrix row - see \ref TNL::Matrices::DenseMatrixView::RowView.
        *
        * \par Example
        * \include Matrices/DenseMatrix/DenseMatrixViewExample_forRows.cpp
@@ -676,10 +732,10 @@ class DenseMatrixView : public MatrixView< Real, Device, Index >
        * \param function is an instance of the lambda function to be called for each row.
        *
        * ```
-       * auto function = [] __cuda_callable__ ( RowViewType& row ) { ... };
+       * auto function = [] __cuda_callable__ ( RowView& row ) { ... };
        * ```
        *
-       * \e RowViewType represents matrix row - see \ref TNL::Matrices::DenseMatrixView::RowViewType.
+       * \e RowView represents matrix row - see \ref TNL::Matrices::DenseMatrixView::RowView.
        *
        * \par Example
        * \include Matrices/DenseMatrix/DenseMatrixViewExample_forRows.cpp
@@ -694,10 +750,12 @@ class DenseMatrixView : public MatrixView< Real, Device, Index >
        *
        * \tparam Function is type of lambda function that will operate on matrix elements.
        *    It is should have form like
-       *  `function( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx_, const RealType& value, bool& compute )`.
-       *  The column index repeats twice only for compatibility with sparse matrices.
-       *  If the 'compute' variable is set to false the iteration over the row can
-       *  be interrupted.
+       *
+       * ```
+       * auto function = [] __cuda_callable__ ( RowView& row ) { ... };
+       * ```
+       *
+       * \e RowView represents matrix row - see \ref TNL::Matrices::DenseMatrixView::RowView.
        *
        * \param begin defines beginning of the range [begin,end) of rows to be processed.
        * \param end defines ending of the range [begin,end) of rows to be processed.
@@ -711,10 +769,12 @@ class DenseMatrixView : public MatrixView< Real, Device, Index >
        *
        * \tparam Function is type of lambda function that will operate on matrix elements.
        *    It is should have form like
-       *  `function( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx_, RealType& value, bool& compute )`.
-       *  The column index repeats twice only for compatibility with sparse matrices.
-       *  If the 'compute' variable is set to false the iteration over the row can
-       *  be interrupted.
+       *
+       * ```
+       * auto function = [] __cuda_callable__ ( RowView& row ) { ... };
+       * ```
+       *
+       * \e RowView represents matrix row - see \ref TNL::Matrices::DenseMatrixView::RowView.
        *
        * \param begin defines beginning of the range [begin,end) of rows to be processed.
        * \param end defines ending of the range [begin,end) of rows to be processed.
@@ -750,7 +810,9 @@ class DenseMatrixView : public MatrixView< Real, Device, Index >
        *
        * More precisely, it computes:
        *
-       * `outVector = matrixMultiplicator * ( *this ) * inVector + outVectorMultiplicator * outVector`
+       * ```
+       * outVector = matrixMultiplicator * ( *this ) * inVector + outVectorMultiplicator * outVector
+       * ```
        *
        * \tparam InVector is type of input vector.  It can be \ref Vector,
        *     \ref VectorView, \ref Array, \ref ArraView or similar container.
@@ -808,6 +870,42 @@ class DenseMatrixView : public MatrixView< Real, Device, Index >
        */
       DenseMatrixView& operator=( const DenseMatrixView& matrix );
 
+      /**
+       * \brief Comparison operator with another dense matrix view.
+       *
+       * \param matrix is the right-hand side matrix view.
+       * \return \e true if the RHS matrix view is equal, \e false otherwise.
+       */
+      template< typename Real_, typename Device_, typename Index_ >
+      bool operator==( const DenseMatrixView< Real_, Device_, Index_, Organization >& matrix ) const;
+
+      /**
+       * \brief Comparison operator with another dense matrix view.
+       *
+       * \param matrix is the right-hand side matrix.
+       * \return \e false if the RHS matrix view is equal, \e true otherwise.
+       */
+      template< typename Real_, typename Device_, typename Index_ >
+      bool operator!=( const DenseMatrixView< Real_, Device_, Index_, Organization >& matrix ) const;
+
+      /**
+       * \brief Comparison operator with another arbitrary matrix type.
+       *
+       * \param matrix is the right-hand side matrix.
+       * \return \e true if the RHS matrix is equal, \e false otherwise.
+       */
+      template< typename Matrix >
+      bool operator==( const Matrix& m ) const;
+
+      /**
+       * \brief Comparison operator with another arbitrary matrix type.
+       *
+       * \param matrix is the right-hand side matrix.
+       * \return \e true if the RHS matrix is equal, \e false otherwise.
+       */
+      template< typename Matrix >
+      bool operator!=( const Matrix& m ) const;
+
       /**
        * \brief Method for saving the matrix view to the file with given filename.
        *
diff --git a/src/TNL/Matrices/DenseMatrixView.hpp b/src/TNL/Matrices/DenseMatrixView.hpp
index 7dd5428b593540225567c2954687563096284a42..3a44269d1d78923c5b0e40091c9b20d9eb303483 100644
--- a/src/TNL/Matrices/DenseMatrixView.hpp
+++ b/src/TNL/Matrices/DenseMatrixView.hpp
@@ -19,6 +19,194 @@
 namespace TNL {
 namespace Matrices {
 
+#ifdef HAVE_CUDA
+/**
+ * The following kernel is an attempt to map more CUDA threads to one matrix row.
+ */
+template< int BlockSize, int ThreadsPerRow, typename Matrix, typename InVector, typename OutVector >
+__global__ void
+VectorColumnMajorDenseMatrixViewVectorMultiplicationKernel( const Matrix matrix, const InVector inVector, OutVector outVector, const int begin, const int end, int gridIdx )
+{
+   using Real = typename Matrix::RealType;
+   using Index = typename Matrix::IndexType;
+   constexpr int  inVectorCacheSize = 20480 / sizeof( Real );
+   __shared__ Real inVectorCache[ inVectorCacheSize ];
+   __shared__ Real result_[ BlockSize ];
+
+   constexpr Index rowsPerBlock = 256 / ThreadsPerRow;
+   const Index rowIdx = ( ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * 256 + threadIdx.x ) / ThreadsPerRow + begin;
+   const Index localColIdx = threadIdx.x / rowsPerBlock;
+   const Index localRowIdx = threadIdx.x % rowsPerBlock;
+
+   Real result( 0.0 );
+   Index columnIdx( 0 );
+   const auto& values = matrix.getValues();
+   const auto& rowsCount = matrix.getRows();
+   Index valuesPtr = rowIdx + localColIdx * rowsCount;
+
+   while( columnIdx < matrix.getColumns() )
+   {
+      const Index lastIdx = min( matrix.getColumns(), columnIdx + inVectorCacheSize );
+      Index matrixColIdx = columnIdx + threadIdx.x;
+      Index cacheColIdx = threadIdx.x;
+      while( matrixColIdx < lastIdx )
+      {
+         inVectorCache[ cacheColIdx ] = inVector[ matrixColIdx ];
+         cacheColIdx += 256;
+         matrixColIdx += 256;
+      }
+      __syncthreads();
+
+      matrixColIdx = columnIdx + localColIdx;
+      cacheColIdx = localColIdx;
+      if( rowIdx < end )
+         while( matrixColIdx < lastIdx )
+         {
+            result += values[ valuesPtr ] * inVectorCache[ cacheColIdx ];
+            cacheColIdx += ThreadsPerRow;
+            matrixColIdx += ThreadsPerRow;
+            valuesPtr += ThreadsPerRow * rowsCount;
+         }
+      columnIdx = lastIdx;
+   }
+   const int idx = localRowIdx * ThreadsPerRow + localColIdx;
+   result_[ idx ] = result;
+   if( ThreadsPerRow > 8 && localColIdx < ThreadsPerRow - 8 )
+      result_[ idx ] += result_[ idx + 8 ];
+   __syncwarp();
+   if( ThreadsPerRow > 4 && localColIdx < ThreadsPerRow - 4 )
+      result_[ idx ] += result_[ idx + 4 ];
+   __syncwarp();
+   if( ThreadsPerRow > 2 && localColIdx < ThreadsPerRow - 2 )
+      result_[ idx ] += result_[ idx + 2 ];
+   __syncwarp();
+   if( ThreadsPerRow > 1 && localColIdx < ThreadsPerRow - 1 )
+      result_[ idx ] += result_[ idx + 1 ];
+   __syncwarp();
+
+   if( rowIdx < end && localColIdx == 0 )
+      outVector[ rowIdx ] = result_[ idx ];
+}
+
+template< typename Matrix, typename InVector, typename OutVector >
+__global__ void
+ColumnMajorDenseMatrixViewVectorMultiplicationKernel( const Matrix matrix, const InVector inVector, OutVector outVector, const int begin, const int end, int gridIdx )
+{
+   using Real = typename Matrix::RealType;
+   using Index = typename Matrix::IndexType;
+   constexpr int  inVectorCacheSize = 20480 / sizeof( Real );
+   __shared__ Real inVectorCache[ inVectorCacheSize ];
+
+   const int rowIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * 256 + threadIdx.x + begin;
+
+   Real result( 0.0 );
+   Index columnIdx( 0 );
+   const auto& values = matrix.getValues();
+   const auto& rowsCount = matrix.getRows();
+   Index valuesPtr = rowIdx;
+
+   while( columnIdx < matrix.getColumns() )
+   {
+      const Index lastIdx = min( matrix.getColumns(), columnIdx + inVectorCacheSize );
+      Index matrixColIdx = columnIdx + threadIdx.x;
+      Index cacheColIdx = threadIdx.x;
+      while( matrixColIdx < lastIdx )
+      {
+         inVectorCache[ cacheColIdx ] = inVector[ matrixColIdx ];
+         cacheColIdx += 256;
+         matrixColIdx += 256;
+      }
+      __syncthreads();
+
+      matrixColIdx = columnIdx;
+      cacheColIdx = 0;
+      if( rowIdx < end )
+         while( matrixColIdx < lastIdx )
+         {
+            result += values[ valuesPtr ] * inVectorCache[ cacheColIdx ];
+            cacheColIdx++;
+            matrixColIdx++;
+            valuesPtr += rowsCount;
+         }
+      columnIdx = lastIdx;
+   }
+   if( rowIdx < end )
+      outVector[ rowIdx ] = result;
+}
+
+template< typename Matrix, typename InVector, typename OutVector >
+__global__ void
+RowMajorDenseMatrixViewVectorMultiplicationKernel( const Matrix matrix, const InVector inVector, OutVector outVector, const int first, const int last, int gridIdx )
+{
+   using Real = typename Matrix::RealType;
+   using Index = typename Matrix::IndexType;
+   constexpr int  inVectorCacheSize = 20480 / sizeof( Real );
+   __shared__ Real inVectorCache[ inVectorCacheSize ];
+
+   constexpr int threadsPerRow = 32;
+   //const Index rowIdx = begin + ((gridIdx * TNL::Cuda::getMaxGridXSize() ) + (blockIdx.x * blockDim.x) + threadIdx.x) / threadsPerRow;
+   const Index rowIdx = first + ( ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * 256 + threadIdx.x ) /  threadsPerRow;
+
+   Real result = 0.0;
+   const Index laneID = threadIdx.x & 31; // & is cheaper than %
+   const Real* values = matrix.getValues().getData();
+
+   Index columnIdx( 0 );
+   /*while( columnIdx < matrix.getColumns() )
+   {
+      const Index lastIdx = min( matrix.getColumns(), columnIdx + inVectorCacheSize );
+      Index matrixColIdx = columnIdx + threadIdx.x;
+      Index cacheColIdx = threadIdx.x;
+      while( matrixColIdx < lastIdx )
+      {
+         inVectorCache[ cacheColIdx ] = inVector[ matrixColIdx ];
+         cacheColIdx += 256;
+         matrixColIdx += 256;
+      }
+      __syncthreads();
+
+      // Calculate result
+      if( rowIdx < last )
+      {
+         const Index begin = rowIdx * matrix.getColumns() + columnIdx;
+         const Index end = rowIdx * matrix.getColumns() + lastIdx;
+         Index localColumn( 0 );
+
+         for( Index i = begin + laneID; i < end; i += threadsPerRow, localColumn += threadsPerRow )
+            result += values[ i ] * inVectorCache[ localColumn ];
+      }
+      columnIdx = lastIdx;
+   }*/
+
+   if( rowIdx < last )
+   {
+      const Index begin = rowIdx * matrix.getColumns();
+      const Index end = begin + matrix.getColumns();
+
+      for( Index i = begin + laneID; i < end; i += threadsPerRow, columnIdx += threadsPerRow )
+         result += values[ i ] * inVector[ columnIdx ];
+   }
+
+   if( rowIdx < last )
+   {
+      // Reduction
+      if( threadsPerRow > 16 )
+         result += __shfl_down_sync(0xFFFFFFFF, result, 16 );
+      if( threadsPerRow > 8 )
+         result += __shfl_down_sync(0xFFFFFFFF, result,  8 );
+      if( threadsPerRow > 4 )
+         result += __shfl_down_sync(0xFFFFFFFF, result,  4 );
+      if( threadsPerRow > 2 )
+         result += __shfl_down_sync(0xFFFFFFFF, result,  2 );
+      if( threadsPerRow > 1 )
+         result += __shfl_down_sync(0xFFFFFFFF, result,  1 );
+      // Write result
+      if( laneID == 0 )
+         outVector[ rowIdx ] = result;
+   }
+}
+#endif
+
 template< typename Real,
           typename Device,
           typename Index,
@@ -297,7 +485,7 @@ reduceRows( IndexType begin, IndexType end, Fetch& fetch, const Reduce& reduce,
          return fetch( rowIdx, columnIdx, values_view[ globalIdx ] );
       return identity;
    };
-   this->segments.segmentsReduction( begin, end, fetch_, reduce, keep, identity );
+   this->segments.reduceSegments( begin, end, fetch_, reduce, keep, identity );
 }
 
 template< typename Real,
@@ -314,7 +502,7 @@ reduceRows( IndexType begin, IndexType end, Fetch& fetch, const Reduce& reduce,
          return fetch( rowIdx, columnIdx, values_view[ globalIdx ] );
       return identity;
    };
-   this->segments.segmentsReduction( begin, end, fetch_, reduce, keep, identity );
+   this->segments.reduceSegments( begin, end, fetch_, reduce, keep, identity );
 }
 
 template< typename Real,
@@ -351,8 +539,8 @@ DenseMatrixView< Real, Device, Index, Organization >::
 forElements( IndexType begin, IndexType end, Function&& function ) const
 {
    const auto values_view = this->values.getConstView();
-   auto f = [=] __cuda_callable__ ( IndexType rowIdx, IndexType columnIdx, IndexType globalIdx, bool& compute ) mutable {
-      function( rowIdx, columnIdx, columnIdx, values_view[ globalIdx ], compute );
+   auto f = [=] __cuda_callable__ ( IndexType rowIdx, IndexType columnIdx, IndexType globalIdx ) mutable {
+      function( rowIdx, columnIdx, columnIdx, values_view[ globalIdx ] );
    };
    this->segments.forElements( begin, end, f );
 }
@@ -367,8 +555,8 @@ DenseMatrixView< Real, Device, Index, Organization >::
 forElements( IndexType begin, IndexType end, Function&& function )
 {
    auto values_view = this->values.getView();
-   auto f = [=] __cuda_callable__ ( IndexType rowIdx, IndexType columnIdx, IndexType globalIdx, bool& compute ) mutable {
-      function( rowIdx, columnIdx, globalIdx, values_view[ globalIdx ], compute );
+   auto f = [=] __cuda_callable__ ( IndexType rowIdx, IndexType columnIdx, IndexType globalIdx ) mutable {
+      function( rowIdx, columnIdx, globalIdx, values_view[ globalIdx ] );
    };
    this->segments.forElements( begin, end, f );
 }
@@ -531,13 +719,85 @@ vectorProduct( const InVector& inVector,
    const auto valuesView = this->values.getConstView();
    if( end == 0 )
       end = this->getRows();
+
+   if( std::is_same< DeviceType, Devices::Cuda >::value &&
+      matrixMultiplicator == 1.0 && outVectorMultiplicator == 0.0 )
+   {
+#ifdef HAVE_CUDA
+      if( Organization == Algorithms::Segments::ColumnMajorOrder )
+      {
+         constexpr int BlockSize = 256;
+         constexpr int ThreadsPerRow = 1;
+         const size_t threadsCount = ( end - begin ) * ThreadsPerRow;
+         const size_t blocksCount = roundUpDivision( threadsCount, BlockSize );
+         const size_t gridsCount = roundUpDivision( blocksCount, Cuda::getMaxGridSize() );
+         const size_t sharedMemSize = 20480;
+         for( size_t gridIdx = 0; gridIdx < gridsCount; gridIdx++ )
+         {
+            dim3 blocks( Cuda::getMaxGridSize() );
+            if( gridIdx == gridsCount - 1 )
+               blocks = blocksCount % Cuda::getMaxGridSize();
+            ColumnMajorDenseMatrixViewVectorMultiplicationKernel<<< blocks, BlockSize, sharedMemSize >>>( *this, inVectorView, outVectorView, begin, end, gridIdx );
+         }
+         TNL_CHECK_CUDA_DEVICE;
+         return;
+      }
+      if( Organization == Algorithms::Segments::RowMajorOrder )
+      {
+         constexpr int BlockSize = 256;
+         constexpr int ThreadsPerRow = 32;
+         const size_t threadsCount = ( end - begin ) * ThreadsPerRow;
+         const size_t blocksCount = roundUpDivision( threadsCount, BlockSize );
+         const size_t gridsCount = roundUpDivision( blocksCount, Cuda::getMaxGridSize() );
+         const size_t sharedMemSize = 20480;
+         for( size_t gridIdx = 0; gridIdx < gridsCount; gridIdx++ )
+         {
+            dim3 blocks( Cuda::getMaxGridSize() );
+            if( gridIdx == gridsCount - 1 )
+               blocks = blocksCount % Cuda::getMaxGridSize();
+            RowMajorDenseMatrixViewVectorMultiplicationKernel<<< blocks, BlockSize, sharedMemSize >>>( *this, inVectorView, outVectorView, begin, end, gridIdx );
+         }
+         TNL_CHECK_CUDA_DEVICE;
+         return;
+      }
+
+#endif
+   }
+
+   /***
+    * The rest is general implementation based on segments
+    */
+
    auto fetch = [=] __cuda_callable__ ( IndexType row, IndexType column, IndexType offset, bool& compute ) -> RealType {
       return valuesView[ offset ] * inVectorView[ column ];
    };
-   auto keeper = [=] __cuda_callable__ ( IndexType row, const RealType& value ) mutable {
+   auto keeperGeneral = [=] __cuda_callable__ ( IndexType row, const RealType& value ) mutable {
       outVectorView[ row ] = matrixMultiplicator * value + outVectorMultiplicator * outVectorView[ row ];
    };
-   this->segments.segmentsReduction( begin, end, fetch, std::plus<>{}, keeper, ( RealType ) 0.0 );
+   auto keeperDirect = [=] __cuda_callable__ ( IndexType row, const RealType& value ) mutable {
+      outVectorView[ row ] = value;
+   };
+   auto keeperMatrixMult = [=] __cuda_callable__ ( IndexType row, const RealType& value ) mutable {
+      outVectorView[ row ] = matrixMultiplicator * value;
+   };
+   auto keeperVectorMult = [=] __cuda_callable__ ( IndexType row, const RealType& value ) mutable {
+      outVectorView[ row ] = outVectorMultiplicator * outVectorView[ row ] + value;
+   };
+
+   if( outVectorMultiplicator == 0.0 )
+   {
+      if( matrixMultiplicator == 1.0 )
+         this->segments.reduceSegments( begin, end, fetch, std::plus<>{}, keeperDirect, ( RealType ) 0.0 );
+      else
+         this->segments.reduceSegments( begin, end, fetch, std::plus<>{}, keeperMatrixMult, ( RealType ) 0.0 );
+   }
+   else
+   {
+      if( matrixMultiplicator == 1.0 )
+         this->segments.reduceSegments( begin, end, fetch, std::plus<>{}, keeperVectorMult, ( RealType ) 0.0 );
+      else
+         this->segments.reduceSegments( begin, end, fetch, std::plus<>{}, keeperGeneral, ( RealType ) 0.0 );
+   }
 }
 
 template< typename Real,
@@ -781,6 +1041,63 @@ operator=( const DenseMatrixView& matrix )
    return *this;
 }
 
+template< typename Real,
+          typename Device,
+          typename Index,
+          ElementsOrganization Organization >
+   template< typename Real_, typename Device_, typename Index_ >
+bool
+DenseMatrixView< Real, Device, Index, Organization >::
+operator==( const DenseMatrixView< Real_, Device_, Index_, Organization >& matrix ) const
+{
+   return( this->getRows() == matrix.getRows() &&
+           this->getColumns() == matrix.getColumns() &&
+           this->getValues() == matrix.getValues() );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          ElementsOrganization Organization >
+   template< typename Real_, typename Device_, typename Index_ >
+bool
+DenseMatrixView< Real, Device, Index, Organization >::
+operator!=( const DenseMatrixView< Real_, Device_, Index_, Organization >& matrix ) const
+{
+   return ! ( *this == matrix );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          ElementsOrganization Organization >
+   template< typename Matrix >
+bool
+DenseMatrixView< Real, Device, Index, Organization >::
+operator==( const Matrix& m ) const
+{
+   const auto& view1 = *this;
+   const auto view2 = m.getConstView();
+   auto fetch = [=] __cuda_callable__ ( const IndexType i ) -> bool
+   {
+      return view1.getRow( i ) == view2.getRow( i );
+   };
+   return Algorithms::reduce< DeviceType >( ( IndexType ) 0, this->getRows(), fetch, std::logical_and<>{}, true );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          ElementsOrganization Organization >
+   template< typename Matrix >
+bool
+DenseMatrixView< Real, Device, Index, Organization >::
+operator!=( const Matrix& m ) const
+{
+   return ! ( *this == m );
+}
+
+
 template< typename Real,
           typename Device,
           typename Index,
diff --git a/src/TNL/Matrices/LambdaMatrix.h b/src/TNL/Matrices/LambdaMatrix.h
index 01d3a0b9101bb5e95e0c044f4c39d70b750bd95e..2b788ed5270540a58582153b8c37e87dd346c2a4 100644
--- a/src/TNL/Matrices/LambdaMatrix.h
+++ b/src/TNL/Matrices/LambdaMatrix.h
@@ -260,10 +260,12 @@ class LambdaMatrix
        *
        * \tparam Function is type of lambda function that will operate on matrix elements.
        *    It is should have form like
-       *  `function( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx_, const RealType& value, bool& compute )`.
+       *
+       * ```
+       * auto function = [=] __cuda_callable__ ( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx_, const RealType& value ) { ... };
+       * ```
+       *
        *  The column index repeats twice only for compatibility with sparse matrices.
-       *  If the 'compute' variable is set to false the iteration over the row can
-       *  be interrupted.
        *
        * \param begin defines beginning of the range [begin,end) of rows to be processed.
        * \param end defines ending of the range [begin,end) of rows to be processed.
@@ -275,7 +277,7 @@ class LambdaMatrix
        * \include LambdaMatrixExample_forRows.out
        */
       template< typename Function >
-      void forElements( IndexType first, IndexType last, Function& function ) const;
+      void forElements( IndexType begin, IndexType end, Function& function ) const;
 
       /**
        * \brief This method calls \e forElements for all matrix rows (for constant instances).
@@ -348,10 +350,12 @@ class LambdaMatrix
        *
        * \tparam Function is type of lambda function that will operate on matrix elements.
        *    It is should have form like
-       *  `function( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx_, const RealType& value, bool& compute )`.
-       *  The column index repeats twice only for compatibility with sparse matrices.
-       *  If the 'compute' variable is set to false the iteration over the row can
-       *  be interrupted.
+       *
+       * ```
+       * auto function = [] __cuda_callable__ ( RowView& row ) { ... };
+       * ```
+       *
+       * \e RowView represents matrix row - see \ref TNL::Matrices::LambdaMatrix::RowView.
        *
        * \param begin defines beginning of the range [begin,end) of rows to be processed.
        * \param end defines ending of the range [begin,end) of rows to be processed.
@@ -375,16 +379,29 @@ class LambdaMatrix
        * \brief Method for performing general reduction on matrix rows.
        *
        * \tparam Fetch is a type of lambda function for data fetch declared as
-       *          `fetch( IndexType rowIdx, IndexType columnIdx, RealType elementValue ) -> FetchValue`.
-       *          The return type of this lambda can be any non void.
+       *
+       * ```
+       * auto fetch = [=] __cuda_callable__ ( IndexType rowIdx, IndexType columnIdx, RealType elementValue ) -> FetchValue
+       * ```
+       *
+       *  The return type of this lambda can be any non void.
        * \tparam Reduce is a type of lambda function for reduction declared as
-       *          `reduce( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue`.
+       *
+       * ```
+       * auto reduce = [=] __cuda_callbale__ ( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue
+       * ```
+       *
        * \tparam Keep is a type of lambda function for storing results of reduction in each row.
-       *          It is declared as `keep( const IndexType rowIdx, const double& value )`.
+       *    It is declared as
+       *
+       * ```
+       * auto keep = [=] __cuda_callable__ ( const IndexType rowIdx, const double& value )
+       * ```
+       *
        * \tparam FetchValue is type returned by the Fetch lambda function.
        *
-       * \param begin defines beginning of the range [begin,end) of rows to be processed.
-       * \param end defines ending of the range [begin,end) of rows to be processed.
+       * \param begin defines beginning of the range [\e begin, \e end) of rows to be processed.
+       * \param end defines ending of the range [\e begin,\e end) of rows to be processed.
        * \param fetch is an instance of lambda function for data fetch.
        * \param reduce is an instance of lambda function for reduction.
        * \param keep in an instance of lambda function for storing results.
@@ -398,18 +415,31 @@ class LambdaMatrix
        * \include LambdaMatrixExample_reduceRows.out
        */
       template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
-      void reduceRows( IndexType first, IndexType last, Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& identity ) const;
+      void reduceRows( IndexType begin, IndexType end, Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& zero ) const;
 
       /**
        * \brief Method for performing general reduction on ALL matrix rows.
        *
        * \tparam Fetch is a type of lambda function for data fetch declared as
-       *          `fetch( IndexType rowIdx, IndexType columnIdx, RealType elementValue ) -> FetchValue`.
-       *          The return type of this lambda can be any non void.
+       *
+       * ```
+       * auto fetch = [=] __cuda_callable__ ( IndexType rowIdx, IndexType columnIdx, RealType elementValue ) -> FetchValue
+       * ```
+       *
+       *  The return type of this lambda can be any non void.
        * \tparam Reduce is a type of lambda function for reduction declared as
-       *          `reduce( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue`.
+       *
+       * ```
+       * auto reduce = [=] __cuda_callable__ ( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue
+       * ```
+       *
        * \tparam Keep is a type of lambda function for storing results of reduction in each row.
-       *          It is declared as `keep( const IndexType rowIdx, const double& value )`.
+       *   It is declared as
+       *
+       * ```
+       * auto keep = [=] __cuda_callable__ ( const IndexType rowIdx, const double& value )
+       * ```
+       *
        * \tparam FetchValue is type returned by the Fetch lambda function.
        *
        * \param fetch is an instance of lambda function for data fetch.
@@ -432,7 +462,9 @@ class LambdaMatrix
        *
        * More precisely, it computes:
        *
-       * `outVector = matrixMultiplicator * ( *this ) * inVector + outVectorMultiplicator * outVector`
+       * ```
+       * outVector = matrixMultiplicator * ( *this ) * inVector + outVectorMultiplicator * outVector
+       * ```
        *
        * \tparam InVector is type of input vector.  It can be \ref Vector,
        *     \ref VectorView, \ref Array, \ref ArraView or similar container.
diff --git a/src/TNL/Matrices/LambdaMatrix.hpp b/src/TNL/Matrices/LambdaMatrix.hpp
index 867016d474147f688a6c3f99e396ee33e0f43b38..77de0872c5379a8bd144c10c85a70e5755717805 100644
--- a/src/TNL/Matrices/LambdaMatrix.hpp
+++ b/src/TNL/Matrices/LambdaMatrix.hpp
@@ -317,14 +317,13 @@ forElements( IndexType first, IndexType last, Function& function ) const
    auto matrixElements = this->matrixElementsLambda;
    auto processRow = [=] __cuda_callable__ ( IndexType rowIdx ) mutable {
       const IndexType rowLength = rowLengths( rows, columns, rowIdx );
-      bool compute( true );
-      for( IndexType localIdx = 0; localIdx < rowLength && compute; localIdx++ )
+      for( IndexType localIdx = 0; localIdx < rowLength; localIdx++ )
       {
         IndexType elementColumn( 0 );
         RealType elementValue( 0.0 );
         matrixElements( rows, columns, rowIdx, localIdx, elementColumn, elementValue );
         if( elementValue != 0.0 )
-            function( rowIdx, localIdx, elementColumn, elementValue, compute );
+            function( rowIdx, localIdx, elementColumn, elementValue );
       }
    };
    Algorithms::ParallelFor< DeviceType >::exec( first, last, processRow );
diff --git a/src/TNL/Matrices/LambdaMatrixElement.h b/src/TNL/Matrices/LambdaMatrixElement.h
index 57ba698f3674dada2aadd9fa7d5c62d83eb138dd..b094eb004f7110d963417b5413dcf44c9ad76b9e 100644
--- a/src/TNL/Matrices/LambdaMatrixElement.h
+++ b/src/TNL/Matrices/LambdaMatrixElement.h
@@ -17,17 +17,36 @@
 namespace TNL {
 namespace Matrices {
 
-
+/**
+ * \brief Accessor for elements of lambda matrix.
+ *
+ * \tparam Real is type of matrix elements values.
+ * \tparam Index is a type of matrix elements column indexes.
+ */
 template< typename Real,
           typename Index >
 class LambdaMatrixElement
 {
    public:
 
+      /**
+       * \brief Type of matrix elements values.
+       */
       using RealType = Real;
 
+      /**
+       * \brief Type of matrix elements column indexes.
+       */
       using IndexType = Index;
 
+      /**
+       * \brief Constructor.
+       *
+       * \param value is matrix element value.
+       * \param rowIdx is row index of the matrix element.
+       * \param columnIdx is a column index of the matrix element.
+       * \param localIdx is the rank of the non-zero elements in the matrix row.
+       */
       __cuda_callable__
       LambdaMatrixElement( const RealType& value,
                            const IndexType& rowIdx,
@@ -35,18 +54,43 @@ class LambdaMatrixElement
                            const IndexType& localIdx )
       : value_( value ), rowIdx( rowIdx ), columnIdx( columnIdx ), localIdx( localIdx ) {};
 
+      /**
+       * \brief Copy constructor.
+       *
+       * \param el is the source matrix element.
+       */
       __cuda_callable__
       LambdaMatrixElement( const LambdaMatrixElement& el ) = default;
 
+      /**
+       * \brief Returns constant reference on matrix element value.
+       *
+       * \return constant reference on matrix element value.
+       */
       __cuda_callable__
       const RealType& value() const { return value_; };
 
+      /**
+       * \brief Returns constant reference on matrix element row index.
+       *
+       * \return constant reference on matrix element row index.
+       */
       __cuda_callable__
       const IndexType& rowIndex() const { return rowIdx; };
 
+      /**
+       * \brief Returns constant reference on matrix element column index.
+       *
+       * \return constant reference on matrix element column index.
+       */
       __cuda_callable__
       const IndexType& columnIndex() const { return columnIdx; };
 
+      /**
+       * \brief Returns constant reference on the rank of the non-zero matrix element in the row.
+       *
+       * \return constant reference on the rank of the non-zero matrix element in the row.
+       */
       __cuda_callable__
       const IndexType& localIndex() const { return localIdx; };
 
diff --git a/src/TNL/Matrices/MatrixInfo.h b/src/TNL/Matrices/MatrixInfo.h
index d84afa39a15ecfb95298d706169d198131a8ad06..fd48ddc5fbc232ae581f0edd0c30e47ab683846b 100644
--- a/src/TNL/Matrices/MatrixInfo.h
+++ b/src/TNL/Matrices/MatrixInfo.h
@@ -15,6 +15,7 @@
 #include <TNL/Matrices/DenseMatrixView.h>
 #include <TNL/Matrices/SparseMatrix.h>
 #include <TNL/Matrices/SparseMatrixView.h>
+#include <TNL/Matrices/Sandbox/SparseSandboxMatrix.h>
 #include <TNL/Algorithms/Segments/CSRView.h>
 #include <TNL/Algorithms/Segments/EllpackView.h>
 #include <TNL/Algorithms/Segments/SlicedEllpackView.h>
@@ -66,10 +67,17 @@ struct MatrixInfo< SparseMatrixView< Real, Device, Index, MatrixType, SegmentsVi
 
    static String getFormat()
    {
+      String prefix;
       if( MatrixType::isSymmetric() )
-         return TNL::String( "Symmetric " ) + SegmentsView< Device, Index >::getSegmentsType();
-      else
-         return SegmentsView< Device, Index >::getSegmentsType();
+      {
+         if( std::is_same< Real, bool >::value )
+            prefix = "Symmetric Binary ";
+         else
+            prefix = "Symmetric ";
+      }
+      else if( std::is_same< Real, bool >::value )
+         prefix = "Binary ";
+      return prefix + SegmentsView< Device, Index >::getSegmentsType();
    };
 };
 
@@ -85,6 +93,34 @@ struct MatrixInfo< SparseMatrix< Real, Device, Index, MatrixType, Segments, Real
 {
 };
 
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType >
+struct MatrixInfo< Sandbox::SparseSandboxMatrixView< Real, Device, Index, MatrixType > >
+{
+   static String getDensity() { return String( "sparse" ); };
+
+   static String getFormat()
+   {
+      if( MatrixType::isSymmetric() )
+         return TNL::String( "Symmetric Sandbox" );
+      else
+         return TNL::String( "Sandbox" );
+   };
+};
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          typename RealAllocator,
+          typename IndexAllocator >
+struct MatrixInfo< Sandbox::SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator > >
+: public MatrixInfo< typename Sandbox::SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::ViewType >
+{
+};
+
 /////
 // Legacy matrices
 template< typename Real, typename Device, typename Index >
diff --git a/src/TNL/Matrices/MatrixRowViewIterator.h b/src/TNL/Matrices/MatrixRowViewIterator.h
index cf99bea295f56948226ace980ba1e0019bf90756..463ac3ca516f150192b5e70a22f9d96aff8816ca 100644
--- a/src/TNL/Matrices/MatrixRowViewIterator.h
+++ b/src/TNL/Matrices/MatrixRowViewIterator.h
@@ -72,15 +72,27 @@ class MatrixRowViewIterator
       __cuda_callable__
       bool operator!=( const MatrixRowViewIterator& other ) const;
 
+      /**
+       * \brief Increment operator.
+       */
       __cuda_callable__
       MatrixRowViewIterator& operator++();
 
+      /**
+       * \brief Decrement operetor.
+       */
       __cuda_callable__
       MatrixRowViewIterator& operator--();
 
+      /**
+       * \brief Dereference operator.
+       */
       __cuda_callable__
       MatrixElementType operator*();
 
+      /**
+       * \brief Dereference operator for constant instances.
+       */
       __cuda_callable__
       const MatrixElementType operator*() const;
 
diff --git a/src/TNL/Matrices/MatrixRowViewIterator.hpp b/src/TNL/Matrices/MatrixRowViewIterator.hpp
index 7b233e47b07cc3acde2b41892966c92b8c49ec20..7d217bc7afc2b676127a684de36be26126795b73 100644
--- a/src/TNL/Matrices/MatrixRowViewIterator.hpp
+++ b/src/TNL/Matrices/MatrixRowViewIterator.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************
-                          SparseMatrixRowView.hpp -  description
+                          MatrixRowViewIterator.hpp -  description
                              -------------------
     begin                : Mar 20, 2021
     copyright            : (C) 2021 by Tomas Oberhuber
diff --git a/src/TNL/Matrices/MatrixWrapping.h b/src/TNL/Matrices/MatrixWrapping.h
new file mode 100644
index 0000000000000000000000000000000000000000..6c0c6bb8ca0b57bf21f493201202b3eec8bbdfab
--- /dev/null
+++ b/src/TNL/Matrices/MatrixWrapping.h
@@ -0,0 +1,155 @@
+/***************************************************************************
+                          MatrixWrapping.h -  description
+                             -------------------
+    begin                : May 3, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#include <TNL/Algorithms/Segments/CSR.h>
+#include <TNL/Algorithms/Segments/Ellpack.h>
+#include <TNL/Matrices/SparseMatrixView.h>
+#include <TNL/Matrices/DenseMatrixView.h>
+
+namespace TNL {
+namespace Matrices {
+
+/**
+ * \brief Function for wrapping an array of values into a dense matrix view.
+ *
+ * \tparam Device is a device on which the array is allocated.
+ * \tparam Real is a type of array elements.
+ * \tparam Index is a type for indexing of matrix elements.
+ * \tparam Organization is matrix elements organization - see \ref TNL::Algorithms::Segments::ElementsOrganization.
+ * \param rows is a number of matrix rows.
+ * \param columns is a number of matrix columns.
+ * \param values is the array with matrix elements values.
+ * \return instance of DenseMatrixView wrapping the array.
+ *
+ * The array size must be equal to product of `rows` and `columns`. The dense matrix view does not deallocate the input
+ * array at the end of its lifespan.
+ *
+ * \par Example
+ * \include Matrices/DenseMatrix/DenseMatrixViewExample_wrap.cpp
+ * \par Output
+ * \include DenseMatrixViewExample_wrap.out
+ */
+template< typename Device,
+          typename Real,
+          typename Index,
+          ElementsOrganization Organization = Algorithms::Segments::DefaultElementsOrganization< Device >::getOrganization() >
+DenseMatrixView< Real, Device, Index, Organization >
+wrapDenseMatrix( const Index& rows, const Index& columns, Real* values )
+{
+   using MatrixView = DenseMatrixView< Real, Device, Index, Organization >;
+   using ValuesViewType = typename MatrixView::ValuesViewType;
+   return MatrixView( rows, columns, ValuesViewType( values, rows * columns ) );
+}
+
+/**
+ * \brief Function for wrapping of arrays defining [CSR format](https://en.wikipedia.org/wiki/Sparse_matrix#Compressed_sparse_row_(CSR,_CRS_or_Yale_format)) into a sparse matrix view.
+ *
+ * \tparam Device  is a device on which the arrays are allocated.
+ * \tparam Real is a type of matrix elements values.
+ * \tparam Index is a type for matrix elements indexing.
+ * \param rows is a number of matrix rows.
+ * \param columns is a number of matrix columns.
+ * \param rowPointers is an array holding row pointers of the CSR format ( `ROW_INDEX` [here](https://en.wikipedia.org/wiki/Sparse_matrix#Compressed_sparse_row_(CSR,_CRS_or_Yale_format)))
+ * \param values is an array with values of matrix elements ( `V` [here](https://en.wikipedia.org/wiki/Sparse_matrix#Compressed_sparse_row_(CSR,_CRS_or_Yale_format)))
+ * \param columnIndexes is an array with column indexes of matrix elements  ( `COL_INDEX` [here](https://en.wikipedia.org/wiki/Sparse_matrix#Compressed_sparse_row_(CSR,_CRS_or_Yale_format)))
+ * \return instance of SparseMatrixView with CSR format.
+ *
+ * The size of array \e rowPointers must be equal to number of `rows + 1`. The last element of the array equals to the number of all nonzero matrix elements. The sizes of arrays `values` and
+ * `columnIndexes` must be equal to this number.
+ *
+ * \par Example
+ * \include Matrices/SparseMatrix/SparseMatrixViewExample_wrapCSR.cpp
+ * \par Output
+ * \include SparseMatrixViewExample_wrapCSR.out
+ */
+template< typename Device,
+          typename Real,
+          typename Index >
+SparseMatrixView< Real, Device, Index, GeneralMatrix, Algorithms::Segments::CSRViewDefault >
+wrapCSRMatrix( const Index& rows, const Index& columns, Index* rowPointers, Real* values, Index* columnIndexes )
+{
+   using MatrixView = SparseMatrixView< Real, Device, Index, GeneralMatrix, Algorithms::Segments::CSRViewDefault >;
+   using ValuesViewType = typename MatrixView::ValuesViewType;
+   using ColumnIndexesView = typename MatrixView::ColumnsIndexesViewType;
+   using SegmentsView = typename MatrixView::SegmentsViewType;
+   using KernelView = typename SegmentsView::KernelView;
+   using RowPointersView = typename SegmentsView::OffsetsView;
+   RowPointersView rowPointersView( rowPointers, rows + 1 );
+   Index elementsCount = rowPointersView.getElement( rows );
+   SegmentsView segments( rowPointersView, KernelView() );
+   ValuesViewType valuesView( values, elementsCount );
+   ColumnIndexesView columnIndexesView( columnIndexes, elementsCount );
+   return MatrixView( rows, columns, valuesView, columnIndexesView, segments );
+}
+
+/// This is to prevent from appearing in Doxygen documentation.
+/// \cond HIDDEN_CLASS
+template< typename Device,
+          ElementsOrganization Organization,
+          typename Real,
+          typename Index,
+          int Alignment = 1 >
+struct EllpackMatrixWrapper
+{
+   template< typename Device_, typename Index_ >
+   using EllpackSegments = Algorithms::Segments::EllpackView< Device_, Index_, Organization, Alignment >;
+   using MatrixView = SparseMatrixView< Real, Device, Index, GeneralMatrix, EllpackSegments >;
+
+   static MatrixView wrap( const Index& rows, const Index& columns, const Index& nonzerosPerRow, Real* values, Index* columnIndexes )
+   {
+      using ValuesViewType = typename MatrixView::ValuesViewType;
+      using ColumnIndexesView = typename MatrixView::ColumnsIndexesViewType;
+      using SegmentsView = Algorithms::Segments::EllpackView< Device, Index, Organization, Alignment >;
+      SegmentsView segments( rows, nonzerosPerRow );
+      Index elementsCount = segments.getStorageSize();
+      ValuesViewType valuesView( values, elementsCount );
+      ColumnIndexesView columnIndexesView( columnIndexes, elementsCount );
+      return MatrixView( rows, columns, valuesView, columnIndexesView, segments );
+   }
+};
+/// \endcond
+
+/**
+ * \brief Function for wrapping of arrays defining [Ellpack format](https://people.math.sc.edu/Burkardt/data/sparse_ellpack/sparse_ellpack.html) into a sparse matrix view.
+ *
+ * \tparam Device  is a device on which the arrays are allocated.
+ * \tparam Real is a type of matrix elements values.
+ * \tparam Index is a type for matrix elements indexing.
+ * \tparam Alignment defines alignment of data. The number of matrix rows is rounded to a multiple of this number. It it usefull mainly for GPUs.
+ * \param rows is a number of matrix rows.
+ * \param columns is a number of matrix columns.
+ * \param nonzerosPerRow is number of nonzero matrix elements in each row.
+ * \param values is an array with values of matrix elements.
+ * \param columnIndexes is an array with column indexes of matrix elements.
+ * \return instance of SparseMatrixView with CSR format.
+ *
+ *  The sizes of arrays `values` and `columnIndexes` must be equal to `rows * nonzerosPerRow`. Use `-1` as a column index for padding zeros.
+ *
+ * \par Example
+ * \include Matrices/SparseMatrix/SparseMatrixViewExample_wrapEllpack.cpp
+ * \par Output
+ * \include SparseMatrixViewExample_wrapEllpack.out
+ */
+template< typename Device,
+          ElementsOrganization Organization,
+          typename Real,
+          typename Index,
+          int Alignment = 1 >
+auto
+wrapEllpackMatrix( const Index rows, const Index columns, const Index nonzerosPerRow, Real* values, Index* columnIndexes )
+-> decltype( EllpackMatrixWrapper< Device, Organization, Real, Index, Alignment >::wrap( rows, columns, nonzerosPerRow, values, columnIndexes ) )
+{
+   return EllpackMatrixWrapper< Device, Organization, Real, Index, Alignment >::wrap( rows, columns, nonzerosPerRow, values, columnIndexes );
+}
+
+   } //namespace Matrices
+} //namepsace TNL
diff --git a/src/TNL/Matrices/MultidiagonalMatrix.h b/src/TNL/Matrices/MultidiagonalMatrix.h
index d938a106298c298b090ad5bdaf67b57768e109a4..3a74b0b44ff0d0b5529e87416b36790113a02f6f 100644
--- a/src/TNL/Matrices/MultidiagonalMatrix.h
+++ b/src/TNL/Matrices/MultidiagonalMatrix.h
@@ -245,7 +245,7 @@ class MultidiagonalMatrix : public Matrix< Real, Device, Index, RealAllocator >
       MultidiagonalMatrix( MultidiagonalMatrix&& matrix ) = default;
 
       /**
-       * \brief Returns a modifiable view of the mutlidiagonal matrix.
+       * \brief Returns a modifiable view of the multidiagonal matrix.
        *
        * See \ref MultidiagonalMatrixView.
        *
@@ -601,12 +601,24 @@ class MultidiagonalMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * \brief Method for performing general reduction on matrix rows.
        *
        * \tparam Fetch is a type of lambda function for data fetch declared as
-       *          `fetch( IndexType rowIdx, IndexType& columnIdx, RealType& elementValue ) -> FetchValue`.
-       *          The return type of this lambda can be any non void.
+       *
+       * ```
+       * auto fetch = [=] __cuda_callable__ ( IndexType rowIdx, IndexType& columnIdx, RealType& elementValue ) -> FetchValue { ... };
+       * ```
+       *
+       * The return type of this lambda can be any non void.
        * \tparam Reduce is a type of lambda function for reduction declared as
-       *          `reduce( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue`.
-       * \tparam Keep is a type of lambda function for storing results of reduction in each row.
-       *          It is declared as `keep( const IndexType rowIdx, const double& value )`.
+       *
+       * ```
+       * auto reduce = [=] __cuda_callable__ ( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue { ... };
+       * ```
+       *
+       * \tparam Keep is a type of lambda function for storing results of reduction in each row. It is declared as
+       *
+       * ```
+       * auto keep = [=] __cuda_callable__ ( const IndexType rowIdx, const double& value ) { ... };
+       * ```
+       *
        * \tparam FetchValue is type returned by the Fetch lambda function.
        *
        * \param begin defines beginning of the range [begin,end) of rows to be processed.
@@ -624,22 +636,34 @@ class MultidiagonalMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * \include MultidiagonalMatrixExample_reduceRows.out
        */
       template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
-      void reduceRows( IndexType first, IndexType last, Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& identity );
+      void reduceRows( IndexType begin, IndexType end, Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& zero );
 
       /**
        * \brief Method for performing general reduction on matrix rows for constant instances.
        *
        * \tparam Fetch is a type of lambda function for data fetch declared as
-       *          `fetch( IndexType rowIdx, IndexType& columnIdx, RealType& elementValue ) -> FetchValue`.
-       *          The return type of this lambda can be any non void.
+       *
+       * ```
+       * auto fetch = [=] __cuda_callable__ ( IndexType rowIdx, IndexType& columnIdx, RealType& elementValue ) -> FetchValue { ... };
+       * ```
+       *
+       * The return type of this lambda can be any non void.
        * \tparam Reduce is a type of lambda function for reduction declared as
-       *          `reduce( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue`.
-       * \tparam Keep is a type of lambda function for storing results of reduction in each row.
-       *          It is declared as `keep( const IndexType rowIdx, const double& value )`.
+       *
+       * ```
+       * auto reduce = [=] __cuda_callbale__ ( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue { ... };
+       * ```
+       *
+       * \tparam Keep is a type of lambda function for storing results of reduction in each row. It is declared as
+       *
+       * ```
+       * auto keep =[=] __cuda_callable__ ( const IndexType rowIdx, const double& value ) { ... };
+       * ```
+       *
        * \tparam FetchValue is type returned by the Fetch lambda function.
        *
-       * \param begin defines beginning of the range [begin,end) of rows to be processed.
-       * \param end defines ending of the range [begin,end) of rows to be processed.
+       * \param begin defines beginning of the range [\e begin,\e end) of rows to be processed.
+       * \param end defines ending of the range [\e begin,\e end) of rows to be processed.
        * \param fetch is an instance of lambda function for data fetch.
        * \param reduce is an instance of lambda function for reduction.
        * \param keep in an instance of lambda function for storing results.
@@ -653,18 +677,30 @@ class MultidiagonalMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * \include MultidiagonalMatrixExample_reduceRows.out
        */
       template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
-      void reduceRows( IndexType first, IndexType last, Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& identity ) const;
+      void reduceRows( IndexType begin, IndexType end, Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& zero ) const;
 
       /**
        * \brief Method for performing general reduction on all matrix rows.
        *
        * \tparam Fetch is a type of lambda function for data fetch declared as
-       *          `fetch( IndexType rowIdx, IndexType& columnIdx, RealType& elementValue ) -> FetchValue`.
-       *          The return type of this lambda can be any non void.
+       *
+       * ```
+       * auto fetch = [=] __cuda_callable__ ( IndexType rowIdx, IndexType& columnIdx, RealType& elementValue ) -> FetchValue { ... };
+       * ```
+       *
+       * The return type of this lambda can be any non void.
        * \tparam Reduce is a type of lambda function for reduction declared as
-       *          `reduce( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue`.
-       * \tparam Keep is a type of lambda function for storing results of reduction in each row.
-       *          It is declared as `keep( const IndexType rowIdx, const double& value )`.
+       *
+       * ```
+       * auto reduce = [=] __cuda_callable__ ( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue { ... };
+       * ```
+       *
+       * \tparam Keep is a type of lambda function for storing results of reduction in each row. It is declared as
+       *
+       * ```
+       * auto keep = [=] __cuda_callable__ ( const IndexType rowIdx, const double& value ) { ... };
+       * ```
+       *
        * \tparam FetchValue is type returned by the Fetch lambda function.
        *
        * \param fetch is an instance of lambda function for data fetch.
@@ -686,12 +722,24 @@ class MultidiagonalMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * \brief Method for performing general reduction on all matrix rows for constant instances.
        *
        * \tparam Fetch is a type of lambda function for data fetch declared as
-       *          `fetch( IndexType rowIdx, IndexType& columnIdx, RealType& elementValue ) -> FetchValue`.
-       *          The return type of this lambda can be any non void.
+       *
+       * ```
+       * auto fetch = [=] __cuda_callable__ ( IndexType rowIdx, IndexType& columnIdx, RealType& elementValue ) -> FetchValue { ... };
+       * ```
+       *
+       * The return type of this lambda can be any non void.
        * \tparam Reduce is a type of lambda function for reduction declared as
-       *          `reduce( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue`.
-       * \tparam Keep is a type of lambda function for storing results of reduction in each row.
-       *          It is declared as `keep( const IndexType rowIdx, const double& value )`.
+       *
+       * ```
+       * auto reduce = [=] __cuda_callable__ ( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue { ... };
+       * ```
+       *
+       * \tparam Keep is a type of lambda function for storing results of reduction in each row. It is declared as
+       *
+       *  ```
+       * auto keep = [=] __cuda_callable__ ( const IndexType rowIdx, const double& value ) { ... };
+       * ```
+       *
        * \tparam FetchValue is type returned by the Fetch lambda function.
        *
        * \param fetch is an instance of lambda function for data fetch.
@@ -712,10 +760,11 @@ class MultidiagonalMatrix : public Matrix< Real, Device, Index, RealAllocator >
       /**
        * \brief Method for iteration over matrix rows for constant instances.
        *
-       * \tparam Function is type of lambda function that will operate on matrix elements.
-       *    It is should have form like
+       * \tparam Function is type of lambda function that will operate on matrix elements. It is should have form like
        *
-       *  `function( IndexType rowIdx, IndexType localIdx, IndexType columnIdx, const RealType& value, bool& compute )`,
+       * ```
+       * auto function = [=] __cuda_callble__ ( IndexType rowIdx, IndexType localIdx, IndexType columnIdx, const RealType& value ) { ... };
+       * ```
        *
        * where
        *
@@ -724,13 +773,10 @@ class MultidiagonalMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * \e localIdx parameter is a rank of the non-zero element in given row. It is also, in fact,
        *  index of the matrix subdiagonal.
        *
-       * \e columnIdx is a column index of the matrx element.
+       * \e columnIdx is a column index of the matrix element.
        *
        * \e value is the matrix element value.
        *
-       * \e compute is a reference to a boolen variable. If it is set to false the iteration over the row can
-       *  be interrupted.
-       *
        * \param begin defines beginning of the range [begin,end) of rows to be processed.
        * \param end defines ending of the range [begin,end) of rows to be processed.
        * \param function is an instance of the lambda function to be called in each row.
@@ -746,10 +792,11 @@ class MultidiagonalMatrix : public Matrix< Real, Device, Index, RealAllocator >
       /**
        * \brief Method for iteration over matrix rows for non-constant instances.
        *
-       * \tparam Function is type of lambda function that will operate on matrix elements.
-       *    It is should have form like
+       * \tparam Function is type of lambda function that will operate on matrix elements. It is should have form like
        *
-       *  `function( IndexType rowIdx, IndexType localIdx, IndexType columnIdx, const RealType& value, bool& compute )`,
+       * ```
+       * auto function = [=] __cuda_callable__ ( IndexType rowIdx, IndexType localIdx, IndexType columnIdx, const RealType& value ) { ... };
+       * ```
        *
        * where
        *
@@ -762,9 +809,6 @@ class MultidiagonalMatrix : public Matrix< Real, Device, Index, RealAllocator >
        *
        * \e value is a reference to the matrix element value. It can be used even for changing the matrix element value.
        *
-       * \e compute is a reference to a boolen variable. If it is set to false the iteration over the row can
-       *  be interrupted.
-       *
        * \param begin defines beginning of the range [begin,end) of rows to be processed.
        * \param end defines ending of the range [begin,end) of rows to be processed.
        * \param function is an instance of the lambda function to be called in each row.
@@ -912,12 +956,13 @@ class MultidiagonalMatrix : public Matrix< Real, Device, Index, RealAllocator >
       /**
        * \brief Method for sequential iteration over all matrix rows for constant instances.
        *
-       * \tparam Function is type of lambda function that will operate on matrix elements.
-       *    It is should have form like
-       *  `function( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx_, const RealType& value, bool& compute )`.
+       * \tparam Function is type of lambda function that will operate on matrix elements. It is should have form like
+       *
+       * ```
+       * auto function = [=] ( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx_, const RealType& value ) { ... };
+       * ```
+       *
        *  The column index repeats twice only for compatibility with sparse matrices.
-       *  If the 'compute' variable is set to false the iteration over the row can
-       *  be interrupted.
        *
        * \param begin defines beginning of the range [begin,end) of rows to be processed.
        * \param end defines ending of the range [begin,end) of rows to be processed.
@@ -929,15 +974,16 @@ class MultidiagonalMatrix : public Matrix< Real, Device, Index, RealAllocator >
       /**
        * \brief Method for sequential iteration over all matrix rows for non-constant instances.
        *
-       * \tparam Function is type of lambda function that will operate on matrix elements.
-       *    It is should have form like
-       *  `function( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx_, RealType& value, bool& compute )`.
+       * \tparam Function is type of lambda function that will operate on matrix elements. It is should have form like
+       *
+       * ```
+       * auto function = [=] __cuda_callable__ ( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx_, RealType& value ) { ... };
+       * ```
+       *
        *  The column index repeats twice only for compatibility with sparse matrices.
-       *  If the 'compute' variable is set to false the iteration over the row can
-       *  be interrupted.
        *
-       * \param begin defines beginning of the range [begin,end) of rows to be processed.
-       * \param end defines ending of the range [begin,end) of rows to be processed.
+       * \param begin defines beginning of the range [\e begin,\e end) of rows to be processed.
+       * \param end defines ending of the range [\e begin,\e end) of rows to be processed.
        * \param function is an instance of the lambda function to be called in each row.
        */
       template< typename Function >
@@ -970,7 +1016,9 @@ class MultidiagonalMatrix : public Matrix< Real, Device, Index, RealAllocator >
        *
        * More precisely, it computes:
        *
-       * `outVector = matrixMultiplicator * ( * this ) * inVector + outVectorMultiplicator * outVector`
+       * ```
+       * outVector = matrixMultiplicator * ( * this ) * inVector + outVectorMultiplicator * outVector
+       * ```
        *
        * \tparam InVector is type of input vector.  It can be \ref Vector,
        *     \ref VectorView, \ref Array, \ref ArraView or similar container.
diff --git a/src/TNL/Matrices/MultidiagonalMatrix.hpp b/src/TNL/Matrices/MultidiagonalMatrix.hpp
index b432814c6ab06481d1720df07f044e3df06e8e0e..244188831cf05b7d2dc5218060ff35c589e0d016 100644
--- a/src/TNL/Matrices/MultidiagonalMatrix.hpp
+++ b/src/TNL/Matrices/MultidiagonalMatrix.hpp
@@ -873,7 +873,7 @@ operator=( const MultidiagonalMatrix< Real_, Device_, Index_, Organization_, Rea
       if( std::is_same< Device, Device_ >::value )
       {
          const auto matrix_view = matrix.getView();
-         auto f = [=] __cuda_callable__ ( const IndexType& rowIdx, const IndexType& localIdx, const IndexType& column, Real& value, bool& compute ) mutable {
+         auto f = [=] __cuda_callable__ ( const IndexType& rowIdx, const IndexType& localIdx, const IndexType& column, Real& value ) mutable {
             value = matrix_view.getValues()[ matrix_view.getIndexer().getGlobalIndex( rowIdx, localIdx ) ];
          };
          this->forAllElements( f );
@@ -898,7 +898,7 @@ operator=( const MultidiagonalMatrix< Real_, Device_, Index_, Organization_, Rea
 
             ////
             // Copy matrix elements into buffer
-            auto f1 = [=] __cuda_callable__ ( RHSIndexType rowIdx, RHSIndexType localIdx, RHSIndexType columnIndex, const RHSRealType& value, bool& compute ) mutable {
+            auto f1 = [=] __cuda_callable__ ( RHSIndexType rowIdx, RHSIndexType localIdx, RHSIndexType columnIndex, const RHSRealType& value ) mutable {
                   const IndexType bufferIdx = ( rowIdx - baseRow ) * maxRowLength + localIdx;
                   matrixValuesBuffer_view[ bufferIdx ] = value;
             };
@@ -910,7 +910,7 @@ operator=( const MultidiagonalMatrix< Real_, Device_, Index_, Organization_, Rea
 
             ////
             // Copy matrix elements from the buffer to the matrix
-            auto f2 = [=] __cuda_callable__ ( const IndexType rowIdx, const IndexType localIdx, const IndexType columnIndex, RealType& value, bool& compute  ) mutable {
+            auto f2 = [=] __cuda_callable__ ( const IndexType rowIdx, const IndexType localIdx, const IndexType columnIndex, RealType& value ) mutable {
                const IndexType bufferIdx = ( rowIdx - baseRow ) * maxRowLength + localIdx;
                   value = thisValuesBuffer_view[ bufferIdx ];
             };
diff --git a/src/TNL/Matrices/MultidiagonalMatrixElement.h b/src/TNL/Matrices/MultidiagonalMatrixElement.h
index 3672526eabd584f7622f2743175d45068e1d7f9d..2c8c370276c802450e44fa40a3000f479dccdbec 100644
--- a/src/TNL/Matrices/MultidiagonalMatrixElement.h
+++ b/src/TNL/Matrices/MultidiagonalMatrixElement.h
@@ -18,16 +18,36 @@ namespace TNL {
 namespace Matrices {
 
 
+/**
+ * \brief Accessor for multidiagonal matrix elements.
+ *
+ * \tparam Real is a type of matrix elements values.
+ * \tparam Index is a type of matrix elements column indexes.
+ */
 template< typename Real,
           typename Index >
 class MultidiagonalMatrixElement
 {
    public:
 
+      /**
+       * \brief Type of matrix elements values.
+       */
       using RealType = Real;
 
+      /**
+       * \brief Type of matrix elements column indexes.
+       */
       using IndexType = Index;
 
+      /**
+       * \brief Constructor.
+       *
+       * \param value is matrix element value.
+       * \param rowIdx is row index of the matrix element.
+       * \param columnIdx is a column index of the matrix element.
+       * \param localIdx is the rank of the non-zero elements in the matrix row.
+       */
       __cuda_callable__
       MultidiagonalMatrixElement( RealType& value,
                                   const IndexType& rowIdx,
@@ -35,21 +55,51 @@ class MultidiagonalMatrixElement
                                   const IndexType& localIdx )
       : value_( value ), rowIdx( rowIdx ), columnIdx( columnIdx ), localIdx( localIdx ) {};
 
+      /**
+       * \brief Returns reference on matrix element value.
+       *
+       * \return reference on matrix element value.
+       */
       __cuda_callable__
       RealType& value() { return value_; };
 
+      /**
+       * \brief Returns constant reference on matrix element value.
+       *
+       * \return constant reference on matrix element value.
+       */
       __cuda_callable__
       const RealType& value() const { return value_; };
 
+      /**
+       * \brief Returns constant reference on matrix element column index.
+       *
+       * \return constant reference on matrix element column index.
+       */
       __cuda_callable__
       const IndexType& rowIndex() const { return rowIdx; };
 
+      /**
+       * \brief Returns reference on matrix element column index.
+       *
+       * \return reference on matrix element column index.
+       */
       __cuda_callable__
       IndexType& columnIndex() { return columnIdx; };
 
+      /**
+       * \brief Returns constant reference on matrix element column index.
+       *
+       * \return constant reference on matrix element column index.
+       */
       __cuda_callable__
       const IndexType& columnIndex() const { return columnIdx; };
 
+      /**
+       * \brief Returns constant reference on the rank of the non-zero matrix element in the row.
+       *
+       * \return constant reference on the rank of the non-zero matrix element in the row.
+       */
       __cuda_callable__
       const IndexType& localIndex() const { return localIdx; };
 
diff --git a/src/TNL/Matrices/MultidiagonalMatrixView.h b/src/TNL/Matrices/MultidiagonalMatrixView.h
index 3575602136cc1596dd461dab2e3ad302645dd535..869ddb9732e8541fecf9847c1ed648f189012335 100644
--- a/src/TNL/Matrices/MultidiagonalMatrixView.h
+++ b/src/TNL/Matrices/MultidiagonalMatrixView.h
@@ -363,16 +363,28 @@ class MultidiagonalMatrixView : public MatrixView< Real, Device, Index >
        * \brief Method for performing general reduction on matrix rows for constant instances.
        *
        * \tparam Fetch is a type of lambda function for data fetch declared as
-       *          `fetch( IndexType rowIdx, IndexType& columnIdx, RealType& elementValue ) -> FetchValue`.
-       *          The return type of this lambda can be any non void.
+       *
+       * ```
+       * auto fetch = [=] __cuda_callable__ ( IndexType rowIdx, IndexType& columnIdx, RealType& elementValue ) -> FetchValue { ... };
+       * ```
+       *
+       *  The return type of this lambda can be any non void.
        * \tparam Reduce is a type of lambda function for reduction declared as
-       *          `reduce( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue`.
-       * \tparam Keep is a type of lambda function for storing results of reduction in each row.
-       *          It is declared as `keep( const IndexType rowIdx, const double& value )`.
+       *
+       * ```
+       * auto reduce = [=] __cuda_callable__ ( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue { ... };
+       * ```
+       *
+       * \tparam Keep is a type of lambda function for storing results of reduction in each row. It is declared as
+       *
+       * ```
+       * auto keep = [=] __cuda_callable__ ( const IndexType rowIdx, const double& value ) { ... };
+       * ```
+       *
        * \tparam FetchValue is type returned by the Fetch lambda function.
        *
-       * \param begin defines beginning of the range [begin,end) of rows to be processed.
-       * \param end defines ending of the range [begin,end) of rows to be processed.
+       * \param begin defines beginning of the range [ \e begin, \e end ) of rows to be processed.
+       * \param end defines ending of the range [ \e begin, \e end ) of rows to be processed.
        * \param fetch is an instance of lambda function for data fetch.
        * \param reduce is an instance of lambda function for reduction.
        * \param keep in an instance of lambda function for storing results.
@@ -386,22 +398,34 @@ class MultidiagonalMatrixView : public MatrixView< Real, Device, Index >
        * \include MultidiagonalMatrixViewExample_reduceRows.out
        */
       template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
-      void reduceRows( IndexType first, IndexType last, Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& identity ) const;
+      void reduceRows( IndexType begin, IndexType end, Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& zero ) const;
 
       /**
        * \brief Method for performing general reduction on matrix rows.
        *
        * \tparam Fetch is a type of lambda function for data fetch declared as
-       *          `fetch( IndexType rowIdx, IndexType& columnIdx, RealType& elementValue ) -> FetchValue`.
-       *          The return type of this lambda can be any non void.
+       *
+       * ```
+       * auto fetch = [=] __cuda_callable__ ( IndexType rowIdx, IndexType& columnIdx, RealType& elementValue ) -> FetchValue { ... };
+       * ```
+       *
+       * The return type of this lambda can be any non void.
        * \tparam Reduce is a type of lambda function for reduction declared as
-       *          `reduce( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue`.
-       * \tparam Keep is a type of lambda function for storing results of reduction in each row.
-       *          It is declared as `keep( const IndexType rowIdx, const double& value )`.
+       *
+       * ```
+       * auto reduce = [=] __cuda_callable__ ( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue { ... };
+       * ```
+       *
+       * \tparam Keep is a type of lambda function for storing results of reduction in each row. It is declared as
+       *
+       * ```
+       * auto keep = [=] __cuda_callable__ ( const IndexType rowIdx, const double& value ) { ... };
+       * ```
+       *
        * \tparam FetchValue is type returned by the Fetch lambda function.
        *
-       * \param begin defines beginning of the range [begin,end) of rows to be processed.
-       * \param end defines ending of the range [begin,end) of rows to be processed.
+       * \param begin defines beginning of the range [ \e begin, \e end ) of rows to be processed.
+       * \param end defines ending of the range [ \e begin, \e end ) of rows to be processed.
        * \param fetch is an instance of lambda function for data fetch.
        * \param reduce is an instance of lambda function for reduction.
        * \param keep in an instance of lambda function for storing results.
@@ -415,18 +439,30 @@ class MultidiagonalMatrixView : public MatrixView< Real, Device, Index >
        * \include MultidiagonalMatrixViewExample_reduceRows.out
        */
       template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
-      void reduceRows( IndexType first, IndexType last, Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& identity );
+      void reduceRows( IndexType begin, IndexType end, Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& zero );
 
       /**
        * \brief Method for performing general reduction on all matrix rows for constant instances.
        *
        * \tparam Fetch is a type of lambda function for data fetch declared as
-       *          `fetch( IndexType rowIdx, IndexType& columnIdx, RealType& elementValue ) -> FetchValue`.
-       *          The return type of this lambda can be any non void.
+       *
+       * ```
+       * auto fetch = [=] __cuda_callable__ ( IndexType rowIdx, IndexType& columnIdx, RealType& elementValue ) -> FetchValue { ... };
+       * ```
+       *
+       * The return type of this lambda can be any non void.
        * \tparam Reduce is a type of lambda function for reduction declared as
-       *          `reduce( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue`.
-       * \tparam Keep is a type of lambda function for storing results of reduction in each row.
-       *          It is declared as `keep( const IndexType rowIdx, const double& value )`.
+       *
+       * ```
+       * auto reduce = [=] __cuda_callable__ ( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue { ... };
+       * ```
+       *
+       * \tparam Keep is a type of lambda function for storing results of reduction in each row. It is declared as
+       *
+       * ```
+       * auto keep = [=] __cuda_callable__ ( const IndexType rowIdx, const double& value ) { ... };
+       * ```
+       *
        * \tparam FetchValue is type returned by the Fetch lambda function.
        *
        * \param fetch is an instance of lambda function for data fetch.
@@ -448,10 +484,18 @@ class MultidiagonalMatrixView : public MatrixView< Real, Device, Index >
        * \brief Method for performing general reduction on all matrix rows.
        *
        * \tparam Fetch is a type of lambda function for data fetch declared as
-       *          `fetch( IndexType rowIdx, IndexType& columnIdx, RealType& elementValue ) -> FetchValue`.
-       *          The return type of this lambda can be any non void.
+       *
+       * ```
+       * auto fetch = [=] __cuda_callable__ ( IndexType rowIdx, IndexType& columnIdx, RealType& elementValue ) -> FetchValue { ... };
+       * ```
+       *
+       * The return type of this lambda can be any non void.
        * \tparam Reduce is a type of lambda function for reduction declared as
-       *          `reduce( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue`.
+       *
+       * ```
+       * auto reduce = [=] __cuda_callable__ ( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue { ... };
+       * ```
+       *
        * \tparam Keep is a type of lambda function for storing results of reduction in each row.
        *          It is declared as `keep( const IndexType rowIdx, const double& value )`.
        * \tparam FetchValue is type returned by the Fetch lambda function.
@@ -474,10 +518,11 @@ class MultidiagonalMatrixView : public MatrixView< Real, Device, Index >
       /**
        * \brief Method for iteration over all matrix rows for constant instances.
        *
-       * \tparam Function is type of lambda function that will operate on matrix elements.
-       *    It is should have form like
+       * \tparam Function is type of lambda function that will operate on matrix elements. It is should have form like
        *
-       *  `function( IndexType rowIdx, IndexType localIdx, IndexType columnIdx, const RealType& value, bool& compute )`,
+       * ```
+       * auto function = [=] __cuda_callable__ ( IndexType rowIdx, IndexType localIdx, IndexType columnIdx, const RealType& value ) { ... };
+       * ```
        *
        * where
        *
@@ -486,15 +531,12 @@ class MultidiagonalMatrixView : public MatrixView< Real, Device, Index >
        * \e localIdx parameter is a rank of the non-zero element in given row. It is also, in fact,
        *  index of the matrix subdiagonal.
        *
-       * \e columnIdx is a column index of the matrx element.
+       * \e columnIdx is a column index of the matrix element.
        *
        * \e value is the matrix element value.
        *
-       * \e compute is a reference to a boolen variable. If it is set to false the iteration over the row can
-       *  be interrupted.
-       *
-       * \param begin defines beginning of the range [begin,end) of rows to be processed.
-       * \param end defines ending of the range [begin,end) of rows to be processed.
+       * \param begin defines beginning of the range [ \e begin, \e end ) of rows to be processed.
+       * \param end defines ending of the range [ \e begin, \e end ) of rows to be processed.
        * \param function is an instance of the lambda function to be called in each row.
        *
        * \par Example
@@ -503,15 +545,16 @@ class MultidiagonalMatrixView : public MatrixView< Real, Device, Index >
        * \include MultidiagonalMatrixViewExample_forRows.out
        */
       template< typename Function >
-      void forElements( IndexType first, IndexType last, Function& function ) const;
+      void forElements( IndexType begin, IndexType end, Function& function ) const;
 
       /**
        * \brief Method for iteration over all matrix rows for non-constant instances.
        *
-       * \tparam Function is type of lambda function that will operate on matrix elements.
-       *    It is should have form like
+       * \tparam Function is type of lambda function that will operate on matrix elements. It is should have form like
        *
-       *  `function( IndexType rowIdx, IndexType localIdx, IndexType columnIdx, const RealType& value, bool& compute )`,
+       * ```
+       * auto function = [=] __cuda_callable__ ( IndexType rowIdx, IndexType localIdx, IndexType columnIdx, const RealType& value ) { ... };
+       * ```
        *
        * where
        *
@@ -520,15 +563,12 @@ class MultidiagonalMatrixView : public MatrixView< Real, Device, Index >
        * \e localIdx parameter is a rank of the non-zero element in given row. It is also, in fact,
        *  index of the matrix subdiagonal.
        *
-       * \e columnIdx is a column index of the matrx element.
+       * \e columnIdx is a column index of the matrix element.
        *
        * \e value is a reference to the matrix element value. It can be used even for changing the matrix element value.
        *
-       * \e compute is a reference to a boolen variable. If it is set to false the iteration over the row can
-       *  be interrupted.
-       *
-       * \param begin defines beginning of the range [begin,end) of rows to be processed.
-       * \param end defines ending of the range [begin,end) of rows to be processed.
+       * \param begin defines beginning of the range [ \e begin, \e end ) of rows to be processed.
+       * \param end defines ending of the range [ \e begin, \e end ) of rows to be processed.
        * \param function is an instance of the lambda function to be called in each row.
        *
        * \par Example
@@ -537,7 +577,7 @@ class MultidiagonalMatrixView : public MatrixView< Real, Device, Index >
        * \include MultidiagonalMatrixViewExample_forRows.out
        */
       template< typename Function >
-      void forElements( IndexType first, IndexType last, Function& function );
+      void forElements( IndexType begin, IndexType end, Function& function );
 
       /**
        * \brief This method calls \e forElements for all matrix rows (for constant instances).
@@ -674,15 +714,16 @@ class MultidiagonalMatrixView : public MatrixView< Real, Device, Index >
       /**
        * \brief Method for sequential iteration over all matrix rows for constant instances.
        *
-       * \tparam Function is type of lambda function that will operate on matrix elements.
-       *    It is should have form like
-       *  `function( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx_, const RealType& value, bool& compute )`.
-       *  The column index repeats twice only for compatibility with sparse matrices.
-       *  If the 'compute' variable is set to false the iteration over the row can
-       *  be interrupted.
+       * \tparam Function is type of lambda function that will operate on matrix elements. It is should have form like
        *
-       * \param begin defines beginning of the range [begin,end) of rows to be processed.
-       * \param end defines ending of the range [begin,end) of rows to be processed.
+       * ```
+       * auto function = [] __cuda_callable__ ( const RowView& row ) { ... };
+       * ```
+       *
+       * \e RowView represents matrix row - see \ref TNL::Matrices::MultidiagonalMatrixView::RowView.
+       *
+       * \param begin defines beginning of the range [ \e begin, \e end ) of rows to be processed.
+       * \param end defines ending of the range [ \e begin, \e end ) of rows to be processed.
        * \param function is an instance of the lambda function to be called in each row.
        */
       template< typename Function >
@@ -691,15 +732,16 @@ class MultidiagonalMatrixView : public MatrixView< Real, Device, Index >
       /**
        * \brief Method for sequential iteration over all matrix rows for non-constant instances.
        *
-       * \tparam Function is type of lambda function that will operate on matrix elements.
-       *    It is should have form like
-       *  `function( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx_, RealType& value, bool& compute )`.
-       *  The column index repeats twice only for compatibility with sparse matrices.
-       *  If the 'compute' variable is set to false the iteration over the row can
-       *  be interrupted.
+       * \tparam Function is type of lambda function that will operate on matrix elements. It is should have form like
        *
-       * \param begin defines beginning of the range [begin,end) of rows to be processed.
-       * \param end defines ending of the range [begin,end) of rows to be processed.
+       * ```
+       * auto function = [] __cuda_callable__ ( RowView& row ) { ... };
+       * ```
+       *
+       * \e RowView represents matrix row - see \ref TNL::Matrices::MultidiagonalMatrixView::RowView.
+       *
+       * \param begin defines beginning of the range [ \e  begin, \e end ) of rows to be processed.
+       * \param end defines ending of the range [ \e begin, \e end ) of rows to be processed.
        * \param function is an instance of the lambda function to be called in each row.
        */
       template< typename Function >
@@ -732,7 +774,9 @@ class MultidiagonalMatrixView : public MatrixView< Real, Device, Index >
        *
        * More precisely, it computes:
        *
-       * `outVector = matrixMultiplicator * ( * this ) * inVector + outVectorMultiplicator * outVector`
+       * ```
+       * outVector = matrixMultiplicator * ( * this ) * inVector + outVectorMultiplicator * outVector
+       * ```
        *
        * \tparam InVector is type of input vector.  It can be \ref Vector,
        *     \ref VectorView, \ref Array, \ref ArraView or similar container.
@@ -755,8 +799,8 @@ class MultidiagonalMatrixView : public MatrixView< Real, Device, Index >
                           OutVector& outVector,
                           const RealType matrixMultiplicator = 1.0,
                           const RealType outVectorMultiplicator = 0.0,
-                          const IndexType firstRow = 0,
-                          IndexType lastRow = 0 ) const;
+                          const IndexType begin = 0,
+                          IndexType end = 0 ) const;
 
       template< typename Real_, typename Device_, typename Index_, ElementsOrganization Organization_ >
       void addMatrix( const MultidiagonalMatrixView< Real_, Device_, Index_, Organization_ >& matrix,
diff --git a/src/TNL/Matrices/MultidiagonalMatrixView.hpp b/src/TNL/Matrices/MultidiagonalMatrixView.hpp
index 7621f16787363ab5e7ad2b0fcba1367b9bbd7904..550158e4d458261e65d89fb5bd5d4018f4eb9a32 100644
--- a/src/TNL/Matrices/MultidiagonalMatrixView.hpp
+++ b/src/TNL/Matrices/MultidiagonalMatrixView.hpp
@@ -216,7 +216,7 @@ setValue( const RealType& v )
    // we dont do this->values = v here because it would set even elements 'outside' the matrix
    // method getNumberOfNonzeroElements would not work well then
    const RealType newValue = v;
-   auto f = [=] __cuda_callable__ ( const IndexType& rowIdx, const IndexType& localIdx, const IndexType columnIdx, RealType& value, bool& compute ) mutable {
+   auto f = [=] __cuda_callable__ ( const IndexType& rowIdx, const IndexType& localIdx, const IndexType columnIdx, RealType& value ) mutable {
       value = newValue;
    };
    this->forAllElements( f );
@@ -443,13 +443,12 @@ forElements( IndexType first, IndexType last, Function& function ) const
    const IndexType diagonalsCount = this->diagonalsOffsets.getSize();
    const IndexType columns = this->getColumns();
    const auto indexer = this->indexer;
-   bool compute( true );
    auto f = [=] __cuda_callable__ ( IndexType rowIdx ) mutable {
       for( IndexType localIdx = 0; localIdx < diagonalsCount; localIdx++ )
       {
          const IndexType columnIdx = rowIdx + diagonalsOffsets_view[ localIdx ];
          if( columnIdx >= 0 && columnIdx < columns )
-            function( rowIdx, localIdx, columnIdx, values_view[ indexer.getGlobalIndex( rowIdx, localIdx ) ], compute );
+            function( rowIdx, localIdx, columnIdx, values_view[ indexer.getGlobalIndex( rowIdx, localIdx ) ] );
       }
    };
    Algorithms::ParallelFor< DeviceType >::exec( first, last, f );
@@ -469,13 +468,12 @@ forElements( IndexType first, IndexType last, Function& function )
    const IndexType diagonalsCount = this->diagonalsOffsets.getSize();
    const IndexType columns = this->getColumns();
    const auto indexer = this->indexer;
-   bool compute( true );
    auto f = [=] __cuda_callable__ ( IndexType rowIdx ) mutable {
-      for( IndexType localIdx = 0; localIdx < diagonalsCount && compute; localIdx++ )
+      for( IndexType localIdx = 0; localIdx < diagonalsCount; localIdx++ )
       {
          const IndexType columnIdx = rowIdx + diagonalsOffsets_view[ localIdx ];
          if( columnIdx >= 0 && columnIdx < columns )
-            function( rowIdx, localIdx, columnIdx, values_view[ indexer.getGlobalIndex( rowIdx, localIdx ) ], compute );
+            function( rowIdx, localIdx, columnIdx, values_view[ indexer.getGlobalIndex( rowIdx, localIdx ) ] );
       }
    };
    Algorithms::ParallelFor< DeviceType >::exec( first, last, f );
diff --git a/src/TNL/Matrices/Sandbox/SparseSandboxMatrix.h b/src/TNL/Matrices/Sandbox/SparseSandboxMatrix.h
new file mode 100644
index 0000000000000000000000000000000000000000..c4b3ae23f78beb2b1fa530e813621057cd19d248
--- /dev/null
+++ b/src/TNL/Matrices/Sandbox/SparseSandboxMatrix.h
@@ -0,0 +1,1179 @@
+/***************************************************************************
+                          SparseSandboxMatrix.h -  description
+                             -------------------
+    begin                : Apr 19, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#include <map>
+#include <TNL/Matrices/Matrix.h>
+#include <TNL/Matrices/MatrixType.h>
+#include <TNL/Allocators/Default.h>
+#include <TNL/Algorithms/Segments/CSR.h>
+#include <TNL/Matrices/Sandbox/SparseSandboxMatrixRowView.h>
+#include <TNL/Matrices/Sandbox/SparseSandboxMatrixView.h>
+#include <TNL/Matrices/DenseMatrix.h>
+
+namespace TNL {
+   namespace Matrices {
+      /**
+       * \brief Namespace for sandbox matrices.
+       */
+      namespace Sandbox {
+
+/**
+ * \brief Template of a sparse matrix that can be used for testing of new sparse-matrix formats.
+ *
+ * \tparam Real is a type of matrix elements. If \e Real equals \e bool the matrix is treated
+ *    as binary and so the matrix elements values are not stored in the memory since we need
+ *    to remember only coordinates of non-zero elements( which equal one).
+ * \tparam Device is a device where the matrix is allocated.
+ * \tparam Index is a type for indexing of the matrix elements.
+ * \tparam MatrixType specifies a symmetry of matrix. See \ref MatrixType. Symmetric
+ *    matrices store only lower part of the matrix and its diagonal. The upper part is reconstructed on the fly.
+ *    GeneralMatrix with no symmetry is used by default.
+ * \tparam RealAllocator is allocator for the matrix elements values.
+ * \tparam IndexAllocator is allocator for the matrix elements column indexes.
+ *
+ * This class can be used for rapid testing and development of new formats for sparse matrices. One may profit from
+ * several TNL tools compatible with interface of this templated class like:
+ *
+ * 1. Large set of existing unit tests.
+ * 3. Matrix reading from MTX files - to use \ref TNL::Matrices::MatrixReader, the following methods must be functional
+ *    a. \ref TNL::Matrices::SandboxSparseMatrix::setRowCapacities
+ *    b. \ref TNL::Matrices::SandboxSparseMatrix::setElement
+ *    c. \ref TNL::Matrices::SandboxSparseMatrix::operator= between different devices
+ * 4. Matrix benchmarks - the following methods must be functional
+ *    a. \ref TNL::Matrices::SandboxSparseMatrix::vectorProduct - for SpMV benchmark
+ * 5. Linear solvers
+ * 6. Simple comparison of performance with other matrix formats
+ *
+ * In the core of this class there is:
+ *
+ * 1. Vector 'values` (\ref TNL::Matrices::Matrix::values) which is inheritted from \ref TNL::Matrices::Matrix. This vector is used for storing
+ *    of matrix elements values.
+ * 2. Vector `columnIndexes` (\ref TNL::Matrices::SendboxMatrix::columnIndexes). This vector is used for storing of matrix elements column indexes.
+ *
+ * This class contains fully functional implementation of CSR format and so the user have to replace just what he needs to. Once you have
+ * successfully implemented the sparse matrix format in this form, you may consider to extract it into a form of segments to make it accessible
+ * even for other algorithms then SpMV.
+ *
+ * Parts of the code, that need to be modified are marked by SANDBOX_TODO tag. The whole implementation consits of the following classes:
+ *
+ * 1. \ref TNL::Matrices::Sandbox::SparseSandboxMatrix - this class, it serves for matrix setup and performing of the main operations.
+ * 2. \ref TNL::Matrices::Sandbox::SparseSandboxMatrixView - view class which is necessary mainly for passing the matrix to GPU kernels. Most methods of `SparseSandboxMatrix` are common
+ *    with `SparseSandboxMatrixView` and in this case they are implemented in the view class (and there is just redirection from this class). For this reason, `SparseSandboxMatrix` contains instance of the view class
+ *    (\ref TNL::Matrices::Sandbox::SparseSandboxMatrix::view) which needs to be regularly updated each time when metadata are changed. This is usually done by the means of
+ *    method \ref TNL::Matrices::Sandbox::SparseSandboxMatrix::getView.
+ * 3. \ref TNL::Matrices::Sandbox::SparseSandboxMatrixRowView - is a class for accessing particular matrix rows. It will, likely, require some changes as well.
+ *
+ * We suggest the following way of implementation of the new sparse matrix format:
+ *
+ * 1. Add metadata required by your format next to \ref TNL::Matrices::Sandbox::SparseSandboxMatrix::rowPointers but do not replace the row pointers. It will allow you
+ *    to implement your new format next to the original CSR and to check/compare with the valid CSR implementation any time you get into troubles. The adventage is that all
+ *    unit tests are working properly and you may just focus on modifying one method after another. The unit tests are called from
+ *    `src/UnitTests/Matrices/SparseMatrixTests_SandboxMatrix.h` and `src/UnitTests/Matrices/SparseMatrixVectorProductTests_SandboxMatrix.h`
+ * 2. Modify first the method \ref TNL::Matrices::Sandbox::SparseSandboxMatrix::setRowCapacities which is responsible for the setup of the format metadata.
+ * 3. Continue with modification of constructors, view class, \ref TNL::Matrices::Sandbox::SparseSandoxMatrix::getView and \ref TNL::Matrices::Sandbox::SparseSandoxMatrix::getConstView.
+ * 4. Next you need to modify \ref TNL::Matrices::Sandbox::SparseSandboxMatrix::setElement and \ref TNL::Matrices::Sandbox::SparseSandboxMatrix::getElement methods and assignment operator
+ *    at least for copying the matrix across different devices (i.e. from CPU to GPU). It will allow you to use \ref TNL::Matrices::MatrixReader. We recommend to have the same data layout
+ *    on both CPU and GPU so that the transfer of the matrix from CPU to GPU is trivial.
+ * 5. Finally proceed to \ref TNL::Matrices::Sandbox::SparseSandboxMatrix::vectorProduct to implement SpMV operation. We recommend to implement first the CPU version which is easier to
+ *     debug. Next proceed to GPU version.
+ * 6. When SpMV works it is time to delete the original CSR implementation, i.e. everything around `rowPointers`.
+ * 7. Optimize your implementation to the best performance and test with `tnl-benchmark-spmv` - you need to include your new matrix to `src/Benchmarks/SpMV/spmv.h` and modify this file
+ *    accordingly.
+ * 8. If you want, you may now generalize SpMV to \ref TNL::Matrices::Sandbox::SparseSandboxMatrix::reduceRows method.
+ * 9. If you have `reduceRows` implemented, you may use the original implementation of SpMV based just on the `reduceRows` method.
+ * 10. You may implement \ref TNL::Matrices::Sandbox::SparseSandboxMatrix::forRows and \ref TNL::Matrices::Sandbox::SparseSandboxMatrix::forElements.
+ * 11. Now you have complete implementation of new sparse matrix format. You may turn it into new type of segments (\ref TNL::Algorithms::Segments).
+ *
+ * During the implementation some unit tests may crash. If you do not need them at the moment, you may comment them in files
+ * `src/UnitTests/Matrices/SparseMatrixTests.h` and `src/UnitTests/Matrices/SparseMatrixVectorProductTests.h`
+ */
+template< typename Real =  double,
+          typename Device = Devices::Host,
+          typename Index = int,
+          typename MatrixType = GeneralMatrix,
+          typename RealAllocator = typename Allocators::Default< Device >::template Allocator< Real >,
+          typename IndexAllocator = typename Allocators::Default< Device >::template Allocator< Index > >
+class SparseSandboxMatrix : public Matrix< Real, Device, Index, RealAllocator >
+{
+   static_assert(
+         ! MatrixType::isSymmetric() ||
+         ! std::is_same< Device, Devices::Cuda >::value ||
+         ( std::is_same< Real, float >::value || std::is_same< Real, double >::value || std::is_same< Real, int >::value || std::is_same< Real, long long int >::value ),
+         "Given Real type is not supported by atomic operations on GPU which are necessary for symmetric operations." );
+
+   public:
+
+      // Supporting types - they are not important for the user
+      using BaseType = Matrix< Real, Device, Index, RealAllocator >;
+      using ValuesVectorType = typename Matrix< Real, Device, Index, RealAllocator >::ValuesType;
+      using ValuesViewType = typename ValuesVectorType::ViewType;
+      using ConstValuesViewType = typename ValuesViewType::ConstViewType;
+      using ColumnsIndexesVectorType = Containers::Vector< typename TNL::copy_const< Index >::template from< Real >::type, Device, Index, IndexAllocator >;
+      using ColumnsIndexesViewType = typename ColumnsIndexesVectorType::ViewType;
+      using ConstColumnsIndexesViewType = typename ColumnsIndexesViewType::ConstViewType;
+      using RowsCapacitiesType = Containers::Vector< std::remove_const_t< Index >, Device, Index, IndexAllocator >;
+      using RowsCapacitiesView = Containers::VectorView< std::remove_const_t< Index >, Device, Index >;
+      using ConstRowsCapacitiesView = typename RowsCapacitiesView::ConstViewType;
+
+      /**
+       * \brief Test of symmetric matrix type.
+       *
+       * \return \e true if the matrix is stored as symmetric and \e false otherwise.
+       */
+      static constexpr bool isSymmetric() { return MatrixType::isSymmetric(); };
+
+      /**
+       * \brief Test of binary matrix type.
+       *
+       * \return \e true if the matrix is stored as binary and \e false otherwise.
+       */
+      static constexpr bool isBinary() { return std::is_same< Real, bool >::value; };
+
+      /**
+       * \brief The type of matrix elements.
+       */
+      using RealType = std::remove_const_t< Real >;
+
+      /**
+       * \brief The device where the matrix is allocated.
+       */
+      using DeviceType = Device;
+
+      /**
+       * \brief The type used for matrix elements indexing.
+       */
+      using IndexType = Index;
+
+      /**
+       * \brief The allocator for matrix elements values.
+       */
+      using RealAllocatorType = RealAllocator;
+
+      /**
+       * \brief The allocator for matrix elements column indexes.
+       */
+      using IndexAllocatorType = IndexAllocator;
+
+      /**
+       * \brief Type of related matrix view.
+       *
+       * See \ref SparseSandboxMatrixView.
+       */
+      using ViewType = SparseSandboxMatrixView< Real, Device, Index, MatrixType >;
+
+      /**
+       * \brief Matrix view type for constant instances.
+       *
+       * See \ref SparseSandboxMatrixView.
+       */
+      using ConstViewType = SparseSandboxMatrixView< std::add_const_t< Real >, Device, Index, MatrixType >;
+
+      /**
+       * \brief Type for accessing matrix rows.
+       */
+      using RowView = SparseSandboxMatrixRowView< ValuesViewType, ColumnsIndexesViewType, isBinary() >;
+
+      /**
+       * \brief Type for accessing constant matrix rows.
+       */
+      using ConstRowView = SparseSandboxMatrixRowView< ConstValuesViewType, ConstColumnsIndexesViewType, isBinary() >;;
+
+      /**
+       * \brief Helper type for getting self type or its modifications.
+       */
+      template< typename _Real = Real,
+                typename _Device = Device,
+                typename _Index = Index,
+                typename _MatrixType = MatrixType,
+                typename _RealAllocator = typename Allocators::Default< _Device >::template Allocator< _Real >,
+                typename _IndexAllocator = typename Allocators::Default< _Device >::template Allocator< _Index > >
+      using Self = SparseSandboxMatrix< _Real, _Device, _Index, _MatrixType, _RealAllocator, _IndexAllocator >;
+
+      /**
+       * \brief Type of container for CSR row pointers.
+       *
+       * SANDBOX_TODO: You may replace it with containers for metadata of your format.
+       */
+      using RowPointers = TNL::Containers::Vector< IndexType, DeviceType, IndexType >;
+
+      /**
+       * \brief Constructor only with values and column indexes allocators.
+       *
+       * \param realAllocator is used for allocation of matrix elements values.
+       * \param indexAllocator is used for allocation of matrix elements column indexes.
+       */
+      SparseSandboxMatrix( const RealAllocatorType& realAllocator = RealAllocatorType(),
+                           const IndexAllocatorType& indexAllocator = IndexAllocatorType() );
+
+      /**
+       * \brief Copy constructor.
+       *
+       * \param matrix is the source matrix
+       */
+      SparseSandboxMatrix( const SparseSandboxMatrix& matrix1 ) = default;
+
+      /**
+       * \brief Move constructor.
+       *
+       * \param matrix is the source matrix
+       */
+      SparseSandboxMatrix( SparseSandboxMatrix&& matrix ) = default;
+
+      /**
+       * \brief Constructor with matrix dimensions.
+       *
+       * \param rows is number of matrix rows.
+       * \param columns is number of matrix columns.
+       * \param realAllocator is used for allocation of matrix elements values.
+       * \param indexAllocator is used for allocation of matrix elements column indexes.
+       */
+      template< typename Index_t, std::enable_if_t< std::is_integral< Index_t >::value, int > = 0 >
+      SparseSandboxMatrix( const Index_t rows,
+                           const Index_t columns,
+                           const RealAllocatorType& realAllocator = RealAllocatorType(),
+                           const IndexAllocatorType& indexAllocator = IndexAllocatorType() );
+
+      /**
+       * \brief Constructor with matrix rows capacities and number of columns.
+       *
+       * The number of matrix rows is given by the size of \e rowCapacities list.
+       *
+       * \tparam ListIndex is the initializer list values type.
+       * \param rowCapacities is a list telling how many matrix elements must be
+       *    allocated in each row.
+       * \param columns is the number of matrix columns.
+       * \param realAllocator is used for allocation of matrix elements values.
+       * \param indexAllocator is used for allocation of matrix elements column indexes.
+       *
+       * \par Example
+       * \include Matrices/SparseMatrix/SparseMatrixExample_Constructor_init_list_1.cpp
+       * \par Output
+       * \include SparseMatrixExample_Constructor_init_list_1.out
+       */
+      template< typename ListIndex >
+      explicit SparseSandboxMatrix( const std::initializer_list< ListIndex >& rowCapacities,
+                                    const IndexType columns,
+                                    const RealAllocatorType& realAllocator = RealAllocatorType(),
+                                    const IndexAllocatorType& indexAllocator = IndexAllocatorType() );
+
+      /**
+       * \brief Constructor with matrix rows capacities given as a vector and number of columns.
+       *
+       * The number of matrix rows is given by the size of \e rowCapacities vector.
+       *
+       * \tparam RowCapacitiesVector is the row capacities vector type. Usually it is some of
+       *    \ref TNL::Containers::Array, \ref TNL::Containers::ArrayView, \ref TNL::Containers::Vector or
+       *    \ref TNL::Containers::VectorView.
+       * \param rowCapacities is a vector telling how many matrix elements must be
+       *    allocated in each row.
+       * \param columns is the number of matrix columns.
+       * \param realAllocator is used for allocation of matrix elements values.
+       * \param indexAllocator is used for allocation of matrix elements column indexes.
+       *
+       * \par Example
+       * \include Matrices/SparseMatrix/SparseMatrixExample_Constructor_rowCapacities_vector.cpp
+       * \par Output
+       * \include SparseMatrixExample_Constructor_rowCapacities_vector.out
+       */
+      template< typename RowCapacitiesVector, std::enable_if_t< TNL::IsArrayType< RowCapacitiesVector >::value, int > = 0 >
+      explicit SparseSandboxMatrix( const RowCapacitiesVector& rowCapacities,
+                                    const IndexType columns,
+                                    const RealAllocatorType& realAllocator = RealAllocatorType(),
+                                    const IndexAllocatorType& indexAllocator = IndexAllocatorType() );
+
+      /**
+       * \brief Constructor with matrix dimensions and data in initializer list.
+       *
+       * The matrix elements values are given as a list \e data of triples:
+       * { { row1, column1, value1 },
+       *   { row2, column2, value2 },
+       * ... }.
+       *
+       * \param rows is number of matrix rows.
+       * \param columns is number of matrix columns.
+       * \param data is a list of matrix elements values.
+       * \param realAllocator is used for allocation of matrix elements values.
+       * \param indexAllocator is used for allocation of matrix elements column indexes.
+       *
+       * \par Example
+       * \include Matrices/SparseMatrix/SparseMatrixExample_Constructor_init_list_2.cpp
+       * \par Output
+       * \include SparseMatrixExample_Constructor_init_list_2.out
+       */
+      explicit SparseSandboxMatrix( const IndexType rows,
+                                    const IndexType columns,
+                                    const std::initializer_list< std::tuple< IndexType, IndexType, RealType > >& data,
+                                    const RealAllocatorType& realAllocator = RealAllocatorType(),
+                                    const IndexAllocatorType& indexAllocator = IndexAllocatorType() );
+
+      /**
+       * \brief Constructor with matrix dimensions and data in std::map.
+       *
+       * The matrix elements values are given as a map \e data where keys are
+       * std::pair of matrix coordinates ( {row, column} ) and value is the
+       * matrix element value.
+       *
+       * \tparam MapIndex is a type for indexing rows and columns.
+       * \tparam MapValue is a type for matrix elements values in the map.
+       *
+       * \param rows is number of matrix rows.
+       * \param columns is number of matrix columns.
+       * \param map is std::map containing matrix elements.
+       * \param realAllocator is used for allocation of matrix elements values.
+       * \param indexAllocator is used for allocation of matrix elements column indexes.
+       *
+       * \par Example
+       * \include Matrices/SparseMatrix/SparseMatrixExample_Constructor_std_map.cpp
+       * \par Output
+       * \include SparseMatrixExample_Constructor_std_map.out
+       */
+      template< typename MapIndex,
+                typename MapValue >
+      explicit SparseSandboxMatrix( const IndexType rows,
+                                    const IndexType columns,
+                                    const std::map< std::pair< MapIndex, MapIndex >, MapValue >& map,
+                                    const RealAllocatorType& realAllocator = RealAllocatorType(),
+                                    const IndexAllocatorType& indexAllocator = IndexAllocatorType() );
+
+      /**
+       * \brief Returns a modifiable view of the sparse matrix.
+       *
+       * See \ref SparseSandboxMatrixView.
+       *
+       * \return sparse matrix view.
+       */
+      ViewType getView() const; // TODO: remove const
+
+      /**
+       * \brief Returns a non-modifiable view of the sparse matrix.
+       *
+       * See \ref SparseSandboxMatrixView.
+       *
+       * \return sparse matrix view.
+       */
+      ConstViewType getConstView() const;
+
+      /**
+       * \brief Returns string with serialization type.
+       *
+       * The string has a form `Matrices::SparseSandboxMatrix< RealType,  [any_device], IndexType, General/Symmetric, Format, [any_allocator] >`.
+       *
+       * \return \ref String with the serialization type.
+       *
+       * \par Example
+       * \include Matrices/SparseMatrix/SparseMatrixExample_getSerializationType.cpp
+       * \par Output
+       * \include SparseMatrixExample_getSerializationType.out
+       */
+      static String getSerializationType();
+
+      /**
+       * \brief Returns string with serialization type.
+       *
+       * See \ref SparseSandboxMatrix::getSerializationType.
+       *
+       * \return \e String with the serialization type.
+       *
+       * \par Example
+       * \include Matrices/SparseMatrix/SparseMatrixExample_getSerializationType.cpp
+       * \par Output
+       * \include SparseMatrixExample_getSerializationType.out
+       */
+      virtual String getSerializationTypeVirtual() const override;
+
+      /**
+       * \brief Set number of rows and columns of this matrix.
+       *
+       * \param rows is the number of matrix rows.
+       * \param columns is the number of matrix columns.
+       */
+      virtual void setDimensions( const IndexType rows,
+                                  const IndexType columns ) override;
+
+      /**
+       * \brief Set the number of matrix rows and columns by the given matrix.
+       *
+       * \tparam Matrix is matrix type. This can be any matrix having methods
+       *  \ref getRows and \ref getColumns.
+       *
+       * \param matrix in the input matrix dimensions of which are to be adopted.
+       */
+      template< typename Matrix >
+      void setLike( const Matrix& matrix );
+
+      /**
+       * \brief Allocates memory for non-zero matrix elements.
+       *
+       * The size of the input vector must be equal to the number of matrix rows.
+       * The number of allocated matrix elements for each matrix row depends on
+       * the sparse matrix format. Some formats may allocate more elements than
+       * required.
+       *
+       * \tparam RowsCapacitiesVector is a type of vector/array used for row
+       *    capacities setting.
+       *
+       * \param rowCapacities is a vector telling the number of required non-zero
+       *    matrix elements in each row.
+       *
+       * \par Example
+       * \include Matrices/SparseMatrix/SparseMatrixExample_setRowCapacities.cpp
+       * \par Output
+       * \include SparseMatrixExample_setRowCapacities.out
+       */
+      template< typename RowsCapacitiesVector >
+      void setRowCapacities( const RowsCapacitiesVector& rowCapacities );
+
+      /**
+       * \brief Compute capacities of all rows.
+       *
+       * The row capacities are not stored explicitly and must be computed.
+       *
+       * \param rowCapacities is a vector where the row capacities will be stored.
+       */
+      template< typename Vector >
+      void getRowCapacities( Vector& rowCapacities ) const;
+
+      /**
+       * \brief This method sets the sparse matrix elements from initializer list.
+       *
+       * The number of matrix rows and columns must be set already.
+       * The matrix elements values are given as a list \e data of triples:
+       * { { row1, column1, value1 },
+       *   { row2, column2, value2 },
+       * ... }.
+       *
+       * \param data is a initializer list of initializer lists representing
+       * list of matrix rows.
+       *
+       * \par Example
+       * \include Matrices/SparseMatrix/SparseMatrixExample_setElements.cpp
+       * \par Output
+       * \include SparseMatrixExample_setElements.out
+       */
+      void setElements( const std::initializer_list< std::tuple< IndexType, IndexType, RealType > >& data );
+
+      /**
+       * \brief This method sets the sparse matrix elements from std::map.
+       *
+       * The matrix elements values are given as a map \e data where keys are
+       * std::pair of matrix coordinates ( {row, column} ) and value is the
+       * matrix element value.
+       *
+       * \tparam MapIndex is a type for indexing rows and columns.
+       * \tparam MapValue is a type for matrix elements values in the map.
+       *
+       * \param map is std::map containing matrix elements.
+       *
+       * \par Example
+       * \include Matrices/SparseMatrix/SparseMatrixExample_setElements_map.cpp
+       * \par Output
+       * \include SparseMatrixExample_setElements_map.out
+       */
+      template< typename MapIndex,
+                typename MapValue >
+      void setElements( const std::map< std::pair< MapIndex, MapIndex > , MapValue >& map );
+
+      /**
+       * \brief Computes number of non-zeros in each row.
+       *
+       * \param rowLengths is a vector into which the number of non-zeros in each row
+       * will be stored.
+       *
+       * \par Example
+       * \include Matrices/SparseMatrix/SparseMatrixExample_getCompressedRowLengths.cpp
+       * \par Output
+       * \include SparseMatrixExample_getCompressedRowLengths.out
+       */
+      template< typename Vector >
+      void getCompressedRowLengths( Vector& rowLengths ) const;
+
+
+      /**
+       * \brief Returns capacity of given matrix row.
+       *
+       * \param row index of matrix row.
+       * \return number of matrix elements allocated for the row.
+       */
+      __cuda_callable__
+      IndexType getRowCapacity( const IndexType row ) const;
+
+      /**
+       * \brief Returns number of non-zero matrix elements.
+       *
+       * This method really counts the non-zero matrix elements and so
+       * it returns zero for matrix having all allocated elements set to zero.
+       *
+       * \return number of non-zero matrix elements.
+       */
+      IndexType getNonzeroElementsCount() const;
+
+      /**
+       * \brief Resets the matrix to zero dimensions.
+       */
+      void reset();
+
+      /**
+       * \brief Constant getter of simple structure for accessing given matrix row.
+       *
+       * \param rowIdx is matrix row index.
+       *
+       * \return RowView for accessing given matrix row.
+       *
+       * \par Example
+       * \include Matrices/SparseMatrix/SparseMatrixExample_getConstRow.cpp
+       * \par Output
+       * \include SparseMatrixExample_getConstRow.out
+       *
+       * See \ref SparseMatrixRowView.
+       */
+      __cuda_callable__
+      const ConstRowView getRow( const IndexType& rowIdx ) const;
+
+      /**
+       * \brief Non-constant getter of simple structure for accessing given matrix row.
+       *
+       * \param rowIdx is matrix row index.
+       *
+       * \return RowView for accessing given matrix row.
+       *
+       * \par Example
+       * \include Matrices/SparseMatrix/SparseMatrixExample_getRow.cpp
+       * \par Output
+       * \include SparseMatrixExample_getRow.out
+       *
+       * See \ref SparseMatrixRowView.
+       */
+      __cuda_callable__
+      RowView getRow( const IndexType& rowIdx );
+
+      /**
+       * \brief Sets element at given \e row and \e column to given \e value.
+       *
+       * This method can be called from the host system (CPU) no matter
+       * where the matrix is allocated. If the matrix is allocated on GPU this method
+       * can be called even from device kernels. If the matrix is allocated in GPU device
+       * this method is called from CPU, it transfers values of each matrix element separately and so the
+       * performance is very low. For higher performance see. \ref SparseMatrix::getRow
+       * or \ref SparseMatrix::forElements and \ref SparseMatrix::forAllElements.
+       * The call may fail if the matrix row capacity is exhausted.
+       *
+       * \param row is row index of the element.
+       * \param column is columns index of the element.
+       * \param value is the value the element will be set to.
+       *
+       * \par Example
+       * \include Matrices/SparseMatrix/SparseMatrixExample_setElement.cpp
+       * \par Output
+       * \include SparseMatrixExample_setElement.out
+       */
+      __cuda_callable__
+      void setElement( const IndexType row,
+                       const IndexType column,
+                       const RealType& value );
+
+      /**
+       * \brief Add element at given \e row and \e column to given \e value.
+       *
+       * This method can be called from the host system (CPU) no matter
+       * where the matrix is allocated. If the matrix is allocated on GPU this method
+       * can be called even from device kernels. If the matrix is allocated in GPU device
+       * this method is called from CPU, it transfers values of each matrix element separately and so the
+       * performance is very low. For higher performance see. \ref SparseMatrix::getRow
+       * or \ref SparseMatrix::forElements and \ref SparseMatrix::forAllElements.
+       * The call may fail if the matrix row capacity is exhausted.
+       *
+       * \param row is row index of the element.
+       * \param column is columns index of the element.
+       * \param value is the value the element will be set to.
+       * \param thisElementMultiplicator is multiplicator the original matrix element
+       *   value is multiplied by before addition of given \e value.
+       *
+       * \par Example
+       * \include Matrices/SparseMatrix/SparseMatrixExample_addElement.cpp
+       * \par Output
+       * \include SparseMatrixExample_addElement.out
+       *
+       */
+      __cuda_callable__
+      void addElement( const IndexType row,
+                       const IndexType column,
+                       const RealType& value,
+                       const RealType& thisElementMultiplicator );
+
+      /**
+       * \brief Returns value of matrix element at position given by its row and column index.
+       *
+       * This method can be called from the host system (CPU) no matter
+       * where the matrix is allocated. If the matrix is allocated on GPU this method
+       * can be called even from device kernels. If the matrix is allocated in GPU device
+       * this method is called from CPU, it transfers values of each matrix element separately and so the
+       * performance is very low. For higher performance see. \ref SparseMatrix::getRow
+       * or \ref SparseMatrix::forElements and \ref SparseMatrix::forAllElements.
+       *
+       * \param row is a row index of the matrix element.
+       * \param column i a column index of the matrix element.
+       *
+       * \return value of given matrix element.
+       *
+       * \par Example
+       * \include Matrices/SparseMatrix/SparseMatrixExample_getElement.cpp
+       * \par Output
+       * \include SparseMatrixExample_getElement.out
+       *
+       */
+      __cuda_callable__
+      RealType getElement( const IndexType row,
+                           const IndexType column ) const;
+
+      /**
+       * \brief Method for performing general reduction on matrix rows.
+       *
+       * \tparam Fetch is a type of lambda function for data fetch declared as
+       *          `fetch( IndexType rowIdx, IndexType& columnIdx, RealType& elementValue ) -> FetchValue`.
+       *          The return type of this lambda can be any non void.
+       * \tparam Reduce is a type of lambda function for reduction declared as
+       *          `reduce( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue`.
+       * \tparam Keep is a type of lambda function for storing results of reduction in each row.
+       *          It is declared as `keep( const IndexType rowIdx, const double& value )`.
+       * \tparam FetchValue is type returned by the Fetch lambda function.
+       *
+       * \param begin defines beginning of the range [begin,end) of rows to be processed.
+       * \param end defines ending of the range [begin,end) of rows to be processed.
+       * \param fetch is an instance of lambda function for data fetch.
+       * \param reduce is an instance of lambda function for reduction.
+       * \param keep in an instance of lambda function for storing results.
+       * \param zero is zero of given reduction operation also known as idempotent element.
+       *
+       * \par Example
+       * \include Matrices/SparseMatrix/SparseMatrixExample_reduceRows.cpp
+       * \par Output
+       * \include SparseMatrixExample_reduceRows.out
+       */
+      template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
+      void reduceRows( IndexType begin, IndexType end, Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& zero );
+
+      /**
+       * \brief Method for performing general reduction on matrix rows for constant instances.
+       *
+       * \tparam Fetch is a type of lambda function for data fetch declared as
+       *          `fetch( IndexType rowIdx, IndexType& columnIdx, RealType& elementValue ) -> FetchValue`.
+       *          The return type of this lambda can be any non void.
+       * \tparam Reduce is a type of lambda function for reduction declared as
+       *          `reduce( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue`.
+       * \tparam Keep is a type of lambda function for storing results of reduction in each row.
+       *          It is declared as `keep( const IndexType rowIdx, const double& value )`.
+       * \tparam FetchValue is type returned by the Fetch lambda function.
+       *
+       * \param begin defines beginning of the range [begin,end) of rows to be processed.
+       * \param end defines ending of the range [begin,end) of rows to be processed.
+       * \param fetch is an instance of lambda function for data fetch.
+       * \param reduce is an instance of lambda function for reduction.
+       * \param keep in an instance of lambda function for storing results.
+       * \param zero is zero of given reduction operation also known as idempotent element.
+       *
+       * \par Example
+       * \include Matrices/SparseMatrix/SparseMatrixExample_reduceRows.cpp
+       * \par Output
+       * \include SparseMatrixExample_reduceRows.out
+       */
+      template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
+      void reduceRows( IndexType begin, IndexType end, Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& zero ) const;
+
+      /**
+       * \brief Method for performing general reduction on all matrix rows.
+       *
+       * \tparam Fetch is a type of lambda function for data fetch declared as
+       *          `fetch( IndexType rowIdx, IndexType& columnIdx, RealType& elementValue ) -> FetchValue`.
+       *          The return type of this lambda can be any non void.
+       * \tparam Reduce is a type of lambda function for reduction declared as
+       *          `reduce( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue`.
+       * \tparam Keep is a type of lambda function for storing results of reduction in each row.
+       *          It is declared as `keep( const IndexType rowIdx, const double& value )`.
+       * \tparam FetchValue is type returned by the Fetch lambda function.
+       *
+       * \param fetch is an instance of lambda function for data fetch.
+       * \param reduce is an instance of lambda function for reduction.
+       * \param keep in an instance of lambda function for storing results.
+       * \param zero is zero of given reduction operation also known as idempotent element.
+       *
+       * \par Example
+       * \include Matrices/SparseMatrix/SparseMatrixExample_reduceAllRows.cpp
+       * \par Output
+       * \include SparseMatrixExample_reduceAllRows.out
+       */
+      template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
+      void reduceAllRows( Fetch&& fetch, const Reduce&& reduce, Keep&& keep, const FetchReal& zero );
+
+      /**
+       * \brief Method for performing general reduction on all matrix rows for constant instances.
+       *
+       * \tparam Fetch is a type of lambda function for data fetch declared as
+       *          `fetch( IndexType rowIdx, IndexType& columnIdx, RealType& elementValue ) -> FetchValue`.
+       *          The return type of this lambda can be any non void.
+       * \tparam Reduce is a type of lambda function for reduction declared as
+       *          `reduce( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue`.
+       * \tparam Keep is a type of lambda function for storing results of reduction in each row.
+       *          It is declared as `keep( const IndexType rowIdx, const double& value )`.
+       * \tparam FetchValue is type returned by the Fetch lambda function.
+       *
+       * \param fetch is an instance of lambda function for data fetch.
+       * \param reduce is an instance of lambda function for reduction.
+       * \param keep in an instance of lambda function for storing results.
+       * \param zero is zero of given reduction operation also known as idempotent element.
+       *
+       * \par Example
+       * \include Matrices/SparseMatrix/SparseMatrixExample_reduceAllRows.cpp
+       * \par Output
+       * \include SparseMatrixExample_reduceAllRows.out
+       */
+      template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
+      void reduceAllRows( Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& zero ) const;
+
+      /**
+       * \brief Method for parallel iteration over matrix elements of given rows for constant instances.
+       *
+       * \tparam Function is type of lambda function that will operate on matrix elements.
+       *
+       * \param begin defines beginning of the range [ \e begin,\e end ) of rows to be processed.
+       * \param end defines ending of the range [ \e begin, \e end ) of rows to be processed.
+       * \param function is an instance of the lambda function to be called for element of given rows.
+       *
+       * The lambda function `function` should be declared like follows:
+       *
+       * ```
+       * auto function = [] __cuda_callable__ ( IndexType rowIdx, IndexType localIdx, IndexType columnIdx, const RealType& value ) { ... };
+       * ```
+       *
+       *  The \e localIdx parameter is a rank of the non-zero element in given row.
+       *
+       * \par Example
+       * \include Matrices/SparseMatrix/SparseMatrixExample_forElements.cpp
+       * \par Output
+       * \include SparseMatrixExample_forElements.out
+       */
+      template< typename Function >
+      void forElements( IndexType begin, IndexType end, Function&& function ) const;
+
+      /**
+       * \brief Method for parallel iteration over all matrix elements of given rows for non-constant instances.
+       *
+       * \tparam Function is type of lambda function that will operate on matrix elements.
+       *
+       * \param begin defines beginning of the range [ \e begin,\e end ) of rows to be processed.
+       * \param end defines ending of the range [ \e begin, \e end ) of rows to be processed.
+       * \param function is an instance of the lambda function to be called for each element of given rows.
+       *
+       * The lambda function `function` should be declared like follows:
+       *
+       * ```
+       * auto function = [] __cuda_callable__ ( IndexType rowIdx, IndexType localIdx, IndexType columnIdx, const RealType& value ) mutable { ... }
+       * ```
+       *
+       *  The \e localIdx parameter is a rank of the non-zero element in given row.
+       *
+       * \par Example
+       * \include Matrices/SparseMatrix/SparseMatrixExample_forElements.cpp
+       * \par Output
+       * \include SparseMatrixExample_forElements.out
+       */
+      template< typename Function >
+      void forElements( IndexType begin, IndexType end, Function&& function );
+
+      /**
+       * \brief Method for parallel iteration over all matrix elements for constant instances.
+       *
+       * See \ref SparseMatrix::forElements.
+       *
+       * \tparam Function is a type of lambda function that will operate on matrix elements.
+       * \param function  is an instance of the lambda function to be called for each matrix element.
+       *
+       * \par Example
+       * \include Matrices/SparseMatrix/SparseMatrixExample_forElements.cpp
+       * \par Output
+       * \include SparseMatrixExample_forElements.out
+       */
+      template< typename Function >
+      void forAllElements( Function&& function ) const;
+
+      /**
+       * \brief Method for parallel iteration over all matrix elements for non-constant instances.
+       *
+       * See \ref SparseMatrix::forElements.
+       *
+       * \tparam Function is a type of lambda function that will operate on matrix elements.
+       * \param function  is an instance of the lambda function to be called for each matrix element.
+       *
+       * \par Example
+       * \include Matrices/SparseMatrix/SparseMatrixExample_forElements.cpp
+       * \par Output
+       * \include SparseMatrixExample_forElements.out
+       */
+      template< typename Function >
+      void forAllElements( Function&& function );
+
+      /**
+       * \brief Method for parallel iteration over matrix rows from interval [ \e begin, \e end).
+       *
+       * In each row, given lambda function is performed. Each row is processed by at most one thread unlike the method
+       * \ref SparseMatrix::forElements where more than one thread can be mapped to each row.
+       *
+       * \tparam Function is type of the lambda function.
+       *
+       * \param begin defines beginning of the range [ \e begin,\e end ) of rows to be processed.
+       * \param end defines ending of the range [ \e begin, \e end ) of rows to be processed.
+       * \param function is an instance of the lambda function to be called for each row.
+       *
+       * ```
+       * auto function = [] __cuda_callable__ ( RowView& row ) mutable { ... };
+       * ```
+       *
+       * \e RowView represents matrix row - see \ref TNL::Matrices::SparseMatrix::RowView.
+       *
+       * \par Example
+       * \include Matrices/SparseMatrix/SparseMatrixExample_forRows.cpp
+       * \par Output
+       * \include SparseMatrixExample_forRows.out
+       */
+      template< typename Function >
+      void forRows( IndexType begin, IndexType end, Function&& function );
+
+      /**
+       * \brief Method for parallel iteration over matrix rows from interval [ \e begin, \e end) for constant instances.
+       *
+       * In each row, given lambda function is performed. Each row is processed by at most one thread unlike the method
+       * \ref SparseMatrix::forElements where more than one thread can be mapped to each row.
+       *
+       * \tparam Function is type of the lambda function.
+       *
+       * \param begin defines beginning of the range [ \e begin,\e end ) of rows to be processed.
+       * \param end defines ending of the range [ \e begin, \e end ) of rows to be processed.
+       * \param function is an instance of the lambda function to be called for each row.
+       *
+       * ```
+       * auto function = [] __cuda_callable__ ( RowView& row ) { ... };
+       * ```
+       *
+       * \e RowView represents matrix row - see \ref TNL::Matrices::SparseMatrix::RowView.
+       *
+       * \par Example
+       * \include Matrices/SparseMatrix/SparseMatrixExample_forRows.cpp
+       * \par Output
+       * \include SparseMatrixExample_forRows.out
+       */
+      template< typename Function >
+      void forRows( IndexType begin, IndexType end, Function&& function ) const;
+
+      /**
+       * \brief Method for parallel iteration over all matrix rows.
+       *
+       * In each row, given lambda function is performed. Each row is processed by at most one thread unlike the method
+       * \ref SparseMatrix::forAllElements where more than one thread can be mapped to each row.
+       *
+       * \tparam Function is type of the lambda function.
+       *
+       * \param function is an instance of the lambda function to be called for each row.
+       *
+       * ```
+       * auto function = [] __cuda_callable__ ( RowView& row ) mutable { ... };
+       * ```
+       *
+       * \e RowView represents matrix row - see \ref TNL::Matrices::SparseMatrix::RowView.
+       *
+       * \par Example
+       * \include Matrices/SparseMatrix/SparseMatrixExample_forRows.cpp
+       * \par Output
+       * \include SparseMatrixExample_forRows.out
+       */
+      template< typename Function >
+      void forAllRows( Function&& function );
+
+      /**
+       * \brief Method for parallel iteration over all matrix rows for constant instances.
+       *
+       * In each row, given lambda function is performed. Each row is processed by at most one thread unlike the method
+       * \ref SparseMatrix::forAllElements where more than one thread can be mapped to each row.
+       *
+       * \tparam Function is type of the lambda function.
+       *
+       * \param function is an instance of the lambda function to be called for each row.
+       *
+       * ```
+       * auto function = [] __cuda_callable__ ( RowView& row ) { ... };
+       * ```
+       *
+       * \e RowView represents matrix row - see \ref TNL::Matrices::SparseMatrix::RowView.
+       *
+       * \par Example
+       * \include Matrices/SparseMatrix/SparseMatrixExample_forRows.cpp
+       * \par Output
+       * \include SparseMatrixExample_forRows.out
+       */
+      template< typename Function >
+      void forAllRows( Function&& function ) const;
+
+      /**
+       * \brief Method for sequential iteration over all matrix rows for constant instances.
+       *
+       * \tparam Function is type of lambda function that will operate on matrix elements.
+       *    It is should have form like
+       *  `function( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx_, const RealType& value )`.
+       *  The column index repeats twice only for compatibility with sparse matrices.
+       *
+       * \param begin defines beginning of the range [begin,end) of rows to be processed.
+       * \param end defines ending of the range [begin,end) of rows to be processed.
+       * \param function is an instance of the lambda function to be called in each row.
+       */
+      template< typename Function >
+      void sequentialForRows( IndexType begin, IndexType end, Function& function ) const;
+
+      /**
+       * \brief Method for sequential iteration over all matrix rows for non-constant instances.
+       *
+       * \tparam Function is type of lambda function that will operate on matrix elements.
+       *    It is should have form like
+       *  `function( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx_, RealType& value )`.
+       *  The column index repeats twice only for compatibility with sparse matrices.
+       *
+       * \param begin defines beginning of the range [begin,end) of rows to be processed.
+       * \param end defines ending of the range [begin,end) of rows to be processed.
+       * \param function is an instance of the lambda function to be called in each row.
+       */
+      template< typename Function >
+      void sequentialForRows( IndexType begin, IndexType end, Function& function );
+
+      /**
+       * \brief This method calls \e sequentialForRows for all matrix rows (for constant instances).
+       *
+       * See \ref SparseMatrix::sequentialForRows.
+       *
+       * \tparam Function is a type of lambda function that will operate on matrix elements.
+       * \param function  is an instance of the lambda function to be called in each row.
+       */
+      template< typename Function >
+      void sequentialForAllRows( Function& function ) const;
+
+      /**
+       * \brief This method calls \e sequentialForRows for all matrix rows.
+       *
+       * See \ref SparseMatrix::sequentialForAllRows.
+       *
+       * \tparam Function is a type of lambda function that will operate on matrix elements.
+       * \param function  is an instance of the lambda function to be called in each row.
+       */
+      template< typename Function >
+      void sequentialForAllRows( Function& function );
+
+      /**
+       * \brief Computes product of matrix and vector.
+       *
+       * More precisely, it computes:
+       *
+       * `outVector = matrixMultiplicator * ( * this ) * inVector + outVectorMultiplicator * outVector`
+       *
+       * \tparam InVector is type of input vector.  It can be \ref Vector,
+       *     \ref VectorView, \ref Array, \ref ArraView or similar container.
+       * \tparam OutVector is type of output vector. It can be \ref Vector,
+       *     \ref VectorView, \ref Array, \ref ArraView or similar container.
+       *
+       * \param inVector is input vector.
+       * \param outVector is output vector.
+       * \param matrixMultiplicator is a factor by which the matrix is multiplied. It is one by default.
+       * \param outVectorMultiplicator is a factor by which the outVector is multiplied before added
+       *    to the result of matrix-vector product. It is zero by default.
+       * \param begin is the beginning of the rows range for which the vector product
+       *    is computed. It is zero by default.
+       * \param end is the end of the rows range for which the vector product
+       *    is computed. It is number if the matrix rows by default.
+       */
+      template< typename InVector,
+                typename OutVector >
+      void vectorProduct( const InVector& inVector,
+                          OutVector& outVector,
+                          const RealType& matrixMultiplicator = 1.0,
+                          const RealType& outVectorMultiplicator = 0.0,
+                          const IndexType firstRow = 0,
+                          const IndexType lastRow = 0 ) const;
+
+      /*template< typename Real2, typename Index2 >
+      void addMatrix( const SparseMatrix< Real2, Segments, Device, Index2 >& matrix,
+                      const RealType& matrixMultiplicator = 1.0,
+                      const RealType& thisMatrixMultiplicator = 1.0 );
+
+      template< typename Real2, typename Index2 >
+      void getTransposition( const SparseMatrix< Real2, Segments, Device, Index2 >& matrix,
+                             const RealType& matrixMultiplicator = 1.0 );
+       */
+
+      template< typename Vector1, typename Vector2 >
+      bool performSORIteration( const Vector1& b,
+                                const IndexType row,
+                                Vector2& x,
+                                const RealType& omega = 1.0 ) const;
+
+      /**
+       * \brief Assignment of exactly the same matrix type.
+       *
+       * \param matrix is input matrix for the assignment.
+       * \return reference to this matrix.
+       */
+      SparseSandboxMatrix& operator=( const SparseSandboxMatrix& matrix );
+
+      /**
+       * \brief Assignment of exactly the same matrix type but different device.
+       *
+       * \param matrix is input matrix for the assignment.
+       * \return reference to this matrix.
+       */
+      template< typename Device_ >
+      SparseSandboxMatrix& operator=( const SparseSandboxMatrix< RealType, Device_, IndexType, MatrixType, RealAllocator, IndexAllocator >& matrix );
+
+      /**
+       * \brief Assignment of dense matrix
+       *
+       * \param matrix is input matrix for the assignment.
+       * \return reference to this matrix.
+       */
+      template< typename Real_, typename Device_, typename Index_, ElementsOrganization Organization, typename RealAllocator_ >
+      SparseSandboxMatrix& operator=( const DenseMatrix< Real_, Device_, Index_, Organization, RealAllocator_ >& matrix );
+
+
+      /**
+       * \brief Assignment of any matrix type other then this and dense.
+       *
+       * **Warning: Assignment of symmetric sparse matrix to general sparse matrix does not give correct result, currently. Only the diagonal and the lower part of the matrix is assigned.**
+       *
+       * \param matrix is input matrix for the assignment.
+       * \return reference to this matrix.
+       */
+      template< typename RHSMatrix >
+      SparseSandboxMatrix& operator=( const RHSMatrix& matrix );
+
+      /**
+       * \brief Comparison operator with another arbitrary matrix type.
+       *
+       * \param matrix is the right-hand side matrix.
+       * \return \e true if the RHS matrix is equal, \e false otherwise.
+       */
+      template< typename Matrix >
+      bool operator==( const Matrix& m ) const;
+
+      /**
+       * \brief Comparison operator with another arbitrary matrix type.
+       *
+       * \param matrix is the right-hand side matrix.
+       * \return \e true if the RHS matrix is equal, \e false otherwise.
+       */
+      template< typename Matrix >
+      bool operator!=( const Matrix& m ) const;
+
+      /**
+       * \brief Method for saving the matrix to the file with given filename.
+       *
+       * \param fileName is name of the file.
+       */
+      void save( const String& fileName ) const;
+
+      /**
+       * \brief Method for loading the matrix from the file with given filename.
+       *
+       * \param fileName is name of the file.
+       */
+      void load( const String& fileName );
+
+      /**
+       * \brief Method for saving the matrix to a file.
+       *
+       * \param file is the output file.
+       */
+      virtual void save( File& file ) const override;
+
+      /**
+       * \brief Method for loading the matrix from a file.
+       *
+       * \param file is the input file.
+       */
+      virtual void load( File& file ) override;
+
+      /**
+       * \brief Method for printing the matrix to output stream.
+       *
+       * \param str is the output stream.
+       */
+      virtual void print( std::ostream& str ) const override;
+
+      /**
+       * \brief Returns a padding index value.
+       *
+       * Padding index is used for column indexes of padding zeros. Padding zeros
+       * are used in some sparse matrix formats for better data alignment in memory.
+       *
+       * \return value of the padding index.
+       */
+      __cuda_callable__
+      IndexType getPaddingIndex() const;
+
+      /**
+       * \brief Getter of segments for non-constant instances.
+       *
+       * \e Segments are a structure for addressing the matrix elements columns and values.
+       * In fact, \e Segments represent the sparse matrix format.
+       *
+       * \return Non-constant reference to segments.
+       */
+      //SegmentsType& getSegments();
+
+      /**
+       * \brief Getter of segments for constant instances.
+       *
+       * \e Segments are a structure for addressing the matrix elements columns and values.
+       * In fact, \e Segments represent the sparse matrix format.
+       *
+       * \return Constant reference to segments.
+       */
+      //const SegmentsType& getSegments() const;
+
+      /**
+       * \brief Getter of column indexes for constant instances.
+       *
+       * \return Constant reference to a vector with matrix elements column indexes.
+       */
+      const ColumnsIndexesVectorType& getColumnIndexes() const;
+
+      /**
+       * \brief Getter of column indexes for nonconstant instances.
+       *
+       * \return Reference to a vector with matrix elements column indexes.
+       */
+      ColumnsIndexesVectorType& getColumnIndexes();
+
+   protected:
+
+      ColumnsIndexesVectorType columnIndexes;
+
+      IndexAllocator indexAllocator;
+
+      ViewType view;
+
+      /**
+       * \brief Container for CSR row pointers.
+       *
+       * SANDBOX_TODO: You may replace it with containers and metadata required by you format.
+       */
+
+      RowPointers rowPointers;
+};
+
+      } // namespace Sandbox
+   } // namespace Matrices
+} // namespace TNL
+
+#include <TNL/Matrices/Sandbox/SparseSandboxMatrix.hpp>
diff --git a/src/TNL/Matrices/Sandbox/SparseSandboxMatrix.hpp b/src/TNL/Matrices/Sandbox/SparseSandboxMatrix.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..e21e420426ad351908abba1c9cd1d6a6c9607465
--- /dev/null
+++ b/src/TNL/Matrices/Sandbox/SparseSandboxMatrix.hpp
@@ -0,0 +1,1198 @@
+/***************************************************************************
+                          SparseSandboxMatrix.hpp -  description
+                             -------------------
+    begin                : Apr 19, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#include <functional>
+#include <sstream>
+#include <TNL/Algorithms/reduce.h>
+#include <TNL/Matrices/Sandbox/SparseSandboxMatrix.h>
+
+namespace TNL {
+   namespace Matrices {
+      namespace Sandbox {
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          typename RealAllocator,
+          typename IndexAllocator >
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::
+SparseSandboxMatrix( const RealAllocatorType& realAllocator,
+                     const IndexAllocatorType& indexAllocator )
+: BaseType( realAllocator ), columnIndexes( indexAllocator ), rowPointers( ( IndexType ) 1, ( IndexType ) 0, indexAllocator )
+{
+   this->view = this->getView();
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          typename RealAllocator,
+          typename IndexAllocator >
+   template< typename Index_t, std::enable_if_t< std::is_integral< Index_t >::value, int > >
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::
+SparseSandboxMatrix( const Index_t rows,
+                     const Index_t columns,
+                     const RealAllocatorType& realAllocator,
+                     const IndexAllocatorType& indexAllocator )
+: BaseType( rows, columns, realAllocator ), columnIndexes( indexAllocator ), rowPointers( rows + 1, ( IndexType ) 0, indexAllocator )
+{
+   this->view = this->getView();
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          typename RealAllocator,
+          typename IndexAllocator >
+   template< typename ListIndex >
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::
+SparseSandboxMatrix( const std::initializer_list< ListIndex >& rowCapacities,
+                     const IndexType columns,
+                     const RealAllocatorType& realAllocator,
+                     const IndexAllocatorType& indexAllocator )
+: BaseType( rowCapacities.size(), columns, realAllocator ), columnIndexes( indexAllocator ), rowPointers( rowCapacities.size() + 1, ( IndexType ) 0, indexAllocator )
+{
+   this->setRowCapacities( RowsCapacitiesType( rowCapacities ) );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          typename RealAllocator,
+          typename IndexAllocator >
+   template< typename RowCapacitiesVector, std::enable_if_t< TNL::IsArrayType< RowCapacitiesVector >::value, int > >
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::
+SparseSandboxMatrix( const RowCapacitiesVector& rowCapacities,
+                     const IndexType columns,
+                     const RealAllocatorType& realAllocator,
+                     const IndexAllocatorType& indexAllocator )
+: BaseType( rowCapacities.getSize(), columns, realAllocator ), columnIndexes( indexAllocator ), rowPointers( rowCapacities.getSize() + 1, ( IndexType ) 0, indexAllocator )
+{
+   this->setRowCapacities( rowCapacities );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          typename RealAllocator,
+          typename IndexAllocator >
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::
+SparseSandboxMatrix( const IndexType rows,
+                     const IndexType columns,
+                     const std::initializer_list< std::tuple< IndexType, IndexType, RealType > >& data,
+                     const RealAllocatorType& realAllocator,
+                     const IndexAllocatorType& indexAllocator )
+: BaseType( rows, columns, realAllocator ), columnIndexes( indexAllocator ), rowPointers( rows + 1, ( IndexType ) 0, indexAllocator )
+{
+   this->setElements( data );
+   this->view = this->getView();
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          typename RealAllocator,
+          typename IndexAllocator >
+   template< typename MapIndex,
+             typename MapValue >
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::
+SparseSandboxMatrix( const IndexType rows,
+              const IndexType columns,
+              const std::map< std::pair< MapIndex, MapIndex > , MapValue >& map,
+              const RealAllocatorType& realAllocator,
+              const IndexAllocatorType& indexAllocator )
+: BaseType( rows, columns, realAllocator ), columnIndexes( indexAllocator ), rowPointers( rows + 1, ( IndexType ) 0, indexAllocator )
+{
+   this->setDimensions( rows, columns );
+   this->setElements( map );
+   this->view = this->getView();
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          typename RealAllocator,
+          typename IndexAllocator >
+auto
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::
+getView() const -> ViewType
+{
+   return ViewType( this->getRows(),
+                    this->getColumns(),
+                    const_cast< SparseSandboxMatrix* >( this )->getValues().getView(),  // TODO: remove const_cast
+                    const_cast< SparseSandboxMatrix* >( this )->columnIndexes.getView(),
+                    const_cast< SparseSandboxMatrix* >( this )->rowPointers.getView() );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          typename RealAllocator,
+          typename IndexAllocator >
+auto
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::
+getConstView() const -> ConstViewType
+{
+   return ConstViewType( this->getRows(),
+                         this->getColumns(),
+                         this->getValues().getConstView(),
+                         this->columnIndexes.getConstView(),
+                         this->segments.getConstView() );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          typename RealAllocator,
+          typename IndexAllocator >
+String
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::
+getSerializationType()
+{
+   return ViewType::getSerializationType();
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          typename RealAllocator,
+          typename IndexAllocator >
+String
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::
+getSerializationTypeVirtual() const
+{
+   return this->getSerializationType();
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          typename RealAllocator,
+          typename IndexAllocator >
+void
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::
+setDimensions( const IndexType rows,
+               const IndexType columns )
+{
+   BaseType::setDimensions( rows, columns );
+   this->view = this->getView();
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          typename RealAllocator,
+          typename IndexAllocator >
+   template< typename Matrix_ >
+void
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::
+setLike( const Matrix_& matrix )
+{
+   BaseType::setLike( matrix );
+   // SANDBOX_TODO: Replace the following line with assignment of metadata required by your format. 
+   //               Do not assign matrix elements here.
+   this->rowPointers = matrix.rowPointers;
+   this->view = this->getView();
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          typename RealAllocator,
+          typename IndexAllocator >
+   template< typename RowsCapacitiesVector >
+void
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::
+setRowCapacities( const RowsCapacitiesVector& rowsCapacities )
+{
+   TNL_ASSERT_EQ( rowsCapacities.getSize(), this->getRows(), "Number of matrix rows does not fit with rowCapacities vector size." );
+   using RowsCapacitiesVectorDevice = typename RowsCapacitiesVector::DeviceType;
+
+   // SANDBOX_TODO: Replace the following lines with the setup of your sparse matrix format based on
+   //               `rowsCapacities`. This container has the same number of elements as is the number of
+   //               rows of this matrix. Each element says how many nonzero elements the user needs to have
+   //               in each row. This number can be increased if the sparse matrix format uses padding zeros.
+   this->rowPointers.setSize( this->getRows() + 1 );
+   if( std::is_same< DeviceType, RowsCapacitiesVectorDevice >::value )
+   {
+      // GOTCHA: when this->getRows() == 0, getView returns a full view with size == 1
+      if( this->getRows() > 0 ) {
+         auto view = this->rowPointers.getView( 0, this->getRows() );
+         view = rowsCapacities;
+      }
+   }
+   else
+   {
+      RowsCapacitiesType thisRowsCapacities;
+      thisRowsCapacities = rowsCapacities;
+      if( this->getRows() > 0 ) {
+         auto view = this->rowPointers.getView( 0, this->getRows() );
+         view = thisRowsCapacities;
+      }
+   }
+   this->rowPointers.setElement( this->getRows(), 0 );
+   Algorithms::inplaceExclusiveScan( this->rowPointers );
+   //this->rowPointers.template scan< Algorithms::ScanType::Exclusive >();
+   // End of sparse matrix format initiation.
+
+   // SANDBOX_TODO: Compute number of all elements that need to be allocated by your format.
+   const auto storageSize = rowPointers.getElement( this->getRows() );
+
+   // The rest of this methods needs no changes.
+   if( ! isBinary() )
+   {
+      this->values.setSize( storageSize );
+      this->values = ( RealType ) 0;
+   }
+   this->columnIndexes.setSize( storageSize );
+   this->columnIndexes = this->getPaddingIndex();
+   this->view = this->getView();
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          typename RealAllocator,
+          typename IndexAllocator >
+   template< typename Vector >
+void
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::
+getRowCapacities( Vector& rowCapacities ) const
+{
+   this->view.getRowCapacities( rowCapacities );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          typename RealAllocator,
+          typename IndexAllocator >
+void
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::
+setElements( const std::initializer_list< std::tuple< IndexType, IndexType, RealType > >& data )
+{
+   const auto& rows = this->getRows();
+   const auto& columns = this->getColumns();
+   Containers::Vector< IndexType, Devices::Host, IndexType > rowCapacities( rows, 0 );
+   for( const auto& i : data )
+   {
+      if( std::get< 0 >( i ) >= rows )
+      {
+         std::stringstream s;
+         s << "Wrong row index " << std::get< 0 >( i ) << " in an initializer list";
+         throw std::logic_error( s.str() );
+      }
+      rowCapacities[ std::get< 0 >( i ) ]++;
+   }
+   SparseSandboxMatrix< Real, Devices::Host, Index, MatrixType > hostMatrix( rows, columns );
+   hostMatrix.setRowCapacities( rowCapacities );
+   for( const auto& i : data )
+   {
+      if( std::get< 1 >( i ) >= columns )
+      {
+         std::stringstream s;
+         s << "Wrong column index " << std::get< 1 >( i ) << " in an initializer list";
+         throw std::logic_error( s.str() );
+      }
+      hostMatrix.setElement( std::get< 0 >( i ), std::get< 1 >( i ), std::get< 2 >( i ) );
+   }
+   ( *this ) = hostMatrix;
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          typename RealAllocator,
+          typename IndexAllocator >
+   template< typename MapIndex,
+             typename MapValue >
+void
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::
+setElements( const std::map< std::pair< MapIndex, MapIndex > , MapValue >& map )
+{
+   Containers::Vector< IndexType, Devices::Host, IndexType > rowsCapacities( this->getRows(), 0 );
+   for( auto element : map )
+      rowsCapacities[ element.first.first ]++;
+   if( !std::is_same< DeviceType, Devices::Host >::value )
+   {
+      SparseSandboxMatrix< Real, Devices::Host, Index, MatrixType > hostMatrix( this->getRows(), this->getColumns() );
+      hostMatrix.setRowCapacities( rowsCapacities );
+      for( auto element : map )
+         hostMatrix.setElement( element.first.first, element.first.second, element.second );
+      *this = hostMatrix;
+   }
+   else
+   {
+      this->setRowCapacities( rowsCapacities );
+      for( auto element : map )
+         this->setElement( element.first.first, element.first.second, element.second );
+   }
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          typename RealAllocator,
+          typename IndexAllocator >
+   template< typename Vector >
+void
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::
+getCompressedRowLengths( Vector& rowLengths ) const
+{
+   this->view.getCompressedRowLengths( rowLengths );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          typename RealAllocator,
+          typename IndexAllocator >
+__cuda_callable__
+Index
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::
+getRowCapacity( const IndexType row ) const
+{
+   return this->view.getRowCapacity( row );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          typename RealAllocator,
+          typename IndexAllocator >
+Index
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::
+getNonzeroElementsCount() const
+{
+   return this->view.getNonzeroElementsCount();
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          typename RealAllocator,
+          typename IndexAllocator >
+void
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::
+reset()
+{
+   BaseType::reset();
+   this->columnIndexes.reset();
+   // SANDBOX_TODO: Reset the metadata required by your format here.
+   this->rowPointers.reset();
+   this->view = this->getView();
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          typename RealAllocator,
+          typename IndexAllocator >
+__cuda_callable__ auto
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::
+getRow( const IndexType& rowIdx ) const -> const ConstRowView
+{
+   return this->view.getRow( rowIdx );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          typename RealAllocator,
+          typename IndexAllocator >
+__cuda_callable__ auto
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::
+getRow( const IndexType& rowIdx ) -> RowView
+{
+   return this->view.getRow( rowIdx );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          typename RealAllocator,
+          typename IndexAllocator >
+__cuda_callable__ void
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::
+setElement( const IndexType row,
+            const IndexType column,
+            const RealType& value )
+{
+   this->view.setElement( row, column, value );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          typename RealAllocator,
+          typename IndexAllocator >
+__cuda_callable__ void
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::
+addElement( const IndexType row,
+            const IndexType column,
+            const RealType& value,
+            const RealType& thisElementMultiplicator )
+{
+   this->view.addElement( row, column, value, thisElementMultiplicator );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          typename RealAllocator,
+          typename IndexAllocator >
+__cuda_callable__
+auto
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::
+getElement( const IndexType row,
+            const IndexType column ) const -> RealType
+{
+   return this->view.getElement( row, column );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          typename RealAllocator,
+          typename IndexAllocator >
+template< typename InVector,
+       typename OutVector >
+void
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::
+vectorProduct( const InVector& inVector,
+               OutVector& outVector,
+               const RealType& matrixMultiplicator,
+               const RealType& outVectorMultiplicator,
+               const IndexType firstRow,
+               const IndexType lastRow ) const
+{
+   this->view.vectorProduct( inVector, outVector, matrixMultiplicator, outVectorMultiplicator, firstRow, lastRow );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          typename RealAllocator,
+          typename IndexAllocator >
+   template< typename Fetch, typename Reduce, typename Keep, typename FetchValue >
+void
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::
+reduceRows( IndexType begin, IndexType end, Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchValue& zero )
+{
+   this->view.reduceRows( begin, end, fetch, reduce, keep, zero );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          typename RealAllocator,
+          typename IndexAllocator >
+   template< typename Fetch, typename Reduce, typename Keep, typename FetchValue >
+void
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::
+reduceRows( IndexType begin, IndexType end, Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchValue& zero ) const
+{
+   this->view.reduceRows( begin, end, fetch, reduce, keep, zero );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          typename RealAllocator,
+          typename IndexAllocator >
+   template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
+void
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::
+reduceAllRows( Fetch&& fetch, const Reduce&& reduce, Keep&& keep, const FetchReal& zero )
+{
+   this->reduceRows( 0, this->getRows(), fetch, reduce, keep, zero );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          typename RealAllocator,
+          typename IndexAllocator >
+   template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
+void
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::
+reduceAllRows( Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& zero ) const
+{
+   this->reduceRows( 0, this->getRows(), fetch, reduce, keep, zero );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          typename RealAllocator,
+          typename IndexAllocator >
+   template< typename Function >
+void
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::
+forElements( IndexType begin, IndexType end, Function&& function ) const
+{
+   this->view.forElements( begin, end, function );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          typename RealAllocator,
+          typename IndexAllocator >
+   template< typename Function >
+void
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::
+forElements( IndexType begin, IndexType end, Function&& function )
+{
+   this->view.forElements( begin, end, function );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          typename RealAllocator,
+          typename IndexAllocator >
+   template< typename Function >
+void
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::
+forAllElements( Function&& function ) const
+{
+   this->forElements( 0, this->getRows(), function );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          typename RealAllocator,
+          typename IndexAllocator >
+   template< typename Function >
+void
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::
+forAllElements( Function&& function )
+{
+   this->forElements( 0, this->getRows(), function );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          typename RealAllocator,
+          typename IndexAllocator >
+   template< typename Function >
+void
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::
+forRows( IndexType begin, IndexType end, Function&& function )
+{
+   this->getView().forRows( begin, end, function );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          typename RealAllocator,
+          typename IndexAllocator >
+   template< typename Function >
+void
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::
+forRows( IndexType begin, IndexType end, Function&& function ) const
+{
+   this->getConstView().forRows( begin, end, function );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          typename RealAllocator,
+          typename IndexAllocator >
+   template< typename Function >
+void
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::
+forAllRows( Function&& function )
+{
+   this->getView().forAllRows( function );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          typename RealAllocator,
+          typename IndexAllocator >
+   template< typename Function >
+void
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::
+forAllRows( Function&& function ) const
+{
+   this->getConsView().forAllRows( function );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          typename RealAllocator,
+          typename IndexAllocator >
+   template< typename Function >
+void
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::
+sequentialForRows( IndexType begin, IndexType end, Function& function ) const
+{
+   this->view.sequentialForRows( begin, end, function );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          typename RealAllocator,
+          typename IndexAllocator >
+   template< typename Function >
+void
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::
+sequentialForRows( IndexType first, IndexType last, Function& function )
+{
+   this->view.sequentialForRows( first, last, function );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          typename RealAllocator,
+          typename IndexAllocator >
+   template< typename Function >
+void
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::
+sequentialForAllRows( Function& function ) const
+{
+   this->sequentialForRows( 0, this->getRows(), function );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          typename RealAllocator,
+          typename IndexAllocator >
+   template< typename Function >
+void
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::
+sequentialForAllRows( Function& function )
+{
+   this->sequentialForRows( 0, this->getRows(), function );
+}
+
+
+/*template< typename Real,
+          template< typename, typename, typename > class Segments,
+          typename Device,
+          typename Index,
+          typename RealAllocator,
+          typename IndexAllocator >
+template< typename Real2, template< typename, typename > class Segments2, typename Index2, typename RealAllocator2, typename IndexAllocator2 >
+void
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::
+addMatrix( const SparseSandboxMatrix< Real2, Segments2, Device, Index2, RealAllocator2, IndexAllocator2 >& matrix,
+           const RealType& matrixMultiplicator,
+           const RealType& thisMatrixMultiplicator )
+{
+
+}
+
+template< typename Real,
+          template< typename, typename, typename > class Segments,
+          typename Device,
+          typename Index,
+          typename RealAllocator,
+          typename IndexAllocator >
+template< typename Real2, typename Index2 >
+void
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::
+getTransposition( const SparseSandboxMatrix< Real2, Device, Index2 >& matrix,
+                  const RealType& matrixMultiplicator )
+{
+
+}*/
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          typename RealAllocator,
+          typename IndexAllocator >
+template< typename Vector1, typename Vector2 >
+bool
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::
+performSORIteration( const Vector1& b,
+                     const IndexType row,
+                     Vector2& x,
+                     const RealType& omega ) const
+{
+   return false;
+}
+
+// copy assignment
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          typename RealAllocator,
+          typename IndexAllocator >
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >&
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::
+operator=( const SparseSandboxMatrix& matrix )
+{
+   Matrix< Real, Device, Index >::operator=( matrix );
+   this->columnIndexes = matrix.columnIndexes;
+   // SANDBOX_TODO: Replace the following line with an assignment of metadata required by you sparse matrix format.
+   this->rowPointers = matrix.rowPointers;
+   this->view = this->getView();
+   return *this;
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          typename RealAllocator,
+          typename IndexAllocator >
+   template< typename Device_ >
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >&
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::
+operator=( const SparseSandboxMatrix< RealType, Device_, IndexType, MatrixType, RealAllocator, IndexAllocator >& matrix )
+{
+   Matrix< Real, Device, Index >::operator=( matrix );
+   this->columnIndexes = matrix.columnIndexes;
+   // SANDBOX_TODO: Replace the following line with an assignment of metadata required by you sparse matrix format.
+   this->rowPointers = matrix.rowPointers;
+   this->view = this->getView();
+   return *this;
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          typename RealAllocator,
+          typename IndexAllocator >
+   template< typename Real_, typename Device_, typename Index_, ElementsOrganization Organization, typename RealAllocator_ >
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >&
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::
+operator=( const DenseMatrix< Real_, Device_, Index_, Organization, RealAllocator_ >& matrix )
+{
+   using RHSMatrix = DenseMatrix< Real_, Device_, Index_, Organization, RealAllocator_ >;
+   using RHSIndexType = typename RHSMatrix::IndexType;
+   using RHSRealType = typename RHSMatrix::RealType;
+   using RHSDeviceType = typename RHSMatrix::DeviceType;
+   using RHSRealAllocatorType = typename RHSMatrix::RealAllocatorType;
+
+   Containers::Vector< RHSIndexType, RHSDeviceType, RHSIndexType > rowLengths;
+   matrix.getCompressedRowLengths( rowLengths );
+   this->setLike( matrix );
+   this->setRowCapacities( rowLengths );
+   Containers::Vector< IndexType, DeviceType, IndexType > rowLocalIndexes( matrix.getRows() );
+   rowLocalIndexes = 0;
+
+   // TODO: use getConstView when it works
+   const auto matrixView = const_cast< RHSMatrix& >( matrix ).getView();
+   const IndexType paddingIndex = this->getPaddingIndex();
+   auto columns_view = this->columnIndexes.getView();
+   auto values_view = this->values.getView();
+   auto rowLocalIndexes_view = rowLocalIndexes.getView();
+   columns_view = paddingIndex;
+
+   if( std::is_same< DeviceType, RHSDeviceType >::value )
+   {
+      const auto segments_view = this->segments.getView();
+      auto f = [=] __cuda_callable__ ( RHSIndexType rowIdx, RHSIndexType localIdx, RHSIndexType columnIdx, const RHSRealType& value ) mutable {
+         if( value != 0.0 )
+         {
+            IndexType thisGlobalIdx = segments_view.getGlobalIndex( rowIdx, rowLocalIndexes_view[ rowIdx ]++ );
+            columns_view[ thisGlobalIdx ] = columnIdx;
+            if( ! isBinary() )
+               values_view[ thisGlobalIdx ] = value;
+         }
+      };
+      matrix.forAllElements( f );
+   }
+   else
+   {
+      const IndexType maxRowLength = matrix.getColumns();
+      const IndexType bufferRowsCount( 128 );
+      const size_t bufferSize = bufferRowsCount * maxRowLength;
+      Containers::Vector< RHSRealType, RHSDeviceType, RHSIndexType, RHSRealAllocatorType > matrixValuesBuffer( bufferSize );
+      Containers::Vector< RealType, DeviceType, IndexType, RealAllocatorType > thisValuesBuffer( bufferSize );
+      Containers::Vector< IndexType, DeviceType, IndexType, IndexAllocatorType > thisColumnsBuffer( bufferSize );
+      auto matrixValuesBuffer_view = matrixValuesBuffer.getView();
+      auto thisValuesBuffer_view = thisValuesBuffer.getView();
+
+      IndexType baseRow( 0 );
+      const IndexType rowsCount = this->getRows();
+      while( baseRow < rowsCount )
+      {
+         const IndexType lastRow = min( baseRow + bufferRowsCount, rowsCount );
+         thisColumnsBuffer = paddingIndex;
+
+         ////
+         // Copy matrix elements into buffer
+         auto f1 = [=] __cuda_callable__ ( RHSIndexType rowIdx, RHSIndexType localIdx, RHSIndexType columnIndex, const RHSRealType& value ) mutable {
+            const IndexType bufferIdx = ( rowIdx - baseRow ) * maxRowLength + localIdx;
+            matrixValuesBuffer_view[ bufferIdx ] = value;
+         };
+         matrix.forElements( baseRow, lastRow, f1 );
+
+         ////
+         // Copy the source matrix buffer to this matrix buffer
+         thisValuesBuffer_view = matrixValuesBuffer_view;
+
+         ////
+         // Copy matrix elements from the buffer to the matrix and ignoring
+         // zero matrix elements.
+         const IndexType matrix_columns = this->getColumns();
+         auto f2 = [=] __cuda_callable__ ( IndexType rowIdx, IndexType localIdx, IndexType& columnIndex, RealType& value ) mutable {
+            RealType inValue( 0.0 );
+            IndexType bufferIdx, column( rowLocalIndexes_view[ rowIdx ] );
+            while( inValue == 0.0 && column < matrix_columns )
+            {
+               bufferIdx = ( rowIdx - baseRow ) * maxRowLength + column++;
+               inValue = thisValuesBuffer_view[ bufferIdx ];
+            }
+            rowLocalIndexes_view[ rowIdx ] = column;
+            if( inValue == 0.0 )
+            {
+               columnIndex = paddingIndex;
+               value = 0.0;
+            }
+            else
+            {
+               columnIndex = column - 1;
+               value = inValue;
+            }
+         };
+         this->forElements( baseRow, lastRow, f2 );
+         baseRow += bufferRowsCount;
+      }
+      //std::cerr << "This matrix = " << std::endl << *this << std::endl;
+   }
+   this->view = this->getView();
+   return *this;
+
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          typename RealAllocator,
+          typename IndexAllocator >
+   template< typename RHSMatrix >
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >&
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::
+operator=( const RHSMatrix& matrix )
+{
+   using RHSIndexType = typename RHSMatrix::IndexType;
+   using RHSRealType = typename RHSMatrix::RealType;
+   using RHSDeviceType = typename RHSMatrix::DeviceType;
+   using RHSRealAllocatorType = typename RHSMatrix::RealAllocatorType;
+
+   Containers::Vector< RHSIndexType, RHSDeviceType, RHSIndexType > rowCapacities;
+   matrix.getRowCapacities( rowCapacities );
+   this->setDimensions( matrix.getRows(), matrix.getColumns() );
+   this->setRowCapacities( rowCapacities );
+   Containers::Vector< IndexType, DeviceType, IndexType > rowLocalIndexes( matrix.getRows() );
+   rowLocalIndexes = 0;
+
+   // TODO: use getConstView when it works
+   const auto matrixView = const_cast< RHSMatrix& >( matrix ).getView();
+   const IndexType paddingIndex = this->getPaddingIndex();
+   auto columns_view = this->columnIndexes.getView();
+   auto values_view = this->values.getView();
+   auto rowLocalIndexes_view = rowLocalIndexes.getView();
+   columns_view = paddingIndex;
+
+   // SANDBOX_TODO: Modify the follwoing accoring to your format
+   auto row_pointers_view = this->rowPointers.getView();
+   if( std::is_same< DeviceType, RHSDeviceType >::value )
+   {
+      auto f = [=] __cuda_callable__ ( RHSIndexType rowIdx, RHSIndexType localIdx_, RHSIndexType columnIndex, const RHSRealType& value ) mutable {
+         IndexType localIdx( rowLocalIndexes_view[ rowIdx ] );
+         IndexType thisRowBegin = row_pointers_view[ rowIdx ];
+         if( value != 0.0 && columnIndex != paddingIndex )
+         {
+            IndexType thisGlobalIdx = thisRowBegin + localIdx++;
+            columns_view[ thisGlobalIdx ] = columnIndex;
+            if( ! isBinary() )
+               values_view[ thisGlobalIdx ] = value;
+            rowLocalIndexes_view[ rowIdx ] = localIdx;
+         }
+      };
+      matrix.forAllElements( f );
+   }
+   else
+   {
+      const IndexType maxRowLength = max( rowCapacities );
+      const IndexType bufferRowsCount( 128 );
+      const size_t bufferSize = bufferRowsCount * maxRowLength;
+      Containers::Vector< RHSRealType, RHSDeviceType, RHSIndexType, RHSRealAllocatorType > matrixValuesBuffer( bufferSize );
+      Containers::Vector< RHSIndexType, RHSDeviceType, RHSIndexType > matrixColumnsBuffer( bufferSize );
+      Containers::Vector< RealType, DeviceType, IndexType, RealAllocatorType > thisValuesBuffer( bufferSize );
+      Containers::Vector< IndexType, DeviceType, IndexType > thisColumnsBuffer( bufferSize );
+      Containers::Vector< IndexType, DeviceType, IndexType > thisRowLengths;
+      Containers::Vector< RHSIndexType, RHSDeviceType, RHSIndexType > rhsRowLengths;
+      matrix.getCompressedRowLengths( rhsRowLengths );
+      thisRowLengths= rhsRowLengths;
+      auto matrixValuesBuffer_view = matrixValuesBuffer.getView();
+      auto matrixColumnsBuffer_view = matrixColumnsBuffer.getView();
+      auto thisValuesBuffer_view = thisValuesBuffer.getView();
+      auto thisColumnsBuffer_view = thisColumnsBuffer.getView();
+      matrixValuesBuffer_view = 0.0;
+
+      IndexType baseRow( 0 );
+      const IndexType rowsCount = this->getRows();
+      while( baseRow < rowsCount )
+      {
+         const IndexType lastRow = min( baseRow + bufferRowsCount, rowsCount );
+         thisColumnsBuffer = paddingIndex;
+         matrixColumnsBuffer_view = paddingIndex;
+
+         ////
+         // Copy matrix elements into buffer
+         auto f1 = [=] __cuda_callable__ ( RHSIndexType rowIdx, RHSIndexType localIdx, RHSIndexType columnIndex, const RHSRealType& value ) mutable {
+            if( columnIndex != paddingIndex )
+            {
+               TNL_ASSERT_LT( rowIdx - baseRow, bufferRowsCount, "" );
+               TNL_ASSERT_LT( localIdx, maxRowLength, "" );
+               const IndexType bufferIdx = ( rowIdx - baseRow ) * maxRowLength + localIdx;
+               TNL_ASSERT_LT( bufferIdx, ( IndexType ) bufferSize, "" );
+               matrixColumnsBuffer_view[ bufferIdx ] = columnIndex;
+               matrixValuesBuffer_view[ bufferIdx ] = value;
+            }
+         };
+         matrix.forElements( baseRow, lastRow, f1 );
+
+         ////
+         // Copy the source matrix buffer to this matrix buffer
+         thisValuesBuffer_view = matrixValuesBuffer_view;
+         thisColumnsBuffer_view = matrixColumnsBuffer_view;
+
+         ////
+         // Copy matrix elements from the buffer to the matrix and ignoring
+         // zero matrix elements
+         //const IndexType matrix_columns = this->getColumns();
+         const auto thisRowLengths_view = thisRowLengths.getConstView();
+         auto f2 = [=] __cuda_callable__ ( IndexType rowIdx, IndexType localIdx, IndexType& columnIndex, RealType& value ) mutable {
+            RealType inValue( 0.0 );
+            size_t bufferIdx;
+            IndexType bufferLocalIdx( rowLocalIndexes_view[ rowIdx ] );
+            while( inValue == 0.0 && localIdx < thisRowLengths_view[ rowIdx ] )
+            {
+               bufferIdx = ( rowIdx - baseRow ) * maxRowLength + bufferLocalIdx++;
+               TNL_ASSERT_LT( bufferIdx, bufferSize, "" );
+               inValue = thisValuesBuffer_view[ bufferIdx ];
+            }
+            rowLocalIndexes_view[ rowIdx ] = bufferLocalIdx;
+            if( inValue == 0.0 )
+            {
+               columnIndex = paddingIndex;
+               value = 0.0;
+            }
+            else
+            {
+               columnIndex = thisColumnsBuffer_view[ bufferIdx ];//column - 1;
+               value = inValue;
+            }
+         };
+         this->forElements( baseRow, lastRow, f2 );
+         baseRow += bufferRowsCount;
+      }
+   }
+   this->view = this->getView();
+   return *this;
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          typename RealAllocator,
+          typename IndexAllocator >
+   template< typename Matrix >
+bool
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::
+operator==( const Matrix& m ) const
+{
+   return view == m;
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          typename RealAllocator,
+          typename IndexAllocator >
+   template< typename Matrix >
+bool
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::
+operator!=( const Matrix& m ) const
+{
+   return view != m;
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          typename RealAllocator,
+          typename IndexAllocator >
+void
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::
+save( File& file ) const
+{
+   this->view.save( file );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          typename RealAllocator,
+          typename IndexAllocator >
+void
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::
+load( File& file )
+{
+   Matrix< RealType, DeviceType, IndexType >::load( file );
+   file >> this->columnIndexes;
+   // SANDBOX_TODO: Replace the following line with loading of metadata required by your sparse matrix format.
+   file >> rowPointers;
+   this->view = this->getView();
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          typename RealAllocator,
+          typename IndexAllocator >
+void
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::
+save( const String& fileName ) const
+{
+   Object::save( fileName );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          typename RealAllocator,
+          typename IndexAllocator >
+void
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::
+load( const String& fileName )
+{
+   Object::load( fileName );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          typename RealAllocator,
+          typename IndexAllocator >
+void
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::
+print( std::ostream& str ) const
+{
+   this->view.print( str );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          typename RealAllocator,
+          typename IndexAllocator >
+__cuda_callable__
+Index
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::
+getPaddingIndex() const
+{
+   return -1;
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          typename RealAllocator,
+          typename IndexAllocator >
+auto
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::
+getColumnIndexes() const -> const ColumnsIndexesVectorType&
+{
+   return this->columnIndexes;
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          typename RealAllocator,
+          typename IndexAllocator >
+auto
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::
+getColumnIndexes() -> ColumnsIndexesVectorType&
+{
+   return this->columnIndexes;
+}
+
+      } // namespace Sandbox
+   } // namespace Matrices
+} // namespace TNL
diff --git a/src/TNL/Matrices/Sandbox/SparseSandboxMatrixRowView.h b/src/TNL/Matrices/Sandbox/SparseSandboxMatrixRowView.h
new file mode 100644
index 0000000000000000000000000000000000000000..cabf7b7fd524c19d6c1564bf2c3a5a1c4795d708
--- /dev/null
+++ b/src/TNL/Matrices/Sandbox/SparseSandboxMatrixRowView.h
@@ -0,0 +1,282 @@
+ /***************************************************************************
+                          SparseSandboxMatrixRowView.h -  description
+                             -------------------
+    begin                : Apr 20, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#include <ostream>
+
+#include <TNL/Cuda/CudaCallable.h>
+#include <TNL/Matrices/MatrixRowViewIterator.h>
+
+namespace TNL {
+   namespace Matrices {
+      namespace Sandbox {
+
+/**
+ * \brief RowView is a simple structure for accessing rows of sparse matrix.
+ *
+ * \tparam ValuesView is a vector view storing the matrix elements values.
+ * \tparam ColumnsIndexesView is a vector view storing the column indexes of the matrix element.
+ * \tparam isBinary tells if the the parent matrix is a binary matrix.
+ *
+ * See \ref SparseSandboxMatrix and \ref SparseSandboxMatrixView.
+ *
+ * \par Example
+ * \include Matrices/SparseMatrix/SparseMatrixExample_getRow.cpp
+ * \par Output
+ * \include SparseMatrixExample_getRow.out
+ *
+ * \par Example
+ * \include Matrices/SparseMatrix/SparseMatrixViewExample_getRow.cpp
+ * \par Output
+ * \include SparseMatrixViewExample_getRow.out
+ */
+template< typename ValuesView,
+          typename ColumnsIndexesView,
+          bool isBinary_ >
+class SparseSandboxMatrixRowView
+{
+   public:
+
+      /**
+       * \brief The type of matrix elements.
+       */
+      using RealType = typename ValuesView::RealType;
+
+      /**
+       * \brief The type used for matrix elements indexing.
+       */
+      using IndexType = typename ColumnsIndexesView::IndexType;
+
+      /**
+       * \brief Type of container view used for storing the matrix elements values.
+       */
+      using ValuesViewType = ValuesView;
+
+      /**
+       * \brief Type of container view used for storing the column indexes of the matrix elements.
+       */
+      using ColumnsIndexesViewType = ColumnsIndexesView;
+
+      /**
+       * \brief Type of constant container view used for storing the matrix elements values.
+       */
+      using ConstValuesViewType = typename ValuesViewType::ConstViewType;
+
+      /**
+       * \brief Type of constant container view used for storing the column indexes of the matrix elements.
+       */
+      using ConstColumnsIndexesViewType = typename ColumnsIndexesViewType::ConstViewType;
+
+      /**
+       * \brief Type of sparse matrix row view.
+       */
+      using RowView = SparseSandboxMatrixRowView< ValuesViewType, ColumnsIndexesViewType, isBinary_ >;
+
+      /**
+       * \brief Type of constant sparse matrix row view.
+       */
+      using ConstView = SparseSandboxMatrixRowView< ConstValuesViewType, ConstColumnsIndexesViewType, isBinary_ >;
+
+      /**
+       * \brief The type of related matrix element.
+       */
+      using MatrixElementType = SparseMatrixElement< RealType, IndexType >;
+
+      /**
+       * \brief Type of iterator for the matrix row.
+       */
+      using IteratorType = MatrixRowViewIterator< RowView >;
+
+      /**
+       * \brief Tells whether the parent matrix is a binary matrix.
+       * @return `true` if the matrix is binary.
+       */
+      static constexpr bool isBinary() { return isBinary_; };
+
+      /**
+       * \brief Constructor with \e segmentView, \e values and \e columnIndexes.
+       *
+       * \param rowIdx is row index.
+       * \param offset is the begining of the matrix row in arrays with values and column indexes of matrix elements.
+       * \param size is row size, i.e. number of nonzero matrix elements in the row.
+       * \param values is a container view for storing the matrix elements values.
+       * \param columnIndexes is a container view for storing the column indexes of the matrix elements.
+       */
+      __cuda_callable__
+      SparseSandboxMatrixRowView( IndexType rowIdx,
+                                  IndexType offset,
+                                  IndexType size,
+                                  const ValuesViewType& values,
+                                  const ColumnsIndexesViewType& columnIndexes );
+
+      /**
+       * \brief Returns size of the matrix row, i.e. number of matrix elements in this row.
+       *
+       * \return Size of the matrix row.
+       */
+      __cuda_callable__
+      IndexType getSize() const;
+
+      /**
+       * \brief Returns the matrix row index.
+       *
+       * \return matrix row index.
+       */
+      __cuda_callable__
+      const IndexType& getRowIndex() const;
+
+      /**
+       * \brief Returns constants reference to a column index of an element with given rank in the row.
+       *
+       * \param localIdx is the rank of the non-zero element in given row.
+       *
+       * \return constant reference to the matrix element column index.
+       */
+      __cuda_callable__
+      const IndexType& getColumnIndex( const IndexType localIdx ) const;
+
+      /**
+       * \brief Returns non-constants reference to a column index of an element with given rank in the row.
+       *
+       * \param localIdx is the rank of the non-zero element in given row.
+       *
+       * \return non-constant reference to the matrix element column index.
+       */
+      __cuda_callable__
+      IndexType& getColumnIndex( const IndexType localIdx );
+
+      /**
+       * \brief Returns constants reference to value of an element with given rank in the row.
+       *
+       * \param localIdx is the rank of the non-zero element in given row.
+       *
+       * \return constant reference to the matrix element value.
+       */
+      __cuda_callable__
+      const RealType& getValue( const IndexType localIdx ) const;
+
+      /**
+       * \brief Returns non-constants reference to value of an element with given rank in the row.
+       *
+       * \param localIdx is the rank of the non-zero element in given row.
+       *
+       * \return non-constant reference to the matrix element value.
+       */
+      __cuda_callable__
+      RealType& getValue( const IndexType localIdx );
+
+      /**
+       * \brief Sets a value of matrix element with given rank in the matrix row.
+       *
+       * \param localIdx is the rank of the matrix element in the row.
+       * \param value is the new value of the matrix element.
+       */
+      __cuda_callable__
+      void setValue( const IndexType localIdx,
+                     const RealType& value );
+
+      /**
+       * \brief Sets a column index of matrix element with given rank in the matrix row.
+       *
+       * \param localIdx is the rank of the matrix element in the row.
+       * \param columnIndex is the new column index of the matrix element.
+       */
+      __cuda_callable__
+      void setColumnIndex( const IndexType localIdx,
+                           const IndexType& columnIndex );
+
+      /**
+       * \brief Sets both a value and a column index of matrix element with given rank in the matrix row.
+       *
+       * \param localIdx is the rank of the matrix element in the row.
+       * \param columnIndex is the new column index of the matrix element.
+       * \param value is the new value of the matrix element.
+       */
+      __cuda_callable__
+      void setElement( const IndexType localIdx,
+                       const IndexType columnIndex,
+                       const RealType& value );
+
+      /**
+       * \brief Comparison of two matrix rows.
+       *
+       * The other matrix row can be from any other matrix.
+       *
+       * \param other is another matrix row.
+       * \return \e true if both rows are the same, \e false otherwise.
+       */
+      template< typename _ValuesView,
+                typename _ColumnsIndexesView,
+                bool _isBinary >
+      __cuda_callable__
+      bool operator==( const SparseSandboxMatrixRowView< _ValuesView, _ColumnsIndexesView, _isBinary >& other ) const;
+
+      /**
+       * \brief Returns iterator pointing at the beginning of the matrix row.
+       *
+       * \return iterator pointing at the beginning.
+       */
+      __cuda_callable__
+      IteratorType begin();
+
+      /**
+       * \brief Returns iterator pointing at the end of the matrix row.
+       *
+       * \return iterator pointing at the end.
+       */
+      __cuda_callable__
+      IteratorType end();
+
+      /**
+       * \brief Returns constant iterator pointing at the beginning of the matrix row.
+       *
+       * \return iterator pointing at the beginning.
+       */
+      __cuda_callable__
+      const IteratorType cbegin() const;
+
+      /**
+       * \brief Returns constant iterator pointing at the end of the matrix row.
+       *
+       * \return iterator pointing at the end.
+       */
+      __cuda_callable__
+      const IteratorType cend() const;
+
+   protected:
+
+      IndexType rowIdx, size;
+
+      // SANDBOX_TODO: Replace the following line with data required by your format.
+      IndexType offset;
+
+      ValuesViewType values;
+
+      ColumnsIndexesViewType columnIndexes;
+};
+
+/**
+ * \brief Insertion operator for a sparse matrix row.
+ *
+ * \param str is an output stream.
+ * \param row is an input sparse matrix row.
+ * \return  reference to the output stream.
+ */
+template< typename ValuesView,
+          typename ColumnsIndexesView,
+          bool isBinary_ >
+std::ostream& operator<<( std::ostream& str, const SparseSandboxMatrixRowView< ValuesView, ColumnsIndexesView, isBinary_ >& row );
+
+      } // namespace Sandbox
+   } // namespace Matrices
+} // namespace TNL
+
+#include <TNL/Matrices/Sandbox/SparseSandboxMatrixRowView.hpp>
diff --git a/src/TNL/Matrices/Sandbox/SparseSandboxMatrixRowView.hpp b/src/TNL/Matrices/Sandbox/SparseSandboxMatrixRowView.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..b4bd74c443ff44901d18cec5f2daf084aff315bb
--- /dev/null
+++ b/src/TNL/Matrices/Sandbox/SparseSandboxMatrixRowView.hpp
@@ -0,0 +1,240 @@
+/***************************************************************************
+                          SparseSandboxMatrixRowView.hpp -  description
+                             -------------------
+    begin                : Apr 20, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#include <TNL/Matrices/Sandbox/SparseSandboxMatrixRowView.h>
+#include <TNL/Assert.h>
+
+namespace TNL {
+   namespace Matrices {
+      namespace Sandbox {
+
+// SANDBOX_TODO: Modify the follwing constructor by your needs
+template< typename ValuesView,
+          typename ColumnsIndexesView,
+          bool isBinary_ >
+__cuda_callable__
+SparseSandboxMatrixRowView< ValuesView, ColumnsIndexesView, isBinary_ >::
+SparseSandboxMatrixRowView( IndexType rowIdx,
+                            IndexType offset,
+                            IndexType size,
+                            const ValuesViewType& values,
+                            const ColumnsIndexesViewType& columnIndexes )
+ : rowIdx( rowIdx ), size( size ), offset( offset ), values( values ), columnIndexes( columnIndexes )
+{
+}
+
+template< typename ValuesView,
+          typename ColumnsIndexesView,
+          bool isBinary_ >
+__cuda_callable__ auto
+SparseSandboxMatrixRowView< ValuesView, ColumnsIndexesView, isBinary_ >::
+getSize() const -> IndexType
+{
+   return this->size;
+}
+
+template< typename ValuesView,
+          typename ColumnsIndexesView,
+          bool isBinary_ >
+__cuda_callable__
+auto
+SparseSandboxMatrixRowView< ValuesView, ColumnsIndexesView, isBinary_ >::
+getRowIndex() const -> const IndexType&
+{
+   return this->rowIdx;
+}
+
+template< typename ValuesView,
+          typename ColumnsIndexesView,
+          bool isBinary_ >
+__cuda_callable__ auto
+SparseSandboxMatrixRowView< ValuesView, ColumnsIndexesView, isBinary_ >::
+getColumnIndex( const IndexType localIdx ) const -> const IndexType&
+{
+   TNL_ASSERT_LT( localIdx, this->getSize(), "Local index exceeds matrix row capacity." );
+   // SANDBOX_TODO: Modify the following line to match with your sparse format.
+   return columnIndexes[ offset + localIdx ];
+}
+
+template< typename ValuesView,
+          typename ColumnsIndexesView,
+          bool isBinary_ >
+__cuda_callable__ auto
+SparseSandboxMatrixRowView< ValuesView, ColumnsIndexesView, isBinary_ >::
+getColumnIndex( const IndexType localIdx ) -> IndexType&
+{
+   TNL_ASSERT_LT( localIdx, this->getSize(), "Local index exceeds matrix row capacity." );
+   // SANDBOX_TODO: Modify the following line to match with your sparse format.
+   return columnIndexes[ offset + localIdx ];
+}
+
+template< typename ValuesView,
+          typename ColumnsIndexesView,
+          bool isBinary_ >
+__cuda_callable__ auto
+SparseSandboxMatrixRowView< ValuesView, ColumnsIndexesView, isBinary_ >::
+getValue( const IndexType localIdx ) const -> const RealType&
+{
+   TNL_ASSERT_LT( localIdx, this->getSize(), "Local index exceeds matrix row capacity." );
+   TNL_ASSERT_FALSE( isBinary(), "Cannot call this method for binary matrix row." );
+   // SANDBOX_TODO: Modify the following line to match with your sparse format.
+   return values[ offset + localIdx ];
+}
+
+template< typename ValuesView,
+          typename ColumnsIndexesView,
+          bool isBinary_ >
+__cuda_callable__ auto
+SparseSandboxMatrixRowView< ValuesView, ColumnsIndexesView, isBinary_ >::
+getValue( const IndexType localIdx ) -> RealType&
+{
+   TNL_ASSERT_LT( localIdx, this->getSize(), "Local index exceeds matrix row capacity." );
+   TNL_ASSERT_FALSE( isBinary(), "Cannot call this method for binary matrix row." );
+   // SANDBOX_TODO: Modify the following line to match with your sparse format.
+   return values[ offset + localIdx ];
+}
+
+template< typename ValuesView,
+          typename ColumnsIndexesView,
+          bool isBinary_ >
+__cuda_callable__ void
+SparseSandboxMatrixRowView< ValuesView, ColumnsIndexesView, isBinary_ >::
+setValue( const IndexType localIdx,
+          const RealType& value )
+{
+   TNL_ASSERT_LT( localIdx, this->getSize(), "Local index exceeds matrix row capacity." );
+   if( ! isBinary() ) {
+      // SANDBOX_TODO: Modify the following line to match with your sparse format.
+      const IndexType globalIdx = offset + localIdx;
+      values[ globalIdx ] = value;
+   }
+}
+
+template< typename ValuesView,
+          typename ColumnsIndexesView,
+          bool isBinary_ >
+__cuda_callable__ void
+SparseSandboxMatrixRowView< ValuesView, ColumnsIndexesView, isBinary_ >::
+setColumnIndex( const IndexType localIdx,
+                const IndexType& columnIndex )
+{
+   TNL_ASSERT_LT( localIdx, this->getSize(), "Local index exceeds matrix row capacity." );
+   // SANDBOX_TODO: Modify the following line to match with your sparse format.
+   const IndexType globalIdx = offset + localIdx;
+   this->columnIndexes[ globalIdx ] = columnIndex;
+}
+
+template< typename ValuesView,
+          typename ColumnsIndexesView,
+          bool isBinary_ >
+__cuda_callable__ void
+SparseSandboxMatrixRowView< ValuesView, ColumnsIndexesView, isBinary_ >::
+setElement( const IndexType localIdx,
+            const IndexType column,
+            const RealType& value )
+{
+   TNL_ASSERT_LT( localIdx, this->getSize(), "Local index exceeds matrix row capacity." );
+   // SANDBOX_TODO: Modify the following line to match with your sparse format.
+   const IndexType globalIdx = offset + localIdx;
+   columnIndexes[ globalIdx ] = column;
+   if( ! isBinary() )
+      values[ globalIdx ] = value;
+}
+
+template< typename ValuesView,
+          typename ColumnsIndexesView,
+          bool isBinary_ >
+   template< typename _ValuesView,
+             typename _ColumnsIndexesView,
+             bool _isBinary >
+__cuda_callable__
+bool
+SparseSandboxMatrixRowView< ValuesView, ColumnsIndexesView, isBinary_ >::
+operator==( const SparseSandboxMatrixRowView< _ValuesView, _ColumnsIndexesView, _isBinary >& other ) const
+{
+   IndexType i = 0;
+   while( i < getSize() && i < other.getSize() ) {
+      if( getColumnIndex( i ) != other.getColumnIndex( i ) )
+         return false;
+      if( ! _isBinary && getValue( i ) != other.getValue( i ) )
+         return false;
+      ++i;
+   }
+   for( IndexType j = i; j < getSize(); j++ )
+      // TODO: use ... != getPaddingIndex()
+      if( getColumnIndex( j ) >= 0 )
+         return false;
+   for( IndexType j = i; j < other.getSize(); j++ )
+      // TODO: use ... != getPaddingIndex()
+      if( other.getColumnIndex( j ) >= 0 )
+         return false;
+   return true;
+}
+
+template< typename ValuesView,
+          typename ColumnsIndexesView,
+          bool isBinary_ >
+__cuda_callable__ auto
+SparseSandboxMatrixRowView< ValuesView, ColumnsIndexesView, isBinary_ >::
+begin() -> IteratorType
+{
+   return IteratorType( *this, 0 );
+}
+
+template< typename ValuesView,
+          typename ColumnsIndexesView,
+          bool isBinary_ >
+__cuda_callable__ auto
+SparseSandboxMatrixRowView< ValuesView, ColumnsIndexesView, isBinary_ >::
+end() -> IteratorType
+{
+   return IteratorType( *this, this->getSize() );
+}
+
+template< typename ValuesView,
+          typename ColumnsIndexesView,
+          bool isBinary_ >
+__cuda_callable__ auto
+SparseSandboxMatrixRowView< ValuesView, ColumnsIndexesView, isBinary_ >::
+cbegin() const -> const IteratorType
+{
+   return IteratorType( *this, 0 );
+}
+
+template< typename ValuesView,
+          typename ColumnsIndexesView,
+          bool isBinary_ >
+__cuda_callable__ auto
+SparseSandboxMatrixRowView< ValuesView, ColumnsIndexesView, isBinary_ >::
+cend() const -> const IteratorType
+{
+   return IteratorType( *this, this->getSize() );
+}
+
+template< typename ValuesView,
+          typename ColumnsIndexesView,
+          bool isBinary_ >
+std::ostream& operator<<( std::ostream& str, const SparseSandboxMatrixRowView< ValuesView, ColumnsIndexesView, isBinary_ >& row )
+{
+   using NonConstIndex = std::remove_const_t< typename SparseSandboxMatrixRowView< ValuesView, ColumnsIndexesView, isBinary_ >::IndexType >;
+   for( NonConstIndex i = 0; i < row.getSize(); i++ )
+      if( isBinary_ )
+         // TODO: check getPaddingIndex(), print only the column indices of non-zeros but not the values
+         str << " [ " << row.getColumnIndex( i ) << " ] = " << (row.getColumnIndex( i ) >= 0) << ", ";
+      else
+         str << " [ " << row.getColumnIndex( i ) << " ] = " << row.getValue( i ) << ", ";
+   return str;
+}
+
+      } // namespace Sandbox
+   } // namespace Matrices
+} // namespace TNL
diff --git a/src/TNL/Matrices/Sandbox/SparseSandboxMatrixView.h b/src/TNL/Matrices/Sandbox/SparseSandboxMatrixView.h
new file mode 100644
index 0000000000000000000000000000000000000000..66247a349dfe9b9d63e5084abe0c4b41b6c922bb
--- /dev/null
+++ b/src/TNL/Matrices/Sandbox/SparseSandboxMatrixView.h
@@ -0,0 +1,871 @@
+/***************************************************************************
+                          SparseSandboxMatrixView.h -  description
+                             -------------------
+    begin                : Apr 20, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#include <TNL/Matrices/Matrix.h>
+#include <TNL/Matrices/MatrixType.h>
+#include <TNL/Allocators/Default.h>
+#include <TNL/Algorithms/Segments/CSR.h>
+#include <TNL/Matrices/Sandbox/SparseSandboxMatrixRowView.h>
+#include <TNL/TypeTraits.h>
+
+namespace TNL {
+   namespace Matrices {
+      namespace Sandbox {
+
+/**
+ * \brief Implementation of sparse sandbox matrix view.
+ *
+ * It serves as an accessor to \ref SparseSandboxMatrix for example when passing the
+ * matrix to lambda functions. SparseSandboxMatrix view can be also created in CUDA kernels.
+ *
+ * \tparam Real is a type of matrix elements. If \e Real equals \e bool the matrix is treated
+ *    as binary and so the matrix elements values are not stored in the memory since we need
+ *    to remember only coordinates of non-zero elements( which equal one).
+ * \tparam Device is a device where the matrix is allocated.
+ * \tparam Index is a type for indexing of the matrix elements.
+ * \tparam MatrixType specifies a symmetry of matrix. See \ref MatrixType. Symmetric
+ *    matrices store only lower part of the matrix and its diagonal. The upper part is reconstructed on the fly.
+ *    GeneralMatrix with no symmetry is used by default.
+ * \tparam Segments is a structure representing the sparse matrix format. Depending on the pattern of the non-zero elements
+ *    different matrix formats can perform differently especially on GPUs. By default \ref CSR format is used. See also
+ *    \ref Ellpack, \ref SlicedEllpack, \ref ChunkedEllpack or \ref BiEllpack.
+ * \tparam ComputeReal is the same as \e Real mostly but for binary matrices it is set to \e Index type. This can be changed
+ *    bu the user, of course.
+ *
+ */
+template< typename Real,
+          typename Device = Devices::Host,
+          typename Index = int,
+          typename MatrixType = GeneralMatrix >
+class SparseSandboxMatrixView : public MatrixView< Real, Device, Index >
+{
+   static_assert(
+      ! MatrixType::isSymmetric() ||
+      ! std::is_same< Device, Devices::Cuda >::value ||
+      ( std::is_same< Real, float >::value || std::is_same< Real, double >::value || std::is_same< Real, int >::value || std::is_same< Real, long long int >::value ),
+      "Given Real type is not supported by atomic operations on GPU which are necessary for symmetric operations." );
+
+   public:
+
+      // Supporting types - they are not important for the user
+      using BaseType = MatrixView< Real, Device, Index >;
+      using ValuesViewType = typename BaseType::ValuesView;
+      using ConstValuesViewType = typename ValuesViewType::ConstViewType;
+      using ColumnsIndexesViewType = Containers::VectorView< typename TNL::copy_const< Index >::template from< Real >::type, Device, Index >;
+      using ConstColumnsIndexesViewType = typename ColumnsIndexesViewType::ConstViewType;
+      using RowsCapacitiesView = Containers::VectorView< Index, Device, Index >;
+      using ConstRowsCapacitiesView = typename RowsCapacitiesView::ConstViewType;
+
+      /**
+       * \brief Test of symmetric matrix type.
+       *
+       * \return \e true if the matrix is stored as symmetric and \e false otherwise.
+       */
+      static constexpr bool isSymmetric() { return MatrixType::isSymmetric(); };
+
+      /**
+       * \brief Test of binary matrix type.
+       *
+       * \return \e true if the matrix is stored as binary and \e false otherwise.
+       */
+      static constexpr bool isBinary() { return std::is_same< Real, bool >::value; };
+
+      /**
+       * \brief The type of matrix elements.
+       */
+      using RealType = Real;
+
+      //using ComputeRealType = ComputeReal;
+
+      /**
+       * \brief The device where the matrix is allocated.
+       */
+      using DeviceType = Device;
+
+      /**
+       * \brief The type used for matrix elements indexing.
+       */
+      using IndexType = Index;
+
+      /**
+       * \brief Templated type of segments view, i.e. sparse matrix format.
+       */
+      //template< typename Device_, typename Index_ >
+      //using SegmentsViewTemplate = SegmentsView< Device_, Index_ >;
+
+      /**
+       * \brief Type of segments view used by this matrix. It represents the sparse matrix format.
+       */
+      //using SegmentsViewType = SegmentsView< Device, Index >;
+
+      /**
+       * \brief Type of related matrix view.
+       */
+      using ViewType = SparseSandboxMatrixView< Real, Device, Index, MatrixType >;
+
+      /**
+       * \brief Matrix view type for constant instances.
+       */
+      using ConstViewType = SparseSandboxMatrixView< std::add_const_t< Real >, Device, Index, MatrixType >;
+
+      /**
+       * \brief Type for accessing matrix rows.
+       */
+      using RowView = SparseSandboxMatrixRowView< ValuesViewType, ColumnsIndexesViewType, isBinary() >;
+
+      /**
+       * \brief Type for accessing constant matrix rows.
+       */
+      using ConstRowView = SparseSandboxMatrixRowView< ConstValuesViewType, ConstColumnsIndexesViewType, isBinary() >;;
+
+      /**
+       * \brief Helper type for getting self type or its modifications.
+       */
+      template< typename _Real = Real,
+                typename _Device = Device,
+                typename _Index = Index,
+                typename _MatrixType = MatrixType >
+      using Self = SparseSandboxMatrixView< _Real, _Device, _Index, _MatrixType >;
+
+      /**
+       * \brief Type of container view for CSR row pointers.
+       *
+       * SANDBOX_TODO: You may replace it with containers views for metadata of your format.
+       */
+      using RowPointersView = TNL::Containers::VectorView< IndexType, DeviceType, IndexType >;
+
+      /**
+       * \brief Constructor with no parameters.
+       */
+      __cuda_callable__
+      SparseSandboxMatrixView();
+
+      /**
+       * \brief Constructor with all necessary data and views.
+       *
+       * \param rows is a number of matrix rows.
+       * \param columns is a number of matrix columns.
+       * \param values is a vector view with matrix elements values.
+       * \param columnIndexes is a vector view with matrix elements column indexes.
+       * \param rowPointers is a container view with row pointers.
+       *
+       * SANDBOX_TODO: Replace `rowPointers` with metadata by your needs.
+       */
+      __cuda_callable__
+      SparseSandboxMatrixView( const IndexType rows,
+                               const IndexType columns,
+                               const ValuesViewType& values,
+                               const ColumnsIndexesViewType& columnIndexes,
+                               const RowPointersView& rowPointers );
+
+      /**
+       * \brief Copy constructor.
+       *
+       * \param matrix is an input sparse matrix view.
+       */
+      __cuda_callable__
+      SparseSandboxMatrixView( const SparseSandboxMatrixView& matrix ) = default;
+
+      /**
+       * \brief Move constructor.
+       *
+       * \param matrix is an input sparse matrix view.
+       */
+      __cuda_callable__
+      SparseSandboxMatrixView( SparseSandboxMatrixView&& matrix ) = default;
+
+      /**
+       * \brief Returns a modifiable view of the sparse matrix.
+       *
+       * \return sparse matrix view.
+       */
+      __cuda_callable__
+      ViewType getView();
+
+      /**
+       * \brief Returns a non-modifiable view of the sparse matrix.
+       *
+       * \return sparse matrix view.
+       */
+      __cuda_callable__
+      ConstViewType getConstView() const;
+
+      /**
+       * \brief Returns string with serialization type.
+       *
+       * The string has a form `Matrices::SparseSandboxMatrix< RealType,  [any_device], IndexType, General/Symmetric, Format, [any_allocator] >`.
+       *
+       * \return \ref String with the serialization type.
+       *
+       * \par Example
+       * \include Matrices/SparseMatrix/SparseMatrixViewExample_getSerializationType.cpp
+       * \par Output
+       * \include SparseMatrixViewExample_getSerializationType.out
+       */
+      static String getSerializationType();
+
+      /**
+       * \brief Returns string with serialization type.
+       *
+       * See \ref SparseSandboxMatrix::getSerializationType.
+       *
+       * \return \e String with the serialization type.
+       *
+       * \par Example
+       * \include Matrices/SparseMatrix/SparseMatrixExample_getSerializationType.cpp
+       * \par Output
+       * \include SparseMatrixExample_getSerializationType.out
+       */
+      virtual String getSerializationTypeVirtual() const;
+
+      /**
+       * \brief Computes number of non-zeros in each row.
+       *
+       * \param rowLengths is a vector into which the number of non-zeros in each row
+       * will be stored.
+       *
+       * \par Example
+       * \include Matrices/SparseMatrix/SparseMatrixViewExample_getCompressedRowLengths.cpp
+       * \par Output
+       * \include SparseMatrixViewExample_getCompressedRowLengths.out
+       */
+      template< typename Vector >
+      void getCompressedRowLengths( Vector& rowLengths ) const;
+
+      /**
+       * \brief Compute capacities of all rows.
+       *
+       * The row capacities are not stored explicitly and must be computed.
+       *
+       * \param rowCapacities is a vector where the row capacities will be stored.
+       */
+      template< typename Vector >
+      void getRowCapacities( Vector& rowCapacities ) const;
+
+      /**
+       * \brief Returns capacity of given matrix row.
+       *
+       * \param row index of matrix row.
+       * \return number of matrix elements allocated for the row.
+       */
+      __cuda_callable__
+      IndexType getRowCapacity( const IndexType row ) const;
+
+      /**
+       * \brief Returns number of non-zero matrix elements.
+       *
+       * This method really counts the non-zero matrix elements and so
+       * it returns zero for matrix having all allocated elements set to zero.
+       *
+       * \return number of non-zero matrix elements.
+       */
+      IndexType getNonzeroElementsCount() const;
+
+      /**
+       * \brief Constant getter of simple structure for accessing given matrix row.
+       *
+       * \param rowIdx is matrix row index.
+       *
+       * \return RowView for accessing given matrix row.
+       *
+       * \par Example
+       * \include Matrices/SparseMatrix/SparseMatrixViewExample_getConstRow.cpp
+       * \par Output
+       * \include SparseMatrixViewExample_getConstRow.out
+       *
+       * See \ref SparseSandboxMatrixRowView.
+       */
+      __cuda_callable__
+      ConstRowView getRow( const IndexType& rowIdx ) const;
+
+      /**
+       * \brief Non-constant getter of simple structure for accessing given matrix row.
+       *
+       * \param rowIdx is matrix row index.
+       *
+       * \return RowView for accessing given matrix row.
+       *
+       * \par Example
+       * \include Matrices/SparseMatrix/SparseMatrixViewExample_getRow.cpp
+       * \par Output
+       * \include SparseMatrixViewExample_getRow.out
+       *
+       * See \ref SparseSandboxMatrixRowView.
+       */
+      __cuda_callable__
+      RowView getRow( const IndexType& rowIdx );
+
+      /**
+       * \brief Sets element at given \e row and \e column to given \e value.
+       *
+       * This method can be called from the host system (CPU) no matter
+       * where the matrix is allocated. If the matrix is allocated on GPU this method
+       * can be called even from device kernels. If the matrix is allocated in GPU device
+       * this method is called from CPU, it transfers values of each matrix element separately and so the
+       * performance is very low. For higher performance see. \ref SparseSandboxMatrix::getRow
+       * or \ref SparseSandboxMatrix::forElements and \ref SparseSandboxMatrix::forAllElements.
+       * The call may fail if the matrix row capacity is exhausted.
+       *
+       * \param row is row index of the element.
+       * \param column is columns index of the element.
+       * \param value is the value the element will be set to.
+       *
+       * \par Example
+       * \include Matrices/SparseMatrix/SparseMatrixViewExample_setElement.cpp
+       * \par Output
+       * \include SparseMatrixViewExample_setElement.out
+       */
+      __cuda_callable__
+      void setElement( const IndexType row,
+                       const IndexType column,
+                       const RealType& value );
+
+      /**
+       * \brief Add element at given \e row and \e column to given \e value.
+       *
+       * This method can be called from the host system (CPU) no matter
+       * where the matrix is allocated. If the matrix is allocated on GPU this method
+       * can be called even from device kernels. If the matrix is allocated in GPU device
+       * this method is called from CPU, it transfers values of each matrix element separately and so the
+       * performance is very low. For higher performance see. \ref SparseSandboxMatrix::getRow
+       * or \ref SparseSandboxMatrix::forElements and \ref SparseSandboxMatrix::forAllElements.
+       * The call may fail if the matrix row capacity is exhausted.
+       *
+       * \param row is row index of the element.
+       * \param column is columns index of the element.
+       * \param value is the value the element will be set to.
+       * \param thisElementMultiplicator is multiplicator the original matrix element
+       *   value is multiplied by before addition of given \e value.
+       *
+       * \par Example
+       * \include Matrices/SparseMatrix/SparseMatrixViewExample_addElement.cpp
+       * \par Output
+       * \include SparseMatrixViewExample_addElement.out
+       */
+      __cuda_callable__
+      void addElement( IndexType row,
+                       IndexType column,
+                       const RealType& value,
+                       const RealType& thisElementMultiplicator = 1.0 );
+
+      /**
+       * \brief Returns value of matrix element at position given by its row and column index.
+       *
+       * This method can be called from the host system (CPU) no matter
+       * where the matrix is allocated. If the matrix is allocated on GPU this method
+       * can be called even from device kernels. If the matrix is allocated in GPU device
+       * this method is called from CPU, it transfers values of each matrix element separately and so the
+       * performance is very low. For higher performance see. \ref SparseSandboxMatrix::getRow
+       * or \ref SparseSandboxMatrix::forElements and \ref SparseSandboxMatrix::forAllElements.
+       *
+       * \param row is a row index of the matrix element.
+       * \param column i a column index of the matrix element.
+       *
+       * \return value of given matrix element.
+       *
+       * \par Example
+       * \include Matrices/SparseMatrix/SparseMatrixViewExample_getElement.cpp
+       * \par Output
+       * \include SparseMatrixViewExample_getElement.out
+       *
+       */
+      __cuda_callable__
+      RealType getElement( IndexType row,
+                           IndexType column ) const;
+
+      /**
+       * \brief Method for performing general reduction on matrix rows.
+       *
+       * \tparam Fetch is a type of lambda function for data fetch declared as
+       *          `fetch( IndexType rowIdx, IndexType& columnIdx, RealType& elementValue ) -> FetchValue`.
+       *          The return type of this lambda can be any non void.
+       * \tparam Reduce is a type of lambda function for reduction declared as
+       *          `reduce( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue`.
+       * \tparam Keep is a type of lambda function for storing results of reduction in each row.
+       *          It is declared as `keep( const IndexType rowIdx, const double& value )`.
+       * \tparam FetchValue is type returned by the Fetch lambda function.
+       *
+       * \param begin defines beginning of the range [begin,end) of rows to be processed.
+       * \param end defines ending of the range [begin,end) of rows to be processed.
+       * \param fetch is an instance of lambda function for data fetch.
+       * \param reduce is an instance of lambda function for reduction.
+       * \param keep in an instance of lambda function for storing results.
+       * \param zero is zero of given reduction operation also known as idempotent element.
+       *
+       * \par Example
+       * \include Matrices/SparseMatrix/SparseMatrixViewExample_reduceRows.cpp
+       * \par Output
+       * \include SparseMatrixViewExample_reduceRows.out
+       */
+      template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
+      void reduceRows( IndexType begin, IndexType end, Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& zero );
+
+      /**
+       * \brief Method for performing general reduction on matrix rows for constant instances.
+       *
+       * \tparam Fetch is a type of lambda function for data fetch declared as
+       *          `fetch( IndexType rowIdx, IndexType& columnIdx, RealType& elementValue ) -> FetchValue`.
+       *          The return type of this lambda can be any non void.
+       * \tparam Reduce is a type of lambda function for reduction declared as
+       *          `reduce( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue`.
+       * \tparam Keep is a type of lambda function for storing results of reduction in each row.
+       *          It is declared as `keep( const IndexType rowIdx, const double& value )`.
+       * \tparam FetchValue is type returned by the Fetch lambda function.
+       *
+       * \param begin defines beginning of the range [begin,end) of rows to be processed.
+       * \param end defines ending of the range [begin,end) of rows to be processed.
+       * \param fetch is an instance of lambda function for data fetch.
+       * \param reduce is an instance of lambda function for reduction.
+       * \param keep in an instance of lambda function for storing results.
+       * \param zero is zero of given reduction operation also known as idempotent element.
+       *
+       * \par Example
+       * \include Matrices/SparseMatrix/SparseMatrixViewExample_reduceRows.cpp
+       * \par Output
+       * \include SparseMatrixViewExample_reduceRows.out
+       */
+      template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
+      void reduceRows( IndexType begin, IndexType end, Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& zero ) const;
+
+      /**
+       * \brief Method for performing general reduction on all matrix rows.
+       *
+       * \tparam Fetch is a type of lambda function for data fetch declared as
+       *          `fetch( IndexType rowIdx, IndexType& columnIdx, RealType& elementValue ) -> FetchValue`.
+       *          The return type of this lambda can be any non void.
+       * \tparam Reduce is a type of lambda function for reduction declared as
+       *          `reduce( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue`.
+       * \tparam Keep is a type of lambda function for storing results of reduction in each row.
+       *          It is declared as `keep( const IndexType rowIdx, const double& value )`.
+       * \tparam FetchValue is type returned by the Fetch lambda function.
+       *
+       * \param fetch is an instance of lambda function for data fetch.
+       * \param reduce is an instance of lambda function for reduction.
+       * \param keep in an instance of lambda function for storing results.
+       * \param zero is zero of given reduction operation also known as idempotent element.
+       *
+       * \par Example
+       * \include Matrices/SparseMatrix/SparseMatrixViewExample_reduceAllRows.cpp
+       * \par Output
+       * \include SparseMatrixViewExample_reduceAllRows.out
+       */
+      template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
+      void reduceAllRows( Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& zero );
+
+      /**
+       * \brief Method for performing general reduction on all matrix rows for constant instances.
+       *
+       * \tparam Fetch is a type of lambda function for data fetch declared as
+       *          `fetch( IndexType rowIdx, IndexType& columnIdx, RealType& elementValue ) -> FetchValue`.
+       *          The return type of this lambda can be any non void.
+       * \tparam Reduce is a type of lambda function for reduction declared as
+       *          `reduce( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue`.
+       * \tparam Keep is a type of lambda function for storing results of reduction in each row.
+       *          It is declared as `keep( const IndexType rowIdx, const double& value )`.
+       * \tparam FetchValue is type returned by the Fetch lambda function.
+       *
+       * \param fetch is an instance of lambda function for data fetch.
+       * \param reduce is an instance of lambda function for reduction.
+       * \param keep in an instance of lambda function for storing results.
+       * \param zero is zero of given reduction operation also known as idempotent element.
+       *
+       * \par Example
+       * \include Matrices/SparseMatrix/SparseMatrixViewExample_reduceAllRows.cpp
+       * \par Output
+       * \include SparseMatrixViewExample_reduceAllRows.out
+       */
+      template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
+      void reduceAllRows( Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& zero ) const;
+
+      /**
+       * \brief Method for iteration over all matrix rows for constant instances.
+       *
+       * \tparam Function is type of lambda function that will operate on matrix elements.
+       *    It is should have form like
+       *  `function( IndexType rowIdx, IndexType localIdx, IndexType columnIdx, const RealType& value )`.
+       *  The \e localIdx parameter is a rank of the non-zero element in given row.
+       *
+       * \param begin defines beginning of the range [begin,end) of rows to be processed.
+       * \param end defines ending of the range [begin,end) of rows to be processed.
+       * \param function is an instance of the lambda function to be called in each row.
+       *
+       * \par Example
+       * \include Matrices/SparseMatrix/SparseMatrixViewExample_forRows.cpp
+       * \par Output
+       * \include SparseMatrixViewExample_forRows.out
+       */
+      template< typename Function >
+      void forElements( IndexType begin, IndexType end, Function& function ) const;
+
+      /**
+       * \brief Method for iteration over all matrix rows for non-constant instances.
+       *
+       * \tparam Function is type of lambda function that will operate on matrix elements.
+       *    It is should have form like
+       *  `function( IndexType rowIdx, IndexType localIdx, IndexType columnIdx, const RealType& value )`.
+       *  The \e localIdx parameter is a rank of the non-zero element in given row.
+       *
+       * \param begin defines beginning of the range [begin,end) of rows to be processed.
+       * \param end defines ending of the range [begin,end) of rows to be processed.
+       * \param function is an instance of the lambda function to be called in each row.
+       *
+       * \par Example
+       * \include Matrices/SparseMatrix/SparseMatrixViewExample_forRows.cpp
+       * \par Output
+       * \include SparseMatrixViewExample_forRows.out
+       */
+      template< typename Function >
+      void forElements( IndexType begin, IndexType end, Function& function );
+
+      /**
+       * \brief This method calls \e forElements for all matrix rows (for constant instances).
+       *
+       * See \ref SparseSandboxMatrix::forElements.
+       *
+       * \tparam Function is a type of lambda function that will operate on matrix elements.
+       * \param function  is an instance of the lambda function to be called in each row.
+       *
+       * \par Example
+       * \include Matrices/SparseMatrix/SparseMatrixViewExample_forAllRows.cpp
+       * \par Output
+       * \include SparseMatrixViewExample_forAllRows.out
+       */
+      template< typename Function >
+      void forAllElements( Function& function ) const;
+
+      /**
+       * \brief This method calls \e forElements for all matrix rows.
+       *
+       * See \ref SparseSandboxMatrix::forElements.
+       *
+       * \tparam Function is a type of lambda function that will operate on matrix elements.
+       * \param function  is an instance of the lambda function to be called in each row.
+       *
+       * \par Example
+       * \include Matrices/SparseMatrix/SparseMatrixViewExample_forAllRows.cpp
+       * \par Output
+       * \include SparseMatrixViewExample_forAllRows.out
+       */
+      template< typename Function >
+      void forAllElements( Function& function );
+
+      /**
+       * \brief Method for parallel iteration over matrix rows from interval [ \e begin, \e end).
+       *
+       * In each row, given lambda function is performed. Each row is processed by at most one thread unlike the method
+       * \ref SparseSandboxMatrixView::forElements where more than one thread can be mapped to each row.
+
+       *
+       * \tparam Function is type of the lambda function.
+       *
+       * \param begin defines beginning of the range [ \e begin,\e end ) of rows to be processed.
+       * \param end defines ending of the range [ \e begin, \e end ) of rows to be processed.
+       * \param function is an instance of the lambda function to be called for each row.
+       *
+       * ```
+       * auto function = [] __cuda_callable__ ( RowView& row ) mutable { ... };
+       * ```
+       *
+       * \e RowView represents matrix row - see \ref TNL::Matrices::SparseSandboxMatrixView::RowView.
+       *
+       * \par Example
+       * \include Matrices/SparseMatrix/SparseMatrixViewExample_forRows.cpp
+       * \par Output
+       * \include SparseMatrixViewExample_forRows.out
+       */
+      template< typename Function >
+      void forRows( IndexType begin, IndexType end, Function&& function );
+
+      /**
+       * \brief Method for parallel iteration over matrix rows from interval [ \e begin, \e end) for constant instances.
+       *
+       * In each row, given lambda function is performed. Each row is processed by at most one thread unlike the method
+       * \ref SparseSandboxMatrixView::forElements where more than one thread can be mapped to each row.
+       *
+       * \tparam Function is type of the lambda function.
+       *
+       * \param begin defines beginning of the range [ \e begin,\e end ) of rows to be processed.
+       * \param end defines ending of the range [ \e begin, \e end ) of rows to be processed.
+       * \param function is an instance of the lambda function to be called for each row.
+       *
+       * ```
+       * auto function = [] __cuda_callable__ ( RowView& row ) { ... };
+       * ```
+       *
+       * \e RowView represents matrix row - see \ref TNL::Matrices::SparseSandboxMatrixView::RowView.
+       *
+       * \par Example
+       * \include Matrices/SparseMatrix/SparseMatrixViewExample_forRows.cpp
+       * \par Output
+       * \include SparseMatrixViewExample_forRows.out
+       */
+      template< typename Function >
+      void forRows( IndexType begin, IndexType end, Function&& function ) const;
+
+      /**
+       * \brief Method for parallel iteration over all matrix rows.
+       *
+       * In each row, given lambda function is performed. Each row is processed by at most one thread unlike the method
+       * \ref SparseSandboxMatrixView::forAllElements where more than one thread can be mapped to each row.
+       *
+       * \tparam Function is type of the lambda function.
+       *
+       * \param function is an instance of the lambda function to be called for each row.
+       *
+       * ```
+       * auto function = [] __cuda_callable__ ( RowView& row ) mutable { ... };
+       * ```
+       *
+       * \e RowView represents matrix row - see \ref TNL::Matrices::SparseSandboxMatrixView::RowView.
+       *
+       * \par Example
+       * \include Matrices/SparseMatrix/SparseMatrixViewExample_forRows.cpp
+       * \par Output
+       * \include SparseMatrixViewExample_forRows.out
+       */
+      template< typename Function >
+      void forAllRows( Function&& function );
+
+      /**
+       * \brief Method for parallel iteration over all matrix rows for constant instances.
+       *
+       * In each row, given lambda function is performed. Each row is processed by at most one thread unlike the method
+       * \ref SparseSandboxMatrixView::forAllElements where more than one thread can be mapped to each row.
+       *
+       * \tparam Function is type of the lambda function.
+       *
+       * \param function is an instance of the lambda function to be called for each row.
+       *
+       * ```
+       * auto function = [] __cuda_callable__ ( RowView& row ) { ... };
+       * ```
+       *
+       * \e RowView represents matrix row - see \ref TNL::Matrices::SparseSandboxMatrixView::RowView.
+       *
+       * \par Example
+       * \include Matrices/SparseMatrix/SparseMatrixViewExample_forRows.cpp
+       * \par Output
+       * \include SparseMatrixViewExample_forRows.out
+       */
+      template< typename Function >
+      void forAllRows( Function&& function ) const;
+
+      /**
+       * \brief Method for sequential iteration over all matrix rows for constant instances.
+       *
+       * \tparam Function is type of lambda function that will operate on matrix elements.
+       *    It is should have form like
+       *  `function( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx_, const RealType& value )`.
+       *  The column index repeats twice only for compatibility with sparse matrices.
+       *
+       * \param begin defines beginning of the range [begin,end) of rows to be processed.
+       * \param end defines ending of the range [begin,end) of rows to be processed.
+       * \param function is an instance of the lambda function to be called in each row.
+       */
+      template< typename Function >
+      void sequentialForRows( IndexType begin, IndexType end, Function& function ) const;
+
+      /**
+       * \brief Method for sequential iteration over all matrix rows for non-constant instances.
+       *
+       * \tparam Function is type of lambda function that will operate on matrix elements.
+       *    It is should have form like
+       *  `function( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx_, RealType& value )`.
+       *  The column index repeats twice only for compatibility with sparse matrices.
+       *
+       * \param begin defines beginning of the range [begin,end) of rows to be processed.
+       * \param end defines ending of the range [begin,end) of rows to be processed.
+       * \param function is an instance of the lambda function to be called in each row.
+       */
+      template< typename Function >
+      void sequentialForRows( IndexType begin, IndexType end, Function& function );
+
+      /**
+       * \brief This method calls \e sequentialForRows for all matrix rows (for constant instances).
+       *
+       * See \ref SparseSandboxMatrixView::sequentialForRows.
+       *
+       * \tparam Function is a type of lambda function that will operate on matrix elements.
+       * \param function  is an instance of the lambda function to be called in each row.
+       */
+      template< typename Function >
+      void sequentialForAllRows( Function& function ) const;
+
+      /**
+       * \brief This method calls \e sequentialForRows for all matrix rows.
+       *
+       * See \ref SparseSandboxMatrixView::sequentialForAllRows.
+       *
+       * \tparam Function is a type of lambda function that will operate on matrix elements.
+       * \param function  is an instance of the lambda function to be called in each row.
+       */
+      template< typename Function >
+      void sequentialForAllRows( Function& function );
+
+      /**
+       * \brief Computes product of matrix and vector.
+       *
+       * More precisely, it computes:
+       *
+       * `outVector = matrixMultiplicator * ( * this ) * inVector + outVectorMultiplicator * outVector`
+       *
+       * \tparam InVector is type of input vector.  It can be \ref Vector,
+       *     \ref VectorView, \ref Array, \ref ArraView or similar container.
+       * \tparam OutVector is type of output vector. It can be \ref Vector,
+       *     \ref VectorView, \ref Array, \ref ArraView or similar container.
+       *
+       * \param inVector is input vector.
+       * \param outVector is output vector.
+       * \param matrixMultiplicator is a factor by which the matrix is multiplied. It is one by default.
+       * \param outVectorMultiplicator is a factor by which the outVector is multiplied before added
+       *    to the result of matrix-vector product. It is zero by default.
+       * \param begin is the beginning of the rows range for which the vector product
+       *    is computed. It is zero by default.
+       * \param end is the end of the rows range for which the vector product
+       *    is computed. It is number if the matrix rows by default.
+       */
+      template< typename InVector,
+                typename OutVector >
+      void vectorProduct( const InVector& inVector,
+                          OutVector& outVector,
+                          const RealType matrixMultiplicator = 1.0,
+                          const RealType outVectorMultiplicator = 0.0,
+                          const IndexType begin = 0,
+                          IndexType end = 0 ) const;
+
+      template< typename Vector1, typename Vector2 >
+      bool performSORIteration( const Vector1& b,
+                                const IndexType row,
+                                Vector2& x,
+                                const RealType& omega = 1.0 ) const;
+
+      /**
+       * \brief Assignment of any matrix type.
+       * .
+       * \param matrix is input matrix for the assignment.
+       * \return reference to this matrix.
+       */
+      SparseSandboxMatrixView& operator=( const SparseSandboxMatrixView& matrix );
+
+      /**
+       * \brief Comparison operator with another arbitrary matrix type.
+       *
+       * \param matrix is the right-hand side matrix.
+       * \return \e true if the RHS matrix is equal, \e false otherwise.
+       */
+      template< typename Matrix >
+      bool operator==( const Matrix& m ) const;
+
+      /**
+       * \brief Comparison operator with another arbitrary matrix type.
+       *
+       * \param matrix is the right-hand side matrix.
+       * \return \e true if the RHS matrix is equal, \e false otherwise.
+       */
+      template< typename Matrix >
+      bool operator!=( const Matrix& m ) const;
+
+      /**
+       * \brief Method for saving the matrix to the file with given filename.
+       *
+       * \param fileName is name of the file.
+       */
+      void save( const String& fileName ) const;
+
+      /**
+       * \brief Method for saving the matrix to a file.
+       *
+       * \param file is the output file.
+       */
+      void save( File& file ) const;
+
+      /**
+       * \brief Method for printing the matrix to output stream.
+       *
+       * \param str is the output stream.
+       */
+      void print( std::ostream& str ) const;
+
+      /**
+       * \brief Getter of segments for non-constant instances.
+       *
+       * \e Segments are a structure for addressing the matrix elements columns and values.
+       * In fact, \e Segments represent the sparse matrix format.
+       *
+       * \return Non-constant reference to segments.
+       */
+      //SegmentsViewType& getSegments();
+
+      /**
+       * \brief Getter of segments for constant instances.
+       *
+       * \e Segments are a structure for addressing the matrix elements columns and values.
+       * In fact, \e Segments represent the sparse matrix format.
+       *
+       * \return Constant reference to segments.
+       */
+      //const SegmentsViewType& getSegments() const;
+
+      /**
+       * \brief Getter of column indexes for constant instances.
+       *
+       * \return Constant reference to a vector with matrix elements column indexes.
+       */
+      const ColumnsIndexesViewType& getColumnIndexes() const;
+
+      /**
+       * \brief Getter of column indexes for nonconstant instances.
+       *
+       * \return Reference to a vector with matrix elements column indexes.
+       */
+      ColumnsIndexesViewType& getColumnIndexes();
+
+      /**
+       * \brief Returns a padding index value.
+       *
+       * Padding index is used for column indexes of padding zeros. Padding zeros
+       * are used in some sparse matrix formats for better data alignment in memory.
+       *
+       * \return value of the padding index.
+       */
+      __cuda_callable__
+      IndexType getPaddingIndex() const;
+
+   protected:
+
+      ColumnsIndexesViewType columnIndexes;
+
+      RowPointersView rowPointers;
+      //SegmentsViewType segments;
+
+   private:
+      // TODO: this should be probably moved into a detail namespace
+      template< typename VectorOrView,
+                std::enable_if_t< HasSetSizeMethod< VectorOrView >::value, bool > = true >
+      static void set_size_if_resizable( VectorOrView& v, IndexType size )
+      {
+         v.setSize( size );
+      }
+
+      template< typename VectorOrView,
+                std::enable_if_t< ! HasSetSizeMethod< VectorOrView >::value, bool > = true >
+      static void set_size_if_resizable( VectorOrView& v, IndexType size )
+      {
+         TNL_ASSERT_EQ( v.getSize(), size, "view has wrong size" );
+      }
+};
+
+      } // namespace Sandbox
+   } // namespace Matrices
+} // namespace TNL
+
+#include <TNL/Matrices/Sandbox/SparseSandboxMatrixView.hpp>
diff --git a/src/TNL/Matrices/Sandbox/SparseSandboxMatrixView.hpp b/src/TNL/Matrices/Sandbox/SparseSandboxMatrixView.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..07342e8e758065aceab6baf38af6f0db1e43eba4
--- /dev/null
+++ b/src/TNL/Matrices/Sandbox/SparseSandboxMatrixView.hpp
@@ -0,0 +1,1030 @@
+/***************************************************************************
+                          SparseSandboxMatrixView.hpp -  description
+                             -------------------
+    begin                : Apr 20, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#include <functional>
+#include <TNL/Matrices/Sandbox/SparseSandboxMatrixView.h>
+#include <TNL/Algorithms/reduce.h>
+#include <TNL/Algorithms/AtomicOperations.h>
+#include <TNL/Matrices/details/SparseMatrix.h>
+
+namespace TNL {
+   namespace Matrices {
+      namespace Sandbox {
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType >
+__cuda_callable__
+SparseSandboxMatrixView< Real, Device, Index, MatrixType >::
+SparseSandboxMatrixView()
+{
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType >
+__cuda_callable__
+SparseSandboxMatrixView< Real, Device, Index, MatrixType >::
+SparseSandboxMatrixView( const IndexType rows,
+                         const IndexType columns,
+                         const ValuesViewType& values,
+                         const ColumnsIndexesViewType& columnIndexes,
+                         const RowPointersView& rowPointers )
+: MatrixView< Real, Device, Index >( rows, columns, values ), columnIndexes( columnIndexes ), rowPointers( rowPointers )
+{
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType >
+__cuda_callable__
+auto
+SparseSandboxMatrixView< Real, Device, Index, MatrixType >::
+getView() -> ViewType
+{
+   return ViewType( this->getRows(),
+                    this->getColumns(),
+                    this->getValues().getView(),
+                    this->columnIndexes.getView(),
+                    this->segments.getView() );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType >
+__cuda_callable__
+auto
+SparseSandboxMatrixView< Real, Device, Index, MatrixType >::
+getConstView() const -> ConstViewType
+{
+   return ConstViewType( this->getRows(),
+                         this->getColumns(),
+                         this->getValues().getConstView(),
+                         this->getColumnsIndexes().getConstView(),
+                         this->segments.getConstView() );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType >
+String
+SparseSandboxMatrixView< Real, Device, Index, MatrixType >::
+getSerializationType()
+{
+   return String( "Matrices::Sandbox::SparseMatrix< " ) +
+             TNL::getSerializationType< RealType >() + ", " +
+             TNL::getSerializationType< IndexType >() + ", " +
+             MatrixType::getSerializationType() + ", [any_allocator], [any_allocator] >";
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType >
+String
+SparseSandboxMatrixView< Real, Device, Index, MatrixType >::
+getSerializationTypeVirtual() const
+{
+   return this->getSerializationType();
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType >
+   template< typename Vector >
+void
+SparseSandboxMatrixView< Real, Device, Index, MatrixType >::
+getCompressedRowLengths( Vector& rowLengths ) const
+{
+   details::set_size_if_resizable( rowLengths, this->getRows() );
+   rowLengths = 0;
+   auto rowLengths_view = rowLengths.getView();
+   auto fetch = [] __cuda_callable__ ( IndexType row, IndexType column, const RealType& value ) -> IndexType {
+      return ( value != 0.0 );
+   };
+   auto keep = [=] __cuda_callable__ ( const IndexType rowIdx, const IndexType value ) mutable {
+      rowLengths_view[ rowIdx ] = value;
+   };
+   this->reduceAllRows( fetch, std::plus<>{}, keep, 0 );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType >
+   template< typename Vector >
+void
+SparseSandboxMatrixView< Real, Device, Index, MatrixType >::
+getRowCapacities( Vector& rowLengths ) const
+{
+   details::set_size_if_resizable( rowLengths, this->getRows() );
+   rowLengths = 0;
+   auto rowLengths_view = rowLengths.getView();
+   auto fetch = [] __cuda_callable__ ( IndexType row, IndexType column, const RealType& value ) -> IndexType {
+      return 1;
+   };
+   auto keep = [=] __cuda_callable__ ( const IndexType rowIdx, const IndexType value ) mutable {
+      rowLengths_view[ rowIdx ] = value;
+   };
+   this->reduceAllRows( fetch, std::plus<>{}, keep, 0 );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType >
+__cuda_callable__
+Index
+SparseSandboxMatrixView< Real, Device, Index, MatrixType >::
+getRowCapacity( const IndexType row ) const
+{
+   return this->segments.getSegmentSize( row );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType >
+Index
+SparseSandboxMatrixView< Real, Device, Index, MatrixType >::
+getNonzeroElementsCount() const
+{
+   const auto columns_view = this->columnIndexes.getConstView();
+   const IndexType paddingIndex = this->getPaddingIndex();
+   if( ! isSymmetric() )
+   {
+      auto fetch = [=] __cuda_callable__ ( const IndexType i ) -> IndexType {
+         return ( columns_view[ i ] != paddingIndex );
+      };
+      return Algorithms::reduce< DeviceType >( ( IndexType ) 0, this->columnIndexes.getSize(), fetch, std::plus<>{}, 0 );
+   }
+   else
+   {
+      const auto rows = this->getRows();
+      const auto columns = this->getColumns();
+      Containers::Vector< IndexType, DeviceType, IndexType > row_sums( this->getRows(), 0 );
+      auto row_sums_view = row_sums.getView();
+      auto row_pointers_view = this->rowPointers.getConstView();
+      const auto columnIndexesView = this->columnIndexes.getConstView();
+      // SANDBOX_TODO: Replace the following lambda function (or more) with code compute number of nonzero matrix elements
+      //               of symmetric matrix. Note, that this is required only by symmetric matrices and that the highest performance
+      //               is not a priority here.
+      auto f = [=] __cuda_callable__ ( IndexType rowIdx ) mutable {
+         auto begin = row_pointers_view[ rowIdx ];
+         auto end = row_pointers_view[ rowIdx + 1 ];
+         IndexType sum( 0 );
+         for( IndexType globalIdx = begin; globalIdx < end; globalIdx++ )
+         {
+            const IndexType column = columnIndexesView[ globalIdx ];
+            if( column != paddingIndex )
+               sum += 1 + ( column != rowIdx && column < rows && rowIdx < columns );
+         }
+         row_sums_view[ rowIdx ] = sum;
+      };
+      TNL::Algorithms::ParallelFor< DeviceType >::exec( ( IndexType ) 0, this->getRows(), f );
+      return sum( row_sums );
+   }
+   return 0;
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType >
+__cuda_callable__ auto
+SparseSandboxMatrixView< Real, Device, Index, MatrixType >::
+getRow( const IndexType& rowIdx ) const -> ConstRowView
+{
+   TNL_ASSERT_LT( rowIdx, this->getRows(), "Row index is larger than number of matrix rows." );
+   // SANDBOX_TODO: Replace the following with creation of RowView corresponding with your sparse matrix format.
+   return ConstRowView( rowIdx,                                                         // row index
+                        this->rowPointers[ rowIdx ],                                    // row begining
+                        this->rowPointers[ rowIdx + 1 ] - this->rowPointers[ rowIdx ],  // number of elemnts allocated for given row
+                        this->values, this->columnIndexes );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType >
+__cuda_callable__ auto
+SparseSandboxMatrixView< Real, Device, Index, MatrixType >::
+getRow( const IndexType& rowIdx ) -> RowView
+{
+   TNL_ASSERT_LT( rowIdx, this->getRows(), "Row index is larger than number of matrix rows." );
+   // SANDBOX_TODO: Replace this with RowView constructor by your needs.
+   return RowView( rowIdx,                                               // row index
+                   rowPointers[ rowIdx ],                                // index of the first nonzero element in the row
+                   rowPointers[ rowIdx + 1 ] - rowPointers[ rowIdx ],    // number of nonzero elements in the row
+                   this->values,
+                   this->columnIndexes );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType >
+__cuda_callable__ void
+SparseSandboxMatrixView< Real, Device, Index, MatrixType >::
+setElement( const IndexType row,
+            const IndexType column,
+            const RealType& value )
+{
+   this->addElement( row, column, value, 0.0 );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType >
+__cuda_callable__ void
+SparseSandboxMatrixView< Real, Device, Index, MatrixType >::
+addElement( IndexType row,
+            IndexType column,
+            const RealType& value,
+            const RealType& thisElementMultiplicator )
+{
+   TNL_ASSERT_GE( row, 0, "Sparse matrix row index cannot be negative." );
+   TNL_ASSERT_LT( row, this->getRows(), "Sparse matrix row index is larger than number of matrix rows." );
+   TNL_ASSERT_GE( column, 0, "Sparse matrix column index cannot be negative." );
+   TNL_ASSERT_LT( column, this->getColumns(), "Sparse matrix column index is larger than number of matrix columns." );
+
+   if( isSymmetric() && row < column )
+   {
+      swap( row, column );
+      TNL_ASSERT_LT( row, this->getRows(), "Column index is out of the symmetric part of the matrix after transposition." );
+      TNL_ASSERT_LT( column,this->getColumns(), "Row index is out of the symmetric part of the matrix after transposition." );
+   }
+
+   // SANDBOX_TODO: Replace the following line with a code that computes number of matrix elements allocated for 
+   //               matrix row with indedx `row`. Note that the code works on both host and GPU kernel. To achieve
+   //               the same effect, you may use macro __CUDA_ARCH__ as can be seen bellow in this method.
+   const IndexType rowSize = this->rowPointers.getElement( row + 1 ) - this->rowPointers.getElement( row );
+   IndexType col( this->getPaddingIndex() );
+   IndexType i;
+   IndexType globalIdx;
+   for( i = 0; i < rowSize; i++ )
+   {
+      // SANDBOX_TODO: Replace the following line with a code that computes a global index of `i`-th nonzero matrix element
+      //               in the `row`-th matrix row. The global index is a pointer to arrays `values` and `columnIndexes` storing
+      //               the matrix elements values and column indexes respectively.
+      globalIdx = this->rowPointers.getElement( row ) + i;
+      TNL_ASSERT_LT( globalIdx, this->columnIndexes.getSize(), "" );
+      col = this->columnIndexes.getElement( globalIdx );
+      if( col == column )
+      {
+         if( ! isBinary() )
+            this->values.setElement( globalIdx, thisElementMultiplicator * this->values.getElement( globalIdx ) + value );
+         return;
+      }
+      if( col == this->getPaddingIndex() || col > column )
+         break;
+   }
+   if( i == rowSize )
+   {
+#ifndef __CUDA_ARCH__
+      std::stringstream msg;
+      msg << "The capacity of the sparse matrix row number "  << row << " was exceeded.";
+      throw std::logic_error( msg.str() );
+#else
+      TNL_ASSERT_TRUE( false, "");
+      return;
+#endif
+   }
+   if( col == this->getPaddingIndex() )
+   {
+      this->columnIndexes.setElement( globalIdx, column );
+      if( ! isBinary() )
+         this->values.setElement( globalIdx, value );
+      return;
+   }
+   else
+   {
+      IndexType j = rowSize - 1;
+      while( j > i )
+      {
+         // SANDBOX_TODO: Replace the following two lines with a code that computes a global indexes of `j`-th and `j-1`-th nonzero matrix elements
+         //               in the `row`-th matrix row. The global index is a pointer to arrays `values` and `columnIndexes` storing
+         //               the matrix elements values and column indexes respectively.
+         const IndexType globalIdx1 = this->rowPointers.getElement( row ) + j;
+         const IndexType globalIdx2 = globalIdx1 - 1;
+         // End of code replacement.
+         TNL_ASSERT_LT( globalIdx1, this->columnIndexes.getSize(), "" );
+         TNL_ASSERT_LT( globalIdx2, this->columnIndexes.getSize(), "" );
+         this->columnIndexes.setElement( globalIdx1, this->columnIndexes.getElement( globalIdx2 ) );
+         if( ! isBinary() )
+            this->values.setElement( globalIdx1, this->values.getElement( globalIdx2 ) );
+         j--;
+      }
+
+      this->columnIndexes.setElement( globalIdx, column );
+      if( ! isBinary() )
+         this->values.setElement( globalIdx, value );
+      return;
+   }
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType >
+__cuda_callable__
+auto
+SparseSandboxMatrixView< Real, Device, Index, MatrixType >::
+getElement( IndexType row,
+            IndexType column ) const -> RealType
+{
+   TNL_ASSERT_GE( row, 0, "Sparse matrix row index cannot be negative." );
+   TNL_ASSERT_LT( row, this->getRows(), "Sparse matrix row index is larger than number of matrix rows." );
+   TNL_ASSERT_GE( column, 0, "Sparse matrix column index cannot be negative." );
+   TNL_ASSERT_LT( column, this->getColumns(), "Sparse matrix column index is larger than number of matrix columns." );
+
+   if( isSymmetric() && row < column )
+   {
+      swap( row, column );
+      if( row >= this->getRows() || column >= this->getColumns() )
+         return 0.0;
+   }
+
+   // SANDBOX_TODO: Replace the following lines with a code for getting number of elements allocated for given row.
+   const IndexType rowSize = this->rowPointers.getElement( row + 1 ) - this->rowPointers.getElement( row );
+   for( IndexType i = 0; i < rowSize; i++ )
+   {
+      // SANDBOX_TODO: Replace the following line with a code for getting index of the matrix element in arrays `values` and `columnIdexes`.
+      const IndexType globalIdx = this->rowPointers.getElement( row ) + i;
+      TNL_ASSERT_LT( globalIdx, this->columnIndexes.getSize(), "" );
+      const IndexType col = this->columnIndexes.getElement( globalIdx );
+      if( col == column )
+      {
+         if( isBinary() )
+            return 1;
+         else
+            return this->values.getElement( globalIdx );
+      }
+   }
+   return 0.0;
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType >
+template< typename InVector,
+       typename OutVector >
+void
+SparseSandboxMatrixView< Real, Device, Index, MatrixType >::
+vectorProduct( const InVector& inVector,
+               OutVector& outVector,
+               const RealType matrixMultiplicator,
+               const RealType outVectorMultiplicator,
+               const IndexType firstRow,
+               IndexType lastRow ) const
+{
+   TNL_ASSERT_EQ( this->getColumns(), inVector.getSize(), "Matrix columns do not fit with input vector." );
+   TNL_ASSERT_EQ( this->getRows(), outVector.getSize(), "Matrix rows do not fit with output vector." );
+
+   using OutVectorReal = typename OutVector::RealType;
+   static_assert(
+         ! MatrixType::isSymmetric() ||
+         ! std::is_same< Device, Devices::Cuda >::value ||
+         ( std::is_same< OutVectorReal, float >::value ||
+           std::is_same< OutVectorReal, double >::value ||
+           std::is_same< OutVectorReal, int >::value ||
+           std::is_same< OutVectorReal, long long int >::value ),
+         "Given Real type is not supported by atomic operations on GPU which are necessary for symmetric operations." );
+
+   const auto inVectorView = inVector.getConstView();
+   auto outVectorView = outVector.getView();
+   const auto valuesView = this->values.getConstView();
+   const auto columnIndexesView = this->columnIndexes.getConstView();
+   const auto rowPointersView = this->rowPointers.getConstView();
+   const IndexType paddingIndex = this->getPaddingIndex();
+#define HAVE_SANDBOX_SIMPLE_SPMV
+   // SANDBOX_TODO: The following is simple direct implementation of SpMV operation with CSR format. We recommend to start by
+   //               replacing this part with SpMV based on your sparse format.
+   if( std::is_same< DeviceType, TNL::Devices::Host >::value )          // this way you may easily specialize for different device types
+   {
+      // SANDBOX_TODO: This simple and naive implementation for CPU.
+      for( IndexType rowIdx = firstRow; rowIdx < lastRow; rowIdx++ )
+      {
+         const auto begin = rowPointers[ rowIdx ];
+         const auto end = rowPointers[ rowIdx + 1 ];
+         RealType sum( 0.0 );
+         for( IndexType globalIdx = begin; globalIdx < end; globalIdx++ )
+         {
+            const auto columnIdx = this->columnIndexes[ globalIdx ];
+            if( columnIdx != paddingIndex )
+               sum += this->values[ globalIdx ] * inVector[ columnIdx ];
+         }
+         // SANDBOX_TODO:The following is quite inefficient, its better to specialized the code for cases when
+         // `outVectorMultiplicator` is zero or `matrixMultiplicator` is one - see. the full implementation bellow.
+         outVector[ rowIdx ] = outVector[ rowIdx ] * outVectorMultiplicator + matrixMultiplicator * sum;
+      }
+   }
+   else
+   {
+      //SANDBOX_TODO: The following is general implementation based on ParallelFor and lambda function. It would work even on CPU.
+      auto f = [=] __cuda_callable__ ( IndexType rowIdx ) mutable {
+         const auto begin = rowPointersView[ rowIdx ];
+         const auto end = rowPointersView[ rowIdx + 1 ];
+         RealType sum( 0.0 );
+         for( IndexType globalIdx = begin; globalIdx < end; globalIdx++ )
+            sum += valuesView[ globalIdx ] * inVectorView[ columnIndexesView[ globalIdx ] ];
+         outVectorView[ rowIdx ] = outVectorView[ rowIdx ] * outVectorMultiplicator + matrixMultiplicator * sum;
+      };
+      TNL::Algorithms::ParallelFor< DeviceType >::exec( firstRow, lastRow, f );
+   }
+#ifdef HAVE_SANDBOX_SIMPLE_SPMV
+#else
+   // SANDBOX_TODO: The following is fully functional implementation based on method `reduceRows`.
+   if( isSymmetric() )
+      outVector *= outVectorMultiplicator;
+   auto symmetricFetch = [=] __cuda_callable__ ( IndexType row, IndexType localIdx, IndexType globalIdx, bool& compute ) mutable -> RealType {
+      const IndexType column = columnIndexesView[ globalIdx ];
+      compute = ( column != paddingIndex );
+      if( ! compute )
+         return 0.0;
+      if( isSymmetric() && column < row )
+      {
+         if( isBinary() )
+            Algorithms::AtomicOperations< DeviceType >::add( outVectorView[ column ], ( OutVectorReal ) matrixMultiplicator * inVectorView[ row ] );
+         else
+            Algorithms::AtomicOperations< DeviceType >::add( outVectorView[ column ], ( OutVectorReal ) matrixMultiplicator * valuesView[ globalIdx ] * inVectorView[ row ] );
+      }
+      if( isBinary() )
+         return inVectorView[ column ];
+      return valuesView[ globalIdx ] * inVectorView[ column ];
+   };
+   auto fetch = [=] __cuda_callable__ ( IndexType globalIdx, bool& compute ) mutable -> RealType {
+      const IndexType column = columnIndexesView[ globalIdx ];
+      if( isBinary() )
+         return inVectorView[ column ];
+      return valuesView[ globalIdx ] * inVectorView[ column ];
+   };
+
+   auto keeperGeneral = [=] __cuda_callable__ ( IndexType row, const RealType& value ) mutable {
+      if( isSymmetric() )
+      {
+         typename OutVector::RealType aux = matrixMultiplicator * value;
+         Algorithms::AtomicOperations< DeviceType >::add( outVectorView[ row ], aux );
+      }
+      else
+      {
+         if( outVectorMultiplicator == 0.0 )
+            outVectorView[ row ] = matrixMultiplicator * value;
+         else
+            outVectorView[ row ] = outVectorMultiplicator * outVectorView[ row ] + matrixMultiplicator * value;
+      }
+   };
+   auto keeperDirect = [=] __cuda_callable__ ( IndexType row, const RealType& value ) mutable {
+      outVectorView[ row ] = value;
+   };
+   auto keeperMatrixMult = [=] __cuda_callable__ ( IndexType row, const RealType& value ) mutable {
+      outVectorView[ row ] = matrixMultiplicator * value;
+   };
+   auto keeperVectorMult = [=] __cuda_callable__ ( IndexType row, const RealType& value ) mutable {
+      outVectorView[ row ] = outVectorMultiplicator * outVectorView[ row ] + value;
+   };
+
+   if( lastRow == 0 )
+      lastRow = this->getRows();
+   if( isSymmetric() )
+      this->reduceRows( firstRow, lastRow, symmetricFetch, std::plus<>{}, keeperGeneral, ( RealType ) 0.0 );
+   else
+   {
+      if( outVectorMultiplicator == 0.0 )
+      {
+         if( matrixMultiplicator == 1.0 )
+            this->reduceRows( firstRow, lastRow, fetch, std::plus<>{}, keeperDirect, ( RealType ) 0.0 );
+         else
+            this->reduceRows( firstRow, lastRow, fetch, std::plus<>{}, keeperMatrixMult, ( RealType ) 0.0 );
+      }
+      else
+      {
+         if( matrixMultiplicator == 1.0 )
+            this->reduceRows( firstRow, lastRow, fetch, std::plus<>{}, keeperVectorMult, ( RealType ) 0.0 );
+         else
+            this->reduceRows( firstRow, lastRow, fetch, std::plus<>{}, keeperGeneral, ( RealType ) 0.0 );
+      }
+   }
+#endif
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType >
+   template< typename Fetch, typename Reduce, typename Keep, typename FetchValue >
+void
+SparseSandboxMatrixView< Real, Device, Index, MatrixType >::
+reduceRows( IndexType begin, IndexType end, Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchValue& zero )
+{
+   auto columns_view = this->columnIndexes.getView();
+   auto values_view = this->values.getView();
+   auto row_pointers_view = this->rowPointers.getConstView();
+   const IndexType paddingIndex_ = this->getPaddingIndex();
+   // SANDBOX_TODO: Replace the following code with the one for computing reduction in rows by your format.
+   //               Note, that this method can be used for implementation of SpMV.
+   auto f = [=] __cuda_callable__ ( IndexType rowIdx ) mutable {
+      const auto begin = row_pointers_view[ rowIdx ];
+      const auto end = row_pointers_view[ rowIdx + 1 ];
+      FetchValue sum = zero;
+      for( IndexType globalIdx = begin; globalIdx < end; globalIdx++ )
+      {
+         IndexType& columnIdx = columns_view[ globalIdx ];
+         if( columnIdx != paddingIndex_ )
+         {
+            if( isBinary() )
+               sum = reduce( sum, fetch( rowIdx, columnIdx, 1 ) );
+            else
+               sum = reduce( sum, fetch( rowIdx, columnIdx, values_view[ globalIdx ] ) );
+         }
+      }
+      keep( rowIdx, sum );
+   };
+   TNL::Algorithms::ParallelFor< DeviceType >::exec( begin, end, f );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType >
+   template< typename Fetch, typename Reduce, typename Keep, typename FetchValue >
+void
+SparseSandboxMatrixView< Real, Device, Index, MatrixType >::
+reduceRows( IndexType begin, IndexType end, Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchValue& zero ) const
+{
+   auto columns_view = this->columnIndexes.getConstView();
+   auto values_view = this->values.getConstView();
+   const IndexType paddingIndex_ = this->getPaddingIndex();
+   // SANDBOX_TODO: Replace the following code with the one for computing reduction in rows by your format.
+   //               Note, that this method can be used for implementation of SpMV.
+   auto row_pointers_view = this->rowPointers.getConstView();
+   auto f = [=] __cuda_callable__ ( IndexType rowIdx ) mutable {
+      const auto begin = row_pointers_view[ rowIdx ];
+      const auto end = row_pointers_view[ rowIdx + 1 ];
+      FetchValue sum = zero;
+      for( IndexType globalIdx = begin; globalIdx < end; globalIdx++ )
+      {
+         const IndexType& columnIdx = columns_view[ globalIdx ];
+         if( columnIdx != paddingIndex_ )
+         {
+            if( isBinary() )
+               sum = reduce( sum, fetch( rowIdx, columnIdx, 1 ) );
+            else
+               sum = reduce( sum, fetch( rowIdx, columnIdx, values_view[ globalIdx ] ) );
+         }
+      }
+      keep( rowIdx, sum );
+   };
+   TNL::Algorithms::ParallelFor< DeviceType >::exec( begin, end, f );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType >
+   template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
+void
+SparseSandboxMatrixView< Real, Device, Index, MatrixType >::
+reduceAllRows( Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& zero )
+{
+   this->reduceRows( 0, this->getRows(), fetch, reduce, keep, zero );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType >
+   template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
+void
+SparseSandboxMatrixView< Real, Device, Index, MatrixType >::
+reduceAllRows( Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& zero ) const
+{
+   this->reduceRows( 0, this->getRows(), fetch, reduce, keep, zero );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType >
+   template< typename Function >
+void
+SparseSandboxMatrixView< Real, Device, Index, MatrixType >::
+forElements( IndexType begin, IndexType end, Function& function ) const
+{
+   const auto columns_view = this->columnIndexes.getConstView();
+   const auto values_view = this->values.getConstView();
+   // SANDBOX_TODO: Replace the following code with the one for iterating over all allocated matrix elements.
+   auto row_pointers_view = this->rowPointers.getConstView();
+   auto f = [=] __cuda_callable__ ( IndexType rowIdx ) mutable {
+      const auto begin = row_pointers_view[ rowIdx ];
+      const auto end = row_pointers_view[ rowIdx + 1 ];
+      IndexType localIdx( 0 );
+      for( IndexType globalIdx = begin; globalIdx < end; globalIdx++ )
+      {
+         if( isBinary() )
+            function( rowIdx, localIdx, columns_view[ globalIdx ], ( RealType ) 1 );
+         else
+            function( rowIdx, localIdx, columns_view[ globalIdx ], values_view[ globalIdx ] );
+         localIdx++;
+      }
+   };
+   TNL::Algorithms::ParallelFor< DeviceType >::exec( begin, end, f );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType >
+   template< typename Function >
+void
+SparseSandboxMatrixView< Real, Device, Index, MatrixType >::
+forElements( IndexType begin, IndexType end, Function& function )
+{
+   auto columns_view = this->columnIndexes.getView();
+   auto values_view = this->values.getView();
+   // SANDBOX_TODO: Replace the following code with the one for iterating over all allocated matrix elements.
+   auto row_pointers_view = this->rowPointers.getConstView();
+   auto f = [=] __cuda_callable__ ( IndexType rowIdx ) mutable {
+      const auto begin = row_pointers_view[ rowIdx ];
+      const auto end = row_pointers_view[ rowIdx + 1 ];
+      IndexType localIdx( 0 );
+      RealType one( 1.0 );
+      for( IndexType globalIdx = begin; globalIdx < end; globalIdx++ )
+      {
+         if( isBinary() )
+            function( rowIdx, localIdx, columns_view[ globalIdx ], one ); // TODO: Fix this without using `one`.
+         else
+            function( rowIdx, localIdx, columns_view[ globalIdx ], values_view[ globalIdx ] );
+         localIdx++;
+      }
+   };
+   TNL::Algorithms::ParallelFor< DeviceType >::exec( begin, end, f );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType >
+   template< typename Function >
+void
+SparseSandboxMatrixView< Real, Device, Index, MatrixType >::
+forAllElements( Function& function ) const
+{
+   this->forElements( 0, this->getRows(), function );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType >
+   template< typename Function >
+void
+SparseSandboxMatrixView< Real, Device, Index, MatrixType >::
+forAllElements( Function& function )
+{
+   this->forElements( 0, this->getRows(), function );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType >
+   template< typename Function >
+void
+SparseSandboxMatrixView< Real, Device, Index, MatrixType >::
+forRows( IndexType begin, IndexType end, Function&& function )
+{
+   auto columns_view = this->columnIndexes.getView();
+   auto values_view = this->values.getView();
+   // SANDBOX_TODO: Replace the following code with the one for iteration over matrix rows.
+   auto row_pointers_view = this->rowPointers.getConstView();
+   auto f = [=] __cuda_callable__ ( IndexType rowIdx ) mutable {
+      auto rowView = RowView( rowIdx,                                                       // row index
+                              row_pointers_view[ rowIdx ],                                  // row begining
+                              row_pointers_view[ rowIdx + 1 ] -row_pointers_view[ rowIdx ], // number of elemnts allocated for given matrix row
+                              values_view, columns_view );
+      function( rowView );
+   };
+   TNL::Algorithms::ParallelFor< DeviceType >::exec( begin, end, f );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType >
+   template< typename Function >
+void
+SparseSandboxMatrixView< Real, Device, Index, MatrixType >::
+forRows( IndexType begin, IndexType end, Function&& function ) const
+{
+   const auto columns_view = this->columnIndexes.getConstView();
+   const auto values_view = this->values.getConstView();
+   // SANDBOX_TODO: Replace the following code with the one for iteration over matrix rows.
+   auto row_pointers_view = this->rowPointers.getConstView();
+   auto f = [=] __cuda_callable__ ( IndexType rowIdx ) {
+      auto rowView = ConstRowView( rowIdx,                                                       // row index
+                                   row_pointers_view[ rowIdx ],                                  // row begining
+                                   row_pointers_view[ rowIdx + 1 ] -row_pointers_view[ rowIdx ], // number of elemnts allocated for given matrix row
+                                   values_view, columns_view );
+      function( rowView );
+   };
+   TNL::Algorithms::ParallelFor< DeviceType >::exec( begin, end, f );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType >
+   template< typename Function >
+void
+SparseSandboxMatrixView< Real, Device, Index, MatrixType >::
+forAllRows( Function&& function )
+{
+   this->forRows( 0, this->getRows(), function );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType >
+   template< typename Function >
+void
+SparseSandboxMatrixView< Real, Device, Index, MatrixType >::
+forAllRows( Function&& function ) const
+{
+   this->forRows( 0, this->getRows(), function );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType >
+   template< typename Function >
+void
+SparseSandboxMatrixView< Real, Device, Index, MatrixType >::
+sequentialForRows( IndexType begin, IndexType end, Function& function ) const
+{
+   for( IndexType row = begin; row < end; row ++ )
+      this->forRows( row, row + 1, function );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType >
+   template< typename Function >
+void
+SparseSandboxMatrixView< Real, Device, Index, MatrixType >::
+sequentialForRows( IndexType begin, IndexType end, Function& function )
+{
+   for( IndexType row = begin; row < end; row ++ )
+      this->forRows( row, row + 1, function );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType >
+   template< typename Function >
+void
+SparseSandboxMatrixView< Real, Device, Index, MatrixType >::
+sequentialForAllRows( Function& function ) const
+{
+   this->sequentialForRows( 0, this->getRows(), function );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType >
+   template< typename Function >
+void
+SparseSandboxMatrixView< Real, Device, Index, MatrixType >::
+sequentialForAllRows( Function& function )
+{
+   this->sequentialForRows( 0, this->getRows(), function );
+}
+
+/*template< typename Real,
+          template< typename, typename > class SegmentsView,
+          typename Device,
+          typename Index,
+          typename RealAllocator,
+          typename IndexAllocator >
+template< typename Real2, template< typename, typename > class Segments2, typename Index2, typename RealAllocator2, typename IndexAllocator2 >
+void
+SparseSandboxMatrixView< Real, Device, Index, MatrixType >::
+addMatrix( const SparseSandboxMatrixView< Real2, Segments2, Device, Index2, RealAllocator2, IndexAllocator2 >& matrix,
+           const RealType& matrixMultiplicator,
+           const RealType& thisMatrixMultiplicator )
+{
+
+}
+
+template< typename Real,
+          template< typename, typename > class SegmentsView,
+          typename Device,
+          typename Index,
+          typename RealAllocator,
+          typename IndexAllocator >
+template< typename Real2, typename Index2 >
+void
+SparseSandboxMatrixView< Real, Device, Index, MatrixType >::
+getTransposition( const SparseSandboxMatrixView< Real2, Device, Index2 >& matrix,
+                  const RealType& matrixMultiplicator )
+{
+
+}*/
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType >
+template< typename Vector1, typename Vector2 >
+bool
+SparseSandboxMatrixView< Real, Device, Index, MatrixType >::
+performSORIteration( const Vector1& b,
+                     const IndexType row,
+                     Vector2& x,
+                     const RealType& omega ) const
+{
+   return false;
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType >
+SparseSandboxMatrixView< Real, Device, Index, MatrixType >&
+SparseSandboxMatrixView< Real, Device, Index, MatrixType >::
+operator=( const SparseSandboxMatrixView< Real, Device, Index, MatrixType >& matrix )
+{
+   MatrixView< Real, Device, Index >::operator=( matrix );
+   this->columnIndexes.bind( matrix.columnIndexes );
+   // SANDBOX_TODO: Replace the following line with assignment of metadata required by your
+   //               sparse format.
+   this->rowPointers.bind( matrix.rowPointers );
+   return *this;
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType >
+   template< typename Matrix >
+bool
+SparseSandboxMatrixView< Real, Device, Index, MatrixType >::
+operator==( const Matrix& m ) const
+{
+   const auto& view1 = *this;
+   // FIXME: getConstView does not work
+   //const auto view2 = m.getConstView();
+   const auto view2 = m.getView();
+   auto fetch = [=] __cuda_callable__ ( const IndexType i ) -> bool
+   {
+      return view1.getRow( i ) == view2.getRow( i );
+   };
+   return Algorithms::reduce< DeviceType >( 0, this->getRows(), fetch, std::logical_and<>{}, true );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType >
+   template< typename Matrix >
+bool
+SparseSandboxMatrixView< Real, Device, Index, MatrixType >::
+operator!=( const Matrix& m ) const
+{
+   return ! operator==( m );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType >
+void
+SparseSandboxMatrixView< Real, Device, Index, MatrixType >::
+save( File& file ) const
+{
+   MatrixView< RealType, DeviceType, IndexType >::save( file );
+   file << this->columnIndexes
+        << this->rowPointers;  // SANDBOX_TODO: Replace this with medata required by your format
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType >
+void
+SparseSandboxMatrixView< Real, Device, Index, MatrixType >::
+save( const String& fileName ) const
+{
+   Object::save( fileName );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType >
+void
+SparseSandboxMatrixView< Real, Device, Index, MatrixType >::
+print( std::ostream& str ) const
+{
+   if( isSymmetric() )
+   {
+      for( IndexType row = 0; row < this->getRows(); row++ )
+      {
+         str <<"Row: " << row << " -> ";
+         for( IndexType column = 0; column < this->getColumns(); column++ )
+         {
+            auto value = this->getElement( row, column );
+            if( value != ( RealType ) 0 )
+               str << " Col:" << column << "->" << value << "\t";
+         }
+         str << std::endl;
+      }
+   }
+   else
+      for( IndexType row = 0; row < this->getRows(); row++ )
+      {
+         str <<"Row: " << row << " -> ";
+         // SANDBOX_TODO: Replace the followinf line with a code for computing number of elements allocated for given matrix row.
+         const auto rowLength = this->rowPointers.getElement( row + 1 ) - this->rowPointers.getElement( row );
+         for( IndexType i = 0; i < rowLength; i++ )
+         {
+            // SANDBOX_TODO: Replace the following line with a code for getting index of the matrix element in arrays `values` and `columnIdexes`.
+            const IndexType globalIdx = this->rowPointers.getElement( row ) + i;
+            const IndexType column = this->columnIndexes.getElement( globalIdx );
+            if( column == this->getPaddingIndex() )
+               break;
+            RealType value;
+            if( isBinary() )
+               value = ( RealType ) 1.0;
+            else
+               value = this->values.getElement( globalIdx );
+            if( value )
+            {
+               std::stringstream str_;
+               str_ << std::setw( 4 ) << std::right << column << ":" << std::setw( 4 ) << std::left << value;
+               str << std::setw( 10 ) << str_.str();
+            }
+         }
+         str << std::endl;
+      }
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType >
+__cuda_callable__
+Index
+SparseSandboxMatrixView< Real, Device, Index, MatrixType >::
+getPaddingIndex() const
+{
+   return -1;
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType >
+auto
+SparseSandboxMatrixView< Real, Device, Index, MatrixType >::
+getColumnIndexes() const -> const ColumnsIndexesViewType&
+{
+   return this->columnIndexes;
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType >
+auto
+SparseSandboxMatrixView< Real, Device, Index, MatrixType >::
+getColumnIndexes() -> ColumnsIndexesViewType&
+{
+   return this->columnIndexes;
+}
+
+      } // namespace Sandbox
+   } //namespace Matrices
+} // namespace  TNL
diff --git a/src/TNL/Matrices/SparseMatrix.h b/src/TNL/Matrices/SparseMatrix.h
index 237417d66aa903b176b1e502a7f92cec99c81049..d6420438137bdace3f5a96082bbc11c5e8cce2f4 100644
--- a/src/TNL/Matrices/SparseMatrix.h
+++ b/src/TNL/Matrices/SparseMatrix.h
@@ -54,7 +54,7 @@ class SparseMatrix : public Matrix< Real, Device, Index, RealAllocator >
    static_assert(
          ! MatrixType::isSymmetric() ||
          ! std::is_same< Device, Devices::Cuda >::value ||
-         ( std::is_same< Real, float >::value || std::is_same< Real, double >::value || std::is_same< Real, int >::value || std::is_same< Real, long long int >::value ),
+         ( std::is_same< Real, float >::value || std::is_same< Real, double >::value || std::is_same< Real, int >::value || std::is_same< Real, long long int >::value || std::is_same< Real, bool >::value ),
          "Given Real type is not supported by atomic operations on GPU which are necessary for symmetric operations." );
 
    public:
@@ -604,16 +604,28 @@ class SparseMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * \brief Method for performing general reduction on matrix rows.
        *
        * \tparam Fetch is a type of lambda function for data fetch declared as
-       *          `fetch( IndexType rowIdx, IndexType& columnIdx, RealType& elementValue ) -> FetchValue`.
-       *          The return type of this lambda can be any non void.
+       *
+       * ```
+       * auto fetch = [=] __cuda_callable__ ( IndexType rowIdx, IndexType& columnIdx, RealType& elementValue ) -> FetchValue { ... };
+       * ```
+       *
+       *  The return type of this lambda can be any non void.
        * \tparam Reduce is a type of lambda function for reduction declared as
-       *          `reduce( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue`.
-       * \tparam Keep is a type of lambda function for storing results of reduction in each row.
-       *          It is declared as `keep( const IndexType rowIdx, const double& value )`.
+       *
+       * ```
+       * auto reduce = [=] __cuda_callable__ ( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue { ... };
+       * ```
+       *
+       * \tparam Keep is a type of lambda function for storing results of reduction in each row. It is declared as
+       *
+       * ```
+       * auto keep = [=] __cuda_callable__ ( const IndexType rowIdx, const double& value ) { ... };
+       * ```
+       *
        * \tparam FetchValue is type returned by the Fetch lambda function.
        *
-       * \param begin defines beginning of the range [begin,end) of rows to be processed.
-       * \param end defines ending of the range [begin,end) of rows to be processed.
+       * \param begin defines beginning of the range [ \e begin, \e end ) of rows to be processed.
+       * \param end defines ending of the range [ \e begin, \e end ) of rows to be processed.
        * \param fetch is an instance of lambda function for data fetch.
        * \param reduce is an instance of lambda function for reduction.
        * \param keep in an instance of lambda function for storing results.
@@ -633,16 +645,28 @@ class SparseMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * \brief Method for performing general reduction on matrix rows for constant instances.
        *
        * \tparam Fetch is a type of lambda function for data fetch declared as
-       *          `fetch( IndexType rowIdx, IndexType& columnIdx, RealType& elementValue ) -> FetchValue`.
-       *          The return type of this lambda can be any non void.
+       *
+       * ```
+       * auto fetch = [=] __cuda_callable__ ( IndexType rowIdx, IndexType& columnIdx, RealType& elementValue ) -> FetchValue { ... };
+       * ```
+       *
+       *  The return type of this lambda can be any non void.
        * \tparam Reduce is a type of lambda function for reduction declared as
-       *          `reduce( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue`.
-       * \tparam Keep is a type of lambda function for storing results of reduction in each row.
-       *          It is declared as `keep( const IndexType rowIdx, const double& value )`.
+       *
+       * ```
+       * auto reduce = [=] __cuda_callable__ ( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue { ... }
+       * ```
+       *
+       * \tparam Keep is a type of lambda function for storing results of reduction in each row. It is declared as
+       *
+       * ```
+       * auto keep = [=] __cuda_callable__ ( const IndexType rowIdx, const double& value ) { ... };
+       * ```
+       *
        * \tparam FetchValue is type returned by the Fetch lambda function.
        *
-       * \param begin defines beginning of the range [begin,end) of rows to be processed.
-       * \param end defines ending of the range [begin,end) of rows to be processed.
+       * \param begin defines beginning of the range [ \e begin, \e end ) of rows to be processed.
+       * \param end defines ending of the range [ \e begin, \e end ) of rows to be processed.
        * \param fetch is an instance of lambda function for data fetch.
        * \param reduce is an instance of lambda function for reduction.
        * \param keep in an instance of lambda function for storing results.
@@ -662,12 +686,24 @@ class SparseMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * \brief Method for performing general reduction on all matrix rows.
        *
        * \tparam Fetch is a type of lambda function for data fetch declared as
-       *          `fetch( IndexType rowIdx, IndexType& columnIdx, RealType& elementValue ) -> FetchValue`.
-       *          The return type of this lambda can be any non void.
+       *
+       * ```
+       * auto fetch = [=] __cuda_callable__ ( IndexType rowIdx, IndexType& columnIdx, RealType& elementValue ) -> FetchValue { ... };
+       * ```
+       *
+       *  The return type of this lambda can be any non void.
        * \tparam Reduce is a type of lambda function for reduction declared as
-       *          `reduce( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue`.
-       * \tparam Keep is a type of lambda function for storing results of reduction in each row.
-       *          It is declared as `keep( const IndexType rowIdx, const double& value )`.
+       *
+       * ```
+       * auto reduce = [=] __cuda_callable__ ( const FetchValue& v1, const FetchValue& v2 ) -> FetchValu { ... };
+       * ```
+       *
+       * \tparam Keep is a type of lambda function for storing results of reduction in each row. It is declared as
+       *
+       * ```
+       * auto keep = [=] __cuda_callable__ ( const IndexType rowIdx, const double& value ) { ... };
+       * ```
+       *
        * \tparam FetchValue is type returned by the Fetch lambda function.
        *
        * \param fetch is an instance of lambda function for data fetch.
@@ -689,12 +725,24 @@ class SparseMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * \brief Method for performing general reduction on all matrix rows for constant instances.
        *
        * \tparam Fetch is a type of lambda function for data fetch declared as
-       *          `fetch( IndexType rowIdx, IndexType& columnIdx, RealType& elementValue ) -> FetchValue`.
-       *          The return type of this lambda can be any non void.
+       *
+       * ```
+       * auto fetch = [=] __cuda_callable__ ( IndexType rowIdx, IndexType& columnIdx, RealType& elementValue ) -> FetchValue { ... };
+       * ```
+       *
+       *  The return type of this lambda can be any non void.
        * \tparam Reduce is a type of lambda function for reduction declared as
-       *          `reduce( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue`.
-       * \tparam Keep is a type of lambda function for storing results of reduction in each row.
-       *          It is declared as `keep( const IndexType rowIdx, const double& value )`.
+       *
+       * ```
+       * auto reduce = [=] __cuda_callable__ ( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue { ... };
+       * ```
+       *
+       * \tparam Keep is a type of lambda function for storing results of reduction in each row. It is declared as
+       *
+       * ```
+       * auto keep = [=] __cuda_callable__ ( const IndexType rowIdx, const double& value )
+       * ```
+       *
        * \tparam FetchValue is type returned by the Fetch lambda function.
        *
        * \param fetch is an instance of lambda function for data fetch.
@@ -717,19 +765,17 @@ class SparseMatrix : public Matrix< Real, Device, Index, RealAllocator >
        *
        * \tparam Function is type of lambda function that will operate on matrix elements.
        *
-       * \param begin defines beginning of the range [ \e begin,\e end ) of rows to be processed.
+       * \param begin defines beginning of the range [ \e begin, \e end ) of rows to be processed.
        * \param end defines ending of the range [ \e begin, \e end ) of rows to be processed.
        * \param function is an instance of the lambda function to be called for element of given rows.
        *
        * The lambda function `function` should be declared like follows:
        *
        * ```
-       * auto function = [] __cuda_callable__ ( IndexType rowIdx, IndexType localIdx, IndexType columnIdx, const RealType& value, bool& compute ) { ... };
+       * auto function = [] __cuda_callable__ ( IndexType rowIdx, IndexType localIdx, IndexType columnIdx, const RealType& value ) { ... };
        * ```
        *
        *  The \e localIdx parameter is a rank of the non-zero element in given row.
-       *  If the 'compute' variable is set to false the iteration over the row can
-       *  be interrupted.
        *
        * \par Example
        * \include Matrices/SparseMatrix/SparseMatrixExample_forElements.cpp
@@ -751,12 +797,10 @@ class SparseMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * The lambda function `function` should be declared like follows:
        *
        * ```
-       * auto function = [] __cuda_callable__ ( IndexType rowIdx, IndexType localIdx, IndexType columnIdx, const RealType& value, bool& compute ) mutable { ... }
+       * auto function = [] __cuda_callable__ ( IndexType rowIdx, IndexType localIdx, IndexType& columnIdx, const RealType& value ) mutable { ... }
        * ```
        *
        *  The \e localIdx parameter is a rank of the non-zero element in given row.
-       *  If the 'compute' variable is set to false the iteration over the row can
-       *  be interrupted.
        *
        * \par Example
        * \include Matrices/SparseMatrix/SparseMatrixExample_forElements.cpp
@@ -901,15 +945,16 @@ class SparseMatrix : public Matrix< Real, Device, Index, RealAllocator >
       /**
        * \brief Method for sequential iteration over all matrix rows for constant instances.
        *
-       * \tparam Function is type of lambda function that will operate on matrix elements.
-       *    It is should have form like
-       *  `function( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx_, const RealType& value, bool& compute )`.
-       *  The column index repeats twice only for compatibility with sparse matrices.
-       *  If the 'compute' variable is set to false the iteration over the row can
-       *  be interrupted.
-       *
-       * \param begin defines beginning of the range [begin,end) of rows to be processed.
-       * \param end defines ending of the range [begin,end) of rows to be processed.
+       * \tparam Function is type of lambda function that will operate on matrix elements. It is should have form like
+       *
+       * ```
+       * auto function = [] __cuda_callable__ ( RowView& row ) { ... };
+       * ```
+       *
+       * \e RowView represents matrix row - see \ref TNL::Matrices::SparseMatrix::RowView.
+       *
+       * \param begin defines beginning of the range [ \e begin, \e end ) of rows to be processed.
+       * \param end defines ending of the range [ \e begin, \e end ) of rows to be processed.
        * \param function is an instance of the lambda function to be called in each row.
        */
       template< typename Function >
@@ -918,15 +963,16 @@ class SparseMatrix : public Matrix< Real, Device, Index, RealAllocator >
       /**
        * \brief Method for sequential iteration over all matrix rows for non-constant instances.
        *
-       * \tparam Function is type of lambda function that will operate on matrix elements.
-       *    It is should have form like
-       *  `function( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx_, RealType& value, bool& compute )`.
-       *  The column index repeats twice only for compatibility with sparse matrices.
-       *  If the 'compute' variable is set to false the iteration over the row can
-       *  be interrupted.
-       *
-       * \param begin defines beginning of the range [begin,end) of rows to be processed.
-       * \param end defines ending of the range [begin,end) of rows to be processed.
+       * \tparam Function is type of lambda function that will operate on matrix elements. It is should have form like
+       *
+       * ```
+       * auto function = [] __cuda_callable__ ( RowView& row ) { ... };
+       * ```
+       *
+       * \e RowView represents matrix row - see \ref TNL::Matrices::SparseMatrix::RowView.
+       *
+       * \param begin defines beginning of the range [ \e begin, \e end ) of rows to be processed.
+       * \param end defines ending of the range [ \e begin, \e end ) of rows to be processed.
        * \param function is an instance of the lambda function to be called in each row.
        */
       template< typename Function >
@@ -959,7 +1005,9 @@ class SparseMatrix : public Matrix< Real, Device, Index, RealAllocator >
        *
        * More precisely, it computes:
        *
-       * `outVector = matrixMultiplicator * ( * this ) * inVector + outVectorMultiplicator * outVector`
+       * ```
+       * outVector = matrixMultiplicator * ( * this ) * inVector + outVectorMultiplicator * outVector
+       * ```
        *
        * \tparam InVector is type of input vector.  It can be \ref Vector,
        *     \ref VectorView, \ref Array, \ref ArraView or similar container.
@@ -982,8 +1030,8 @@ class SparseMatrix : public Matrix< Real, Device, Index, RealAllocator >
                           OutVector& outVector,
                           const ComputeRealType& matrixMultiplicator = 1.0,
                           const ComputeRealType& outVectorMultiplicator = 0.0,
-                          const IndexType firstRow = 0,
-                          const IndexType lastRow = 0 ) const;
+                          const IndexType begin = 0,
+                          const IndexType end = 0 ) const;
 
       /*template< typename Real2, typename Index2 >
       void addMatrix( const SparseMatrix< Real2, Segments, Device, Index2 >& matrix,
diff --git a/src/TNL/Matrices/SparseMatrix.hpp b/src/TNL/Matrices/SparseMatrix.hpp
index 08e55190f4e1ba9ff6a538cfb3e9afb45f78aae2..dd11c6cf74cd4b961331f220829bb0c79428f69a 100644
--- a/src/TNL/Matrices/SparseMatrix.hpp
+++ b/src/TNL/Matrices/SparseMatrix.hpp
@@ -165,7 +165,7 @@ getConstView() const -> ConstViewType
                          this->getColumns(),
                          this->getValues().getConstView(),
                          this->columnIndexes.getConstView(),
-                         this->segments.getConstView() );
+                         const_cast< SparseMatrix* >( this )->segments.getView() );
 }
 
 template< typename Real,
@@ -417,6 +417,7 @@ SparseMatrix< Real, Device, Index, MatrixType, Segments, ComputeReal, RealAlloca
 reset()
 {
    BaseType::reset();
+   this->columnIndexes.reset();
    this->segments.reset();
    this->view = this->getView();
    TNL_ASSERT_EQ( this->getRows(), segments.getSegmentsCount(), "mismatched segments count" );
@@ -523,7 +524,7 @@ vectorProduct( const InVector& inVector,
                const IndexType firstRow,
                const IndexType lastRow ) const
 {
-   this->view.vectorProduct( inVector, outVector, matrixMultiplicator, outVectorMultiplicator, firstRow, lastRow );
+   this->getView().vectorProduct( inVector, outVector, matrixMultiplicator, outVectorMultiplicator, firstRow, lastRow );
 }
 
 template< typename Real,
@@ -890,7 +891,7 @@ operator=( const DenseMatrix< Real_, Device_, Index_, Organization, RealAllocato
    if( std::is_same< DeviceType, RHSDeviceType >::value )
    {
       const auto segments_view = this->segments.getView();
-      auto f = [=] __cuda_callable__ ( RHSIndexType rowIdx, RHSIndexType localIdx, RHSIndexType columnIdx, const RHSRealType& value, bool& compute ) mutable {
+      auto f = [=] __cuda_callable__ ( RHSIndexType rowIdx, RHSIndexType localIdx, RHSIndexType columnIdx, const RHSRealType& value ) mutable {
          if( value != 0.0 )
          {
             IndexType thisGlobalIdx = segments_view.getGlobalIndex( rowIdx, rowLocalIndexes_view[ rowIdx ]++ );
@@ -921,7 +922,7 @@ operator=( const DenseMatrix< Real_, Device_, Index_, Organization, RealAllocato
 
          ////
          // Copy matrix elements into buffer
-         auto f1 = [=] __cuda_callable__ ( RHSIndexType rowIdx, RHSIndexType localIdx, RHSIndexType columnIndex, const RHSRealType& value, bool& compute ) mutable {
+         auto f1 = [=] __cuda_callable__ ( RHSIndexType rowIdx, RHSIndexType localIdx, RHSIndexType columnIndex, const RHSRealType& value ) mutable {
             const IndexType bufferIdx = ( rowIdx - baseRow ) * maxRowLength + localIdx;
             matrixValuesBuffer_view[ bufferIdx ] = value;
          };
@@ -935,7 +936,7 @@ operator=( const DenseMatrix< Real_, Device_, Index_, Organization, RealAllocato
          // Copy matrix elements from the buffer to the matrix and ignoring
          // zero matrix elements.
          const IndexType matrix_columns = this->getColumns();
-         auto f2 = [=] __cuda_callable__ ( IndexType rowIdx, IndexType localIdx, IndexType& columnIndex, RealType& value, bool& compute  ) mutable {
+         auto f2 = [=] __cuda_callable__ ( IndexType rowIdx, IndexType localIdx, IndexType& columnIndex, RealType& value ) mutable {
             RealType inValue( 0.0 );
             IndexType bufferIdx, column( rowLocalIndexes_view[ rowIdx ] );
             while( inValue == 0.0 && column < matrix_columns )
@@ -1001,7 +1002,7 @@ operator=( const RHSMatrix& matrix )
    if( std::is_same< DeviceType, RHSDeviceType >::value )
    {
       const auto segments_view = this->segments.getView();
-      auto f = [=] __cuda_callable__ ( RHSIndexType rowIdx, RHSIndexType localIdx_, RHSIndexType columnIndex, const RHSRealType& value, bool& compute ) mutable {
+      auto f = [=] __cuda_callable__ ( RHSIndexType rowIdx, RHSIndexType localIdx_, RHSIndexType columnIndex, const RHSRealType& value ) mutable {
          IndexType localIdx( rowLocalIndexes_view[ rowIdx ] );
          if( value != 0.0 && columnIndex != paddingIndex )
          {
@@ -1043,7 +1044,7 @@ operator=( const RHSMatrix& matrix )
 
          ////
          // Copy matrix elements into buffer
-         auto f1 = [=] __cuda_callable__ ( RHSIndexType rowIdx, RHSIndexType localIdx, RHSIndexType columnIndex, const RHSRealType& value, bool& compute ) mutable {
+         auto f1 = [=] __cuda_callable__ ( RHSIndexType rowIdx, RHSIndexType localIdx, RHSIndexType columnIndex, const RHSRealType& value ) mutable {
             if( columnIndex != paddingIndex )
             {
                TNL_ASSERT_LT( rowIdx - baseRow, bufferRowsCount, "" );
@@ -1066,7 +1067,7 @@ operator=( const RHSMatrix& matrix )
          // zero matrix elements
          //const IndexType matrix_columns = this->getColumns();
          const auto thisRowLengths_view = thisRowLengths.getConstView();
-         auto f2 = [=] __cuda_callable__ ( IndexType rowIdx, IndexType localIdx, IndexType& columnIndex, RealType& value, bool& compute ) mutable {
+         auto f2 = [=] __cuda_callable__ ( IndexType rowIdx, IndexType localIdx, IndexType& columnIndex, RealType& value ) mutable {
             RealType inValue( 0.0 );
             size_t bufferIdx;
             IndexType bufferLocalIdx( rowLocalIndexes_view[ rowIdx ] );
diff --git a/src/TNL/Matrices/SparseMatrixElement.h b/src/TNL/Matrices/SparseMatrixElement.h
index 485fb919b95a0938dc9d4b3b67093be31008f0a6..3dcb74379d63c5c2f9fe1af7fc74d43dd86492da 100644
--- a/src/TNL/Matrices/SparseMatrixElement.h
+++ b/src/TNL/Matrices/SparseMatrixElement.h
@@ -18,17 +18,43 @@ namespace TNL {
 namespace Matrices {
 
 
+/**
+ * \brief Accessor for sparse matrix elements.
+ *
+ * \tparam Real is a type of matrix elements values.
+ * \tparam Index is a type of matrix elements column indexes.
+ */
 template< typename Real,
-          typename Index,
-          bool isBinary_ = false >
+          typename Index >
 class SparseMatrixElement
 {
    public:
 
+      /**
+       * \brief Test of binary matrix type.
+       *
+       * \return \e true if the matrix is stored as binary and \e false otherwise.
+       */
+      static constexpr bool isBinary() { return std::is_same< std::remove_const_t< Real >, bool >::value; };
+
+      /**
+       * \brief Type of matrix elements values.
+       */
       using RealType = Real;
 
+      /**
+       * \brief Type of matrix elements column indexes.
+       */
       using IndexType = Index;
 
+      /**
+       * \brief Constructor.
+       *
+       * \param value is matrix element value.
+       * \param rowIdx is row index of the matrix element.
+       * \param columnIdx is a column index of the matrix element.
+       * \param localIdx is the rank of the non-zero elements in the matrix row.
+       */
       __cuda_callable__
       SparseMatrixElement( RealType& value,
                            const IndexType& rowIdx,
@@ -36,21 +62,51 @@ class SparseMatrixElement
                            const IndexType& localIdx )
       : value_( value ), rowIdx( rowIdx ), columnIdx( columnIdx ), localIdx( localIdx ) {};
 
+      /**
+       * \brief Returns reference on matrix element value.
+       *
+       * \return reference on matrix element value.
+       */
       __cuda_callable__
       RealType& value() { return value_; };
 
+      /**
+       * \brief Returns constant reference on matrix element value.
+       *
+       * \return constant reference on matrix element value.
+       */
       __cuda_callable__
       const RealType& value() const { return value_; };
 
+      /**
+       * \brief Returns constant reference on matrix element column index.
+       *
+       * \return constant reference on matrix element column index.
+       */
       __cuda_callable__
       const IndexType& rowIndex() const { return rowIdx; };
 
+      /**
+       * \brief Returns reference on matrix element column index.
+       *
+       * \return reference on matrix element column index.
+       */
       __cuda_callable__
       IndexType& columnIndex() { return columnIdx; };
 
+      /**
+       * \brief Returns constant reference on matrix element column index.
+       *
+       * \return constant reference on matrix element column index.
+       */
       __cuda_callable__
       const IndexType& columnIndex() const { return columnIdx; };
 
+      /**
+       * \brief Returns constant reference on the rank of the non-zero matrix element in the row.
+       *
+       * \return constant reference on the rank of the non-zero matrix element in the row.
+       */
       __cuda_callable__
       const IndexType& localIndex() const { return localIdx; };
 
diff --git a/src/TNL/Matrices/SparseMatrixRowView.h b/src/TNL/Matrices/SparseMatrixRowView.h
index 4976a420e22fb4544a4fbb454e7a17574105098d..10236d94dca47f220646f10b3ced04dc0e871dc1 100644
--- a/src/TNL/Matrices/SparseMatrixRowView.h
+++ b/src/TNL/Matrices/SparseMatrixRowView.h
@@ -14,6 +14,7 @@
 
 #include <TNL/Cuda/CudaCallable.h>
 #include <TNL/Matrices/MatrixRowViewIterator.h>
+#include <TNL/Matrices/details/SparseMatrixRowViewValueGetter.h>
 
 namespace TNL {
 namespace Matrices {
@@ -24,7 +25,6 @@ namespace Matrices {
  * \tparam SegmentView is a segment view of segments representing the matrix format.
  * \tparam ValuesView is a vector view storing the matrix elements values.
  * \tparam ColumnsIndexesView is a vector view storing the column indexes of the matrix element.
- * \tparam isBinary tells if the the parent matrix is a binary matrix.
  *
  * See \ref SparseMatrix and \ref SparseMatrixView.
  *
@@ -40,12 +40,17 @@ namespace Matrices {
  */
 template< typename SegmentView,
           typename ValuesView,
-          typename ColumnsIndexesView,
-          bool isBinary_ >
+          typename ColumnsIndexesView >
 class SparseMatrixRowView
 {
    public:
 
+      /**
+       * \brief Tells whether the parent matrix is a binary matrix.
+       * @return `true` if the matrix is binary.
+       */
+      static constexpr bool isBinary() { return std::is_same< std::remove_const_t< RealType >, bool >::value; };
+
       /**
        * \brief The type of matrix elements.
        */
@@ -84,12 +89,12 @@ class SparseMatrixRowView
       /**
        * \brief Type of sparse matrix row view.
        */
-      using RowView = SparseMatrixRowView< SegmentView, ValuesViewType, ColumnsIndexesViewType, isBinary_ >;
+      using RowView = SparseMatrixRowView< SegmentView, ValuesViewType, ColumnsIndexesViewType >;
 
       /**
        * \brief Type of constant sparse matrix row view.
        */
-      using ConstView = SparseMatrixRowView< SegmentView, ConstValuesViewType, ConstColumnsIndexesViewType, isBinary_ >;
+      using ConstView = SparseMatrixRowView< SegmentView, ConstValuesViewType, ConstColumnsIndexesViewType >;
 
       /**
        * \brief The type of related matrix element.
@@ -101,11 +106,7 @@ class SparseMatrixRowView
        */
       using IteratorType = MatrixRowViewIterator< RowView >;
 
-      /**
-       * \brief Tells whether the parent matrix is a binary matrix.
-       * @return `true` if the matrix is binary.
-       */
-      static constexpr bool isBinary() { return isBinary_; };
+      using ValueGetterType = details::SparseMatrixRowViewValueGetter< SegmentView, ValuesView, ColumnsIndexesView >;
 
       /**
        * \brief Constructor with \e segmentView, \e values and \e columnIndexes.
@@ -163,7 +164,7 @@ class SparseMatrixRowView
        * \return constant reference to the matrix element value.
        */
       __cuda_callable__
-      const RealType& getValue( const IndexType localIdx ) const;
+      auto getValue( const IndexType localIdx ) const -> typename ValueGetterType::ConstResultType;
 
       /**
        * \brief Returns non-constants reference to value of an element with given rank in the row.
@@ -173,7 +174,7 @@ class SparseMatrixRowView
        * \return non-constant reference to the matrix element value.
        */
       __cuda_callable__
-      RealType& getValue( const IndexType localIdx );
+      auto getValue( const IndexType localIdx ) -> typename ValueGetterType::ResultType;
 
       /**
        * \brief Sets a value of matrix element with given rank in the matrix row.
@@ -217,10 +218,9 @@ class SparseMatrixRowView
        */
       template< typename _SegmentView,
                 typename _ValuesView,
-                typename _ColumnsIndexesView,
-                bool _isBinary >
+                typename _ColumnsIndexesView >
       __cuda_callable__
-      bool operator==( const SparseMatrixRowView< _SegmentView, _ValuesView, _ColumnsIndexesView, _isBinary >& other ) const;
+      bool operator==( const SparseMatrixRowView< _SegmentView, _ValuesView, _ColumnsIndexesView >& other ) const;
 
       /**
        * \brief Returns iterator pointing at the beginning of the matrix row.
@@ -254,6 +254,9 @@ class SparseMatrixRowView
       __cuda_callable__
       const IteratorType cend() const;
 
+      __cuda_callable__
+      IndexType getPaddingIndex() const { return -1; };
+
    protected:
 
       SegmentViewType segmentView;
@@ -272,9 +275,8 @@ class SparseMatrixRowView
  */
 template< typename SegmentView,
           typename ValuesView,
-          typename ColumnsIndexesView,
-          bool isBinary_ >
-std::ostream& operator<<( std::ostream& str, const SparseMatrixRowView< SegmentView, ValuesView, ColumnsIndexesView, isBinary_ >& row );
+          typename ColumnsIndexesView >
+std::ostream& operator<<( std::ostream& str, const SparseMatrixRowView< SegmentView, ValuesView, ColumnsIndexesView >& row );
 
 } // namespace Matrices
 } // namespace TNL
diff --git a/src/TNL/Matrices/SparseMatrixRowView.hpp b/src/TNL/Matrices/SparseMatrixRowView.hpp
index 82ae9b8706e4998fc88f2403afd109bb68f61f51..2f14774df3e6153e1c335f3f4b659eb32e30abd0 100644
--- a/src/TNL/Matrices/SparseMatrixRowView.hpp
+++ b/src/TNL/Matrices/SparseMatrixRowView.hpp
@@ -18,10 +18,9 @@ namespace Matrices {
 
 template< typename SegmentView,
           typename ValuesView,
-          typename ColumnsIndexesView,
-          bool isBinary_ >
+          typename ColumnsIndexesView >
 __cuda_callable__
-SparseMatrixRowView< SegmentView, ValuesView, ColumnsIndexesView, isBinary_ >::
+SparseMatrixRowView< SegmentView, ValuesView, ColumnsIndexesView >::
 SparseMatrixRowView( const SegmentViewType& segmentView,
                      const ValuesViewType& values,
                      const ColumnsIndexesViewType& columnIndexes )
@@ -31,10 +30,9 @@ SparseMatrixRowView( const SegmentViewType& segmentView,
 
 template< typename SegmentView,
           typename ValuesView,
-          typename ColumnsIndexesView,
-          bool isBinary_ >
+          typename ColumnsIndexesView >
 __cuda_callable__ auto
-SparseMatrixRowView< SegmentView, ValuesView, ColumnsIndexesView, isBinary_ >::
+SparseMatrixRowView< SegmentView, ValuesView, ColumnsIndexesView >::
 getSize() const -> IndexType
 {
    return segmentView.getSize();
@@ -42,11 +40,10 @@ getSize() const -> IndexType
 
 template< typename SegmentView,
           typename ValuesView,
-          typename ColumnsIndexesView,
-          bool isBinary_ >
+          typename ColumnsIndexesView >
 __cuda_callable__
 auto
-SparseMatrixRowView< SegmentView, ValuesView, ColumnsIndexesView, isBinary_ >::
+SparseMatrixRowView< SegmentView, ValuesView, ColumnsIndexesView >::
 getRowIndex() const -> const IndexType&
 {
    return segmentView.getSegmentIndex();
@@ -54,10 +51,9 @@ getRowIndex() const -> const IndexType&
 
 template< typename SegmentView,
           typename ValuesView,
-          typename ColumnsIndexesView,
-          bool isBinary_ >
+          typename ColumnsIndexesView >
 __cuda_callable__ auto
-SparseMatrixRowView< SegmentView, ValuesView, ColumnsIndexesView, isBinary_ >::
+SparseMatrixRowView< SegmentView, ValuesView, ColumnsIndexesView >::
 getColumnIndex( const IndexType localIdx ) const -> const IndexType&
 {
    TNL_ASSERT_LT( localIdx, this->getSize(), "Local index exceeds matrix row capacity." );
@@ -66,10 +62,9 @@ getColumnIndex( const IndexType localIdx ) const -> const IndexType&
 
 template< typename SegmentView,
           typename ValuesView,
-          typename ColumnsIndexesView,
-          bool isBinary_ >
+          typename ColumnsIndexesView >
 __cuda_callable__ auto
-SparseMatrixRowView< SegmentView, ValuesView, ColumnsIndexesView, isBinary_ >::
+SparseMatrixRowView< SegmentView, ValuesView, ColumnsIndexesView >::
 getColumnIndex( const IndexType localIdx ) -> IndexType&
 {
    TNL_ASSERT_LT( localIdx, this->getSize(), "Local index exceeds matrix row capacity." );
@@ -78,36 +73,37 @@ getColumnIndex( const IndexType localIdx ) -> IndexType&
 
 template< typename SegmentView,
           typename ValuesView,
-          typename ColumnsIndexesView,
-          bool isBinary_ >
+          typename ColumnsIndexesView >
 __cuda_callable__ auto
-SparseMatrixRowView< SegmentView, ValuesView, ColumnsIndexesView, isBinary_ >::
-getValue( const IndexType localIdx ) const -> const RealType&
+SparseMatrixRowView< SegmentView, ValuesView, ColumnsIndexesView >::
+getValue( const IndexType localIdx ) const -> typename ValueGetterType::ConstResultType
 {
    TNL_ASSERT_LT( localIdx, this->getSize(), "Local index exceeds matrix row capacity." );
-   TNL_ASSERT_FALSE( isBinary(), "Cannot call this method for binary matrix row." );
-   return values[ segmentView.getGlobalIndex( localIdx ) ];
+   return ValueGetterType::getValue( segmentView.getGlobalIndex( localIdx ),
+                                     values,
+                                     columnIndexes,
+                                     this->getPaddingIndex() );
 }
 
 template< typename SegmentView,
           typename ValuesView,
-          typename ColumnsIndexesView,
-          bool isBinary_ >
+          typename ColumnsIndexesView >
 __cuda_callable__ auto
-SparseMatrixRowView< SegmentView, ValuesView, ColumnsIndexesView, isBinary_ >::
-getValue( const IndexType localIdx ) -> RealType&
+SparseMatrixRowView< SegmentView, ValuesView, ColumnsIndexesView >::
+getValue( const IndexType localIdx ) -> typename ValueGetterType::ResultType
 {
    TNL_ASSERT_LT( localIdx, this->getSize(), "Local index exceeds matrix row capacity." );
-   TNL_ASSERT_FALSE( isBinary(), "Cannot call this method for binary matrix row." );
-   return values[ segmentView.getGlobalIndex( localIdx ) ];
+   return ValueGetterType::getValue( segmentView.getGlobalIndex( localIdx ),
+                                     values,
+                                     columnIndexes,
+                                     this->getPaddingIndex() );
 }
 
 template< typename SegmentView,
           typename ValuesView,
-          typename ColumnsIndexesView,
-          bool isBinary_ >
+          typename ColumnsIndexesView >
 __cuda_callable__ void
-SparseMatrixRowView< SegmentView, ValuesView, ColumnsIndexesView, isBinary_ >::
+SparseMatrixRowView< SegmentView, ValuesView, ColumnsIndexesView >::
 setValue( const IndexType localIdx,
           const RealType& value )
 {
@@ -120,10 +116,9 @@ setValue( const IndexType localIdx,
 
 template< typename SegmentView,
           typename ValuesView,
-          typename ColumnsIndexesView,
-          bool isBinary_ >
+          typename ColumnsIndexesView >
 __cuda_callable__ void
-SparseMatrixRowView< SegmentView, ValuesView, ColumnsIndexesView, isBinary_ >::
+SparseMatrixRowView< SegmentView, ValuesView, ColumnsIndexesView >::
 setColumnIndex( const IndexType localIdx,
                 const IndexType& columnIndex )
 {
@@ -134,10 +129,9 @@ setColumnIndex( const IndexType localIdx,
 
 template< typename SegmentView,
           typename ValuesView,
-          typename ColumnsIndexesView,
-          bool isBinary_ >
+          typename ColumnsIndexesView >
 __cuda_callable__ void
-SparseMatrixRowView< SegmentView, ValuesView, ColumnsIndexesView, isBinary_ >::
+SparseMatrixRowView< SegmentView, ValuesView, ColumnsIndexesView >::
 setElement( const IndexType localIdx,
             const IndexType column,
             const RealType& value )
@@ -151,22 +145,20 @@ setElement( const IndexType localIdx,
 
 template< typename SegmentView,
           typename ValuesView,
-          typename ColumnsIndexesView,
-          bool isBinary_ >
+          typename ColumnsIndexesView >
    template< typename _SegmentView,
              typename _ValuesView,
-             typename _ColumnsIndexesView,
-             bool _isBinary >
+             typename _ColumnsIndexesView >
 __cuda_callable__
 bool
-SparseMatrixRowView< SegmentView, ValuesView, ColumnsIndexesView, isBinary_ >::
-operator==( const SparseMatrixRowView< _SegmentView, _ValuesView, _ColumnsIndexesView, _isBinary >& other ) const
+SparseMatrixRowView< SegmentView, ValuesView, ColumnsIndexesView >::
+operator==( const SparseMatrixRowView< _SegmentView, _ValuesView, _ColumnsIndexesView >& other ) const
 {
    IndexType i = 0;
    while( i < getSize() && i < other.getSize() ) {
       if( getColumnIndex( i ) != other.getColumnIndex( i ) )
          return false;
-      if( ! _isBinary && getValue( i ) != other.getValue( i ) )
+      if( ! isBinary() && getValue( i ) != other.getValue( i ) )
          return false;
       ++i;
    }
@@ -183,10 +175,9 @@ operator==( const SparseMatrixRowView< _SegmentView, _ValuesView, _ColumnsIndexe
 
 template< typename SegmentView,
           typename ValuesView,
-          typename ColumnsIndexesView,
-          bool isBinary_ >
+          typename ColumnsIndexesView >
 __cuda_callable__ auto
-SparseMatrixRowView< SegmentView, ValuesView, ColumnsIndexesView, isBinary_ >::
+SparseMatrixRowView< SegmentView, ValuesView, ColumnsIndexesView >::
 begin() -> IteratorType
 {
    return IteratorType( *this, 0 );
@@ -194,10 +185,9 @@ begin() -> IteratorType
 
 template< typename SegmentView,
           typename ValuesView,
-          typename ColumnsIndexesView,
-          bool isBinary_ >
+          typename ColumnsIndexesView >
 __cuda_callable__ auto
-SparseMatrixRowView< SegmentView, ValuesView, ColumnsIndexesView, isBinary_ >::
+SparseMatrixRowView< SegmentView, ValuesView, ColumnsIndexesView >::
 end() -> IteratorType
 {
    return IteratorType( *this, this->getSize() );
@@ -205,10 +195,9 @@ end() -> IteratorType
 
 template< typename SegmentView,
           typename ValuesView,
-          typename ColumnsIndexesView,
-          bool isBinary_ >
+          typename ColumnsIndexesView >
 __cuda_callable__ auto
-SparseMatrixRowView< SegmentView, ValuesView, ColumnsIndexesView, isBinary_ >::
+SparseMatrixRowView< SegmentView, ValuesView, ColumnsIndexesView >::
 cbegin() const -> const IteratorType
 {
    return IteratorType( *this, 0 );
@@ -216,10 +205,9 @@ cbegin() const -> const IteratorType
 
 template< typename SegmentView,
           typename ValuesView,
-          typename ColumnsIndexesView,
-          bool isBinary_ >
+          typename ColumnsIndexesView >
 __cuda_callable__ auto
-SparseMatrixRowView< SegmentView, ValuesView, ColumnsIndexesView, isBinary_ >::
+SparseMatrixRowView< SegmentView, ValuesView, ColumnsIndexesView >::
 cend() const -> const IteratorType
 {
    return IteratorType( *this, this->getSize() );
@@ -227,13 +215,12 @@ cend() const -> const IteratorType
 
 template< typename SegmentView,
           typename ValuesView,
-          typename ColumnsIndexesView,
-          bool isBinary_ >
-std::ostream& operator<<( std::ostream& str, const SparseMatrixRowView< SegmentView, ValuesView, ColumnsIndexesView, isBinary_ >& row )
+          typename ColumnsIndexesView >
+std::ostream& operator<<( std::ostream& str, const SparseMatrixRowView< SegmentView, ValuesView, ColumnsIndexesView >& row )
 {
-   using NonConstIndex = std::remove_const_t< typename SparseMatrixRowView< SegmentView, ValuesView, ColumnsIndexesView, isBinary_ >::IndexType >;
+   using NonConstIndex = std::remove_const_t< typename SparseMatrixRowView< SegmentView, ValuesView, ColumnsIndexesView >::IndexType >;
    for( NonConstIndex i = 0; i < row.getSize(); i++ )
-      if( isBinary_ )
+      if( row.isBinary() )
          // TODO: check getPaddingIndex(), print only the column indices of non-zeros but not the values
          str << " [ " << row.getColumnIndex( i ) << " ] = " << (row.getColumnIndex( i ) >= 0) << ", ";
       else
diff --git a/src/TNL/Matrices/SparseMatrixView.h b/src/TNL/Matrices/SparseMatrixView.h
index 40a89b628a4d0474f811dfb16ae62d05407d5302..a20964b0c11f3fce145e1d9f54ec828c17ea628b 100644
--- a/src/TNL/Matrices/SparseMatrixView.h
+++ b/src/TNL/Matrices/SparseMatrixView.h
@@ -67,7 +67,7 @@ class SparseMatrixView : public MatrixView< Real, Device, Index >
    static_assert(
       ! MatrixType::isSymmetric() ||
       ! std::is_same< Device, Devices::Cuda >::value ||
-      ( std::is_same< Real, float >::value || std::is_same< Real, double >::value || std::is_same< Real, int >::value || std::is_same< Real, long long int >::value ),
+      ( std::is_same< Real, float >::value || std::is_same< Real, double >::value || std::is_same< Real, int >::value || std::is_same< Real, long long int >::value || std::is_same< Real, bool >::value ),
       "Given Real type is not supported by atomic operations on GPU which are necessary for symmetric operations." );
 
    public:
@@ -98,7 +98,7 @@ class SparseMatrixView : public MatrixView< Real, Device, Index >
       /**
        * \brief The type of matrix elements.
        */
-      using RealType = Real;
+      using RealType = std::remove_const_t< Real >;
 
       using ComputeRealType = ComputeReal;
 
@@ -136,12 +136,12 @@ class SparseMatrixView : public MatrixView< Real, Device, Index >
       /**
        * \brief Type for accessing matrix rows.
        */
-      using RowView = SparseMatrixRowView< typename SegmentsViewType::SegmentViewType, ValuesViewType, ColumnsIndexesViewType, isBinary() >;
+      using RowView = SparseMatrixRowView< typename SegmentsViewType::SegmentViewType, ValuesViewType, ColumnsIndexesViewType >;
 
       /**
        * \brief Type for accessing constant matrix rows.
        */
-      using ConstRowView = SparseMatrixRowView< typename SegmentsViewType::SegmentViewType, ConstValuesViewType, ConstColumnsIndexesViewType, isBinary() >;;
+      using ConstRowView = SparseMatrixRowView< typename SegmentsViewType::SegmentViewType, ConstValuesViewType, ConstColumnsIndexesViewType >;;
 
       /**
        * \brief Helper type for getting self type or its modifications.
@@ -395,16 +395,28 @@ class SparseMatrixView : public MatrixView< Real, Device, Index >
        * \brief Method for performing general reduction on matrix rows.
        *
        * \tparam Fetch is a type of lambda function for data fetch declared as
-       *          `fetch( IndexType rowIdx, IndexType& columnIdx, RealType& elementValue ) -> FetchValue`.
-       *          The return type of this lambda can be any non void.
+       *
+       * ```
+       * auto fetch = [=] __cuda_callable__ ( IndexType rowIdx, IndexType& columnIdx, RealType& elementValue ) -> FetchValue
+       * ```
+       *
+       *  The return type of this lambda can be any non void.
        * \tparam Reduce is a type of lambda function for reduction declared as
-       *          `reduce( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue`.
-       * \tparam Keep is a type of lambda function for storing results of reduction in each row.
-       *          It is declared as `keep( const IndexType rowIdx, const double& value )`.
+       *
+       * ```
+       * auto reduce = [=] __cuda_callable__ ( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue { ... };
+       * ```
+       *
+       * \tparam Keep is a type of lambda function for storing results of reduction in each row. It is declared as
+       *
+       * ```
+       * auto keep = [=] __cuda_callable__ ( const IndexType rowIdx, const double& value ) { ... };
+       * ```
+       *
        * \tparam FetchValue is type returned by the Fetch lambda function.
        *
-       * \param begin defines beginning of the range [begin,end) of rows to be processed.
-       * \param end defines ending of the range [begin,end) of rows to be processed.
+       * \param begin defines beginning of the range [ \e begin, \e end ) of rows to be processed.
+       * \param end defines ending of the range [ \e begin, \e end ) of rows to be processed.
        * \param fetch is an instance of lambda function for data fetch.
        * \param reduce is an instance of lambda function for reduction.
        * \param keep in an instance of lambda function for storing results.
@@ -424,16 +436,28 @@ class SparseMatrixView : public MatrixView< Real, Device, Index >
        * \brief Method for performing general reduction on matrix rows for constant instances.
        *
        * \tparam Fetch is a type of lambda function for data fetch declared as
-       *          `fetch( IndexType rowIdx, IndexType& columnIdx, RealType& elementValue ) -> FetchValue`.
-       *          The return type of this lambda can be any non void.
+       *
+       * ```
+       * auto fetch = [=] __cuda_callable__ ( IndexType rowIdx, IndexType& columnIdx, RealType& elementValue ) -> FetchValue
+       * ```
+       *
+       * The return type of this lambda can be any non void.
        * \tparam Reduce is a type of lambda function for reduction declared as
-       *          `reduce( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue`.
-       * \tparam Keep is a type of lambda function for storing results of reduction in each row.
-       *          It is declared as `keep( const IndexType rowIdx, const double& value )`.
+       *
+       * ```
+       * auto reduce = [=] __cuda_callable__ ( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue { ... };
+       * ```
+       *
+       * \tparam Keep is a type of lambda function for storing results of reduction in each row. It is declared as
+       *
+       * ```
+       * auto keep = [=] __cuda_callable__ ( const IndexType rowIdx, const RealType& value ) { ... };
+       * ```
+       *
        * \tparam FetchValue is type returned by the Fetch lambda function.
        *
-       * \param begin defines beginning of the range [begin,end) of rows to be processed.
-       * \param end defines ending of the range [begin,end) of rows to be processed.
+       * \param begin defines beginning of the range [ \e begin, \e end ) of rows to be processed.
+       * \param end defines ending of the range [ \e begin, \e end ) of rows to be processed.
        * \param fetch is an instance of lambda function for data fetch.
        * \param reduce is an instance of lambda function for reduction.
        * \param keep in an instance of lambda function for storing results.
@@ -453,12 +477,24 @@ class SparseMatrixView : public MatrixView< Real, Device, Index >
        * \brief Method for performing general reduction on all matrix rows.
        *
        * \tparam Fetch is a type of lambda function for data fetch declared as
-       *          `fetch( IndexType rowIdx, IndexType& columnIdx, RealType& elementValue ) -> FetchValue`.
-       *          The return type of this lambda can be any non void.
+       *
+       * ```
+       * auto fetch = [=] __cuda_callable__ ( IndexType rowIdx, IndexType& columnIdx, RealType& elementValue ) -> FetchValue { ... };
+       * ```
+       *
+       * The return type of this lambda can be any non void.
        * \tparam Reduce is a type of lambda function for reduction declared as
-       *          `reduce( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue`.
-       * \tparam Keep is a type of lambda function for storing results of reduction in each row.
-       *          It is declared as `keep( const IndexType rowIdx, const double& value )`.
+       *
+       * ```
+       * auto reduce = [=] __cuda_callable__ ( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue { ... };
+       * ```
+       *
+       * \tparam Keep is a type of lambda function for storing results of reduction in each row. It is declared as
+       *
+       * ```
+       * auto keep = [=] __cuda_callable__ ( const IndexType rowIdx, const double& value ) { ... };
+       * ```
+       *
        * \tparam FetchValue is type returned by the Fetch lambda function.
        *
        * \param fetch is an instance of lambda function for data fetch.
@@ -480,12 +516,24 @@ class SparseMatrixView : public MatrixView< Real, Device, Index >
        * \brief Method for performing general reduction on all matrix rows for constant instances.
        *
        * \tparam Fetch is a type of lambda function for data fetch declared as
-       *          `fetch( IndexType rowIdx, IndexType& columnIdx, RealType& elementValue ) -> FetchValue`.
-       *          The return type of this lambda can be any non void.
+       *
+       * ```
+       * auto fetch = [=] __cuda_callable__ ( IndexType rowIdx, IndexType& columnIdx, RealType& elementValue ) -> FetchValue { ... };
+       * ```
+       *
+       * The return type of this lambda can be any non void.
        * \tparam Reduce is a type of lambda function for reduction declared as
-       *          `reduce( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue`.
-       * \tparam Keep is a type of lambda function for storing results of reduction in each row.
-       *          It is declared as `keep( const IndexType rowIdx, const double& value )`.
+       *
+       * ```
+       * auto reduce = [=] __cuda_callable__ ( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue { ... };
+       * ```
+       *
+       * \tparam Keep is a type of lambda function for storing results of reduction in each row. It is declared as
+       *
+       * ```
+       * auto keep = [=] __cuda_callable__ ( const IndexType rowIdx, const double& value ) { ... };
+       * ```
+       *
        * \tparam FetchValue is type returned by the Fetch lambda function.
        *
        * \param fetch is an instance of lambda function for data fetch.
@@ -506,15 +554,16 @@ class SparseMatrixView : public MatrixView< Real, Device, Index >
       /**
        * \brief Method for iteration over all matrix rows for constant instances.
        *
-       * \tparam Function is type of lambda function that will operate on matrix elements.
-       *    It is should have form like
-       *  `function( IndexType rowIdx, IndexType localIdx, IndexType columnIdx, const RealType& value, bool& compute )`.
+       * \tparam Function is type of lambda function that will operate on matrix elements. It is should have form like
+       *
+       * ```
+       * auto function = [=] __cuda_callable__ ( IndexType rowIdx, IndexType localIdx, IndexType columnIdx, const RealType& value ) { ... };
+       * ```
+       *
        *  The \e localIdx parameter is a rank of the non-zero element in given row.
-       *  If the 'compute' variable is set to false the iteration over the row can
-       *  be interrupted.
        *
-       * \param begin defines beginning of the range [begin,end) of rows to be processed.
-       * \param end defines ending of the range [begin,end) of rows to be processed.
+       * \param begin defines beginning of the range [ \e begin, \e end ) of rows to be processed.
+       * \param end defines ending of the range [ \e begin,\e end ) of rows to be processed.
        * \param function is an instance of the lambda function to be called in each row.
        *
        * \par Example
@@ -528,15 +577,16 @@ class SparseMatrixView : public MatrixView< Real, Device, Index >
       /**
        * \brief Method for iteration over all matrix rows for non-constant instances.
        *
-       * \tparam Function is type of lambda function that will operate on matrix elements.
-       *    It is should have form like
-       *  `function( IndexType rowIdx, IndexType localIdx, IndexType columnIdx, const RealType& value, bool& compute )`.
+       * \tparam Function is type of lambda function that will operate on matrix elements. It is should have form like
+       *
+       * ```
+       * auto function = [=] __cuda_callable__ ( IndexType rowIdx, IndexType localIdx, IndexType columnIdx, const RealType& value ) { ... };
+       * ```
+       *
        *  The \e localIdx parameter is a rank of the non-zero element in given row.
-       *  If the 'compute' variable is set to false the iteration over the row can
-       *  be interrupted.
        *
-       * \param begin defines beginning of the range [begin,end) of rows to be processed.
-       * \param end defines ending of the range [begin,end) of rows to be processed.
+       * \param begin defines beginning of the range [ \e begin, \e end ) of rows to be processed.
+       * \param end defines ending of the range [ \e begin, \e end ) of rows to be processed.
        * \param function is an instance of the lambda function to be called in each row.
        *
        * \par Example
@@ -683,15 +733,16 @@ class SparseMatrixView : public MatrixView< Real, Device, Index >
       /**
        * \brief Method for sequential iteration over all matrix rows for constant instances.
        *
-       * \tparam Function is type of lambda function that will operate on matrix elements.
-       *    It is should have form like
-       *  `function( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx_, const RealType& value, bool& compute )`.
-       *  The column index repeats twice only for compatibility with sparse matrices.
-       *  If the 'compute' variable is set to false the iteration over the row can
-       *  be interrupted.
+       * \tparam Function is type of lambda function that will operate on matrix elements. It is should have form like
        *
-       * \param begin defines beginning of the range [begin,end) of rows to be processed.
-       * \param end defines ending of the range [begin,end) of rows to be processed.
+       * ```
+       * auto function = [] __cuda_callable__ ( RowView& row ) { ... };
+       * ```
+       *
+       * \e RowView represents matrix row - see \ref TNL::Matrices::SparseMatrixView::RowView.
+       *
+       * \param begin defines beginning of the range [ \e begin, \e end ) of rows to be processed.
+       * \param end defines ending of the range [ \e begin, \e end ) of rows to be processed.
        * \param function is an instance of the lambda function to be called in each row.
        */
       template< typename Function >
@@ -700,15 +751,16 @@ class SparseMatrixView : public MatrixView< Real, Device, Index >
       /**
        * \brief Method for sequential iteration over all matrix rows for non-constant instances.
        *
-       * \tparam Function is type of lambda function that will operate on matrix elements.
-       *    It is should have form like
-       *  `function( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx_, RealType& value, bool& compute )`.
-       *  The column index repeats twice only for compatibility with sparse matrices.
-       *  If the 'compute' variable is set to false the iteration over the row can
-       *  be interrupted.
+       * \tparam Function is type of lambda function that will operate on matrix elements. It is should have form like
        *
-       * \param begin defines beginning of the range [begin,end) of rows to be processed.
-       * \param end defines ending of the range [begin,end) of rows to be processed.
+       * ```
+       * auto function = [] __cuda_callable__ ( RowView& row ) { ... };
+       * ```
+       *
+       * \e RowView represents matrix row - see \ref TNL::Matrices::SparseMatrixView::RowView.
+       *
+       * \param begin defines beginning of the range [ \e begin, \e end ) of rows to be processed.
+       * \param end defines ending of the range [ \e begin, \e end ) of rows to be processed.
        * \param function is an instance of the lambda function to be called in each row.
        */
       template< typename Function >
@@ -741,7 +793,9 @@ class SparseMatrixView : public MatrixView< Real, Device, Index >
        *
        * More precisely, it computes:
        *
-       * `outVector = matrixMultiplicator * ( * this ) * inVector + outVectorMultiplicator * outVector`
+       * ```
+       * outVector = matrixMultiplicator * ( * this ) * inVector + outVectorMultiplicator * outVector
+       * ```
        *
        * \tparam InVector is type of input vector.  It can be \ref Vector,
        *     \ref VectorView, \ref Array, \ref ArraView or similar container.
@@ -888,7 +942,7 @@ class SparseMatrixView : public MatrixView< Real, Device, Index >
       }
 };
 
-} // namespace Conatiners
+   } // namespace Matrices
 } // namespace TNL
 
 #include <TNL/Matrices/SparseMatrixView.hpp>
diff --git a/src/TNL/Matrices/SparseMatrixView.hpp b/src/TNL/Matrices/SparseMatrixView.hpp
index 94a8fc5729de884077305197cb55ef7cda8ce3e5..c3f7387fd05f6c75e8e5140d5620bb8852cf420c 100644
--- a/src/TNL/Matrices/SparseMatrixView.hpp
+++ b/src/TNL/Matrices/SparseMatrixView.hpp
@@ -80,8 +80,8 @@ getConstView() const -> ConstViewType
    return ConstViewType( this->getRows(),
                          this->getColumns(),
                          this->getValues().getConstView(),
-                         this->getColumnsIndexes().getConstView(),
-                         this->segments.getConstView() );
+                         this->getColumnIndexes().getConstView(),
+                         const_cast< SparseMatrixView* >( this )->segments.getView() );
 }
 
 template< typename Real,
@@ -210,7 +210,7 @@ getNonzeroElementsCount() const
       auto keeper = [=] __cuda_callable__ ( IndexType row, const IndexType& value ) mutable {
          row_sums_view[ row ] = value;
       };
-      this->segments.segmentsReduction( (IndexType) 0, this->getRows(), fetch, std::plus<>{}, keeper, ( IndexType ) 0 );
+      this->segments.reduceSegments( 0, this->getRows(), fetch, std::plus<>{}, keeper, ( IndexType ) 0 );
       return sum( row_sums );
    }
 }
@@ -417,7 +417,7 @@ vectorProduct( const InVector& inVector,
    const auto valuesView = this->values.getConstView();
    const auto columnIndexesView = this->columnIndexes.getConstView();
    const IndexType paddingIndex = this->getPaddingIndex();
-   if( isSymmetric() )
+   if( isSymmetric() && outVectorMultiplicator != 1.0 )
       outVector *= outVectorMultiplicator;
    auto symmetricFetch = [=] __cuda_callable__ ( IndexType row, IndexType localIdx, IndexType globalIdx, bool& compute ) mutable -> ComputeRealType {
       const IndexType column = columnIndexesView[ globalIdx ];
@@ -475,22 +475,22 @@ vectorProduct( const InVector& inVector,
    if( lastRow == 0 )
       lastRow = this->getRows();
    if( isSymmetric() )
-      this->segments.segmentsReduction( firstRow, lastRow, symmetricFetch, std::plus<>{}, keeperGeneral, ( ComputeRealType ) 0.0 );
+      this->segments.reduceSegments( firstRow, lastRow, symmetricFetch, std::plus<>{}, keeperGeneral, ( ComputeRealType ) 0.0 );
    else
    {
       if( outVectorMultiplicator == 0.0 )
       {
          if( matrixMultiplicator == 1.0 )
-            this->segments.segmentsReduction( firstRow, lastRow, fetch, std::plus<>{}, keeperDirect, ( ComputeRealType ) 0.0 );
+            this->segments.reduceSegments( firstRow, lastRow, fetch, std::plus<>{}, keeperDirect, ( ComputeRealType ) 0.0 );
          else
-            this->segments.segmentsReduction( firstRow, lastRow, fetch, std::plus<>{}, keeperMatrixMult, ( ComputeRealType ) 0.0 );
+            this->segments.reduceSegments( firstRow, lastRow, fetch, std::plus<>{}, keeperMatrixMult, ( ComputeRealType ) 0.0 );
       }
       else
       {
          if( matrixMultiplicator == 1.0 )
-            this->segments.segmentsReduction( firstRow, lastRow, fetch, std::plus<>{}, keeperVectorMult, ( ComputeRealType ) 0.0 );
+            this->segments.reduceSegments( firstRow, lastRow, fetch, std::plus<>{}, keeperVectorMult, ( ComputeRealType ) 0.0 );
          else
-            this->segments.segmentsReduction( firstRow, lastRow, fetch, std::plus<>{}, keeperGeneral, ( ComputeRealType ) 0.0 );
+            this->segments.reduceSegments( firstRow, lastRow, fetch, std::plus<>{}, keeperGeneral, ( ComputeRealType ) 0.0 );
       }
    }
 }
@@ -520,7 +520,7 @@ reduceRows( IndexType begin, IndexType end, Fetch& fetch, const Reduce& reduce,
       }
       return identity;
    };
-   this->segments.segmentsReduction( begin, end, fetch_, reduce, keep, identity );
+   this->segments.reduceSegments( begin, end, fetch_, reduce, keep, identity );
 }
 
 template< typename Real,
@@ -549,7 +549,7 @@ reduceRows( IndexType begin, IndexType end, Fetch& fetch, const Reduce& reduce,
       }
       return identity;
    };
-   this->segments.segmentsReduction( begin, end, fetch_, reduce, keep, identity );
+   this->segments.reduceSegments( begin, end, fetch_, reduce, keep, identity );
 }
 
 template< typename Real,
@@ -594,12 +594,16 @@ forElements( IndexType begin, IndexType end, Function& function ) const
    const auto columns_view = this->columnIndexes.getConstView();
    const auto values_view = this->values.getConstView();
    //const IndexType paddingIndex_ = this->getPaddingIndex();
-   auto f = [=] __cuda_callable__ ( IndexType rowIdx, IndexType localIdx, IndexType globalIdx, bool& compute ) mutable -> bool {
-      if( isBinary() )
-         function( rowIdx, localIdx, columns_view[ globalIdx ], 1, compute );
-      else
-         function( rowIdx, localIdx, columns_view[ globalIdx ], values_view[ globalIdx ], compute );
-      return true;
+   auto columns = this->getColumns();
+   auto f = [=] __cuda_callable__ ( IndexType rowIdx, IndexType localIdx, IndexType globalIdx ) mutable {
+      if( localIdx < columns )
+      {
+         if( isBinary() )
+            function( rowIdx, localIdx, columns_view[ globalIdx ], 1 );
+         else
+            function( rowIdx, localIdx, columns_view[ globalIdx ], values_view[ globalIdx ] );
+      }
+      //return true;
    };
    this->segments.forElements( begin, end, f );
 }
@@ -618,14 +622,18 @@ forElements( IndexType begin, IndexType end, Function& function )
    auto columns_view = this->columnIndexes.getView();
    auto values_view = this->values.getView();
    const IndexType paddingIndex_ = this->getPaddingIndex();
-   auto f = [=] __cuda_callable__ ( IndexType rowIdx, IndexType localIdx, IndexType globalIdx, bool& compute ) mutable {
-      if( isBinary() )
+   auto columns = this->getColumns();
+   auto f = [=] __cuda_callable__ ( IndexType rowIdx, IndexType localIdx, IndexType globalIdx ) mutable {
+      if( localIdx < columns )
       {
-         RealType one( columns_view[ globalIdx ] != paddingIndex_ );
-         function( rowIdx, localIdx, columns_view[ globalIdx ], one, compute );
+         if( isBinary() )
+         {
+            RealType one( columns_view[ globalIdx ] != paddingIndex_ );
+            function( rowIdx, localIdx, columns_view[ globalIdx ], one );
+         }
+         else
+            function( rowIdx, localIdx, columns_view[ globalIdx ], values_view[ globalIdx ] );
       }
-      else
-         function( rowIdx, localIdx, columns_view[ globalIdx ], values_view[ globalIdx ], compute );
    };
    this->segments.forElements( begin, end, f );
 }
@@ -862,14 +870,12 @@ SparseMatrixView< Real, Device, Index, MatrixType, SegmentsView, ComputeReal >::
 operator==( const Matrix& m ) const
 {
    const auto& view1 = *this;
-   // FIXME: getConstView does not work
-   //const auto view2 = m.getConstView();
-   const auto view2 = m.getView();
+   const auto view2 = m.getConstView();
    auto fetch = [=] __cuda_callable__ ( const IndexType i ) -> bool
    {
       return view1.getRow( i ) == view2.getRow( i );
    };
-   return Algorithms::reduce< DeviceType >( (IndexType) 0, this->getRows(), fetch, std::logical_and<>{}, true );
+   return Algorithms::reduce< DeviceType >( ( IndexType ) 0, this->getRows(), fetch, std::logical_and<>{}, true );
 }
 
 template< typename Real,
@@ -896,7 +902,7 @@ void
 SparseMatrixView< Real, Device, Index, MatrixType, SegmentsView, ComputeReal >::
 save( File& file ) const
 {
-   MatrixView< RealType, DeviceType, IndexType >::save( file );
+   MatrixView< Real, Device, Index >::save( file );
    file << this->columnIndexes;
    this->segments.save( file );
 }
diff --git a/src/TNL/Matrices/TridiagonalMatrix.h b/src/TNL/Matrices/TridiagonalMatrix.h
index b74e0dcb9d8e3d609c593a40f3ac5986f3dd8ea5..45e6d132fecd289df3ba1ccf608eebf39a1342d7 100644
--- a/src/TNL/Matrices/TridiagonalMatrix.h
+++ b/src/TNL/Matrices/TridiagonalMatrix.h
@@ -493,16 +493,28 @@ class TridiagonalMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * \brief Method for performing general reduction on matrix rows.
        *
        * \tparam Fetch is a type of lambda function for data fetch declared as
-       *          `fetch( IndexType rowIdx, IndexType& columnIdx, RealType& elementValue ) -> FetchValue`.
-       *          The return type of this lambda can be any non void.
+       *
+       * ```
+       * auto fetch = [=] __cuda_callable__ ( IndexType rowIdx, IndexType& columnIdx, RealType& elementValue ) -> FetchValue { ... };
+       * ```
+       *
+       * The return type of this lambda can be any non void.
        * \tparam Reduce is a type of lambda function for reduction declared as
-       *          `reduce( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue`.
-       * \tparam Keep is a type of lambda function for storing results of reduction in each row.
-       *          It is declared as `keep( const IndexType rowIdx, const double& value )`.
+       *
+       * ```
+       * auto reduce = [=] __cuda_callable__ ( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue { ... };
+       * ```
+       *
+       * \tparam Keep is a type of lambda function for storing results of reduction in each row. It is declared as
+       *
+       * ```
+       * auto keep = [=] __cuda_callable__ ( const IndexType rowIdx, const double& value ) { ... };
+       * ```
+       *
        * \tparam FetchValue is type returned by the Fetch lambda function.
        *
-       * \param begin defines beginning of the range [begin,end) of rows to be processed.
-       * \param end defines ending of the range [begin,end) of rows to be processed.
+       * \param begin defines beginning of the range [ \e begin, \e end ) of rows to be processed.
+       * \param end defines ending of the range [ \e begin, \e end ) of rows to be processed.
        * \param fetch is an instance of lambda function for data fetch.
        * \param reduce is an instance of lambda function for reduction.
        * \param keep in an instance of lambda function for storing results.
@@ -522,16 +534,28 @@ class TridiagonalMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * \brief Method for performing general reduction on matrix rows of constant matrix instances.
        *
        * \tparam Fetch is a type of lambda function for data fetch declared as
-       *          `fetch( IndexType rowIdx, IndexType& columnIdx, RealType& elementValue ) -> FetchValue`.
-       *          The return type of this lambda can be any non void.
+       *
+       * ```
+       * auto fetch = [=] __cuda_callable__ ( IndexType rowIdx, IndexType& columnIdx, RealType& elementValue ) -> FetchValue { ... };
+       * ```
+       *
+       *  The return type of this lambda can be any non void.
        * \tparam Reduce is a type of lambda function for reduction declared as
-       *          `reduce( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue`.
-       * \tparam Keep is a type of lambda function for storing results of reduction in each row.
-       *          It is declared as `keep( const IndexType rowIdx, const double& value )`.
+       *
+       * ```
+       * auto reduce = [=] __cuda_callable__ ( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue { ... };
+       * ```
+       *
+       * \tparam Keep is a type of lambda function for storing results of reduction in each row. It is declared as
+       *
+       * ```
+       * auto keep = [=] __cuda_callable__ ( const IndexType rowIdx, const double& value ) { ... };
+       * ```
+       *
        * \tparam FetchValue is type returned by the Fetch lambda function.
        *
-       * \param begin defines beginning of the range [begin,end) of rows to be processed.
-       * \param end defines ending of the range [begin,end) of rows to be processed.
+       * \param begin defines beginning of the range [ \e begin, \e end ) of rows to be processed.
+       * \param end defines ending of the range [ \e begin, \e end ) of rows to be processed.
        * \param fetch is an instance of lambda function for data fetch.
        * \param reduce is an instance of lambda function for reduction.
        * \param keep in an instance of lambda function for storing results.
@@ -551,16 +575,28 @@ class TridiagonalMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * \brief Method for performing general reduction on all matrix rows.
        *
        * \tparam Fetch is a type of lambda function for data fetch declared as
-       *          `fetch( IndexType rowIdx, IndexType& columnIdx, RealType& elementValue ) -> FetchValue`.
-       *          The return type of this lambda can be any non void.
+       *
+       * ```
+       * auto fetch = [=] __cuda_callable__ ( IndexType rowIdx, IndexType& columnIdx, RealType& elementValue ) -> FetchValue { ... };
+       * ```
+       *
+       * The return type of this lambda can be any non void.
        * \tparam Reduce is a type of lambda function for reduction declared as
-       *          `reduce( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue`.
-       * \tparam Keep is a type of lambda function for storing results of reduction in each row.
-       *          It is declared as `keep( const IndexType rowIdx, const double& value )`.
+       *
+       * ```
+       * auto reduce = [=] __cuda_callable__ ( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue { ... };
+       * ```
+       *
+       * \tparam Keep is a type of lambda function for storing results of reduction in each row.  It is declared as
+       *
+       * ```
+       * auto keep = [=] __cuda_callable__ ( const IndexType rowIdx, const double& value ) { ... };
+       * ```
+       *
        * \tparam FetchValue is type returned by the Fetch lambda function.
        *
-       * \param begin defines beginning of the range [begin,end) of rows to be processed.
-       * \param end defines ending of the range [begin,end) of rows to be processed.
+       * \param begin defines beginning of the range [ \e begin, \e end ) of rows to be processed.
+       * \param end defines ending of the range [ \e begin, \e end ) of rows to be processed.
        * \param fetch is an instance of lambda function for data fetch.
        * \param reduce is an instance of lambda function for reduction.
        * \param keep in an instance of lambda function for storing results.
@@ -580,16 +616,28 @@ class TridiagonalMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * \brief Method for performing general reduction on all matrix rows of constant matrix instances.
        *
        * \tparam Fetch is a type of lambda function for data fetch declared as
-       *          `fetch( IndexType rowIdx, IndexType& columnIdx, RealType& elementValue ) -> FetchValue`.
-       *          The return type of this lambda can be any non void.
+       *
+       * ```
+       * auto fetch = [=] __cuda_callable__ ( IndexType rowIdx, IndexType& columnIdx, RealType& elementValue ) -> FetchValue { ... };
+       * ```
+       *
+       * The return type of this lambda can be any non void.
        * \tparam Reduce is a type of lambda function for reduction declared as
-       *          `reduce( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue`.
-       * \tparam Keep is a type of lambda function for storing results of reduction in each row.
-       *          It is declared as `keep( const IndexType rowIdx, const double& value )`.
+       *
+       * ```
+       * auto reduce = [=] __cuda_callable__ ( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue { ... };
+       * ```
+       *
+       * \tparam Keep is a type of lambda function for storing results of reduction in each row. It is declared as
+       *
+       * ```
+       * auto keep = [=] __cuda_callable__ ( const IndexType rowIdx, const double& value ) { ... };
+       * ```
+       *
        * \tparam FetchValue is type returned by the Fetch lambda function.
        *
-       * \param begin defines beginning of the range [begin,end) of rows to be processed.
-       * \param end defines ending of the range [begin,end) of rows to be processed.
+       * \param begin defines beginning of the range [ \e begin, \e end ) of rows to be processed.
+       * \param end defines ending of the range [ \e begin, \e end ) of rows to be processed.
        * \param fetch is an instance of lambda function for data fetch.
        * \param reduce is an instance of lambda function for reduction.
        * \param keep in an instance of lambda function for storing results.
@@ -608,15 +656,16 @@ class TridiagonalMatrix : public Matrix< Real, Device, Index, RealAllocator >
       /**
        * \brief Method for iteration over matrix rows for constant instances.
        *
-       * \tparam Function is type of lambda function that will operate on matrix elements.
-       *    It is should have form like
-       *  `function( IndexType rowIdx, IndexType localIdx, IndexType columnIdx, const RealType& value, bool& compute )`.
+       * \tparam Function is type of lambda function that will operate on matrix elements. It is should have form like
+       *
+       * ```
+       * auto function = [=] __cuda_callable__ ( IndexType rowIdx, IndexType localIdx, IndexType columnIdx, const RealType& value ) { ... };
+       * ```
+       *
        *  The \e localIdx parameter is a rank of the non-zero element in given row.
-       *  If the 'compute' variable is set to false the iteration over the row can
-       *  be interrupted.
        *
-       * \param begin defines beginning of the range [begin,end) of rows to be processed.
-       * \param end defines ending of the range [begin,end) of rows to be processed.
+       * \param begin defines beginning of the range [ \e begin, \e end ) of rows to be processed.
+       * \param end defines ending of the range [ \e begin, \e end ) of rows to be processed.
        * \param function is an instance of the lambda function to be called in each row.
        *
        * \par Example
@@ -630,12 +679,13 @@ class TridiagonalMatrix : public Matrix< Real, Device, Index, RealAllocator >
       /**
        * \brief Method for iteration over matrix rows for non-constant instances.
        *
-       * \tparam Function is type of lambda function that will operate on matrix elements.
-       *    It is should have form like
-       *  `function( IndexType rowIdx, IndexType localIdx, IndexType columnIdx, const RealType& value, bool& compute )`.
+       * \tparam Function is type of lambda function that will operate on matrix elements. It is should have form like
+       *
+       * ```
+       * auto function = [=] __cuda_callable__ ( IndexType rowIdx, IndexType localIdx, IndexType columnIdx, const RealType& value ) { ... };
+       * ```
+       *
        *  The \e localIdx parameter is a rank of the non-zero element in given row.
-       *  If the 'compute' variable is set to false the iteration over the row can
-       *  be interrupted.
        *
        * \param begin defines beginning of the range [begin,end) of rows to be processed.
        * \param end defines ending of the range [begin,end) of rows to be processed.
@@ -652,15 +702,16 @@ class TridiagonalMatrix : public Matrix< Real, Device, Index, RealAllocator >
       /**
        * \brief Method for iteration over all matrix rows for constant instances.
        *
-       * \tparam Function is type of lambda function that will operate on matrix elements.
-       *    It is should have form like
-       *  `function( IndexType rowIdx, IndexType localIdx, IndexType columnIdx, const RealType& value, bool& compute )`.
+       * \tparam Function is type of lambda function that will operate on matrix elements. It is should have form like
+       *
+       * ```
+       * auto function = [=] __cuda_callable__ ( IndexType rowIdx, IndexType localIdx, IndexType columnIdx, const RealType& value ) { ... };
+       * ```
+       *
        *  The \e localIdx parameter is a rank of the non-zero element in given row.
-       *  If the 'compute' variable is set to false the iteration over the row can
-       *  be interrupted.
        *
-       * \param begin defines beginning of the range [begin,end) of rows to be processed.
-       * \param end defines ending of the range [begin,end) of rows to be processed.
+       * \param begin defines beginning of the range [ \e begin, \e end ) of rows to be processed.
+       * \param end defines ending of the range [ \e begin, \e end ) of rows to be processed.
        * \param function is an instance of the lambda function to be called in each row.
        *
        * \par Example
@@ -674,12 +725,13 @@ class TridiagonalMatrix : public Matrix< Real, Device, Index, RealAllocator >
       /**
        * \brief Method for iteration over all matrix rows for non-constant instances.
        *
-       * \tparam Function is type of lambda function that will operate on matrix elements.
-       *    It is should have form like
-       *  `function( IndexType rowIdx, IndexType localIdx, IndexType columnIdx, const RealType& value, bool& compute )`.
+       * \tparam Function is type of lambda function that will operate on matrix elements. It is should have form like
+       *
+       * ```
+       * auto function = [=] __cuda_callable__ ( IndexType rowIdx, IndexType localIdx, IndexType columnIdx, const RealType& value ) { ... };
+       * ```
+       *
        *  The \e localIdx parameter is a rank of the non-zero element in given row.
-       *  If the 'compute' variable is set to false the iteration over the row can
-       *  be interrupted.
        *
        * \param begin defines beginning of the range [begin,end) of rows to be processed.
        * \param end defines ending of the range [begin,end) of rows to be processed.
@@ -796,12 +848,13 @@ class TridiagonalMatrix : public Matrix< Real, Device, Index, RealAllocator >
       /**
        * \brief Method for sequential iteration over all matrix rows for constant instances.
        *
-       * \tparam Function is type of lambda function that will operate on matrix elements.
-       *    It is should have form like
-       *  `function( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx_, const RealType& value, bool& compute )`.
-       *  The column index repeats twice only for compatibility with sparse matrices.
-       *  If the 'compute' variable is set to false the iteration over the row can
-       *  be interrupted.
+       * \tparam Function is type of lambda function that will operate on matrix elements. It is should have form like
+       *
+       * ```
+       * auto function = [] __cuda_callable__ ( RowView& row ) { ... };
+       * ```
+       *
+       * \e RowView represents matrix row - see \ref TNL::Matrices::TridiagonalMatrix::RowView.
        *
        * \param begin defines beginning of the range [begin,end) of rows to be processed.
        * \param end defines ending of the range [begin,end) of rows to be processed.
@@ -813,12 +866,13 @@ class TridiagonalMatrix : public Matrix< Real, Device, Index, RealAllocator >
       /**
        * \brief Method for sequential iteration over all matrix rows for non-constant instances.
        *
-       * \tparam Function is type of lambda function that will operate on matrix elements.
-       *    It is should have form like
-       *  `function( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx_, RealType& value, bool& compute )`.
-       *  The column index repeats twice only for compatibility with sparse matrices.
-       *  If the 'compute' variable is set to false the iteration over the row can
-       *  be interrupted.
+       * \tparam Function is type of lambda function that will operate on matrix elements. It is should have form like
+       *
+       * ```
+       * auto function = [] __cuda_callable__ ( RowView& row ) { ... };
+       * ```
+       *
+       * \e RowView represents matrix row - see \ref TNL::Matrices::TridiagonalMatrix::RowView.
        *
        * \param begin defines beginning of the range [begin,end) of rows to be processed.
        * \param end defines ending of the range [begin,end) of rows to be processed.
@@ -854,7 +908,9 @@ class TridiagonalMatrix : public Matrix< Real, Device, Index, RealAllocator >
        *
        * More precisely, it computes:
        *
-       * `outVector = matrixTriplicator * ( * this ) * inVector + outVectorTriplicator * outVector`
+       * ```
+       * outVector = matrixTriplicator * ( * this ) * inVector + outVectorTriplicator * outVector
+       * ```
        *
        * \tparam InVector is type of input vector.  It can be \ref Vector,
        *     \ref VectorView, \ref Array, \ref ArraView or similar container.
diff --git a/src/TNL/Matrices/TridiagonalMatrix.hpp b/src/TNL/Matrices/TridiagonalMatrix.hpp
index a76ae3ef92484eb6ec0d48cc872d1d9fbb476d91..93ebcd8b39e68ba0f70079497a8b3cb64d5e5e43 100644
--- a/src/TNL/Matrices/TridiagonalMatrix.hpp
+++ b/src/TNL/Matrices/TridiagonalMatrix.hpp
@@ -717,7 +717,7 @@ operator=( const TridiagonalMatrix< Real_, Device_, Index_, Organization_, RealA
       if( std::is_same< Device, Device_ >::value )
       {
          const auto matrix_view = matrix.getView();
-         auto f = [=] __cuda_callable__ ( const IndexType& rowIdx, const IndexType& localIdx, const IndexType& column, Real& value, bool& compute ) mutable {
+         auto f = [=] __cuda_callable__ ( const IndexType& rowIdx, const IndexType& localIdx, const IndexType& column, Real& value ) mutable {
             value = matrix_view.getValues()[ matrix_view.getIndexer().getGlobalIndex( rowIdx, localIdx ) ];
          };
          this->forAllElements( f );
@@ -727,7 +727,7 @@ operator=( const TridiagonalMatrix< Real_, Device_, Index_, Organization_, RealA
          TridiagonalMatrix< Real, Device, Index, Organization_ > auxMatrix;
          auxMatrix = matrix;
          const auto matrix_view = auxMatrix.getView();
-         auto f = [=] __cuda_callable__ ( const IndexType& rowIdx, const IndexType& localIdx, const IndexType& column, Real& value, bool& compute ) mutable {
+         auto f = [=] __cuda_callable__ ( const IndexType& rowIdx, const IndexType& localIdx, const IndexType& column, Real& value ) mutable {
             value = matrix_view.getValues()[ matrix_view.getIndexer().getGlobalIndex( rowIdx, localIdx ) ];
          };
          this->forAllElements( f );
diff --git a/src/TNL/Matrices/TridiagonalMatrixView.h b/src/TNL/Matrices/TridiagonalMatrixView.h
index e05a8b05971b2fed4492a58b169474aec6dcdf78..c8e0ecdcad2e6f9b4fc45f2b56e09a0070866c5a 100644
--- a/src/TNL/Matrices/TridiagonalMatrixView.h
+++ b/src/TNL/Matrices/TridiagonalMatrixView.h
@@ -179,11 +179,6 @@ class TridiagonalMatrixView : public MatrixView< Real, Device, Index >
       template< typename Vector >
       void getCompressedRowLengths( Vector& rowLengths ) const;
 
-      //[[deprecated]]
-      //IndexType getRowLength( const IndexType row ) const;
-
-      //IndexType getMaxRowLength() const;
-
       /**
        * \brief Returns number of non-zero matrix elements.
        *
@@ -350,16 +345,28 @@ class TridiagonalMatrixView : public MatrixView< Real, Device, Index >
        * \brief Method for performing general reduction on matrix rows for constant instances.
        *
        * \tparam Fetch is a type of lambda function for data fetch declared as
-       *          `fetch( IndexType rowIdx, IndexType& columnIdx, RealType& elementValue ) -> FetchValue`.
-       *          The return type of this lambda can be any non void.
+       *
+       * ```
+       * auto fetch = [=] __cuda_callable__ ( IndexType rowIdx, IndexType& columnIdx, RealType& elementValue ) -> FetchValue { ... };
+       * ```
+       *
+       *  The return type of this lambda can be any non void.
        * \tparam Reduce is a type of lambda function for reduction declared as
-       *          `reduce( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue`.
-       * \tparam Keep is a type of lambda function for storing results of reduction in each row.
-       *          It is declared as `keep( const IndexType rowIdx, const double& value )`.
+       *
+       * ```
+       * auto reduce = [=] __cuda_callable__ ( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue { ... };
+       * ```
+       *
+       * \tparam Keep is a type of lambda function for storing results of reduction in each row. It is declared as
+       *
+       * ```
+       * auto keep = [=] __cuda_callable__ ( const IndexType rowIdx, const double& value ) { ... };
+       * ```
+       *
        * \tparam FetchValue is type returned by the Fetch lambda function.
        *
-       * \param begin defines beginning of the range [begin,end) of rows to be processed.
-       * \param end defines ending of the range [begin,end) of rows to be processed.
+       * \param begin defines beginning of the range [ \e begin, \e end ) of rows to be processed.
+       * \param end defines ending of the range [ \e begin, \e end ) of rows to be processed.
        * \param fetch is an instance of lambda function for data fetch.
        * \param reduce is an instance of lambda function for reduction.
        * \param keep in an instance of lambda function for storing results.
@@ -373,22 +380,34 @@ class TridiagonalMatrixView : public MatrixView< Real, Device, Index >
        * \include TridiagonalMatrixViewExample_reduceRows.out
        */
       template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
-      void reduceRows( IndexType first, IndexType last, Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& identity ) const;
+      void reduceRows( IndexType begin, IndexType end, Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& zero ) const;
 
       /**
        * \brief Method for performing general reduction on matrix rows.
        *
        * \tparam Fetch is a type of lambda function for data fetch declared as
-       *          `fetch( IndexType rowIdx, IndexType& columnIdx, RealType& elementValue ) -> FetchValue`.
-       *          The return type of this lambda can be any non void.
+       *
+       * ```
+       * auto fetch = [=] __cuda_callable__ ( IndexType rowIdx, IndexType& columnIdx, RealType& elementValue ) -> FetchValue { ... };
+       * ```
+       *
+       * The return type of this lambda can be any non void.
        * \tparam Reduce is a type of lambda function for reduction declared as
-       *          `reduce( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue`.
-       * \tparam Keep is a type of lambda function for storing results of reduction in each row.
-       *          It is declared as `keep( const IndexType rowIdx, const double& value )`.
+       *
+       * ```
+       * auto reduce( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue { ... };
+       * ```
+       *
+       * \tparam Keep is a type of lambda function for storing results of reduction in each row. It is declared as
+       *
+       * ```
+       * auto keep = [=] __cuda_callable__ ( const IndexType rowIdx, const double& value ) { ... };
+       * ```
+       *
        * \tparam FetchValue is type returned by the Fetch lambda function.
        *
-       * \param begin defines beginning of the range [begin,end) of rows to be processed.
-       * \param end defines ending of the range [begin,end) of rows to be processed.
+       * \param begin defines beginning of the range [ \e begin, \e end ) of rows to be processed.
+       * \param end defines ending of the range [ \e begin, \e end ) of rows to be processed.
        * \param fetch is an instance of lambda function for data fetch.
        * \param reduce is an instance of lambda function for reduction.
        * \param keep in an instance of lambda function for storing results.
@@ -402,18 +421,30 @@ class TridiagonalMatrixView : public MatrixView< Real, Device, Index >
        * \include TridiagonalMatrixViewExample_reduceRows.out
        */
       template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
-      void reduceRows( IndexType first, IndexType last, Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& identity );
+      void reduceRows( IndexType begin, IndexType end, Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& zero );
 
       /**
        * \brief Method for performing general reduction on all matrix rows for constant instances.
        *
        * \tparam Fetch is a type of lambda function for data fetch declared as
-       *          `fetch( IndexType rowIdx, IndexType& columnIdx, RealType& elementValue ) -> FetchValue`.
-       *          The return type of this lambda can be any non void.
+       *
+       * ```
+       * auto fetch = [=] __cuda_callable__ ( IndexType rowIdx, IndexType& columnIdx, RealType& elementValue ) -> FetchValue { ... };
+       * ```
+       *
+       * The return type of this lambda can be any non void.
        * \tparam Reduce is a type of lambda function for reduction declared as
-       *          `reduce( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue`.
-       * \tparam Keep is a type of lambda function for storing results of reduction in each row.
-       *          It is declared as `keep( const IndexType rowIdx, const double& value )`.
+       *
+       * ```
+       * auto reduce = [=] __cuda_callable__ ( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue { ... };
+       * ```
+       *
+       * \tparam Keep is a type of lambda function for storing results of reduction in each row. It is declared as
+       *
+       * ```
+       * auto keep = [=] __cuda_callable__ ( const IndexType rowIdx, const double& value ) { ... };
+       * ```
+       *
        * \tparam FetchValue is type returned by the Fetch lambda function.
        *
        * \param fetch is an instance of lambda function for data fetch.
@@ -435,12 +466,24 @@ class TridiagonalMatrixView : public MatrixView< Real, Device, Index >
        * \brief Method for performing general reduction on all matrix rows.
        *
        * \tparam Fetch is a type of lambda function for data fetch declared as
-       *          `fetch( IndexType rowIdx, IndexType& columnIdx, RealType& elementValue ) -> FetchValue`.
-       *          The return type of this lambda can be any non void.
+       *
+       * ```
+       * auto fetch = [=] __cuda_callable__ ( IndexType rowIdx, IndexType& columnIdx, RealType& elementValue ) -> FetchValue { ... };
+       * ```
+       *
+       *  The return type of this lambda can be any non void.
        * \tparam Reduce is a type of lambda function for reduction declared as
-       *          `reduce( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue`.
-       * \tparam Keep is a type of lambda function for storing results of reduction in each row.
-       *          It is declared as `keep( const IndexType rowIdx, const double& value )`.
+       *
+       * ```
+       * auto reduce = [=] __cuda_callable__ ( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue { ... };
+       * ```
+       *
+       * \tparam Keep is a type of lambda function for storing results of reduction in each row. It is declared as
+       *
+       * ```
+       * auto keep = [=] __cuda_callable__ ( const IndexType rowIdx, const double& value ) { ... };
+       * ```
+       *
        * \tparam FetchValue is type returned by the Fetch lambda function.
        *
        * \param fetch is an instance of lambda function for data fetch.
@@ -461,15 +504,16 @@ class TridiagonalMatrixView : public MatrixView< Real, Device, Index >
       /**
        * \brief Method for iteration over all matrix rows for constant instances.
        *
-       * \tparam Function is type of lambda function that will operate on matrix elements.
-       *    It is should have form like
-       *  `function( IndexType rowIdx, IndexType localIdx, IndexType columnIdx, const RealType& value, bool& compute )`.
-       *  The \e localIdx parameter is a rank of the non-zero element in given row.
-       *  If the 'compute' variable is set to false the iteration over the row can
-       *  be interrupted.
+       * \tparam Function is type of lambda function that will operate on matrix elements. It is should have form like
+       *
+       * ```
+       * auto function = [=] __cuda_callable__ ( IndexType rowIdx, IndexType localIdx, IndexType columnIdx, const RealType& value ) { ... };
+       * ```
+       *
+       * The \e localIdx parameter is a rank of the non-zero element in given row.
        *
-       * \param begin defines beginning of the range [begin,end) of rows to be processed.
-       * \param end defines ending of the range [begin,end) of rows to be processed.
+       * \param begin defines beginning of the range [ \e begin, \e end ) of rows to be processed.
+       * \param end defines ending of the range [ \e begin,\e end ) of rows to be processed.
        * \param function is an instance of the lambda function to be called in each row.
        *
        * \par Example
@@ -478,20 +522,21 @@ class TridiagonalMatrixView : public MatrixView< Real, Device, Index >
        * \include TridiagonalMatrixViewExample_forRows.out
        */
       template< typename Function >
-      void forElements( IndexType first, IndexType last, Function& function ) const;
+      void forElements( IndexType begin, IndexType end, Function& function ) const;
 
       /**
        * \brief Method for iteration over all matrix rows for non-constant instances.
        *
-       * \tparam Function is type of lambda function that will operate on matrix elements.
-       *    It is should have form like
-       *  `function( IndexType rowIdx, IndexType localIdx, IndexType columnIdx, const RealType& value, bool& compute )`.
+       * \tparam Function is type of lambda function that will operate on matrix elements. It is should have form like
+       *
+       * ```
+       * auto function = [=] __cuda_callable__ ( IndexType rowIdx, IndexType localIdx, IndexType columnIdx, const RealType& value ) { ... };
+       * ```
+       *
        *  The \e localIdx parameter is a rank of the non-zero element in given row.
-       *  If the 'compute' variable is set to false the iteration over the row can
-       *  be interrupted.
        *
-       * \param begin defines beginning of the range [begin,end) of rows to be processed.
-       * \param end defines ending of the range [begin,end) of rows to be processed.
+       * \param begin defines beginning of the range [ \e begin, \e end ) of rows to be processed.
+       * \param end defines ending of the range [ \e begin, \e end ) of rows to be processed.
        * \param function is an instance of the lambda function to be called in each row.
        *
        * \par Example
@@ -500,7 +545,7 @@ class TridiagonalMatrixView : public MatrixView< Real, Device, Index >
        * \include TridiagonalMatrixViewExample_forRows.out
        */
       template< typename Function >
-      void forElements( IndexType first, IndexType last, Function& function );
+      void forElements( IndexType begin, IndexType end, Function& function );
 
       /**
        * \brief This method calls \e forElements for all matrix rows (for constant instances).
@@ -637,15 +682,16 @@ class TridiagonalMatrixView : public MatrixView< Real, Device, Index >
       /**
        * \brief Method for sequential iteration over all matrix rows for constant instances.
        *
-       * \tparam Function is type of lambda function that will operate on matrix elements.
-       *    It is should have form like
-       *  `function( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx_, const RealType& value, bool& compute )`.
-       *  The column index repeats twice only for compatibility with sparse matrices.
-       *  If the 'compute' variable is set to false the iteration over the row can
-       *  be interrupted.
+       * \tparam Function is type of lambda function that will operate on matrix elements. It is should have form like
+       *
+       * ```
+       * auto function = [] __cuda_callable__ ( RowView& row ) { ... };
+       * ```
+       *
+       * \e RowView represents matrix row - see \ref TNL::Matrices::TridiagonalMatrixView::RowView.
        *
-       * \param begin defines beginning of the range [begin,end) of rows to be processed.
-       * \param end defines ending of the range [begin,end) of rows to be processed.
+       * \param begin defines beginning of the range [ \e begin, \e end ) of rows to be processed.
+       * \param end defines ending of the range [ \e begin, \e end ) of rows to be processed.
        * \param function is an instance of the lambda function to be called in each row.
        */
       template< typename Function >
@@ -654,15 +700,16 @@ class TridiagonalMatrixView : public MatrixView< Real, Device, Index >
       /**
        * \brief Method for sequential iteration over all matrix rows for non-constant instances.
        *
-       * \tparam Function is type of lambda function that will operate on matrix elements.
-       *    It is should have form like
-       *  `function( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx_, RealType& value, bool& compute )`.
-       *  The column index repeats twice only for compatibility with sparse matrices.
-       *  If the 'compute' variable is set to false the iteration over the row can
-       *  be interrupted.
+       * \tparam Function is type of lambda function that will operate on matrix elements. It is should have form like
        *
-       * \param begin defines beginning of the range [begin,end) of rows to be processed.
-       * \param end defines ending of the range [begin,end) of rows to be processed.
+       * ```
+       * auto function = [] __cuda_callable__ ( RowView& row ) { ... };
+       * ```
+       *
+       * \e RowView represents matrix row - see \ref TNL::Matrices::TridiagonalMatrixView::RowView.
+       *
+       * \param begin defines beginning of the range [ \e begin, \e end ) of rows to be processed.
+       * \param end defines ending of the range [ \e begin, \e end ) of rows to be processed.
        * \param function is an instance of the lambda function to be called in each row.
        */
       template< typename Function >
@@ -695,7 +742,9 @@ class TridiagonalMatrixView : public MatrixView< Real, Device, Index >
        *
        * More precisely, it computes:
        *
-       * `outVector = matrixMultiplicator * ( * this ) * inVector + outVectorMultiplicator * outVector`
+       * ```
+       * outVector = matrixMultiplicator * ( * this ) * inVector + outVectorMultiplicator * outVector
+       * ```
        *
        * \tparam InVector is type of input vector.  It can be \ref Vector,
        *     \ref VectorView, \ref Array, \ref ArraView or similar container.
diff --git a/src/TNL/Matrices/TridiagonalMatrixView.hpp b/src/TNL/Matrices/TridiagonalMatrixView.hpp
index 0ebad8cb64e627dcdc37549658c9b10fa3e6a68d..c67073381047ef8a282bae91fcb5e470f5a484d1 100644
--- a/src/TNL/Matrices/TridiagonalMatrixView.hpp
+++ b/src/TNL/Matrices/TridiagonalMatrixView.hpp
@@ -393,26 +393,25 @@ forElements( IndexType first, IndexType last, Function& function ) const
 {
    const auto values_view = this->values.getConstView();
    const auto indexer = this->indexer;
-   bool compute( true );
    auto f = [=] __cuda_callable__ ( IndexType rowIdx ) mutable {
       if( rowIdx == 0 )
       {
-         function( 0, 1, 0, values_view[ indexer.getGlobalIndex( 0, 1 ) ], compute );
-         function( 0, 2, 1, values_view[ indexer.getGlobalIndex( 0, 2 ) ], compute );
+         function( 0, 1, 0, values_view[ indexer.getGlobalIndex( 0, 1 ) ] );
+         function( 0, 2, 1, values_view[ indexer.getGlobalIndex( 0, 2 ) ] );
       }
       else if( rowIdx + 1 < indexer.getColumns() )
       {
-         function( rowIdx, 0, rowIdx - 1, values_view[ indexer.getGlobalIndex( rowIdx, 0 ) ], compute );
-         function( rowIdx, 1, rowIdx,     values_view[ indexer.getGlobalIndex( rowIdx, 1 ) ], compute );
-         function( rowIdx, 2, rowIdx + 1, values_view[ indexer.getGlobalIndex( rowIdx, 2 ) ], compute );
+         function( rowIdx, 0, rowIdx - 1, values_view[ indexer.getGlobalIndex( rowIdx, 0 ) ] );
+         function( rowIdx, 1, rowIdx,     values_view[ indexer.getGlobalIndex( rowIdx, 1 ) ] );
+         function( rowIdx, 2, rowIdx + 1, values_view[ indexer.getGlobalIndex( rowIdx, 2 ) ] );
       }
       else if( rowIdx < indexer.getColumns() )
       {
-         function( rowIdx, 0, rowIdx - 1, values_view[ indexer.getGlobalIndex( rowIdx, 0 ) ], compute );
-         function( rowIdx, 1, rowIdx,     values_view[ indexer.getGlobalIndex( rowIdx, 1 ) ], compute );
+         function( rowIdx, 0, rowIdx - 1, values_view[ indexer.getGlobalIndex( rowIdx, 0 ) ] );
+         function( rowIdx, 1, rowIdx,     values_view[ indexer.getGlobalIndex( rowIdx, 1 ) ] );
       }
       else
-         function( rowIdx, 0, rowIdx, values_view[ indexer.getGlobalIndex( rowIdx, 0 ) ], compute );
+         function( rowIdx, 0, rowIdx, values_view[ indexer.getGlobalIndex( rowIdx, 0 ) ] );
    };
    Algorithms::ParallelFor< DeviceType >::exec( first, last, f );
 }
@@ -428,26 +427,25 @@ forElements( IndexType first, IndexType last, Function& function )
 {
    auto values_view = this->values.getView();
    const auto indexer = this->indexer;
-   bool compute( true );
    auto f = [=] __cuda_callable__ ( IndexType rowIdx ) mutable {
       if( rowIdx == 0 )
       {
-         function( 0, 1, 0, values_view[ indexer.getGlobalIndex( 0, 1 ) ], compute );
-         function( 0, 2, 1, values_view[ indexer.getGlobalIndex( 0, 2 ) ], compute );
+         function( 0, 1, 0, values_view[ indexer.getGlobalIndex( 0, 1 ) ] );
+         function( 0, 2, 1, values_view[ indexer.getGlobalIndex( 0, 2 ) ] );
       }
       else if( rowIdx + 1 < indexer.getColumns() )
       {
-         function( rowIdx, 0, rowIdx - 1, values_view[ indexer.getGlobalIndex( rowIdx, 0 ) ], compute );
-         function( rowIdx, 1, rowIdx,     values_view[ indexer.getGlobalIndex( rowIdx, 1 ) ], compute );
-         function( rowIdx, 2, rowIdx + 1, values_view[ indexer.getGlobalIndex( rowIdx, 2 ) ], compute );
+         function( rowIdx, 0, rowIdx - 1, values_view[ indexer.getGlobalIndex( rowIdx, 0 ) ] );
+         function( rowIdx, 1, rowIdx,     values_view[ indexer.getGlobalIndex( rowIdx, 1 ) ] );
+         function( rowIdx, 2, rowIdx + 1, values_view[ indexer.getGlobalIndex( rowIdx, 2 ) ] );
       }
       else if( rowIdx < indexer.getColumns() )
       {
-         function( rowIdx, 0, rowIdx - 1, values_view[ indexer.getGlobalIndex( rowIdx, 0 ) ], compute );
-         function( rowIdx, 1, rowIdx,     values_view[ indexer.getGlobalIndex( rowIdx, 1 ) ], compute );
+         function( rowIdx, 0, rowIdx - 1, values_view[ indexer.getGlobalIndex( rowIdx, 0 ) ] );
+         function( rowIdx, 1, rowIdx,     values_view[ indexer.getGlobalIndex( rowIdx, 1 ) ] );
       }
       else
-         function( rowIdx, 0, rowIdx, values_view[ indexer.getGlobalIndex( rowIdx, 0 ) ], compute );
+         function( rowIdx, 0, rowIdx, values_view[ indexer.getGlobalIndex( rowIdx, 0 ) ] );
    };
    Algorithms::ParallelFor< DeviceType >::exec( first, last, f );
 }
@@ -663,13 +661,13 @@ addMatrix( const TridiagonalMatrixView< Real_, Device_, Index_, Organization_ >&
       const auto matrix_view = matrix;
       const auto matrixMult = matrixMultiplicator;
       const auto thisMult = thisMatrixMultiplicator;
-      auto add0 = [=] __cuda_callable__ ( const IndexType& rowIdx, const IndexType& localIdx, const IndexType& column, Real& value, bool& compute ) mutable {
+      auto add0 = [=] __cuda_callable__ ( const IndexType& rowIdx, const IndexType& localIdx, const IndexType& column, Real& value ) mutable {
          value = matrixMult * matrix.getValues()[ matrix.getIndexer().getGlobalIndex( rowIdx, localIdx ) ];
       };
-      auto add1 = [=] __cuda_callable__ ( const IndexType& rowIdx, const IndexType& localIdx, const IndexType& column, Real& value, bool& compute ) mutable {
+      auto add1 = [=] __cuda_callable__ ( const IndexType& rowIdx, const IndexType& localIdx, const IndexType& column, Real& value ) mutable {
          value += matrixMult * matrix.getValues()[ matrix.getIndexer().getGlobalIndex( rowIdx, localIdx ) ];
       };
-      auto addGen = [=] __cuda_callable__ ( const IndexType& rowIdx, const IndexType& localIdx, const IndexType& column, Real& value, bool& compute ) mutable {
+      auto addGen = [=] __cuda_callable__ ( const IndexType& rowIdx, const IndexType& localIdx, const IndexType& column, Real& value ) mutable {
          value = thisMult * value + matrixMult * matrix.getValues()[ matrix.getIndexer().getGlobalIndex( rowIdx, localIdx ) ];
       };
       if( thisMult == 0.0 )
diff --git a/src/TNL/Matrices/details/SparseMatrixRowViewValueGetter.h b/src/TNL/Matrices/details/SparseMatrixRowViewValueGetter.h
new file mode 100644
index 0000000000000000000000000000000000000000..ce696e56e85cd0a5d567f7972cd5b95f3ca11186
--- /dev/null
+++ b/src/TNL/Matrices/details/SparseMatrixRowViewValueGetter.h
@@ -0,0 +1,79 @@
+
+
+/***************************************************************************
+                          SparseMatrixRowViewValueGetter.h  -  description
+                             -------------------
+    begin                : May 4, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+namespace TNL {
+   namespace Matrices {
+      namespace details {
+
+
+template< typename SegmentView,
+          typename ValuesView,
+          typename ColumnsIndexesView,
+          typename Real = std::remove_const_t<typename ValuesView::RealType >,
+          bool isBinary_ = std::is_same< std::remove_const_t<typename ValuesView::RealType >, bool >::value >
+struct SparseMatrixRowViewValueGetter {};
+
+template< typename SegmentView,
+          typename ValuesView,
+          typename ColumnsIndexesView,
+          typename Real >
+struct SparseMatrixRowViewValueGetter< SegmentView, ValuesView, ColumnsIndexesView, Real, true >
+{
+   using RealType = typename ValuesView::RealType;
+
+   using IndexType = typename ColumnsIndexesView::IndexType;
+
+   using ResultType = bool;
+
+   using ConstResultType = bool;
+
+   __cuda_callable__
+   static bool getValue( const IndexType& globalIdx, const ValuesView& values, const ColumnsIndexesView& columnIndexes, const IndexType& paddingIndex )
+   {
+      if( columnIndexes[ globalIdx ] != paddingIndex )
+         return true;
+      return false;
+   };
+};
+
+template< typename SegmentView,
+          typename ValuesView,
+          typename ColumnsIndexesView,
+          typename Real >
+struct SparseMatrixRowViewValueGetter< SegmentView, ValuesView, ColumnsIndexesView, Real, false >
+{
+   using RealType = typename ValuesView::RealType;
+
+   using IndexType = typename ColumnsIndexesView::IndexType;
+
+   using ResultType = RealType&;
+
+   using ConstResultType = const RealType&;
+
+   __cuda_callable__
+   static const RealType& getValue( const IndexType& globalIdx, const ValuesView& values, const ColumnsIndexesView& columnIndexes, const IndexType& paddingIndex )
+   {
+      return values[ globalIdx ];
+   };
+
+   __cuda_callable__
+   static RealType& getValue( const IndexType& globalIdx, ValuesView& values, ColumnsIndexesView& columnIndexes, const IndexType& paddingIndex )
+   {
+      return values[ globalIdx ];
+   };
+};
+
+      } //namespace details
+   } //namepsace Matrices
+} //namespace TNL
diff --git a/src/UnitTests/Algorithms/Segments/SegmentsTest.hpp b/src/UnitTests/Algorithms/Segments/SegmentsTest.hpp
index 7073bdb8ab4dc2aa5aefa6013b6867d5e2a6746b..f6d5a9c8c100d0e8b419ca4d02c0567f6c047d49 100644
--- a/src/UnitTests/Algorithms/Segments/SegmentsTest.hpp
+++ b/src/UnitTests/Algorithms/Segments/SegmentsTest.hpp
@@ -112,7 +112,7 @@ void test_SetSegmentsSizes_EqualSizes_EllpackOnly()
 }
 
 template< typename Segments >
-void test_AllReduction_MaximumInSegments()
+void test_reduceAllSegments_MaximumInSegments()
 {
    using DeviceType = typename Segments::DeviceType;
    using IndexType = typename Segments::IndexType;
@@ -128,7 +128,7 @@ void test_AllReduction_MaximumInSegments()
    TNL::Containers::Vector< IndexType, DeviceType, IndexType > v( segments.getStorageSize() );
 
    auto view = v.getView();
-   auto init = [=] __cuda_callable__ ( const IndexType segmentIdx, const IndexType localIdx, const IndexType globalIdx, bool& compute ) mutable -> bool {
+   auto init = [=] __cuda_callable__ ( const IndexType segmentIdx, const IndexType localIdx, const IndexType globalIdx ) mutable -> bool {
       view[ globalIdx ] =  segmentIdx * 5 + localIdx + 1;
       return true;
    };
@@ -147,13 +147,13 @@ void test_AllReduction_MaximumInSegments()
    auto keep = [=] __cuda_callable__ ( const IndexType i, const IndexType a ) mutable {
       result_view[ i ] = a;
    };
-   segments.allReduction( fetch, reduce, keep, std::numeric_limits< IndexType >::min() );
+   segments.reduceAllSegments( fetch, reduce, keep, std::numeric_limits< IndexType >::min() );
 
    for( IndexType i = 0; i < segmentsCount; i++ )
       EXPECT_EQ( result.getElement( i ), ( i + 1 ) * segmentSize );
 
    result_view = 0;
-   segments.getView().allReduction( fetch, reduce, keep, std::numeric_limits< IndexType >::min() );
+   segments.getView().reduceAllSegments( fetch, reduce, keep, std::numeric_limits< IndexType >::min() );
    for( IndexType i = 0; i < segmentsCount; i++ )
       EXPECT_EQ( result.getElement( i ), ( i + 1 ) * segmentSize );
 }
diff --git a/src/UnitTests/Algorithms/Segments/SegmentsTest_CSR.h b/src/UnitTests/Algorithms/Segments/SegmentsTest_CSR.h
index 74219f7db8c27bc768006cb569b1edaa3df5556d..b1b58771229c8ce567da07343ffbc4a724836c6b 100644
--- a/src/UnitTests/Algorithms/Segments/SegmentsTest_CSR.h
+++ b/src/UnitTests/Algorithms/Segments/SegmentsTest_CSR.h
@@ -44,11 +44,11 @@ TYPED_TEST( CSRSegmentsTest, setSegmentsSizes_EqualSizes )
     test_SetSegmentsSizes_EqualSizes< CSRSegmentsType >();
 }
 
-TYPED_TEST( CSRSegmentsTest, allReduction_MaximumInSegments )
+TYPED_TEST( CSRSegmentsTest, reduceAllSegments_MaximumInSegments )
 {
     using CSRSegmentsType = typename TestFixture::CSRSegmentsType;
 
-    test_AllReduction_MaximumInSegments< CSRSegmentsType >();
+    test_reduceAllSegments_MaximumInSegments< CSRSegmentsType >();
 }
 
 #endif
diff --git a/src/UnitTests/Algorithms/Segments/SegmentsTest_Ellpack.h b/src/UnitTests/Algorithms/Segments/SegmentsTest_Ellpack.h
index 262ddce6d77246803ea5e3ef826f4c72b8444c7c..af9816ee486068e79cdb72ff961951968d14436c 100644
--- a/src/UnitTests/Algorithms/Segments/SegmentsTest_Ellpack.h
+++ b/src/UnitTests/Algorithms/Segments/SegmentsTest_Ellpack.h
@@ -51,11 +51,11 @@ TYPED_TEST( EllpackSegmentsTest, setSegmentsSizes_EqualSizes_EllpackOnly )
     test_SetSegmentsSizes_EqualSizes_EllpackOnly< EllpackSegmentsType >();
 }
 
-TYPED_TEST( EllpackSegmentsTest, allReduction_MaximumInSegments )
+TYPED_TEST( EllpackSegmentsTest, reduceAllSegments_MaximumInSegments )
 {
     using EllpackSegmentsType = typename TestFixture::EllpackSegmentsType;
 
-    test_AllReduction_MaximumInSegments< EllpackSegmentsType >();
+    test_reduceAllSegments_MaximumInSegments< EllpackSegmentsType >();
 }
 
 #endif
diff --git a/src/UnitTests/Algorithms/Segments/SegmentsTest_SlicedEllpack.h b/src/UnitTests/Algorithms/Segments/SegmentsTest_SlicedEllpack.h
index 42a9e76525ca44dc6c7bebb1731a5b5c1d5eab6b..2cd9fcd1cbe8f557fca30c781bd87dfed5329631 100644
--- a/src/UnitTests/Algorithms/Segments/SegmentsTest_SlicedEllpack.h
+++ b/src/UnitTests/Algorithms/Segments/SegmentsTest_SlicedEllpack.h
@@ -44,11 +44,11 @@ TYPED_TEST( SlicedEllpackSegmentsTest, setSegmentsSizes_EqualSizes )
     test_SetSegmentsSizes_EqualSizes< SlicedEllpackSegmentsType >();
 }
 
-TYPED_TEST( SlicedEllpackSegmentsTest, allReduction_MaximumInSegments )
+TYPED_TEST( SlicedEllpackSegmentsTest, reduceAllSegments_MaximumInSegments )
 {
     using SlicedEllpackSegmentsType = typename TestFixture::SlicedEllpackSegmentsType;
 
-    test_AllReduction_MaximumInSegments< SlicedEllpackSegmentsType >();
+    test_reduceAllSegments_MaximumInSegments< SlicedEllpackSegmentsType >();
 }
 
 #endif
diff --git a/src/UnitTests/Matrices/CMakeLists.txt b/src/UnitTests/Matrices/CMakeLists.txt
index ef639fc440f71af822eb5822209fe802bb590d1b..2fe0f39ee7ef3a5190a86fd38e9e08768614f9e4 100644
--- a/src/UnitTests/Matrices/CMakeLists.txt
+++ b/src/UnitTests/Matrices/CMakeLists.txt
@@ -8,11 +8,21 @@ set( COMMON_TESTS
             SparseMatrixTest_CSRScalar
             SparseMatrixTest_CSRVector
             SparseMatrixTest_CSRHybrid
+            SparseMatrixTest_CSRLight
             SparseMatrixTest_CSRAdaptive
             SparseMatrixTest_Ellpack
             SparseMatrixTest_SlicedEllpack
             SparseMatrixTest_ChunkedEllpack
             SparseMatrixTest_BiEllpack
+            SparseMatrixVectorProductTest_CSRScalar
+            SparseMatrixVectorProductTest_CSRVector
+            SparseMatrixVectorProductTest_CSRHybrid
+            SparseMatrixVectorProductTest_CSRLight
+            SparseMatrixVectorProductTest_CSRAdaptive
+            SparseMatrixVectorProductTest_Ellpack
+            SparseMatrixVectorProductTest_SlicedEllpack
+            SparseMatrixVectorProductTest_ChunkedEllpack
+            SparseMatrixVectorProductTest_BiEllpack
             SparseMatrixCopyTest
             BinarySparseMatrixTest_CSR
             BinarySparseMatrixTest_Ellpack
@@ -20,6 +30,9 @@ set( COMMON_TESTS
             BinarySparseMatrixCopyTest
             SymmetricSparseMatrixTest_CSR
             LambdaMatrixTest
+            SparseMatrixTest_SandboxMatrix
+            SparseMatrixVectorProductTest_SandboxMatrix
+            MatrixWrappingTest
 )
 
 set( CPP_TESTS
diff --git a/src/UnitTests/Matrices/DenseMatrixTest.h b/src/UnitTests/Matrices/DenseMatrixTest.h
index ceb7ae358855a35d3dba2848375a76f8994fe88b..4558387e470dab33b41afbd1b9b4aecbda62dfc1 100644
--- a/src/UnitTests/Matrices/DenseMatrixTest.h
+++ b/src/UnitTests/Matrices/DenseMatrixTest.h
@@ -791,7 +791,7 @@ void test_ForElements()
    const IndexType rows = 8;
 
    Matrix m( rows, cols  );
-   m.forAllElements( [] __cuda_callable__ ( IndexType rowIdx, IndexType localIdx, const IndexType& columnIdx, RealType& value, bool compute ) mutable {
+   m.forAllElements( [] __cuda_callable__ ( IndexType rowIdx, IndexType localIdx, const IndexType& columnIdx, RealType& value ) mutable {
       value = rowIdx + 1.0;
    } );
 
@@ -996,6 +996,48 @@ void test_VectorProduct()
     EXPECT_EQ( outVector.getElement( 4 ), 148 );
 }
 
+template< typename Matrix >
+void test_LargeVectorProduct()
+{
+   using RealType = typename Matrix::RealType;
+   using DeviceType = typename Matrix::DeviceType;
+   using IndexType = typename Matrix::IndexType;
+
+   if( std::is_same< IndexType, short >::value )
+      return;
+
+   const IndexType rows = 5000;
+   const IndexType cols = 5000;
+
+   Matrix m( rows, cols );
+   m.forAllElements(
+      [] __cuda_callable__ ( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx_, RealType& value ) {
+         value = columnIdx + 1.0;
+      }
+   );
+
+
+   using VectorType = TNL::Containers::Vector< RealType, DeviceType, IndexType >;
+
+   VectorType inVector( cols );
+   inVector.forAllElements( [] __cuda_callable__ ( IndexType i, RealType& value ) {
+      value = 1.0;
+   } );
+
+   VectorType outVector( rows, 0.0 );
+
+   m.vectorProduct( inVector, outVector);
+
+   for( IndexType i = 0; i < rows; i++ )
+   {
+      //RealType diag = ( i % 2 == 1 ? cols - 1 : -cols + 1 );
+      //RealType non_diag = ( cols % 2 == 0 ? 0.0 : 1.0 );
+      RealType rcols = cols;
+      EXPECT_EQ( outVector.getElement( i ),  ( 0.5 * rcols ) * ( rcols + 1.0 ) );
+   }
+}
+
+
 template< typename Matrix >
 void test_AddMatrix()
 {
@@ -1622,6 +1664,13 @@ TYPED_TEST( MatrixTest, vectorProductTest )
     test_VectorProduct< MatrixType >();
 }
 
+TYPED_TEST( MatrixTest, largeVectorProductTest )
+{
+    using MatrixType = typename TestFixture::MatrixType;
+
+    test_LargeVectorProduct< MatrixType >();
+}
+
 TYPED_TEST( MatrixTest, addMatrixTest )
 {
     using MatrixType = typename TestFixture::MatrixType;
diff --git a/src/UnitTests/Matrices/MatrixWrappingTest.cpp b/src/UnitTests/Matrices/MatrixWrappingTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..db87ce48232aedad8c57dd2e20129fa522b9b7e4
--- /dev/null
+++ b/src/UnitTests/Matrices/MatrixWrappingTest.cpp
@@ -0,0 +1,11 @@
+/***************************************************************************
+                          MatrixWrappingTest.cpp -  description
+                             -------------------
+    begin                : Mar 4, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#include "MatrixWrappingTest.h"
diff --git a/src/UnitTests/Matrices/MatrixWrappingTest.cu b/src/UnitTests/Matrices/MatrixWrappingTest.cu
new file mode 100644
index 0000000000000000000000000000000000000000..8dd0849dc17dd0f8e40d7fc042d01970f807c8b0
--- /dev/null
+++ b/src/UnitTests/Matrices/MatrixWrappingTest.cu
@@ -0,0 +1,11 @@
+/***************************************************************************
+                          MatrixWrappingTest.cu -  description
+                             -------------------
+    begin                : Mar 4, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#include "MatrixWrappingTest.h"
diff --git a/src/UnitTests/Matrices/MatrixWrappingTest.h b/src/UnitTests/Matrices/MatrixWrappingTest.h
new file mode 100644
index 0000000000000000000000000000000000000000..9da8421d5df8e6f0184c6fa763cdc9ab8e867792
--- /dev/null
+++ b/src/UnitTests/Matrices/MatrixWrappingTest.h
@@ -0,0 +1,113 @@
+/***************************************************************************
+                          SparseMatrixTest.h -  description
+                             -------------------
+    begin                : Mar 21, 2020
+    copyright            : (C) 2020 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#include <TNL/Containers/Vector.h>
+#include <TNL/Containers/VectorView.h>
+#include <TNL/Math.h>
+#include <TNL/Matrices/DenseMatrix.h>
+#include <TNL/Matrices/SparseMatrix.h>
+#include <TNL/Matrices/MatrixWrapping.h>
+#include <TNL/Algorithms/Segments/Ellpack.h>
+#include <iostream>
+#include <sstream>
+
+#ifdef HAVE_GTEST
+#include <gtest/gtest.h>
+
+template< typename Device_, typename Index_, typename IndexAllocator_ >
+using RowMajorEllpack = TNL::Algorithms::Segments::Ellpack< Device_, Index_, IndexAllocator_, TNL::Algorithms::Segments::RowMajorOrder, 1 >;
+
+// test fixture for typed tests
+template< typename Matrix >
+class MatrixTest : public ::testing::Test
+{
+protected:
+   using MatrixType = Matrix;
+};
+
+
+// types for which MatrixTest is instantiated
+// types for which MatrixTest is instantiated
+using MatrixTypes = ::testing::Types
+<
+    TNL::Matrices::DenseMatrix< int,    TNL::Devices::Host, short >,
+    TNL::Matrices::DenseMatrix< long,   TNL::Devices::Host, short >,
+    TNL::Matrices::DenseMatrix< float,  TNL::Devices::Host, short >,
+    TNL::Matrices::DenseMatrix< double, TNL::Devices::Host, short >,
+    TNL::Matrices::DenseMatrix< int,    TNL::Devices::Host, int >,
+    TNL::Matrices::DenseMatrix< long,   TNL::Devices::Host, int >,
+    TNL::Matrices::DenseMatrix< float,  TNL::Devices::Host, int >,
+    TNL::Matrices::DenseMatrix< double, TNL::Devices::Host, int >,
+    TNL::Matrices::DenseMatrix< int,    TNL::Devices::Host, long >,
+    TNL::Matrices::DenseMatrix< long,   TNL::Devices::Host, long >,
+    TNL::Matrices::DenseMatrix< float,  TNL::Devices::Host, long >,
+    TNL::Matrices::DenseMatrix< double, TNL::Devices::Host, long >
+#ifdef HAVE_CUDA
+    ,TNL::Matrices::DenseMatrix< int,    TNL::Devices::Cuda, short >,
+    TNL::Matrices::DenseMatrix< long,   TNL::Devices::Cuda, short >,
+    TNL::Matrices::DenseMatrix< float,  TNL::Devices::Cuda, short >,
+    TNL::Matrices::DenseMatrix< double, TNL::Devices::Cuda, short >,
+    TNL::Matrices::DenseMatrix< int,    TNL::Devices::Cuda, int >,
+    TNL::Matrices::DenseMatrix< long,   TNL::Devices::Cuda, int >,
+    TNL::Matrices::DenseMatrix< float,  TNL::Devices::Cuda, int >,
+    TNL::Matrices::DenseMatrix< double, TNL::Devices::Cuda, int >,
+    TNL::Matrices::DenseMatrix< int,    TNL::Devices::Cuda, long >,
+    TNL::Matrices::DenseMatrix< long,   TNL::Devices::Cuda, long >,
+    TNL::Matrices::DenseMatrix< float,  TNL::Devices::Cuda, long >,
+    TNL::Matrices::DenseMatrix< double, TNL::Devices::Cuda, long >
+#endif
+>;
+
+
+TYPED_TEST_SUITE( MatrixTest, MatrixTypes);
+
+TYPED_TEST( MatrixTest, WrapMatrix )
+{
+   using DenseMatrix = typename TestFixture::MatrixType;
+   using RealType  = typename DenseMatrix::RealType;
+   using DeviceType  = typename DenseMatrix::DeviceType;
+   using IndexType  = typename DenseMatrix::IndexType;
+   using CSRMatrix = TNL::Matrices::SparseMatrix< RealType, DeviceType, IndexType,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRScalar >;
+   using EllpackMatrix = TNL::Matrices::SparseMatrix< RealType, DeviceType, IndexType, TNL::Matrices::GeneralMatrix, RowMajorEllpack >;
+
+   DenseMatrix denseMatrix{
+    { 1,  2,  0,  0 },
+    { 0,  6,  0,  0 },
+    { 9,  0,  0,  0 },
+    { 0,  0, 15, 16 } };
+   IndexType rows( 4 ), columns( 4 );
+   CSRMatrix csrMatrix;
+   EllpackMatrix ellpackMatrix;
+   csrMatrix = ellpackMatrix = denseMatrix;
+
+   auto denseMatrixValues  = denseMatrix.getValues().getData();
+
+   auto csrMatrixValues = csrMatrix.getValues().getData();
+   auto csrMatrixColumnIndexes = csrMatrix.getColumnIndexes().getData();
+   auto csrMatrixRowPointers = csrMatrix.getSegments().getOffsets().getData();
+
+   auto ellpackMatrixValues = ellpackMatrix.getValues().getData();
+   auto ellpackMatrixColumnIndexes = ellpackMatrix.getColumnIndexes().getData();
+
+   auto wrappedDenseMatrix   = TNL::Matrices::wrapDenseMatrix< DeviceType >( rows, columns, denseMatrixValues );
+   auto wrappedCSRMatrix     = TNL::Matrices::wrapCSRMatrix< DeviceType >( rows, columns, csrMatrixRowPointers, csrMatrixValues, csrMatrixColumnIndexes );
+   auto wrappedEllpackMatrix = TNL::Matrices::wrapEllpackMatrix< DeviceType, TNL::Algorithms::Segments::RowMajorOrder >( rows, columns, ( IndexType ) 2, ellpackMatrixValues, ellpackMatrixColumnIndexes );
+
+   EXPECT_EQ( denseMatrix, wrappedDenseMatrix );
+   EXPECT_EQ( csrMatrix, wrappedCSRMatrix );
+   EXPECT_EQ( ellpackMatrix, wrappedEllpackMatrix );
+}
+
+
+#include "../main.h"
+
+#endif
diff --git a/src/UnitTests/Matrices/SparseMatrixTest.h b/src/UnitTests/Matrices/SparseMatrixTest.h
index 1ae0fda8a0115d1e49e24c32dffdfd5c9c222a5a..68d7bedb025a1a389417b8eed89d87ded531f9bd 100644
--- a/src/UnitTests/Matrices/SparseMatrixTest.h
+++ b/src/UnitTests/Matrices/SparseMatrixTest.h
@@ -88,13 +88,6 @@ TYPED_TEST( MatrixTest, addElementTest )
     test_AddElement< MatrixType >();
 }
 
-TYPED_TEST( MatrixTest, vectorProductTest )
-{
-    using MatrixType = typename TestFixture::MatrixType;
-
-    test_VectorProduct< MatrixType >();
-}
-
 TYPED_TEST( MatrixTest, forElements )
 {
     using MatrixType = typename TestFixture::MatrixType;
diff --git a/src/UnitTests/Matrices/SparseMatrixTest.hpp b/src/UnitTests/Matrices/SparseMatrixTest.hpp
index f906adfbf1285f72f16a51e58de4bcd4b4222fb8..1716b0ab894d506fe8eed2f13eb29821cb2266cf 100644
--- a/src/UnitTests/Matrices/SparseMatrixTest.hpp
+++ b/src/UnitTests/Matrices/SparseMatrixTest.hpp
@@ -1026,371 +1026,6 @@ void test_AddElement()
    EXPECT_EQ( m.getElement( 5, 4 ), 20 );
 }
 
-template< typename Matrix >
-void test_VectorProduct()
-{
-   using RealType = typename Matrix::RealType;
-   using DeviceType = typename Matrix::DeviceType;
-   using IndexType = typename Matrix::IndexType;
-   using VectorType = TNL::Containers::Vector< RealType, DeviceType, IndexType >;
-
-   /*
-    * Sets up the following 4x4 sparse matrix:
-    *
-    *    /  1  0  0  0 \
-    *    |  0  2  0  3 |
-    *    |  0  4  0  0 |
-    *    \  0  0  5  0 /
-    */
-
-   const IndexType m_rows_1 = 4;
-   const IndexType m_cols_1 = 4;
-
-   Matrix m_1;
-   m_1.reset();
-   m_1.setDimensions( m_rows_1, m_cols_1 );
-   typename Matrix::RowsCapacitiesType rowLengths_1{ 1, 2, 1, 1 };
-   m_1.setRowCapacities( rowLengths_1 );
-
-   RealType value_1 = 1;
-   m_1.setElement( 0, 0, value_1++ );      // 0th row
-
-   m_1.setElement( 1, 1, value_1++ );      // 1st row
-   m_1.setElement( 1, 3, value_1++ );
-
-   m_1.setElement( 2, 1, value_1++ );      // 2nd row
-
-   m_1.setElement( 3, 2, value_1++ );      // 3rd row
-
-   VectorType inVector_1;
-   inVector_1.setSize( m_cols_1 );
-   for( IndexType i = 0; i < inVector_1.getSize(); i++ )
-       inVector_1.setElement( i, 2 );
-
-   VectorType outVector_1;
-   outVector_1.setSize( m_rows_1 );
-   for( IndexType j = 0; j < outVector_1.getSize(); j++ )
-       outVector_1.setElement( j, 0 );
-
-   m_1.vectorProduct( inVector_1, outVector_1 );
-   EXPECT_EQ( outVector_1.getElement( 0 ),  2 );
-   EXPECT_EQ( outVector_1.getElement( 1 ), 10 );
-   EXPECT_EQ( outVector_1.getElement( 2 ),  8 );
-   EXPECT_EQ( outVector_1.getElement( 3 ), 10 );
-
-   /*
-    * Sets up the following 4x4 sparse matrix:
-    *
-    *    /  1  2  3  0 \
-    *    |  0  0  0  4 |
-    *    |  5  6  7  0 |
-    *    \  0  8  0  0 /
-    */
-
-   const IndexType m_rows_2 = 4;
-   const IndexType m_cols_2 = 4;
-
-   Matrix m_2( m_rows_2, m_cols_2 );
-   typename Matrix::RowsCapacitiesType rowLengths_2{ 3, 1, 3, 1 };
-   m_2.setRowCapacities( rowLengths_2 );
-
-   RealType value_2 = 1;
-   for( IndexType i = 0; i < 3; i++ )      // 0th row
-      m_2.setElement( 0, i, value_2++ );
-
-   m_2.setElement( 1, 3, value_2++ );      // 1st row
-
-   for( IndexType i = 0; i < 3; i++ )      // 2nd row
-      m_2.setElement( 2, i, value_2++ );
-
-   for( IndexType i = 1; i < 2; i++ )      // 3rd row
-      m_2.setElement( 3, i, value_2++ );
-
-   VectorType inVector_2;
-   inVector_2.setSize( m_cols_2 );
-   for( IndexType i = 0; i < inVector_2.getSize(); i++ )
-      inVector_2.setElement( i, 2 );
-
-   VectorType outVector_2;
-   outVector_2.setSize( m_rows_2 );
-   for( IndexType j = 0; j < outVector_2.getSize(); j++ )
-      outVector_2.setElement( j, 0 );
-
-   m_2.vectorProduct( inVector_2, outVector_2 );
-
-   EXPECT_EQ( outVector_2.getElement( 0 ), 12 );
-   EXPECT_EQ( outVector_2.getElement( 1 ),  8 );
-   EXPECT_EQ( outVector_2.getElement( 2 ), 36 );
-   EXPECT_EQ( outVector_2.getElement( 3 ), 16 );
-
-   /*
-    * Sets up the following 4x4 sparse matrix:
-    *
-    *    /  1  2  3  0 \
-    *    |  0  4  5  6 |
-    *    |  7  8  9  0 |
-    *    \  0 10 11 12 /
-    */
-
-   const IndexType m_rows_3 = 4;
-   const IndexType m_cols_3 = 4;
-
-   Matrix m_3( m_rows_3, m_cols_3 );
-   typename Matrix::RowsCapacitiesType rowLengths_3{ 3, 3, 3, 3 };
-   m_3.setRowCapacities( rowLengths_3 );
-
-   RealType value_3 = 1;
-   for( IndexType i = 0; i < 3; i++ )          // 0th row
-      m_3.setElement( 0, i, value_3++ );
-
-   for( IndexType i = 1; i < 4; i++ )
-      m_3.setElement( 1, i, value_3++ );      // 1st row
-
-   for( IndexType i = 0; i < 3; i++ )          // 2nd row
-      m_3.setElement( 2, i, value_3++ );
-
-   for( IndexType i = 1; i < 4; i++ )          // 3rd row
-      m_3.setElement( 3, i, value_3++ );
-
-   VectorType inVector_3;
-   inVector_3.setSize( m_cols_3 );
-   for( IndexType i = 0; i < inVector_3.getSize(); i++ )
-      inVector_3.setElement( i, 2 );
-
-   VectorType outVector_3;
-   outVector_3.setSize( m_rows_3 );
-   for( IndexType j = 0; j < outVector_3.getSize(); j++ )
-      outVector_3.setElement( j, 0 );
-
-   m_3.vectorProduct( inVector_3, outVector_3 );
-
-   EXPECT_EQ( outVector_3.getElement( 0 ), 12 );
-   EXPECT_EQ( outVector_3.getElement( 1 ), 30 );
-   EXPECT_EQ( outVector_3.getElement( 2 ), 48 );
-   EXPECT_EQ( outVector_3.getElement( 3 ), 66 );
-
-   /*
-    * Sets up the following 8x8 sparse matrix:
-    *
-    *    /  1  2  3  0  0  4  0  0 \
-    *    |  0  5  6  7  8  0  0  0 |
-    *    |  9 10 11 12 13  0  0  0 |
-    *    |  0 14 15 16 17  0  0  0 |
-    *    |  0  0 18 19 20 21  0  0 |
-    *    |  0  0  0 22 23 24 25  0 |
-    *    | 26 27 28 29 30  0  0  0 |
-    *    \ 31 32 33 34 35  0  0  0 /
-    */
-
-   const IndexType m_rows_4 = 8;
-   const IndexType m_cols_4 = 8;
-
-   Matrix m_4( m_rows_4, m_cols_4 );
-   typename Matrix::RowsCapacitiesType rowLengths_4{ 4, 4, 5, 4, 4, 4, 5, 5 };
-   m_4.setRowCapacities( rowLengths_4 );
-
-   RealType value_4 = 1;
-   for( IndexType i = 0; i < 3; i++ )       // 0th row
-      m_4.setElement( 0, i, value_4++ );
-
-   m_4.setElement( 0, 5, value_4++ );
-
-   for( IndexType i = 1; i < 5; i++ )       // 1st row
-      m_4.setElement( 1, i, value_4++ );
-
-   for( IndexType i = 0; i < 5; i++ )       // 2nd row
-      m_4.setElement( 2, i, value_4++ );
-
-   for( IndexType i = 1; i < 5; i++ )       // 3rd row
-      m_4.setElement( 3, i, value_4++ );
-
-   for( IndexType i = 2; i < 6; i++ )       // 4th row
-      m_4.setElement( 4, i, value_4++ );
-
-   for( IndexType i = 3; i < 7; i++ )       // 5th row
-      m_4.setElement( 5, i, value_4++ );
-
-   for( IndexType i = 0; i < 5; i++ )       // 6th row
-      m_4.setElement( 6, i, value_4++ );
-
-   for( IndexType i = 0; i < 5; i++ )       // 7th row
-      m_4.setElement( 7, i, value_4++ );
-
-   VectorType inVector_4;
-   inVector_4.setSize( m_cols_4 );
-   for( IndexType i = 0; i < inVector_4.getSize(); i++ )
-      inVector_4.setElement( i, 2 );
-
-   VectorType outVector_4;
-   outVector_4.setSize( m_rows_4 );
-   for( IndexType j = 0; j < outVector_4.getSize(); j++ )
-      outVector_4.setElement( j, 0 );
-
-   m_4.vectorProduct( inVector_4, outVector_4 );
-
-   EXPECT_EQ( outVector_4.getElement( 0 ),  20 );
-   EXPECT_EQ( outVector_4.getElement( 1 ),  52 );
-   EXPECT_EQ( outVector_4.getElement( 2 ), 110 );
-   EXPECT_EQ( outVector_4.getElement( 3 ), 124 );
-   EXPECT_EQ( outVector_4.getElement( 4 ), 156 );
-   EXPECT_EQ( outVector_4.getElement( 5 ), 188 );
-   EXPECT_EQ( outVector_4.getElement( 6 ), 280 );
-   EXPECT_EQ( outVector_4.getElement( 7 ), 330 );
-
-   /*
-    * Sets up the following 8x8 sparse matrix:
-    *
-    *    /  1  2  3  0  4  5  0  1 \   6
-    *    |  0  6  0  7  0  0  0  1 |   3
-    *    |  0  8  9  0 10  0  0  1 |   4
-    *    |  0 11 12 13 14  0  0  1 |   5
-    *    |  0 15  0  0  0  0  0  1 |   2
-    *    |  0 16 17 18 19 20 21  1 |   7
-    *    | 22 23 24 25 26 27 28  1 |   8
-    *    \ 29 30 31 32 33 34 35 36 /   8
-    */
-
-   const IndexType m_rows_5 = 8;
-   const IndexType m_cols_5 = 8;
-
-   Matrix m_5( m_rows_5, m_cols_5 );
-   typename Matrix::RowsCapacitiesType rowLengths_5{ 6, 3, 4, 5, 2, 7, 8, 8 };
-   m_5.setRowCapacities( rowLengths_5 );
-
-   RealType value_5 = 1;
-   for( IndexType i = 0; i < 3; i++ )   // 0th row
-      m_5.setElement( 0, i, value_5++ );
-
-   m_5.setElement( 0, 4, value_5++ );           // 0th row
-   m_5.setElement( 0, 5, value_5++ );
-
-   m_5.setElement( 1, 1, value_5++ );           // 1st row
-   m_5.setElement( 1, 3, value_5++ );
-
-   for( IndexType i = 1; i < 3; i++ )            // 2nd row
-      m_5.setElement( 2, i, value_5++ );
-
-   m_5.setElement( 2, 4, value_5++ );           // 2nd row
-
-   for( IndexType i = 1; i < 5; i++ )            // 3rd row
-      m_5.setElement( 3, i, value_5++ );
-
-   m_5.setElement( 4, 1, value_5++ );           // 4th row
-
-   for( IndexType i = 1; i < 7; i++ )            // 5th row
-      m_5.setElement( 5, i, value_5++ );
-
-   for( IndexType i = 0; i < 7; i++ )            // 6th row
-      m_5.setElement( 6, i, value_5++ );
-
-   for( IndexType i = 0; i < 8; i++ )            // 7th row
-      m_5.setElement( 7, i, value_5++ );
-
-   for( IndexType i = 0; i < 7; i++ )            // 1s at the end of rows
-      m_5.setElement( i, 7, 1);
-
-   VectorType inVector_5;
-   inVector_5.setSize( m_cols_5 );
-   for( IndexType i = 0; i < inVector_5.getSize(); i++ )
-       inVector_5.setElement( i, 2 );
-
-   VectorType outVector_5;
-   outVector_5.setSize( m_rows_5 );
-   for( IndexType j = 0; j < outVector_5.getSize(); j++ )
-       outVector_5.setElement( j, 0 );
-
-   m_5.vectorProduct( inVector_5, outVector_5 );
-
-   EXPECT_EQ( outVector_5.getElement( 0 ),  32 );
-   EXPECT_EQ( outVector_5.getElement( 1 ),  28 );
-   EXPECT_EQ( outVector_5.getElement( 2 ),  56 );
-   EXPECT_EQ( outVector_5.getElement( 3 ), 102 );
-   EXPECT_EQ( outVector_5.getElement( 4 ),  32 );
-   EXPECT_EQ( outVector_5.getElement( 5 ), 224 );
-   EXPECT_EQ( outVector_5.getElement( 6 ), 352 );
-   EXPECT_EQ( outVector_5.getElement( 7 ), 520 );
-
-   /////
-   // Large test
-   const IndexType size( 1051 );
-   //for( int size = 1; size < 1000; size++ )
-   {
-      //std::cerr << " size = " << size << std::endl;
-      // Test with large diagonal matrix
-      Matrix m1( size, size );
-      TNL::Containers::Vector< IndexType, DeviceType, IndexType > rowCapacities( size );
-      rowCapacities.forAllElements( [] __cuda_callable__ ( IndexType i, IndexType& value ) { value = 1; } );
-      m1.setRowCapacities( rowCapacities );
-      auto f1 = [=] __cuda_callable__ ( IndexType row, IndexType localIdx, IndexType& column, RealType& value, bool& compute ) {
-         if( localIdx == 0  )
-         {
-            value = row + 1;
-            column = row;
-         }
-      };
-      m1.forAllElements( f1 );
-      // check that the matrix was initialized
-      m1.getCompressedRowLengths( rowCapacities );
-      EXPECT_EQ( rowCapacities, 1 );
-
-      TNL::Containers::Vector< double, DeviceType, IndexType > in( size, 1.0 ), out( size, 0.0 );
-      m1.vectorProduct( in, out );
-      //std::cerr << out << std::endl;
-      for( IndexType i = 0; i < size; i++ )
-         EXPECT_EQ( out.getElement( i ), i + 1 );
-
-      // Test with large triangular matrix
-      const int rows( size ), columns( size );
-      Matrix m2( rows, columns );
-      rowCapacities.setSize( rows );
-      rowCapacities.forAllElements( [=] __cuda_callable__ ( IndexType i, IndexType& value ) { value = i + 1; } );
-      m2.setRowCapacities( rowCapacities );
-      auto f2 = [=] __cuda_callable__ ( IndexType row, IndexType localIdx, IndexType& column, RealType& value, bool& compute ) {
-         if( localIdx <= row )
-         {
-            value = localIdx + 1;
-            column = localIdx;
-         }
-      };
-      m2.forAllElements( f2 );
-      // check that the matrix was initialized
-      TNL::Containers::Vector< IndexType, DeviceType, IndexType > rowLengths( rows );
-      m2.getCompressedRowLengths( rowLengths );
-      EXPECT_EQ( rowLengths, rowCapacities );
-
-      out.setSize( rows );
-      out = 0.0;
-      m2.vectorProduct( in, out );
-      for( IndexType i = 0; i < rows; i++ )
-         EXPECT_EQ( out.getElement( i ), ( i + 1 ) * ( i + 2 ) / 2 );
-   }
-
-   /**
-    * Long row test
-    */
-   using MatrixSegmentsType = typename Matrix::SegmentsType;
-   constexpr TNL::Algorithms::Segments::ElementsOrganization organization = MatrixSegmentsType::getOrganization();
-   using ChunkedEllpackView_ = TNL::Algorithms::Segments::ChunkedEllpackView< DeviceType, IndexType, organization >;
-   if( ! std::is_same< typename Matrix::SegmentsViewType, ChunkedEllpackView_ >::value )
-   {
-      // TODO: Fix ChunkedEllpack for this test - seems that it allocates too much memory
-      const int columns = 3000;
-      const int rows = 1;
-      Matrix m3( rows, columns );
-      TNL::Containers::Vector< IndexType, DeviceType, IndexType > rowsCapacities( rows );
-      rowsCapacities = columns;
-      m3.setRowCapacities( rowsCapacities );
-      auto f = [] __cuda_callable__ ( IndexType row, IndexType localIdx, IndexType& column, RealType& value, bool& compute ) {
-         column = localIdx;
-         value = localIdx + 1;
-      };
-      m3.forAllElements( f );
-      TNL::Containers::Vector< double, DeviceType, IndexType > in( columns, 1.0 ), out( rows, 0.0 );
-      m3.vectorProduct( in, out );
-      EXPECT_EQ( out.getElement( 0 ), ( double ) columns * ( double ) (columns + 1 ) / 2.0 );
-   }
-}
-
 template< typename Matrix >
 void test_ForElements()
 {
@@ -1415,7 +1050,7 @@ void test_ForElements()
    const IndexType rows = 8;
 
    Matrix m( { 3, 3, 3, 3, 3, 3, 3, 3, 3 }, cols  );
-   m.forAllElements( [] __cuda_callable__ ( IndexType rowIdx, IndexType localIdx, IndexType& columnIdx, RealType& value, bool compute ) mutable {
+   m.forAllElements( [] __cuda_callable__ ( IndexType rowIdx, IndexType localIdx, IndexType& columnIdx, RealType& value ) mutable {
       value = rowIdx + 1.0;
       columnIdx = localIdx;
    } );
diff --git a/src/UnitTests/Matrices/SparseMatrixTest_CSRLight.cpp b/src/UnitTests/Matrices/SparseMatrixTest_CSRLight.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d6a3a41cd3b3996ba93978fc0feea13c5b1d88b9
--- /dev/null
+++ b/src/UnitTests/Matrices/SparseMatrixTest_CSRLight.cpp
@@ -0,0 +1,11 @@
+/***************************************************************************
+                          SparseMatrixTest_CSRLight.cpp -  description
+                             -------------------
+    begin                : Jun 9, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#include "SparseMatrixTest_CSRLight.h"
diff --git a/src/UnitTests/Matrices/SparseMatrixTest_CSRLight.cu b/src/UnitTests/Matrices/SparseMatrixTest_CSRLight.cu
new file mode 120000
index 0000000000000000000000000000000000000000..e40135b9e94a9d65e7de1ba79d99bdd9380e067c
--- /dev/null
+++ b/src/UnitTests/Matrices/SparseMatrixTest_CSRLight.cu
@@ -0,0 +1 @@
+SparseMatrixTest_CSRLight.cpp
\ No newline at end of file
diff --git a/src/UnitTests/Matrices/SparseMatrixTest_CSRLight.h b/src/UnitTests/Matrices/SparseMatrixTest_CSRLight.h
new file mode 100644
index 0000000000000000000000000000000000000000..ddd956a52c06a178eb93da9e121c487bb58d5791
--- /dev/null
+++ b/src/UnitTests/Matrices/SparseMatrixTest_CSRLight.h
@@ -0,0 +1,46 @@
+/***************************************************************************
+                          SparseMatrixTest_CSRLight.h -  description
+                             -------------------
+    begin                : Jun 9, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#include <iostream>
+#include <TNL/Algorithms/Segments/CSR.h>
+#include <TNL/Matrices/SparseMatrix.h>
+
+#ifdef HAVE_GTEST
+#include <gtest/gtest.h>
+
+const char* saveAndLoadFileName = "test_SparseMatrixTest_CSRLight_segments";
+
+// types for which MatrixTest is instantiated
+using MatrixTypes = ::testing::Types
+<
+    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >,
+    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >
+#ifdef HAVE_CUDA
+   ,TNL::Matrices::SparseMatrix< int,     TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >,
+    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >
+#endif
+>;
+
+#endif
+
+#include "SparseMatrixTest.h"
+#include "../main.h"
diff --git a/src/UnitTests/Matrices/SparseMatrixTest_Ellpack.h b/src/UnitTests/Matrices/SparseMatrixTest_Ellpack.h
index ef56ec63a46561dd2dffc723c780076ce7d671a6..b13a19c6a4e3eafa51d401091a267aea2cb18cd3 100644
--- a/src/UnitTests/Matrices/SparseMatrixTest_Ellpack.h
+++ b/src/UnitTests/Matrices/SparseMatrixTest_Ellpack.h
@@ -46,7 +46,15 @@ using MatrixTypes = ::testing::Types
     TNL::Matrices::SparseMatrix< int,     TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, ColumnMajorEllpack >,
     TNL::Matrices::SparseMatrix< long,    TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, ColumnMajorEllpack >,
     TNL::Matrices::SparseMatrix< float,   TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, ColumnMajorEllpack >,
-    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, ColumnMajorEllpack >
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, ColumnMajorEllpack >,
+    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, RowMajorEllpack >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, RowMajorEllpack >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, RowMajorEllpack >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, RowMajorEllpack >,
+    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, RowMajorEllpack >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, RowMajorEllpack >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, RowMajorEllpack >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, RowMajorEllpack >
 #endif
 >;
 
diff --git a/src/UnitTests/Matrices/SparseMatrixTest_SandboxMatrix.cpp b/src/UnitTests/Matrices/SparseMatrixTest_SandboxMatrix.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..dc856310e58f2fabf336efbe5c6d950be2998f9f
--- /dev/null
+++ b/src/UnitTests/Matrices/SparseMatrixTest_SandboxMatrix.cpp
@@ -0,0 +1,11 @@
+/***************************************************************************
+                          SparseMatrixTest_SandboxMatrix.cpp -  description
+                             -------------------
+    begin                : Apr 19, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#include "SparseMatrixTest_SandboxMatrix.h"
diff --git a/src/UnitTests/Matrices/SparseMatrixTest_SandboxMatrix.cu b/src/UnitTests/Matrices/SparseMatrixTest_SandboxMatrix.cu
new file mode 120000
index 0000000000000000000000000000000000000000..27787fdf2474a56b79948644678b201a81259688
--- /dev/null
+++ b/src/UnitTests/Matrices/SparseMatrixTest_SandboxMatrix.cu
@@ -0,0 +1 @@
+SparseMatrixTest_SandboxMatrix.cpp
\ No newline at end of file
diff --git a/src/UnitTests/Matrices/SparseMatrixTest_SandboxMatrix.h b/src/UnitTests/Matrices/SparseMatrixTest_SandboxMatrix.h
new file mode 100644
index 0000000000000000000000000000000000000000..ad1a0c74d6d6f709a63aad6d500a5a103fb04292
--- /dev/null
+++ b/src/UnitTests/Matrices/SparseMatrixTest_SandboxMatrix.h
@@ -0,0 +1,45 @@
+/***************************************************************************
+                          SandboxMatrixTest_SandboxMatrix.h -  description
+                             -------------------
+    begin                : Apr 19, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#include <iostream>
+#include <TNL/Matrices/Sandbox/SparseSandboxMatrix.h>
+
+#ifdef HAVE_GTEST
+#include <gtest/gtest.h>
+
+const char* saveAndLoadFileName = "test_SandboxMatrixTest_SandboxMatrix";
+
+// types for which MatrixTest is instantiated
+using MatrixTypes = ::testing::Types
+<
+    TNL::Matrices::Sandbox::SparseSandboxMatrix< int,     TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix >,
+    TNL::Matrices::Sandbox::SparseSandboxMatrix< long,    TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix >,
+    TNL::Matrices::Sandbox::SparseSandboxMatrix< float,   TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix >,
+    TNL::Matrices::Sandbox::SparseSandboxMatrix< double,  TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix >,
+    TNL::Matrices::Sandbox::SparseSandboxMatrix< int,     TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix >,
+    TNL::Matrices::Sandbox::SparseSandboxMatrix< long,    TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix >,
+    TNL::Matrices::Sandbox::SparseSandboxMatrix< float,   TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix >,
+    TNL::Matrices::Sandbox::SparseSandboxMatrix< double,  TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix >
+#ifdef HAVE_CUDA
+   ,TNL::Matrices::Sandbox::SparseSandboxMatrix< int,     TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix >,
+    TNL::Matrices::Sandbox::SparseSandboxMatrix< long,    TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix >,
+    TNL::Matrices::Sandbox::SparseSandboxMatrix< float,   TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix >,
+    TNL::Matrices::Sandbox::SparseSandboxMatrix< double,  TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix >,
+    TNL::Matrices::Sandbox::SparseSandboxMatrix< int,     TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix >,
+    TNL::Matrices::Sandbox::SparseSandboxMatrix< long,    TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix >,
+    TNL::Matrices::Sandbox::SparseSandboxMatrix< float,   TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix >,
+    TNL::Matrices::Sandbox::SparseSandboxMatrix< double,  TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix >
+#endif
+>;
+
+#endif
+
+#include "SparseMatrixTest.h"
+#include "../main.h"
diff --git a/src/UnitTests/Matrices/SparseMatrixVectorProductTest.h b/src/UnitTests/Matrices/SparseMatrixVectorProductTest.h
new file mode 100644
index 0000000000000000000000000000000000000000..3a77cbe1a3dac8c146de27538c8d9c42defbdf62
--- /dev/null
+++ b/src/UnitTests/Matrices/SparseMatrixVectorProductTest.h
@@ -0,0 +1,84 @@
+/***************************************************************************
+                          SparseMatrixVectorProductTest.h -  description
+                             -------------------
+    begin                : Mar 30, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#include <TNL/Containers/Vector.h>
+#include <TNL/Containers/VectorView.h>
+#include <TNL/Math.h>
+#include <TNL/Algorithms/ParallelFor.h>
+#include <iostream>
+#include <sstream>
+
+#include "SparseMatrixVectorProductTest.hpp"
+
+#ifdef HAVE_GTEST
+#include <gtest/gtest.h>
+
+// test fixture for typed tests
+template< typename Matrix >
+class MatrixTest : public ::testing::Test
+{
+protected:
+   using MatrixType = Matrix;
+};
+
+TYPED_TEST_SUITE( MatrixTest, MatrixTypes);
+
+TYPED_TEST( MatrixTest, vectorProductTest_smallMatrix1 )
+{
+    using MatrixType = typename TestFixture::MatrixType;
+
+    test_VectorProduct_smallMatrix1< MatrixType >();
+}
+
+TYPED_TEST( MatrixTest, vectorProductTest_smallMatrix2 )
+{
+    using MatrixType = typename TestFixture::MatrixType;
+
+    test_VectorProduct_smallMatrix2< MatrixType >();
+}
+
+TYPED_TEST( MatrixTest, vectorProductTest_smallMatrix3 )
+{
+    using MatrixType = typename TestFixture::MatrixType;
+
+    test_VectorProduct_smallMatrix3< MatrixType >();
+}
+
+TYPED_TEST( MatrixTest, vectorProductTest_mediumSizeMatrix1 )
+{
+    using MatrixType = typename TestFixture::MatrixType;
+
+    test_VectorProduct_mediumSizeMatrix1< MatrixType >();
+}
+
+TYPED_TEST( MatrixTest, vectorProductTest_mediumSizeMatrix2 )
+{
+    using MatrixType = typename TestFixture::MatrixType;
+
+    test_VectorProduct_mediumSizeMatrix2< MatrixType >();
+}
+
+TYPED_TEST( MatrixTest, vectorProductTest_largeMatrix )
+{
+    using MatrixType = typename TestFixture::MatrixType;
+
+    test_VectorProduct_largeMatrix< MatrixType >();
+}
+
+TYPED_TEST( MatrixTest, vectorProductTest_longRowsMatrix )
+{
+    using MatrixType = typename TestFixture::MatrixType;
+
+    test_VectorProduct_longRowsMatrix< MatrixType >();
+}
+
+#endif
diff --git a/src/UnitTests/Matrices/SparseMatrixVectorProductTest.hpp b/src/UnitTests/Matrices/SparseMatrixVectorProductTest.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..dadecff560c66576651b38055c05731f029602ac
--- /dev/null
+++ b/src/UnitTests/Matrices/SparseMatrixVectorProductTest.hpp
@@ -0,0 +1,446 @@
+/***************************************************************************
+                          SparseMatrixVectorProductTest.hpp -  description
+                             -------------------
+    begin                : Mar 30, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#include <functional>
+#include <TNL/Containers/Vector.h>
+#include <TNL/Containers/VectorView.h>
+#include <TNL/Math.h>
+#include <TNL/Algorithms/ParallelFor.h>
+#include <iostream>
+#include <sstream>
+
+// Just for ChunkedEllpack vectorProduct test exception
+#include <TNL/Algorithms/Segments/ChunkedEllpackView.h>
+
+#ifdef HAVE_GTEST
+#include <gtest/gtest.h>
+
+template< typename Matrix >
+void test_VectorProduct_smallMatrix1()
+{
+   using RealType = typename Matrix::RealType;
+   using DeviceType = typename Matrix::DeviceType;
+   using IndexType = typename Matrix::IndexType;
+   using VectorType = TNL::Containers::Vector< RealType, DeviceType, IndexType >;
+
+   /*
+    * Sets up the following 4x4 sparse matrix:
+    *
+    *    /  1  0  0  0 \
+    *    |  0  2  0  3 |
+    *    |  0  4  0  0 |
+    *    \  0  0  5  0 /
+    */
+
+   const IndexType m_rows_1 = 4;
+   const IndexType m_cols_1 = 4;
+
+   Matrix m_1;
+   m_1.reset();
+   m_1.setDimensions( m_rows_1, m_cols_1 );
+   typename Matrix::RowsCapacitiesType rowLengths_1{ 1, 2, 1, 1 };
+   m_1.setRowCapacities( rowLengths_1 );
+
+   RealType value_1 = 1;
+   m_1.setElement( 0, 0, value_1++ );      // 0th row
+
+   m_1.setElement( 1, 1, value_1++ );      // 1st row
+   m_1.setElement( 1, 3, value_1++ );
+
+   m_1.setElement( 2, 1, value_1++ );      // 2nd row
+
+   m_1.setElement( 3, 2, value_1++ );      // 3rd row
+
+   VectorType inVector_1;
+   inVector_1.setSize( m_cols_1 );
+   for( IndexType i = 0; i < inVector_1.getSize(); i++ )
+       inVector_1.setElement( i, 2 );
+
+   VectorType outVector_1;
+   outVector_1.setSize( m_rows_1 );
+   for( IndexType j = 0; j < outVector_1.getSize(); j++ )
+       outVector_1.setElement( j, 0 );
+
+   m_1.vectorProduct( inVector_1, outVector_1 );
+   EXPECT_EQ( outVector_1.getElement( 0 ),  2 );
+   EXPECT_EQ( outVector_1.getElement( 1 ), 10 );
+   EXPECT_EQ( outVector_1.getElement( 2 ),  8 );
+   EXPECT_EQ( outVector_1.getElement( 3 ), 10 );
+}
+
+template< typename Matrix >
+void test_VectorProduct_smallMatrix2()
+{
+   using RealType = typename Matrix::RealType;
+   using DeviceType = typename Matrix::DeviceType;
+   using IndexType = typename Matrix::IndexType;
+   using VectorType = TNL::Containers::Vector< RealType, DeviceType, IndexType >;
+
+   /*
+    * Sets up the following 4x4 sparse matrix:
+    *
+    *    /  1  2  3  0 \
+    *    |  0  0  0  4 |
+    *    |  5  6  7  0 |
+    *    \  0  8  0  0 /
+    */
+
+   const IndexType m_rows_2 = 4;
+   const IndexType m_cols_2 = 4;
+
+   Matrix m_2( m_rows_2, m_cols_2 );
+   typename Matrix::RowsCapacitiesType rowLengths_2{ 3, 1, 3, 1 };
+   m_2.setRowCapacities( rowLengths_2 );
+
+   RealType value_2 = 1;
+   for( IndexType i = 0; i < 3; i++ )      // 0th row
+      m_2.setElement( 0, i, value_2++ );
+
+   m_2.setElement( 1, 3, value_2++ );      // 1st row
+
+   for( IndexType i = 0; i < 3; i++ )      // 2nd row
+      m_2.setElement( 2, i, value_2++ );
+
+   for( IndexType i = 1; i < 2; i++ )      // 3rd row
+      m_2.setElement( 3, i, value_2++ );
+
+   VectorType inVector_2;
+   inVector_2.setSize( m_cols_2 );
+   for( IndexType i = 0; i < inVector_2.getSize(); i++ )
+      inVector_2.setElement( i, 2 );
+
+   VectorType outVector_2;
+   outVector_2.setSize( m_rows_2 );
+   for( IndexType j = 0; j < outVector_2.getSize(); j++ )
+      outVector_2.setElement( j, 0 );
+
+   m_2.vectorProduct( inVector_2, outVector_2 );
+
+   EXPECT_EQ( outVector_2.getElement( 0 ), 12 );
+   EXPECT_EQ( outVector_2.getElement( 1 ),  8 );
+   EXPECT_EQ( outVector_2.getElement( 2 ), 36 );
+   EXPECT_EQ( outVector_2.getElement( 3 ), 16 );
+}
+
+template< typename Matrix >
+void test_VectorProduct_smallMatrix3()
+{
+   using RealType = typename Matrix::RealType;
+   using DeviceType = typename Matrix::DeviceType;
+   using IndexType = typename Matrix::IndexType;
+   using VectorType = TNL::Containers::Vector< RealType, DeviceType, IndexType >;
+
+   /*
+    * Sets up the following 4x4 sparse matrix:
+    *
+    *    /  1  2  3  0 \
+    *    |  0  4  5  6 |
+    *    |  7  8  9  0 |
+    *    \  0 10 11 12 /
+    */
+
+   const IndexType m_rows_3 = 4;
+   const IndexType m_cols_3 = 4;
+
+   Matrix m_3( m_rows_3, m_cols_3 );
+   typename Matrix::RowsCapacitiesType rowLengths_3{ 3, 3, 3, 3 };
+   m_3.setRowCapacities( rowLengths_3 );
+
+   RealType value_3 = 1;
+   for( IndexType i = 0; i < 3; i++ )          // 0th row
+      m_3.setElement( 0, i, value_3++ );
+
+   for( IndexType i = 1; i < 4; i++ )
+      m_3.setElement( 1, i, value_3++ );      // 1st row
+
+   for( IndexType i = 0; i < 3; i++ )          // 2nd row
+      m_3.setElement( 2, i, value_3++ );
+
+   for( IndexType i = 1; i < 4; i++ )          // 3rd row
+      m_3.setElement( 3, i, value_3++ );
+
+   VectorType inVector_3;
+   inVector_3.setSize( m_cols_3 );
+   for( IndexType i = 0; i < inVector_3.getSize(); i++ )
+      inVector_3.setElement( i, 2 );
+
+   VectorType outVector_3;
+   outVector_3.setSize( m_rows_3 );
+   for( IndexType j = 0; j < outVector_3.getSize(); j++ )
+      outVector_3.setElement( j, 0 );
+
+   m_3.vectorProduct( inVector_3, outVector_3 );
+
+   EXPECT_EQ( outVector_3.getElement( 0 ), 12 );
+   EXPECT_EQ( outVector_3.getElement( 1 ), 30 );
+   EXPECT_EQ( outVector_3.getElement( 2 ), 48 );
+   EXPECT_EQ( outVector_3.getElement( 3 ), 66 );
+}
+
+template< typename Matrix >
+void test_VectorProduct_mediumSizeMatrix1()
+{
+   using RealType = typename Matrix::RealType;
+   using DeviceType = typename Matrix::DeviceType;
+   using IndexType = typename Matrix::IndexType;
+   using VectorType = TNL::Containers::Vector< RealType, DeviceType, IndexType >;
+
+   /*
+    * Sets up the following 8x8 sparse matrix:
+    *
+    *    /  1  2  3  0  0  4  0  0 \
+    *    |  0  5  6  7  8  0  0  0 |
+    *    |  9 10 11 12 13  0  0  0 |
+    *    |  0 14 15 16 17  0  0  0 |
+    *    |  0  0 18 19 20 21  0  0 |
+    *    |  0  0  0 22 23 24 25  0 |
+    *    | 26 27 28 29 30  0  0  0 |
+    *    \ 31 32 33 34 35  0  0  0 /
+    */
+
+   const IndexType m_rows_4 = 8;
+   const IndexType m_cols_4 = 8;
+
+   Matrix m_4( m_rows_4, m_cols_4 );
+   typename Matrix::RowsCapacitiesType rowLengths_4{ 4, 4, 5, 4, 4, 4, 5, 5 };
+   m_4.setRowCapacities( rowLengths_4 );
+
+   RealType value_4 = 1;
+   for( IndexType i = 0; i < 3; i++ )       // 0th row
+      m_4.setElement( 0, i, value_4++ );
+
+   m_4.setElement( 0, 5, value_4++ );
+
+   for( IndexType i = 1; i < 5; i++ )       // 1st row
+      m_4.setElement( 1, i, value_4++ );
+
+   for( IndexType i = 0; i < 5; i++ )       // 2nd row
+      m_4.setElement( 2, i, value_4++ );
+
+   for( IndexType i = 1; i < 5; i++ )       // 3rd row
+      m_4.setElement( 3, i, value_4++ );
+
+   for( IndexType i = 2; i < 6; i++ )       // 4th row
+      m_4.setElement( 4, i, value_4++ );
+
+   for( IndexType i = 3; i < 7; i++ )       // 5th row
+      m_4.setElement( 5, i, value_4++ );
+
+   for( IndexType i = 0; i < 5; i++ )       // 6th row
+      m_4.setElement( 6, i, value_4++ );
+
+   for( IndexType i = 0; i < 5; i++ )       // 7th row
+      m_4.setElement( 7, i, value_4++ );
+
+   VectorType inVector_4;
+   inVector_4.setSize( m_cols_4 );
+   for( IndexType i = 0; i < inVector_4.getSize(); i++ )
+      inVector_4.setElement( i, 2 );
+
+   VectorType outVector_4;
+   outVector_4.setSize( m_rows_4 );
+   for( IndexType j = 0; j < outVector_4.getSize(); j++ )
+      outVector_4.setElement( j, 0 );
+
+   m_4.vectorProduct( inVector_4, outVector_4 );
+
+   EXPECT_EQ( outVector_4.getElement( 0 ),  20 );
+   EXPECT_EQ( outVector_4.getElement( 1 ),  52 );
+   EXPECT_EQ( outVector_4.getElement( 2 ), 110 );
+   EXPECT_EQ( outVector_4.getElement( 3 ), 124 );
+   EXPECT_EQ( outVector_4.getElement( 4 ), 156 );
+   EXPECT_EQ( outVector_4.getElement( 5 ), 188 );
+   EXPECT_EQ( outVector_4.getElement( 6 ), 280 );
+   EXPECT_EQ( outVector_4.getElement( 7 ), 330 );
+}
+
+template< typename Matrix >
+void test_VectorProduct_mediumSizeMatrix2()
+{
+   using RealType = typename Matrix::RealType;
+   using DeviceType = typename Matrix::DeviceType;
+   using IndexType = typename Matrix::IndexType;
+   using VectorType = TNL::Containers::Vector< RealType, DeviceType, IndexType >;
+
+   /*
+    * Sets up the following 8x8 sparse matrix:
+    *
+    *    /  1  2  3  0  4  5  0  1 \   6
+    *    |  0  6  0  7  0  0  0  1 |   3
+    *    |  0  8  9  0 10  0  0  1 |   4
+    *    |  0 11 12 13 14  0  0  1 |   5
+    *    |  0 15  0  0  0  0  0  1 |   2
+    *    |  0 16 17 18 19 20 21  1 |   7
+    *    | 22 23 24 25 26 27 28  1 |   8
+    *    \ 29 30 31 32 33 34 35 36 /   8
+    */
+
+   const IndexType m_rows_5 = 8;
+   const IndexType m_cols_5 = 8;
+
+   Matrix m_5( m_rows_5, m_cols_5 );
+   typename Matrix::RowsCapacitiesType rowLengths_5{ 6, 3, 4, 5, 2, 7, 8, 8 };
+   m_5.setRowCapacities( rowLengths_5 );
+
+   RealType value_5 = 1;
+   for( IndexType i = 0; i < 3; i++ )   // 0th row
+      m_5.setElement( 0, i, value_5++ );
+
+   m_5.setElement( 0, 4, value_5++ );           // 0th row
+   m_5.setElement( 0, 5, value_5++ );
+
+   m_5.setElement( 1, 1, value_5++ );           // 1st row
+   m_5.setElement( 1, 3, value_5++ );
+
+   for( IndexType i = 1; i < 3; i++ )            // 2nd row
+      m_5.setElement( 2, i, value_5++ );
+
+   m_5.setElement( 2, 4, value_5++ );           // 2nd row
+
+   for( IndexType i = 1; i < 5; i++ )            // 3rd row
+      m_5.setElement( 3, i, value_5++ );
+
+   m_5.setElement( 4, 1, value_5++ );           // 4th row
+
+   for( IndexType i = 1; i < 7; i++ )            // 5th row
+      m_5.setElement( 5, i, value_5++ );
+
+   for( IndexType i = 0; i < 7; i++ )            // 6th row
+      m_5.setElement( 6, i, value_5++ );
+
+   for( IndexType i = 0; i < 8; i++ )            // 7th row
+      m_5.setElement( 7, i, value_5++ );
+
+   for( IndexType i = 0; i < 7; i++ )            // 1s at the end of rows
+      m_5.setElement( i, 7, 1);
+
+   VectorType inVector_5;
+   inVector_5.setSize( m_cols_5 );
+   for( IndexType i = 0; i < inVector_5.getSize(); i++ )
+       inVector_5.setElement( i, 2 );
+
+   VectorType outVector_5;
+   outVector_5.setSize( m_rows_5 );
+   for( IndexType j = 0; j < outVector_5.getSize(); j++ )
+       outVector_5.setElement( j, 0 );
+
+   m_5.vectorProduct( inVector_5, outVector_5 );
+
+   EXPECT_EQ( outVector_5.getElement( 0 ),  32 );
+   EXPECT_EQ( outVector_5.getElement( 1 ),  28 );
+   EXPECT_EQ( outVector_5.getElement( 2 ),  56 );
+   EXPECT_EQ( outVector_5.getElement( 3 ), 102 );
+   EXPECT_EQ( outVector_5.getElement( 4 ),  32 );
+   EXPECT_EQ( outVector_5.getElement( 5 ), 224 );
+   EXPECT_EQ( outVector_5.getElement( 6 ), 352 );
+   EXPECT_EQ( outVector_5.getElement( 7 ), 520 );
+}
+
+
+template< typename Matrix >
+void test_VectorProduct_largeMatrix()
+{
+   using RealType = typename Matrix::RealType;
+   using DeviceType = typename Matrix::DeviceType;
+   using IndexType = typename Matrix::IndexType;
+   using VectorType = TNL::Containers::Vector< RealType, DeviceType, IndexType >;
+
+   /////
+   // Large test
+   const IndexType size( 1051 );
+   //for( int size = 1; size < 1000; size++ )
+   {
+      //std::cerr << " size = " << size << std::endl;
+      // Test with large diagonal matrix
+      Matrix m1( size, size );
+      TNL::Containers::Vector< IndexType, DeviceType, IndexType > rowCapacities( size );
+      rowCapacities.forAllElements( [] __cuda_callable__ ( IndexType i, IndexType& value ) { value = 1; } );
+      m1.setRowCapacities( rowCapacities );
+      auto f1 = [=] __cuda_callable__ ( IndexType row, IndexType localIdx, IndexType& column, RealType& value ) {
+         if( localIdx == 0  )
+         {
+            value = row + 1;
+            column = row;
+         }
+      };
+      m1.forAllElements( f1 );
+      // check that the matrix was initialized
+      m1.getCompressedRowLengths( rowCapacities );
+      EXPECT_EQ( rowCapacities, 1 );
+
+      TNL::Containers::Vector< double, DeviceType, IndexType > in( size, 1.0 ), out( size, 0.0 );
+      m1.vectorProduct( in, out );
+      //std::cerr << out << std::endl;
+      for( IndexType i = 0; i < size; i++ )
+         EXPECT_EQ( out.getElement( i ), i + 1 );
+
+      // Test with large triangular matrix
+      const int rows( size ), columns( size );
+      Matrix m2( rows, columns );
+      rowCapacities.setSize( rows );
+      rowCapacities.forAllElements( [=] __cuda_callable__ ( IndexType i, IndexType& value ) { value = i + 1; } );
+      m2.setRowCapacities( rowCapacities );
+      auto f2 = [=] __cuda_callable__ ( IndexType row, IndexType localIdx, IndexType& column, RealType& value ) {
+         if( localIdx <= row )
+         {
+            value = localIdx + 1;
+            column = localIdx;
+         }
+      };
+      m2.forAllElements( f2 );
+      // check that the matrix was initialized
+      TNL::Containers::Vector< IndexType, DeviceType, IndexType > rowLengths( rows );
+      m2.getCompressedRowLengths( rowLengths );
+      EXPECT_EQ( rowLengths, rowCapacities );
+
+      out.setSize( rows );
+      out = 0.0;
+      m2.vectorProduct( in, out );
+      for( IndexType i = 0; i < rows; i++ )
+         EXPECT_EQ( out.getElement( i ), ( i + 1 ) * ( i + 2 ) / 2 );
+   }
+}
+
+template< typename Matrix >
+void test_VectorProduct_longRowsMatrix()
+{
+   using RealType = typename Matrix::RealType;
+   using DeviceType = typename Matrix::DeviceType;
+   using IndexType = typename Matrix::IndexType;
+   using VectorType = TNL::Containers::Vector< RealType, DeviceType, IndexType >;
+
+   /**
+    * Long row test
+    */
+   using MatrixSegmentsType = typename Matrix::SegmentsType;
+   constexpr TNL::Algorithms::Segments::ElementsOrganization organization = MatrixSegmentsType::getOrganization();
+   using ChunkedEllpackView_ = TNL::Algorithms::Segments::ChunkedEllpackView< DeviceType, IndexType, organization >;
+   for( auto columns : { 64, 65, 128, 129, 256, 257, 512, 513, 1024, 1025, 2048, 2049, 3000 } )
+   {
+      const int rows = 33;
+      Matrix m3( rows, columns );
+      TNL::Containers::Vector< IndexType, DeviceType, IndexType > rowsCapacities( rows );
+      rowsCapacities = columns;
+      m3.setRowCapacities( rowsCapacities );
+      auto f = [] __cuda_callable__ ( IndexType row, IndexType localIdx, IndexType& column, RealType& value ) {
+         column = localIdx;
+         value = localIdx + row;
+      };
+      m3.forAllElements( f );
+      TNL::Containers::Vector< double, DeviceType, IndexType > in( columns, 1.0 ), out( rows, 0.0 );
+      m3.vectorProduct( in, out );
+      for( IndexType rowIdx = 0; rowIdx < rows; rowIdx++ )
+         EXPECT_EQ( out.getElement( rowIdx ), ( double ) columns * ( double ) (columns - 1 ) / 2.0 + columns * rowIdx );
+   }
+}
+
+#endif
diff --git a/src/UnitTests/Matrices/SparseMatrixVectorProductTest_BiEllpack.cpp b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_BiEllpack.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..319ed66053e11bee70bc0d5c98daec464c3a24a8
--- /dev/null
+++ b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_BiEllpack.cpp
@@ -0,0 +1,11 @@
+/***************************************************************************
+                          SparseMatrixVectorProductTest_BiEllpack.cpp -  description
+                             -------------------
+    begin                : Mar 30, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#include "SparseMatrixVectorProductTest_BiEllpack.h"
diff --git a/src/UnitTests/Matrices/SparseMatrixVectorProductTest_BiEllpack.cu b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_BiEllpack.cu
new file mode 120000
index 0000000000000000000000000000000000000000..f34bb20d59c6f9af52efe54c20550e1f98df051f
--- /dev/null
+++ b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_BiEllpack.cu
@@ -0,0 +1 @@
+SparseMatrixVectorProductTest_BiEllpack.cpp
\ No newline at end of file
diff --git a/src/UnitTests/Matrices/SparseMatrixVectorProductTest_BiEllpack.h b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_BiEllpack.h
new file mode 100644
index 0000000000000000000000000000000000000000..abdc11ca2fce134d4b03af4be67316ca93442779
--- /dev/null
+++ b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_BiEllpack.h
@@ -0,0 +1,58 @@
+/***************************************************************************
+                          SparseMatrixVectorProductTest_BiEllpack.h -  description
+                             -------------------
+    begin                : Mar 30, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#include <iostream>
+#include <TNL/Algorithms/Segments/BiEllpack.h>
+#include <TNL/Matrices/SparseMatrix.h>
+
+#ifdef HAVE_GTEST
+#include <gtest/gtest.h>
+
+const char* saveAndLoadFileName = "test_SparseMatrixTest_BiEllpack_segments";
+
+////
+// Row-major format is used for the host system
+template< typename Device, typename Index, typename IndexAllocator >
+using RowMajorBiEllpack = TNL::Algorithms::Segments::BiEllpack< Device, Index, IndexAllocator, TNL::Algorithms::Segments::RowMajorOrder >;
+
+////
+// Column-major format is used for GPUs
+template< typename Device, typename Index, typename IndexAllocator >
+using ColumnMajorBiEllpack = TNL::Algorithms::Segments::BiEllpack< Device, Index, IndexAllocator, TNL::Algorithms::Segments::ColumnMajorOrder >;
+
+// types for which MatrixTest is instantiated
+using MatrixTypes = ::testing::Types
+<
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, ColumnMajorBiEllpack >,
+    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, RowMajorBiEllpack >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, RowMajorBiEllpack >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, RowMajorBiEllpack >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, RowMajorBiEllpack >,
+    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, RowMajorBiEllpack >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, RowMajorBiEllpack >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, RowMajorBiEllpack >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, RowMajorBiEllpack >
+#ifdef HAVE_CUDA
+   ,TNL::Matrices::SparseMatrix< double,  TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, RowMajorBiEllpack >,
+    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, ColumnMajorBiEllpack >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, ColumnMajorBiEllpack >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, ColumnMajorBiEllpack >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, ColumnMajorBiEllpack >,
+    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, ColumnMajorBiEllpack >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, ColumnMajorBiEllpack >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, ColumnMajorBiEllpack >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, ColumnMajorBiEllpack >
+#endif
+>;
+
+#endif
+
+#include "SparseMatrixVectorProductTest.h"
+#include "../main.h"
diff --git a/src/UnitTests/Matrices/SparseMatrixVectorProductTest_CSRAdaptive.cpp b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_CSRAdaptive.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..bafa050d0a0b0e41b7e4d90a717f2cfffd6a947e
--- /dev/null
+++ b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_CSRAdaptive.cpp
@@ -0,0 +1,11 @@
+/***************************************************************************
+                          SparseMatrixVectorProductTest_CSRAdaptive.cpp -  description
+                             -------------------
+    begin                : Mar 30, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#include "SparseMatrixVectorProductTest_CSRAdaptive.h"
diff --git a/src/UnitTests/Matrices/SparseMatrixVectorProductTest_CSRAdaptive.cu b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_CSRAdaptive.cu
new file mode 120000
index 0000000000000000000000000000000000000000..28919f7457dd6d473c4b887c050069f221dec37c
--- /dev/null
+++ b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_CSRAdaptive.cu
@@ -0,0 +1 @@
+SparseMatrixVectorProductTest_CSRAdaptive.cpp
\ No newline at end of file
diff --git a/src/UnitTests/Matrices/SparseMatrixVectorProductTest_CSRAdaptive.h b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_CSRAdaptive.h
new file mode 100644
index 0000000000000000000000000000000000000000..93a0d79fb55f9789a2a4309704483a7f5d2e569e
--- /dev/null
+++ b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_CSRAdaptive.h
@@ -0,0 +1,46 @@
+/***************************************************************************
+                          SparseMatrixVectorProductTest_CSRAdaptive.h -  description
+                             -------------------
+    begin                : Mar 30, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#include <iostream>
+#include <TNL/Algorithms/Segments/CSR.h>
+#include <TNL/Matrices/SparseMatrix.h>
+
+#ifdef HAVE_GTEST
+#include <gtest/gtest.h>
+
+const char* saveAndLoadFileName = "test_SparseMatrixTest_CSRAdaptive_segments";
+
+// types for which MatrixTest is instantiated
+using MatrixTypes = ::testing::Types
+<
+    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >,
+    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >
+#ifdef HAVE_CUDA
+   ,TNL::Matrices::SparseMatrix< int,     TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >,
+    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >
+#endif
+>;
+
+#endif
+
+#include "SparseMatrixVectorProductTest.h"
+#include "../main.h"
diff --git a/src/UnitTests/Matrices/SparseMatrixVectorProductTest_CSRHybrid.cpp b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_CSRHybrid.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a6795b4e1c59e5a022f5a9876d8485fc27ee9627
--- /dev/null
+++ b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_CSRHybrid.cpp
@@ -0,0 +1,11 @@
+/***************************************************************************
+                          SparseMatrixVectorProductTest_CSRHybrid.cpp -  description
+                             -------------------
+    begin                : Mar 30, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#include "SparseMatrixVectorProductTest_CSRHybrid.h"
diff --git a/src/UnitTests/Matrices/SparseMatrixVectorProductTest_CSRHybrid.cu b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_CSRHybrid.cu
new file mode 120000
index 0000000000000000000000000000000000000000..4c81adef3778e1dacc5ce48ead8f5d76d5fbeba9
--- /dev/null
+++ b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_CSRHybrid.cu
@@ -0,0 +1 @@
+SparseMatrixVectorProductTest_CSRHybrid.cpp
\ No newline at end of file
diff --git a/src/UnitTests/Matrices/SparseMatrixVectorProductTest_CSRHybrid.h b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_CSRHybrid.h
new file mode 100644
index 0000000000000000000000000000000000000000..99b5e440301188adb5d816d13c3d941132e67b5d
--- /dev/null
+++ b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_CSRHybrid.h
@@ -0,0 +1,46 @@
+/***************************************************************************
+                          SparseMatrixVectorProductTest_CSRHybrid.h -  description
+                             -------------------
+    begin                : Mar 30, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#include <iostream>
+#include <TNL/Algorithms/Segments/CSR.h>
+#include <TNL/Matrices/SparseMatrix.h>
+
+#ifdef HAVE_GTEST
+#include <gtest/gtest.h>
+
+const char* saveAndLoadFileName = "test_SparseMatrixTest_CSRHybrid_segments";
+
+// types for which MatrixTest is instantiated
+using MatrixTypes = ::testing::Types
+<
+    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >,
+    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >
+#ifdef HAVE_CUDA
+   ,TNL::Matrices::SparseMatrix< int,     TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >,
+    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >
+#endif
+>;
+
+#endif
+
+#include "SparseMatrixVectorProductTest.h"
+#include "../main.h"
diff --git a/src/UnitTests/Matrices/SparseMatrixVectorProductTest_CSRLight.cpp b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_CSRLight.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..274fa20b5f1ba298ab28696cd2d8eac7d0735198
--- /dev/null
+++ b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_CSRLight.cpp
@@ -0,0 +1,11 @@
+/***************************************************************************
+                          SparseMatrixVectorProductTest_CSRHybrid.cpp -  description
+                             -------------------
+    begin                : Jun 9, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#include "SparseMatrixVectorProductTest_CSRHybrid.h"
diff --git a/src/UnitTests/Matrices/SparseMatrixVectorProductTest_CSRLight.cu b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_CSRLight.cu
new file mode 120000
index 0000000000000000000000000000000000000000..68e56b2ee0f5f94cf7592966acdf21b01decfb21
--- /dev/null
+++ b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_CSRLight.cu
@@ -0,0 +1 @@
+SparseMatrixVectorProductTest_CSRLight.cpp
\ No newline at end of file
diff --git a/src/UnitTests/Matrices/SparseMatrixVectorProductTest_CSRLight.h b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_CSRLight.h
new file mode 100644
index 0000000000000000000000000000000000000000..eef049eacf78a21dac2f66e084e1d659b1b7fa47
--- /dev/null
+++ b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_CSRLight.h
@@ -0,0 +1,46 @@
+/***************************************************************************
+                          SparseMatrixVectorProductTest_CSRLight.h -  description
+                             -------------------
+    begin                : Jun 9, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#include <iostream>
+#include <TNL/Algorithms/Segments/CSR.h>
+#include <TNL/Matrices/SparseMatrix.h>
+
+#ifdef HAVE_GTEST
+#include <gtest/gtest.h>
+
+const char* saveAndLoadFileName = "test_SparseMatrixTest_CSRLight_segments";
+
+// types for which MatrixTest is instantiated
+using MatrixTypes = ::testing::Types
+<
+    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >,
+    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >
+#ifdef HAVE_CUDA
+   ,TNL::Matrices::SparseMatrix< int,     TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >,
+    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >
+#endif
+>;
+
+#endif
+
+#include "SparseMatrixVectorProductTest.h"
+#include "../main.h"
diff --git a/src/UnitTests/Matrices/SparseMatrixVectorProductTest_CSRScalar.cpp b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_CSRScalar.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..bfa16c02b91c7cddb03a63763e994ade31bcb0f8
--- /dev/null
+++ b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_CSRScalar.cpp
@@ -0,0 +1,11 @@
+/***************************************************************************
+                          SparseMatrixVectorProductTest_CSRScalar.cpp -  description
+                             -------------------
+    begin                : Mar 30, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#include "SparseMatrixVectorProductTest_CSRScalar.h"
diff --git a/src/UnitTests/Matrices/SparseMatrixVectorProductTest_CSRScalar.cu b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_CSRScalar.cu
new file mode 120000
index 0000000000000000000000000000000000000000..024a31f15c0372c1d968c8182eb93439305556d7
--- /dev/null
+++ b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_CSRScalar.cu
@@ -0,0 +1 @@
+SparseMatrixVectorProductTest_CSRScalar.cpp
\ No newline at end of file
diff --git a/src/UnitTests/Matrices/SparseMatrixVectorProductTest_CSRScalar.h b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_CSRScalar.h
new file mode 100644
index 0000000000000000000000000000000000000000..b9586f66e74f77dfa3f929829604e17d304910ca
--- /dev/null
+++ b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_CSRScalar.h
@@ -0,0 +1,46 @@
+/***************************************************************************
+                          SparseMatrixVectorProductTest_CSRScalar.h -  description
+                             -------------------
+    begin                : Mar 30, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#include <iostream>
+#include <TNL/Algorithms/Segments/CSR.h>
+#include <TNL/Matrices/SparseMatrix.h>
+
+#ifdef HAVE_GTEST
+#include <gtest/gtest.h>
+
+const char* saveAndLoadFileName = "test_SparseMatrixTest_CSRScalar_segments";
+
+// types for which MatrixTest is instantiated
+using MatrixTypes = ::testing::Types
+<
+    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRScalar >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRScalar >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRScalar >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRScalar >,
+    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRScalar >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRScalar >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRScalar >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRScalar >
+#ifdef HAVE_CUDA
+   ,TNL::Matrices::SparseMatrix< int,     TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRScalar >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRScalar >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRScalar >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRScalar >,
+    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRScalar >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRScalar >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRScalar >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRScalar >
+#endif
+>;
+
+#endif
+
+#include "SparseMatrixVectorProductTest.h"
+#include "../main.h"
diff --git a/src/UnitTests/Matrices/SparseMatrixVectorProductTest_CSRVector.cpp b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_CSRVector.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..68075da024e79ccc6cd47911e5de0232a8cc04e0
--- /dev/null
+++ b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_CSRVector.cpp
@@ -0,0 +1,11 @@
+/***************************************************************************
+                          SparseMatrixVectorProductTest_CSRVector.cpp -  description
+                             -------------------
+    begin                : Mar 30, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#include "SparseMatrixVectorProductTest_CSRVector.h"
diff --git a/src/UnitTests/Matrices/SparseMatrixVectorProductTest_CSRVector.cu b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_CSRVector.cu
new file mode 120000
index 0000000000000000000000000000000000000000..91409a4b4800e3f759aba3d19994ed2344fe99e4
--- /dev/null
+++ b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_CSRVector.cu
@@ -0,0 +1 @@
+SparseMatrixVectorProductTest_CSRVector.cpp
\ No newline at end of file
diff --git a/src/UnitTests/Matrices/SparseMatrixVectorProductTest_CSRVector.h b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_CSRVector.h
new file mode 100644
index 0000000000000000000000000000000000000000..0afe07e82ad9afff7246937220f7dcc28bca983f
--- /dev/null
+++ b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_CSRVector.h
@@ -0,0 +1,46 @@
+/***************************************************************************
+                          SparseMatrixVectorProductTest_CSRVector.h -  description
+                             -------------------
+    begin                : Mar 30, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#include <iostream>
+#include <TNL/Algorithms/Segments/CSR.h>
+#include <TNL/Matrices/SparseMatrix.h>
+
+#ifdef HAVE_GTEST
+#include <gtest/gtest.h>
+
+const char* saveAndLoadFileName = "test_SparseMatrixTest_CSRVector_segments";
+
+// types for which MatrixTest is instantiated
+using MatrixTypes = ::testing::Types
+<
+    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRVector >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRVector >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRVector >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRVector >,
+    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRVector >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRVector >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRVector >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRVector >
+#ifdef HAVE_CUDA
+   ,TNL::Matrices::SparseMatrix< int,     TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRVector >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRVector >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRVector >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRVector >,
+    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRVector >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRVector >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRVector >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRVector >
+#endif
+>;
+
+#endif
+
+#include "SparseMatrixVectorProductTest.h"
+#include "../main.h"
diff --git a/src/UnitTests/Matrices/SparseMatrixVectorProductTest_ChunkedEllpack.cpp b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_ChunkedEllpack.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1586d819192e5764b173d7d22566258f20eaf6b9
--- /dev/null
+++ b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_ChunkedEllpack.cpp
@@ -0,0 +1,11 @@
+/***************************************************************************
+                          SparseMatrixVectorProductTest_ChunkedEllpack.cpp -  description
+                             -------------------
+    begin                : Mar 30, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#include "SparseMatrixVectorProductTest_ChunkedEllpack.h"
diff --git a/src/UnitTests/Matrices/SparseMatrixVectorProductTest_ChunkedEllpack.cu b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_ChunkedEllpack.cu
new file mode 120000
index 0000000000000000000000000000000000000000..dea4491d64f98fd38c82e6cfa3b1b7b4d488ea05
--- /dev/null
+++ b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_ChunkedEllpack.cu
@@ -0,0 +1 @@
+SparseMatrixVectorProductTest_ChunkedEllpack.cpp
\ No newline at end of file
diff --git a/src/UnitTests/Matrices/SparseMatrixVectorProductTest_ChunkedEllpack.h b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_ChunkedEllpack.h
new file mode 100644
index 0000000000000000000000000000000000000000..d2cb049f6a7611ebeda132a1810f126f3686add1
--- /dev/null
+++ b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_ChunkedEllpack.h
@@ -0,0 +1,57 @@
+/***************************************************************************
+                          SparseMatrixVectorProductTest_ChunkedEllpack.h -  description
+                             -------------------
+    begin                : Mar 30, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#include <iostream>
+#include <TNL/Algorithms/Segments/ChunkedEllpack.h>
+#include <TNL/Matrices/SparseMatrix.h>
+
+#ifdef HAVE_GTEST
+#include <gtest/gtest.h>
+
+const char* saveAndLoadFileName = "test_SparseMatrixTest_ChunkedEllpack_segments";
+
+////
+// Row-major format is used for the host system
+template< typename Device, typename Index, typename IndexAllocator >
+using RowMajorChunkedEllpack = TNL::Algorithms::Segments::ChunkedEllpack< Device, Index, IndexAllocator, TNL::Algorithms::Segments::RowMajorOrder >;
+
+////
+// Column-major format is used for GPUs
+template< typename Device, typename Index, typename IndexAllocator >
+using ColumnMajorChunkedEllpack = TNL::Algorithms::Segments::ChunkedEllpack< Device, Index, IndexAllocator, TNL::Algorithms::Segments::ColumnMajorOrder >;
+
+// types for which MatrixTest is instantiated
+using MatrixTypes = ::testing::Types
+<
+     TNL::Matrices::SparseMatrix< int,     TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, ColumnMajorChunkedEllpack >
+    ,TNL::Matrices::SparseMatrix< int,     TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, RowMajorChunkedEllpack >
+    ,TNL::Matrices::SparseMatrix< long,    TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, RowMajorChunkedEllpack >
+    ,TNL::Matrices::SparseMatrix< float,   TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, RowMajorChunkedEllpack >
+    ,TNL::Matrices::SparseMatrix< double,  TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, RowMajorChunkedEllpack >
+    ,TNL::Matrices::SparseMatrix< int,     TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, RowMajorChunkedEllpack >
+    ,TNL::Matrices::SparseMatrix< long,    TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, RowMajorChunkedEllpack >
+    ,TNL::Matrices::SparseMatrix< float,   TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, RowMajorChunkedEllpack >
+    ,TNL::Matrices::SparseMatrix< double,  TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, RowMajorChunkedEllpack >
+#ifdef HAVE_CUDA
+    ,TNL::Matrices::SparseMatrix< int,     TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, ColumnMajorChunkedEllpack >
+    ,TNL::Matrices::SparseMatrix< long,    TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, ColumnMajorChunkedEllpack >
+    ,TNL::Matrices::SparseMatrix< float,   TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, ColumnMajorChunkedEllpack >
+    ,TNL::Matrices::SparseMatrix< double,  TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, ColumnMajorChunkedEllpack >
+    ,TNL::Matrices::SparseMatrix< int,     TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, ColumnMajorChunkedEllpack >
+    ,TNL::Matrices::SparseMatrix< long,    TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, ColumnMajorChunkedEllpack >
+    ,TNL::Matrices::SparseMatrix< float,   TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, ColumnMajorChunkedEllpack >
+    ,TNL::Matrices::SparseMatrix< double,  TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, ColumnMajorChunkedEllpack >
+#endif
+>;
+
+#endif
+
+#include "SparseMatrixVectorProductTest.h"
+#include "../main.h"
diff --git a/src/UnitTests/Matrices/SparseMatrixVectorProductTest_Ellpack.cpp b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_Ellpack.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9e2446c38652670607fbf4192434c0b8ace98619
--- /dev/null
+++ b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_Ellpack.cpp
@@ -0,0 +1,11 @@
+/***************************************************************************
+                          SparseMatrixVectorProductTest_Ellpack.cpp -  description
+                             -------------------
+    begin                : Mar 30, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#include "SparseMatrixVectorProductTest_Ellpack.h"
diff --git a/src/UnitTests/Matrices/SparseMatrixVectorProductTest_Ellpack.cu b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_Ellpack.cu
new file mode 120000
index 0000000000000000000000000000000000000000..d30bd03b85d9fcd0761ae96054d8f85adc1f9b44
--- /dev/null
+++ b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_Ellpack.cu
@@ -0,0 +1 @@
+SparseMatrixVectorProductTest_Ellpack.cpp
\ No newline at end of file
diff --git a/src/UnitTests/Matrices/SparseMatrixVectorProductTest_Ellpack.h b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_Ellpack.h
new file mode 100644
index 0000000000000000000000000000000000000000..c93aace755ec0e960ac080a9fb0a7c3323adc819
--- /dev/null
+++ b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_Ellpack.h
@@ -0,0 +1,64 @@
+/***************************************************************************
+                          SparseMatrixVectorProductTest_Ellpack.h -  description
+                             -------------------
+    begin                : Mar 30, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#include <iostream>
+#include <TNL/Algorithms/Segments/Ellpack.h>
+#include <TNL/Matrices/SparseMatrix.h>
+
+#ifdef HAVE_GTEST
+#include <gtest/gtest.h>
+
+const char* saveAndLoadFileName = "test_SparseMatrixTest_Ellpack_segments";
+
+////
+// Row-major format is used for the host system
+template< typename Device, typename Index, typename IndexAlocator >
+using RowMajorEllpack = TNL::Algorithms::Segments::Ellpack< Device, Index, IndexAlocator, TNL::Algorithms::Segments::RowMajorOrder, 32 >;
+
+////
+// Column-major format is used for GPUs
+template< typename Device, typename Index, typename IndexAllocator >
+using ColumnMajorEllpack = TNL::Algorithms::Segments::Ellpack< Device, Index, IndexAllocator, TNL::Algorithms::Segments::ColumnMajorOrder, 32 >;
+
+// types for which MatrixTest is instantiated
+using MatrixTypes = ::testing::Types
+<
+    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, RowMajorEllpack >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, RowMajorEllpack >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, RowMajorEllpack >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, RowMajorEllpack >,
+    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, RowMajorEllpack >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, RowMajorEllpack >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, RowMajorEllpack >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, RowMajorEllpack >
+#ifdef HAVE_CUDA
+   ,TNL::Matrices::SparseMatrix< int,     TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, ColumnMajorEllpack >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, ColumnMajorEllpack >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, ColumnMajorEllpack >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, ColumnMajorEllpack >,
+    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, ColumnMajorEllpack >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, ColumnMajorEllpack >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, ColumnMajorEllpack >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, ColumnMajorEllpack >,
+    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, RowMajorEllpack >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, RowMajorEllpack >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, RowMajorEllpack >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, RowMajorEllpack >,
+    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, RowMajorEllpack >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, RowMajorEllpack >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, RowMajorEllpack >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, RowMajorEllpack >
+#endif
+>;
+
+#endif
+
+#include "SparseMatrixVectorProductTest.h"
+#include "../main.h"
diff --git a/src/UnitTests/Matrices/SparseMatrixVectorProductTest_SandboxMatrix.cpp b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_SandboxMatrix.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..bfa16c02b91c7cddb03a63763e994ade31bcb0f8
--- /dev/null
+++ b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_SandboxMatrix.cpp
@@ -0,0 +1,11 @@
+/***************************************************************************
+                          SparseMatrixVectorProductTest_CSRScalar.cpp -  description
+                             -------------------
+    begin                : Mar 30, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#include "SparseMatrixVectorProductTest_CSRScalar.h"
diff --git a/src/UnitTests/Matrices/SparseMatrixVectorProductTest_SandboxMatrix.cu b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_SandboxMatrix.cu
new file mode 120000
index 0000000000000000000000000000000000000000..bd87e1ad0c244dd2117317355391a5d450f8de98
--- /dev/null
+++ b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_SandboxMatrix.cu
@@ -0,0 +1 @@
+SparseMatrixVectorProductTest_SandboxMatrix.cpp
\ No newline at end of file
diff --git a/src/UnitTests/Matrices/SparseMatrixVectorProductTest_SandboxMatrix.h b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_SandboxMatrix.h
new file mode 100644
index 0000000000000000000000000000000000000000..7b06af0f3a8049d22e81fa480c59c21d106696c0
--- /dev/null
+++ b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_SandboxMatrix.h
@@ -0,0 +1,45 @@
+/***************************************************************************
+                          SparseMatrixVectorProductTest_SandbxMatrix.h -  description
+                             -------------------
+    begin                : Apr 22, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#include <iostream>
+#include <TNL/Matrices/Sandbox/SparseSandboxMatrix.h>
+
+#ifdef HAVE_GTEST
+#include <gtest/gtest.h>
+
+const char* saveAndLoadFileName = "test_SparseMatrixTest_CSRScalar_segments";
+
+// types for which MatrixTest is instantiated
+using MatrixTypes = ::testing::Types
+<
+    TNL::Matrices::Sandbox::SparseSandboxMatrix< int,     TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix >,
+    TNL::Matrices::Sandbox::SparseSandboxMatrix< long,    TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix >,
+    TNL::Matrices::Sandbox::SparseSandboxMatrix< float,   TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix >,
+    TNL::Matrices::Sandbox::SparseSandboxMatrix< double,  TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix >,
+    TNL::Matrices::Sandbox::SparseSandboxMatrix< int,     TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix >,
+    TNL::Matrices::Sandbox::SparseSandboxMatrix< long,    TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix >,
+    TNL::Matrices::Sandbox::SparseSandboxMatrix< float,   TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix >,
+    TNL::Matrices::Sandbox::SparseSandboxMatrix< double,  TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix >
+#ifdef HAVE_CUDA
+   ,TNL::Matrices::Sandbox::SparseSandboxMatrix< int,     TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix >,
+    TNL::Matrices::Sandbox::SparseSandboxMatrix< long,    TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix >,
+    TNL::Matrices::Sandbox::SparseSandboxMatrix< float,   TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix >,
+    TNL::Matrices::Sandbox::SparseSandboxMatrix< double,  TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix >,
+    TNL::Matrices::Sandbox::SparseSandboxMatrix< int,     TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix >,
+    TNL::Matrices::Sandbox::SparseSandboxMatrix< long,    TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix >,
+    TNL::Matrices::Sandbox::SparseSandboxMatrix< float,   TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix >,
+    TNL::Matrices::Sandbox::SparseSandboxMatrix< double,  TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix >
+#endif
+>;
+
+#endif
+
+#include "SparseMatrixVectorProductTest.h"
+#include "../main.h"
diff --git a/src/UnitTests/Matrices/SparseMatrixVectorProductTest_SlicedEllpack.cpp b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_SlicedEllpack.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0afb094fda217ecea4181438a9376d41a9be5b60
--- /dev/null
+++ b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_SlicedEllpack.cpp
@@ -0,0 +1,11 @@
+/***************************************************************************
+                          SparseMatrixVectorProductTest_SlicedEllpack.cpp -  description
+                             -------------------
+    begin                : Mar 30, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#include "SparseMatrixVectorProductTest_SlicedEllpack.h"
diff --git a/src/UnitTests/Matrices/SparseMatrixVectorProductTest_SlicedEllpack.cu b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_SlicedEllpack.cu
new file mode 120000
index 0000000000000000000000000000000000000000..6c3448930f3a12f8dac48f5638153db3883f94c6
--- /dev/null
+++ b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_SlicedEllpack.cu
@@ -0,0 +1 @@
+SparseMatrixVectorProductTest_SlicedEllpack.cpp
\ No newline at end of file
diff --git a/src/UnitTests/Matrices/SparseMatrixVectorProductTest_SlicedEllpack.h b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_SlicedEllpack.h
new file mode 100644
index 0000000000000000000000000000000000000000..5efa70d45eaf65e5ce46865838274f3ad883b8df
--- /dev/null
+++ b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_SlicedEllpack.h
@@ -0,0 +1,57 @@
+/***************************************************************************
+                          SparseMatrixVectorProductTest_SlicedEllpack.h -  description
+                             -------------------
+    begin                : Mar 30, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#include <iostream>
+#include <TNL/Algorithms/Segments/SlicedEllpack.h>
+#include <TNL/Matrices/SparseMatrix.h>
+#include <TNL/Matrices/MatrixType.h>
+
+#ifdef HAVE_GTEST
+#include <gtest/gtest.h>
+
+const char* saveAndLoadFileName = "test_SparseMatrixTest_SlicedEllpack_segments";
+
+////
+// Row-major format is used for the host system
+template< typename Device, typename Index, typename IndexAllocator >
+using RowMajorSlicedEllpack = TNL::Algorithms::Segments::SlicedEllpack< Device, Index, IndexAllocator, TNL::Algorithms::Segments::RowMajorOrder, 32 >;
+
+////
+// Column-major format is used for GPUs
+template< typename Device, typename Index, typename IndexAllocator >
+using ColumnMajorSlicedEllpack = TNL::Algorithms::Segments::SlicedEllpack< Device, Index, IndexAllocator, TNL::Algorithms::Segments::ColumnMajorOrder, 32 >;
+
+// types for which MatrixTest is instantiated
+using MatrixTypes = ::testing::Types
+<
+    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, RowMajorSlicedEllpack >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, RowMajorSlicedEllpack >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, RowMajorSlicedEllpack >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, RowMajorSlicedEllpack >,
+    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, RowMajorSlicedEllpack >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, RowMajorSlicedEllpack >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, RowMajorSlicedEllpack >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, RowMajorSlicedEllpack >
+#ifdef HAVE_CUDA
+   ,TNL::Matrices::SparseMatrix< int,     TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, ColumnMajorSlicedEllpack >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, ColumnMajorSlicedEllpack >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, ColumnMajorSlicedEllpack >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, ColumnMajorSlicedEllpack >,
+    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, ColumnMajorSlicedEllpack >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, ColumnMajorSlicedEllpack >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, ColumnMajorSlicedEllpack >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, ColumnMajorSlicedEllpack >
+#endif
+>;
+
+#endif
+
+#include "SparseMatrixVectorProductTest.h"
+#include "../main.h"