diff --git a/scripts/eti.py b/scripts/eti.py
new file mode 100755
index 0000000000000000000000000000000000000000..89d78876dccdc6149e38fe74391bfccfcb468d2f
--- /dev/null
+++ b/scripts/eti.py
@@ -0,0 +1,74 @@
+#! /usr/bin/env python3
+
+import os.path
+import pathlib
+import re
+import sys
+
+if len(sys.argv) != 2:
+    print(f"usage: {sys.argv[0]} FILE\n\nwhere FILE is a C++ source code or header file.", file=sys.stderr)
+    sys.exit(1)
+if not os.path.isfile(sys.argv[1]):
+    print(f"error: {sys.argv[1]} is not a valid file.", file=sys.stderr)
+    sys.exit(1)
+
+src = sys.argv[1]
+basename = os.path.splitext(os.path.basename(src))[0]
+dirname = f"{basename}.templates"
+
+if not os.path.isdir(dirname):
+    os.mkdir(dirname)
+
+def get_source_code(namespaces, extern_template_instantiation):
+    eti = extern_template_instantiation.strip().replace("extern ", "", 1)
+    # use absolute path for the include when src is an absolute path
+    # (e.g. when called by CMake, because relative include does not work with
+    # its separate build dir structure)
+    if src == os.path.abspath(src):
+        source_code = f"#include \"{src}\"\n"
+    # use relative path for the include when src is relative
+    else:
+        relpath = os.path.relpath(src, dirname)
+        source_code = f"#include \"{relpath}\"\n"
+    for ns in namespaces:
+        source_code += f"namespace {ns} {{\n"
+    source_code += eti + "\n"
+    for ns in namespaces:
+        source_code += f"}} // namespace {ns}\n"
+    return source_code
+
+def check_write(content, fname):
+    write = False
+    if os.path.isfile(fname):
+        write = open(fname, "r").read().strip() != content.strip()
+    else:
+        write = True
+
+    if write is True:
+        with open(fname, "w") as out:
+            out.write(content)
+
+i = 0
+namespaces = []
+file_names = set()
+for line in open(src).readlines():
+    # heuristics for namespaces
+    ns_begin = re.search(r"^\s*namespace\s+(\w+)\s*\{$", line)
+    if ns_begin:
+        namespaces.append(ns_begin.group(1))
+    ns_end = re.search(r"^\s*\}\s*\/\/\s*namespace\s+(\w+)$", line)
+    if ns_end:
+        namespaces.pop(-1)
+
+    if line.strip().startswith("extern template"):
+        source_code = get_source_code(namespaces, line)
+        for ext in ["cpp", "cu"]:
+            fname = f"{dirname}/{basename}.t{i}.{ext}"
+            check_write(source_code, fname)
+            file_names.add(fname)
+        i += 1
+
+# remove extraneous files from the target directory
+for path in pathlib.Path(dirname).iterdir():
+    if str(path) not in file_names:
+        path.unlink()
diff --git a/src/Benchmarks/BLAS/array-operations.h b/src/Benchmarks/BLAS/array-operations.h
index 38a58c4312b14bcc3231d3c097ff4ae6bd171e6b..271819de7faa5a3107f88e9c7cff9d68ebd2d281 100644
--- a/src/Benchmarks/BLAS/array-operations.h
+++ b/src/Benchmarks/BLAS/array-operations.h
@@ -14,8 +14,7 @@
 
 #include <cstring>
 
-#include "../Benchmarks.h"
-
+#include <TNL/Benchmarks/Benchmarks.h>
 #include <TNL/Containers/Array.h>
 
 namespace TNL {
@@ -116,10 +115,7 @@ benchmarkArrayOperations( Benchmark<> & benchmark,
       hostArray = hostArray2;
    };
    benchmark.setOperation( "copy (operator=)", 2 * datasetSize );
-   // copyBasetime is used later inside HAVE_CUDA guard, so the compiler will
-   // complain when compiling without CUDA
-   const double copyBasetime = benchmark.time< Devices::Host >( reset1, "CPU", copyAssignHostHost );
-   (void)copyBasetime;  // ignore unused variable
+   benchmark.time< Devices::Host >( reset1, "CPU", copyAssignHostHost );
 #ifdef HAVE_CUDA
    auto copyAssignCudaCuda = [&]() {
       deviceArray = deviceArray2;
@@ -135,7 +131,7 @@ benchmarkArrayOperations( Benchmark<> & benchmark,
    auto copyAssignCudaHost = [&]() {
       hostArray = deviceArray;
    };
-   benchmark.setOperation( "copy (operator=)", datasetSize, copyBasetime );
+   benchmark.setOperation( "copy (operator=)", datasetSize, benchmark.getBaseTime() );
    benchmark.time< Devices::Cuda >( reset1, "CPU->GPU", copyAssignHostCuda );
    benchmark.time< Devices::Cuda >( reset1, "GPU->CPU", copyAssignCudaHost );
 #endif
diff --git a/src/Benchmarks/BLAS/dense-mv.h b/src/Benchmarks/BLAS/gemv.h
similarity index 58%
rename from src/Benchmarks/BLAS/dense-mv.h
rename to src/Benchmarks/BLAS/gemv.h
index 1204257cce7f2b6fa354a045245129282992c1b3..93147d6baa7c1a04b9d6a037eeec4d0050e824c5 100644
--- a/src/Benchmarks/BLAS/dense-mv.h
+++ b/src/Benchmarks/BLAS/gemv.h
@@ -1,5 +1,5 @@
 /***************************************************************************
-                          dense-mv.h  -  description
+                          gemv.h  -  description
                              -------------------
     begin                : Jul 8, 2021
     copyright            : (C) 2021 by Tomas Oberhuber et al.
@@ -8,15 +8,14 @@
 
 /* See Copyright Notice in tnl/Copyright */
 
-// Implemented by: Jakub Klinkovsky
+// Implemented by: Jakub Klinkovsky, Tomas Oberhuber
 
 #pragma once
 
-#include "../Benchmarks.h"
+#include <TNL/Benchmarks/Benchmarks.h>
 #include "cublasWrappers.h"
 
 #include <TNL/Containers/Vector.h>
-#include <TNL/Pointers/DevicePointer.h>
 #include <TNL/Matrices/DenseMatrix.h>
 #include <TNL/Devices/Cuda.h>
 #include <TNL/Devices/Host.h>
@@ -27,16 +26,12 @@ namespace Benchmarks {
 template< typename Matrix >
 void setMatrix( Matrix& matrix )
 {
-   using RealType = typename Matrix::RealType;
-   using IndexType = typename Matrix::IndexType;
-   matrix.forAllElements( [] __cuda_callable__ ( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx_, RealType& value ) {
-       value = 1.0; } );
+   matrix.setValue( 1.0 );
 }
 
 template< typename Real >
 void
-benchmarkDenseMVSynthetic( Benchmark<> & benchmark,
-                           const int & size )
+benchmarkGemv( Benchmark<> & benchmark, int rows, int columns )
 {
    using HostMatrix = TNL::Matrices::DenseMatrix< Real, TNL::Devices::Host >;
    using RowMajorCudaMatrix = TNL::Matrices::DenseMatrix< Real, TNL::Devices::Cuda, int, TNL::Algorithms::Segments::RowMajorOrder >;
@@ -50,20 +45,13 @@ benchmarkDenseMVSynthetic( Benchmark<> & benchmark,
    HostVector inHostVector, outHostVector;
    CudaVector inCudaVector, outCudaVector1, outCudaVector2;
 
-   // create benchmark group
-   const std::vector< String > parsedType = parseObjectType( getType< HostMatrix >() );
-#ifdef HAVE_CUDA
-   benchmark.createHorizontalGroup( parsedType[ 0 ], 2 );
-#else
-   benchmark.createHorizontalGroup( parsedType[ 0 ], 1 );
-#endif
-
-   hostMatrix.setDimensions( size, size );
-   inHostVector.setSize( size );
-   outHostVector.setSize( size );
+   hostMatrix.setDimensions( rows, columns );
+   inHostVector.setSize( columns );
+   outHostVector.setSize( rows );
 
    setMatrix< HostMatrix >( hostMatrix );
-   const double datasetSize = (double) ( size * size ) * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB;
+   const double datasetSize = (double) ( rows * columns + rows + columns ) * sizeof(Real) / oneGB;
+   benchmark.setOperation( "gemv", datasetSize );
 
    // reset function
    auto reset = [&]() {
@@ -80,14 +68,13 @@ benchmarkDenseMVSynthetic( Benchmark<> & benchmark,
    auto spmvHost = [&]() {
       hostMatrix.vectorProduct( inHostVector, outHostVector );
    };
-   benchmark.setOperation( datasetSize );
    benchmark.time< Devices::Host >( reset, "CPU", spmvHost );
 
 #ifdef HAVE_CUDA
-   columnMajorCudaMatrix.setDimensions( size, size );
-   inCudaVector.setSize( size );
-   outCudaVector1.setSize( size );
-   outCudaVector2.setSize( size );
+   columnMajorCudaMatrix.setDimensions( rows, columns );
+   inCudaVector.setSize( columns );
+   outCudaVector1.setSize( rows );
+   outCudaVector2.setSize( rows );
    setMatrix< ColumnMajorCudaMatrix >( columnMajorCudaMatrix );
 
    auto columnMajorMvCuda = [&]() {
@@ -97,7 +84,7 @@ benchmarkDenseMVSynthetic( Benchmark<> & benchmark,
 
    columnMajorCudaMatrix.reset();
 
-   rowMajorCudaMatrix.setDimensions( size, size );
+   rowMajorCudaMatrix.setDimensions( rows, columns );
    setMatrix< RowMajorCudaMatrix >( rowMajorCudaMatrix );
 
    auto rowMajorMvCuda = [&]() {
@@ -109,7 +96,7 @@ benchmarkDenseMVSynthetic( Benchmark<> & benchmark,
    //std::cerr << outCudaVector1 << std::endl << outCudaVector2 << std::endl;
 
    rowMajorCudaMatrix.reset();
-   columnMajorCudaMatrix.setDimensions( size, size );
+   columnMajorCudaMatrix.setDimensions( rows, columns );
    setMatrix< ColumnMajorCudaMatrix >( columnMajorCudaMatrix );
 
    cublasHandle_t cublasHandle;
@@ -117,8 +104,8 @@ benchmarkDenseMVSynthetic( Benchmark<> & benchmark,
    auto mvCublas = [&] () {
       Real alpha = 1.0;
       Real beta = 0.0;
-      cublasGemv( cublasHandle, CUBLAS_OP_N, size, size, &alpha,
-                  columnMajorCudaMatrix.getValues().getData(), size,
+      cublasGemv( cublasHandle, CUBLAS_OP_N, rows, columns, &alpha,
+                  columnMajorCudaMatrix.getValues().getData(), rows,
                   inCudaVector.getData(), 1, &beta,
                   outCudaVector1.getData(), 1 );
    };
@@ -128,19 +115,5 @@ benchmarkDenseMVSynthetic( Benchmark<> & benchmark,
 #endif
 }
 
-/*template< typename Real = double,
-          typename Index = int >
-void
-benchmarkDenseSynthetic( Benchmark<> & benchmark,
-                         const int & size )
-{
-   // TODO: benchmark all formats from tnl-benchmark-spmv (different parameters of the base formats)
-   // NOTE: CSR is disabled because it is very slow on GPU
-   //benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Scalar >( benchmark, size, elementsPerRow );
-   benchmarkSpMV< Real, Benchmarks::SpMV::ReferenceFormats::Legacy::Ellpack >( benchmark, size, elementsPerRow );
-   benchmarkSpMV< Real, SlicedEllpack >( benchmark, size, elementsPerRow );
-   benchmarkSpMV< Real, Benchmarks::SpMV::ReferenceFormats::Legacy::ChunkedEllpack >( benchmark, size, elementsPerRow );
-}*/
-
 } // namespace Benchmarks
 } // namespace TNL
diff --git a/src/Benchmarks/BLAS/spmv.h b/src/Benchmarks/BLAS/spmv.h
deleted file mode 100644
index 6cd669dc085f2bbfc74e75f8c1f93fb5fb6682a6..0000000000000000000000000000000000000000
--- a/src/Benchmarks/BLAS/spmv.h
+++ /dev/null
@@ -1,189 +0,0 @@
-/***************************************************************************
-                          spmv.h  -  description
-                             -------------------
-    begin                : Dec 30, 2015
-    copyright            : (C) 2015 by Tomas Oberhuber et al.
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-// Implemented by: Jakub Klinkovsky
-
-#pragma once
-
-#include "../Benchmarks.h"
-
-#include <TNL/Pointers/DevicePointer.h>
-#include <Benchmarks/SpMV/ReferenceFormats/Legacy/CSR.h>
-#include <Benchmarks/SpMV/ReferenceFormats/Legacy/Ellpack.h>
-#include <Benchmarks/SpMV/ReferenceFormats/Legacy/SlicedEllpack.h>
-#include <Benchmarks/SpMV/ReferenceFormats/Legacy/ChunkedEllpack.h>
-
-namespace TNL {
-namespace Benchmarks {
-
-// silly alias to match the number of template parameters with other formats
-template< typename Real, typename Device, typename Index >
-using SlicedEllpack = SpMV::ReferenceFormats::Legacy::SlicedEllpack< Real, Device, Index >;
-
-// Legacy formats
-template< typename Real, typename Device, typename Index >
-using SparseMatrixLegacy_CSR_Scalar = SpMV::ReferenceFormats::Legacy::CSR< Real, Device, Index, SpMV::ReferenceFormats::Legacy::CSRScalar >;
-
-
-template< typename Matrix >
-int setHostTestMatrix( Matrix& matrix,
-                       const int elementsPerRow )
-{
-   const int size = matrix.getRows();
-   int elements( 0 );
-   for( int row = 0; row < size; row++ ) {
-      int col = row - elementsPerRow / 2;
-      for( int element = 0; element < elementsPerRow; element++ ) {
-         if( col + element >= 0 &&
-            col + element < size )
-         {
-            matrix.setElement( row, col + element, element + 1 );
-            elements++;
-         }
-      }
-   }
-   return elements;
-}
-
-#ifdef HAVE_CUDA
-template< typename Matrix >
-__global__ void setCudaTestMatrixKernel( Matrix* matrix,
-                                         const int elementsPerRow,
-                                         const int gridIdx )
-{
-   const int rowIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
-   if( rowIdx >= matrix->getRows() )
-      return;
-   int col = rowIdx - elementsPerRow / 2;
-   for( int element = 0; element < elementsPerRow; element++ ) {
-      if( col + element >= 0 &&
-         col + element < matrix->getColumns() )
-         matrix->setElementFast( rowIdx, col + element, element + 1 );
-   }
-}
-#endif
-
-template< typename Matrix >
-void setCudaTestMatrix( Matrix& matrix,
-                        const int elementsPerRow )
-{
-#ifdef HAVE_CUDA
-   typedef typename Matrix::IndexType IndexType;
-   typedef typename Matrix::RealType RealType;
-   Pointers::DevicePointer< Matrix > kernel_matrix( matrix );
-   dim3 cudaBlockSize( 256 ), cudaGridSize( Cuda::getMaxGridSize() );
-   const IndexType cudaBlocks = roundUpDivision( matrix.getRows(), cudaBlockSize.x );
-   const IndexType cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() );
-   for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ ) {
-      if( gridIdx == cudaGrids - 1 )
-         cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize();
-      setCudaTestMatrixKernel< Matrix >
-         <<< cudaGridSize, cudaBlockSize >>>
-         ( &kernel_matrix.template modifyData< Devices::Cuda >(), elementsPerRow, gridIdx );
-        TNL_CHECK_CUDA_DEVICE;
-   }
-#endif
-}
-
-
-// TODO: rename as benchmark_SpMV_synthetic and move to spmv-synthetic.h
-template< typename Real,
-          template< typename, typename, typename > class Matrix >
-void
-benchmarkSpMV( Benchmark<> & benchmark,
-               const int & size,
-               const int elementsPerRow = 5 )
-{
-   typedef Matrix< Real, Devices::Host, int > HostMatrix;
-   typedef Matrix< Real, Devices::Cuda, int > DeviceMatrix;
-   typedef Containers::Vector< Real, Devices::Host, int > HostVector;
-   typedef Containers::Vector< Real, Devices::Cuda, int > CudaVector;
-
-   HostMatrix hostMatrix;
-   DeviceMatrix deviceMatrix;
-   Containers::Vector< int, Devices::Host, int > hostRowLengths;
-   Containers::Vector< int, Devices::Cuda, int > deviceRowLengths;
-   HostVector hostVector, hostVector2;
-   CudaVector deviceVector, deviceVector2;
-
-   // create benchmark group
-   const std::vector< String > parsedType = parseObjectType( getType< HostMatrix >() );
-#ifdef HAVE_CUDA
-   benchmark.createHorizontalGroup( parsedType[ 0 ], 2 );
-#else
-   benchmark.createHorizontalGroup( parsedType[ 0 ], 1 );
-#endif
-
-   hostRowLengths.setSize( size );
-   hostMatrix.setDimensions( size, size );
-   hostVector.setSize( size );
-   hostVector2.setSize( size );
-#ifdef HAVE_CUDA
-   deviceRowLengths.setSize( size );
-   deviceMatrix.setDimensions( size, size );
-   deviceVector.setSize( size );
-   deviceVector2.setSize( size );
-#endif
-
-   hostRowLengths.setValue( elementsPerRow );
-#ifdef HAVE_CUDA
-   deviceRowLengths.setValue( elementsPerRow );
-#endif
-
-   hostMatrix.setCompressedRowLengths( hostRowLengths );
-#ifdef HAVE_CUDA
-   deviceMatrix.setCompressedRowLengths( deviceRowLengths );
-#endif
-
-   const int elements = setHostTestMatrix< HostMatrix >( hostMatrix, elementsPerRow );
-   setCudaTestMatrix< DeviceMatrix >( deviceMatrix, elementsPerRow );
-   const double datasetSize = (double) elements * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB;
-
-   // reset function
-   auto reset = [&]() {
-      hostVector.setValue( 1.0 );
-      hostVector2.setValue( 0.0 );
-#ifdef HAVE_CUDA
-      deviceVector.setValue( 1.0 );
-      deviceVector2.setValue( 0.0 );
-#endif
-   };
-
-   // compute functions
-   auto spmvHost = [&]() {
-      hostMatrix.vectorProduct( hostVector, hostVector2 );
-   };
-   benchmark.setOperation( datasetSize );
-   benchmark.time< Devices::Host >( reset, "CPU", spmvHost );
-#ifdef HAVE_CUDA
-   auto spmvCuda = [&]() {
-      deviceMatrix.vectorProduct( deviceVector, deviceVector2 );
-   };
-   benchmark.time< Devices::Cuda >( reset, "GPU", spmvCuda );
-#endif
-}
-
-template< typename Real = double,
-          typename Index = int >
-void
-benchmarkSpmvSynthetic( Benchmark<> & benchmark,
-                        const int & size,
-                        const int & elementsPerRow )
-{
-   // TODO: benchmark all formats from tnl-benchmark-spmv (different parameters of the base formats)
-   // NOTE: CSR is disabled because it is very slow on GPU
-   //benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Scalar >( benchmark, size, elementsPerRow );
-   benchmarkSpMV< Real, Benchmarks::SpMV::ReferenceFormats::Legacy::Ellpack >( benchmark, size, elementsPerRow );
-   benchmarkSpMV< Real, SlicedEllpack >( benchmark, size, elementsPerRow );
-   benchmarkSpMV< Real, Benchmarks::SpMV::ReferenceFormats::Legacy::ChunkedEllpack >( benchmark, size, elementsPerRow );
-}
-
-} // namespace Benchmarks
-} // namespace TNL
diff --git a/src/Benchmarks/BLAS/tnl-benchmark-blas.h b/src/Benchmarks/BLAS/tnl-benchmark-blas.h
index 9b061adf65a752116bc75090d86af8509e4e764a..ca9ffbb00aca570798c2e83dead535dc8ad1ca30 100644
--- a/src/Benchmarks/BLAS/tnl-benchmark-blas.h
+++ b/src/Benchmarks/BLAS/tnl-benchmark-blas.h
@@ -21,8 +21,7 @@
 #include "array-operations.h"
 #include "vector-operations.h"
 #include "triad.h"
-#include "spmv.h"
-#include "dense-mv.h"
+#include "gemv.h"
 
 
 using namespace TNL;
@@ -32,37 +31,39 @@ using namespace TNL::Benchmarks;
 template< typename Real >
 void
 runBlasBenchmarks( Benchmark<> & benchmark,
-                   Benchmark<>::MetadataMap metadata,
                    const std::size_t & minSize,
                    const std::size_t & maxSize,
-                   const double & sizeStepFactor,
-                   const int & elementsPerRow )
+                   const double & sizeStepFactor )
 {
-   const String precision = getType< Real >();
-   metadata["precision"] = precision;
+   benchmark.setMetadataWidths({
+      { "operation", 30 },
+      { "performer", 21 },
+      { "precision", 10 },
+   });
 
    // Array operations
-   benchmark.newBenchmark( String("Array operations (") + precision + ", host allocator = Host)",
-                           metadata );
+   std::cout << "\n== Array operations ==\n" << std::endl;
    for( std::size_t size = minSize; size <= maxSize; size *= 2 ) {
       benchmark.setMetadataColumns( Benchmark<>::MetadataColumns({
+         { "precision", getType< Real >() },
+         { "host allocator", "Host" },
          { "size", convertToString( size ) },
       } ));
       benchmarkArrayOperations< Real >( benchmark, size );
    }
 #ifdef HAVE_CUDA
-   benchmark.newBenchmark( String("Array operations (") + precision + ", host allocator = CudaHost)",
-                           metadata );
    for( std::size_t size = minSize; size <= maxSize; size *= 2 ) {
       benchmark.setMetadataColumns( Benchmark<>::MetadataColumns({
+         { "precision", getType< Real >() },
+         { "host allocator", "CudaHost" },
          { "size", convertToString( size ) },
       } ));
       benchmarkArrayOperations< Real, int, Allocators::CudaHost >( benchmark, size );
    }
-   benchmark.newBenchmark( String("Array operations (") + precision + ", host allocator = CudaManaged)",
-                           metadata );
    for( std::size_t size = minSize; size <= maxSize; size *= 2 ) {
       benchmark.setMetadataColumns( Benchmark<>::MetadataColumns({
+         { "precision", getType< Real >() },
+         { "host allocator", "CudaManaged" },
          { "size", convertToString( size ) },
       } ));
       benchmarkArrayOperations< Real, int, Allocators::CudaManaged >( benchmark, size );
@@ -70,10 +71,10 @@ runBlasBenchmarks( Benchmark<> & benchmark,
 #endif
 
    // Vector operations
-   benchmark.newBenchmark( String("Vector operations (") + precision + ")",
-                           metadata );
+   std::cout << "\n== Vector operations ==\n" << std::endl;
    for( std::size_t size = minSize; size <= maxSize; size *= sizeStepFactor ) {
       benchmark.setMetadataColumns( Benchmark<>::MetadataColumns({
+         { "precision", getType< Real >() },
          { "size", convertToString( size ) },
       } ));
       benchmarkVectorOperations< Real >( benchmark, size );
@@ -81,39 +82,30 @@ runBlasBenchmarks( Benchmark<> & benchmark,
 
    // Triad benchmark: copy from host, compute, copy to host
 #ifdef HAVE_CUDA
-   benchmark.newBenchmark( String("Triad benchmark (") + precision + ")",
-                           metadata );
+   std::cout << "\n== Triad ==\n" << std::endl;
    for( std::size_t size = minSize; size <= maxSize; size *= 2 ) {
       benchmark.setMetadataColumns( Benchmark<>::MetadataColumns({
+         { "precision", getType< Real >() },
          { "size", convertToString( size ) },
       } ));
       benchmarkTriad< Real >( benchmark, size );
    }
 #endif
 
-   // Sparse matrix-vector multiplication
-   benchmark.newBenchmark( String("Sparse matrix-vector multiplication (") + precision + ")",
-                           metadata );
-   for( std::size_t size = minSize; size <= maxSize; size *= 2 ) {
-      benchmark.setMetadataColumns( Benchmark<>::MetadataColumns({
-         { "rows", convertToString( size ) },
-         { "columns", convertToString( size ) },
-         { "elements per row", convertToString( elementsPerRow ) },
-      } ));
-      benchmarkSpmvSynthetic< Real >( benchmark, size, elementsPerRow );
-   }
-
    // Dense matrix-vector multiplication
-   benchmark.newBenchmark( String("Dense matrix-vector multiplication (") + precision + ")",
-                           metadata );
-   for( std::size_t size = 10; size <= 20000; size *= 2 ) {
-      benchmark.setMetadataColumns( Benchmark<>::MetadataColumns({
-         { "rows", convertToString( size ) },
-         { "columns", convertToString( size ) }
-      } ));
-      benchmarkDenseMVSynthetic< Real >( benchmark, size );
+   std::cout << "\n== Dense matrix-vector multiplication ==\n" << std::endl;
+   for( std::size_t rows = 10; rows <= 20000 * 20000; rows *= 2 ) {
+      for( std::size_t columns = 10; columns <= 20000 * 20000; columns *= 2 ) {
+         if( rows * columns > 20000 * 20000 )
+            break;
+         benchmark.setMetadataColumns( Benchmark<>::MetadataColumns({
+            { "precision", getType< Real >() },
+            { "rows", convertToString( rows ) },
+            { "columns", convertToString( columns ) }
+         } ));
+         benchmarkGemv< Real >( benchmark, rows, columns );
+      }
    }
-
 }
 
 void
@@ -132,7 +124,6 @@ setupConfig( Config::ConfigDescription & config )
    config.addEntry< int >( "max-size", "Minimum size of arrays/vectors used in the benchmark.", 10000000 );
    config.addEntry< int >( "size-step-factor", "Factor determining the size of arrays/vectors used in the benchmark. First size is min-size and each following size is stepFactor*previousSize, up to max-size.", 2 );
    config.addEntry< int >( "loops", "Number of iterations for every computation.", 10 );
-   config.addEntry< int >( "elements-per-row", "Number of elements per row of the sparse matrix used in the matrix-vector multiplication benchmark.", 5 );
    config.addEntry< int >( "verbose", "Verbose mode.", 1 );
 
    config.addDelimiter( "Device settings:" );
@@ -167,7 +158,6 @@ main( int argc, char* argv[] )
    const std::size_t maxSize = parameters.getParameter< int >( "max-size" );
    const int sizeStepFactor = parameters.getParameter< int >( "size-step-factor" );
    const int loops = parameters.getParameter< int >( "loops" );
-   const int elementsPerRow = parameters.getParameter< int >( "elements-per-row" );
    const int verbose = parameters.getParameter< int >( "verbose" );
 
    if( sizeStepFactor <= 1 ) {
@@ -179,23 +169,19 @@ main( int argc, char* argv[] )
    auto mode = std::ios::out;
    if( outputMode == "append" )
        mode |= std::ios::app;
-   std::ofstream logFile( logFileName.getString(), mode );
+   std::ofstream logFile( logFileName, mode );
 
-   // init benchmark and common metadata
-   Benchmark<> benchmark( loops, verbose );
+   // init benchmark and set parameters
+   Benchmark<> benchmark( logFile, loops, verbose );
 
-   // prepare global metadata
-   Benchmark<>::MetadataMap metadata = getHardwareMetadata< Logging >();
+   // write global metadata into a separate file
+   std::map< std::string, std::string > metadata = getHardwareMetadata();
+   writeMapAsJson( metadata, logFileName, ".metadata.json" );
 
    if( precision == "all" || precision == "float" )
-      runBlasBenchmarks< float >( benchmark, metadata, minSize, maxSize, sizeStepFactor, elementsPerRow );
+      runBlasBenchmarks< float >( benchmark, minSize, maxSize, sizeStepFactor );
    if( precision == "all" || precision == "double" )
-      runBlasBenchmarks< double >( benchmark, metadata, minSize, maxSize, sizeStepFactor, elementsPerRow );
-
-   if( ! benchmark.save( logFile ) ) {
-      std::cerr << "Failed to write the benchmark results to file '" << logFileName << "'." << std::endl;
-      return EXIT_FAILURE;
-   }
+      runBlasBenchmarks< double >( benchmark, minSize, maxSize, sizeStepFactor );
 
    return EXIT_SUCCESS;
 }
diff --git a/src/Benchmarks/BLAS/triad.h b/src/Benchmarks/BLAS/triad.h
index d2bdf12cf684268c4652171db2e53a518dbb7a11..b466e5652cd0018f4da8d79077cf75ecc59f81ff 100644
--- a/src/Benchmarks/BLAS/triad.h
+++ b/src/Benchmarks/BLAS/triad.h
@@ -12,7 +12,7 @@
 
 #pragma once
 
-#include "../Benchmarks.h"
+#include <TNL/Benchmarks/Benchmarks.h>
 
 #include <TNL/Containers/Array.h>
 #include <TNL/Allocators/CudaHost.h>
diff --git a/src/Benchmarks/BLAS/vector-operations.h b/src/Benchmarks/BLAS/vector-operations.h
index c2a3ceab321b879eec052d8df24f7091cf778d05..1d9937de80a6ba00d3c862e12b666b0ae2d1a78d 100644
--- a/src/Benchmarks/BLAS/vector-operations.h
+++ b/src/Benchmarks/BLAS/vector-operations.h
@@ -15,7 +15,7 @@
 #include <stdlib.h> // srand48
 #include <numeric>  // std::partial_sum
 
-#include "../Benchmarks.h"
+#include <TNL/Benchmarks/Benchmarks.h>
 
 #include <TNL/Containers/Vector.h>
 #include <TNL/Algorithms/scan.h>
diff --git a/src/Benchmarks/Benchmark.hpp b/src/Benchmarks/Benchmark.hpp
deleted file mode 100644
index e2357990a8285bdef417f738b317fde4e3942735..0000000000000000000000000000000000000000
--- a/src/Benchmarks/Benchmark.hpp
+++ /dev/null
@@ -1,312 +0,0 @@
-/***************************************************************************
-                          Benchmarks.hpp  -  description
-                             -------------------
-    begin                : Jun 7, 2021
-    copyright            : (C) 2021 by Tomas Oberhuber et al.
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-// Implemented by: Jakub Klinkovsky,
-//                 Tomas Oberhuber
-
-#pragma once
-
-#include "FunctionTimer.h"
-#include "Logging.h"
-
-#include <iostream>
-#include <exception>
-#include <limits>
-
-#include <TNL/String.h>
-
-#include <TNL/Devices/Host.h>
-#include <TNL/SystemInfo.h>
-#include <TNL/Cuda/DeviceInfo.h>
-#include <TNL/Config/ConfigDescription.h>
-#include <TNL/MPI/Wrappers.h>
-
-namespace TNL {
-namespace Benchmarks {
-
-
-template< typename Logger >
-Benchmark< Logger >::
-Benchmark( int loops,
-           bool verbose,
-           String outputMode,
-           bool logFileAppend )
-: Logger(verbose, outputMode, logFileAppend), loops(loops)
-{}
-
-template< typename Logger >
-void
-Benchmark< Logger >::
-configSetup( Config::ConfigDescription& config )
-{
-   config.addEntry< int >( "loops", "Number of iterations for every computation.", 10 );
-   config.addEntry< bool >( "reset", "Call reset function between loops.", true );
-   config.addEntry< double >( "min-time", "Minimal real time in seconds for every computation.", 0.0 );
-   config.addEntry< int >( "verbose", "Verbose mode, the higher number the more verbosity.", 1 );
-}
-
-template< typename Logger >
-void
-Benchmark< Logger >::
-setup( const Config::ParameterContainer& parameters )
-{
-   this->loops = parameters.getParameter< int >( "loops" );
-   this->reset = parameters.getParameter< bool >( "reset" );
-   this->minTime = parameters.getParameter< double >( "min-time" );
-   const int verbose = parameters.getParameter< int >( "verbose" );
-   Logger::setVerbose( verbose );
-}
-
-template< typename Logger >
-void
-Benchmark< Logger >::
-setLoops( int loops )
-{
-   this->loops = loops;
-}
-
-template< typename Logger >
-void
-Benchmark< Logger >::
-setMinTime( const double& minTime )
-{
-   this->minTime = minTime;
-}
-
-template< typename Logger >
-void
-Benchmark< Logger >::
-newBenchmark( const String & title )
-{
-   Logger::closeTable();
-   Logger::writeTitle( title );
-}
-
-template< typename Logger >
-void
-Benchmark< Logger >::
-newBenchmark( const String & title,
-               MetadataMap metadata )
-{
-   Logger::closeTable();
-   Logger::writeTitle( title );
-   // add loops and reset flag to metadata
-   metadata["loops"] = convertToString(loops);
-   metadata["reset"] = convertToString( reset );
-   metadata["minimal test time"] = convertToString( minTime );
-   Logger::writeMetadata( metadata );
-}
-
-template< typename Logger >
-void
-Benchmark< Logger >::
-setMetadataColumns( const MetadataColumns & metadata )
-{
-   if( Logger::metadataColumns != metadata )
-      Logger::header_changed = true;
-   Logger::metadataColumns = metadata;
-}
-
-template< typename Logger >
-void
-Benchmark< Logger >::
-setOperation( const String & operation,
-              const double datasetSize,
-              const double baseTime )
-{
-   monitor.setStage( operation.getString() );
-   if( Logger::metadataColumns.size() > 0 && String(Logger::metadataColumns[ 0 ].first) == "operation" ) {
-      Logger::metadataColumns[ 0 ].second = operation;
-   }
-   else {
-      Logger::metadataColumns.insert( Logger::metadataColumns.begin(), {"operation", operation} );
-   }
-   setOperation( datasetSize, baseTime );
-   Logger::header_changed = true;
-}
-
-template< typename Logger >
-void
-Benchmark< Logger >::
-setOperation( const double datasetSize,
-              const double baseTime )
-{
-   this->datasetSize = datasetSize;
-   this->baseTime = baseTime;
-}
-
-template< typename Logger >
-void
-Benchmark< Logger >::
-createHorizontalGroup( const String & name,
-                       int subcolumns )
-{
-   if( Logger::horizontalGroups.size() == 0 ) {
-      Logger::horizontalGroups.push_back( {name, subcolumns} );
-   }
-   else {
-      auto & last = Logger::horizontalGroups.back();
-      if( last.first != name && last.second > 0 ) {
-         Logger::horizontalGroups.push_back( {name, subcolumns} );
-      }
-      else {
-         last.first = name;
-         last.second = subcolumns;
-      }
-   }
-}
-
-template< typename Logger >
-   template< typename Device,
-             typename ResetFunction,
-             typename ComputeFunction >
-double
-Benchmark< Logger >::
-time( ResetFunction reset,
-      const String & performer,
-      ComputeFunction & compute,
-      BenchmarkResult< Logger > & result )
-{
-   result.time = std::numeric_limits<double>::quiet_NaN();
-   result.stddev = std::numeric_limits<double>::quiet_NaN();
-   FunctionTimer< Device > functionTimer;
-   try {
-      if( Logger::verbose > 1 ) {
-         // run the monitor main loop
-         Solvers::SolverMonitorThread monitor_thread( monitor );
-         if( this->reset )
-            std::tie( result.time, result.stddev ) = functionTimer.timeFunction( compute, reset, loops, minTime, Logger::verbose, monitor );
-         else
-            std::tie( result.time, result.stddev ) = functionTimer.timeFunction( compute, loops, minTime, Logger::verbose, monitor );
-      }
-      else {
-         if( this->reset )
-            std::tie( result.time, result.stddev ) = functionTimer.timeFunction( compute, reset, loops, minTime, Logger::verbose, monitor );
-         else
-            std::tie( result.time, result.stddev ) = functionTimer.timeFunction( compute, loops, minTime, Logger::verbose, monitor );
-      }
-      this->performedLoops = functionTimer.getPerformedLoops();
-   }
-   catch ( const std::exception& e ) {
-      std::cerr << "timeFunction failed due to a C++ exception with description: " << e.what() << std::endl;
-   }
-
-   result.bandwidth = datasetSize / result.time;
-   result.speedup = this->baseTime / result.time;
-   if( this->baseTime == 0.0 )
-      this->baseTime = result.time;
-
-   Logger::writeTableHeader( performer, result.getTableHeader() );
-   Logger::writeTableRow( performer, result.getRowElements() );
-
-   return this->baseTime;
-}
-
-template< typename Logger >
-   template< typename Device,
-             typename ResetFunction,
-             typename ComputeFunction >
-inline double
-Benchmark< Logger >::
-time( ResetFunction reset,
-      const String& performer,
-      ComputeFunction& compute )
-{
-   BenchmarkResult< Logger > result;
-   return time< Device, ResetFunction, ComputeFunction >( reset, performer, compute, result );
-}
-
-template< typename Logger >
-   template< typename Device,
-             typename ComputeFunction >
-double
-Benchmark< Logger >::
-time( const String & performer,
-      ComputeFunction & compute,
-      BenchmarkResult< Logger > & result )
-{
-   result.time = std::numeric_limits<double>::quiet_NaN();
-   result.stddev = std::numeric_limits<double>::quiet_NaN();
-   FunctionTimer< Device > functionTimer;
-   try {
-      if( Logger::verbose > 1 ) {
-         // run the monitor main loop
-         Solvers::SolverMonitorThread monitor_thread( monitor );
-         std::tie( result.time, result.stddev ) = functionTimer.timeFunction( compute, loops, minTime, Logger::verbose, monitor );
-      }
-      else {
-         std::tie( result.time, result.stddev ) = functionTimer.timeFunction( compute, loops, minTime, Logger::verbose, monitor );
-      }
-   }
-   catch ( const std::exception& e ) {
-      std::cerr << "Function timer failed due to a C++ exception with description: " << e.what() << std::endl;
-   }
-
-   result.bandwidth = datasetSize / result.time;
-   result.speedup = this->baseTime / result.time;
-   if( this->baseTime == 0.0 )
-      this->baseTime = result.time;
-
-   Logger::writeTableHeader( performer, result.getTableHeader() );
-   Logger::writeTableRow( performer, result.getRowElements() );
-
-   return this->baseTime;
-}
-
-template< typename Logger >
-   template< typename Device,
-             typename ComputeFunction >
-inline double
-Benchmark< Logger >::
-time( const String & performer,
-      ComputeFunction & compute )
-{
-   BenchmarkResult< Logger > result;
-   return time< Device, ComputeFunction >( performer, compute, result );
-}
-
-template< typename Logger >
-void
-Benchmark< Logger >::
-addErrorMessage( const char* msg,
-                 int numberOfComputations )
-{
-   // each computation has 3 subcolumns
-   const int colspan = 3 * numberOfComputations;
-   Logger::writeErrorMessage( msg, colspan );
-   std::cerr << msg << std::endl;
-}
-
-template< typename Logger >
-auto
-Benchmark< Logger >::
-getMonitor() -> SolverMonitorType&
-{
-   return monitor;
-}
-
-template< typename Logger >
-int
-Benchmark< Logger >::
-getPerformedLoops() const
-{
-   return this->performedLoops;
-}
-
-template< typename Logger >
-bool
-Benchmark< Logger >::
-isResetingOn() const
-{
-   return reset;
-}
-
-} // namespace Benchmarks
-} // namespace TNL
diff --git a/src/Benchmarks/Benchmarks.h b/src/Benchmarks/Benchmarks.h
deleted file mode 100644
index 77fa9e47c897617b6ddee3d5ab94e1510bd5e777..0000000000000000000000000000000000000000
--- a/src/Benchmarks/Benchmarks.h
+++ /dev/null
@@ -1,256 +0,0 @@
-/***************************************************************************
-                          Benchmarks.h  -  description
-                             -------------------
-    begin                : Dec 30, 2015
-    copyright            : (C) 2015 by Tomas Oberhuber et al.
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-// Implemented by: Jakub Klinkovsky,
-//                 Tomas Oberhuber
-
-#pragma once
-
-#include "FunctionTimer.h"
-#include "Logging.h"
-
-#include <iostream>
-#include <exception>
-#include <limits>
-
-#include <TNL/String.h>
-
-#include <TNL/Devices/Host.h>
-#include <TNL/SystemInfo.h>
-#include <TNL/Cuda/DeviceInfo.h>
-#include <TNL/Config/ConfigDescription.h>
-#include <TNL/MPI/Wrappers.h>
-
-namespace TNL {
-namespace Benchmarks {
-
-const double oneGB = 1024.0 * 1024.0 * 1024.0;
-
-
-template< typename Logger = Logging >
-struct BenchmarkResult
-{
-   using HeaderElements = typename Logger::HeaderElements;
-   using RowElements = typename Logger::RowElements;
-
-   double time = std::numeric_limits<double>::quiet_NaN();
-   double stddev = std::numeric_limits<double>::quiet_NaN();
-   double bandwidth = std::numeric_limits<double>::quiet_NaN();
-   double speedup = std::numeric_limits<double>::quiet_NaN();
-
-   virtual HeaderElements getTableHeader() const
-   {
-      return HeaderElements( {
-         std::pair< String, int >( "time", 8 ),
-         std::pair< String, int >( "stddev", 8 ),
-         std::pair< String, int >( "stddev/time", 8 ),
-         std::pair< String, int >( "bandwidth", 8 ),
-         std::pair< String, int >( "speedup", 8 ) } );
-   }
-
-   virtual RowElements getRowElements() const
-   {
-      RowElements elements;
-      elements << time << stddev << stddev / time << bandwidth;
-      if( speedup != 0 )
-         elements << speedup;
-      else
-         elements << "N/A";
-      return elements;
-   }
-};
-
-template< typename Logger = Logging >
-class Benchmark
-: protected Logger
-{
-   public:
-      using typename Logger::MetadataElement;
-      using typename Logger::MetadataMap;
-      using typename Logger::MetadataColumns;
-      using SolverMonitorType = Solvers::IterativeSolverMonitor< double, int >;
-
-      using typename Logger::CommonLogs;
-      using Logger::addCommonLogs;
-      using Logger::addLogsMetadata;
-      using Logger::writeHeader;
-
-      Benchmark( int loops = 10,
-               bool verbose = true,
-               String outputMode = "",
-               bool logFileAppend = false );
-
-      static void configSetup( Config::ConfigDescription& config );
-
-      void setup( const Config::ParameterContainer& parameters );
-
-      // TODO: ensure that this is not called in the middle of the benchmark
-      // (or just remove it completely?)
-      void setLoops( int loops );
-
-      void setMinTime( const double& minTime );
-
-      // Marks the start of a new benchmark
-      void newBenchmark( const String & title );
-
-      // Marks the start of a new benchmark (with custom metadata)
-      void newBenchmark( const String & title,
-                        MetadataMap metadata );
-
-      // Sets metadata columns -- values used for all subsequent rows until
-      // the next call to this function.
-      void setMetadataColumns( const MetadataColumns & metadata );
-
-      // TODO: maybe should be renamed to createVerticalGroup and ensured that vertical and horizontal groups are not used within the same "Benchmark"
-      // Sets current operation -- operations expand the table vertically
-      //  - baseTime should be reset to 0.0 for most operations, but sometimes
-      //    it is useful to override it
-      //  - Order of operations inside a "Benchmark" does not matter, rows can be
-      //    easily sorted while converting to HTML.)
-      void
-      setOperation( const String & operation,
-                  const double datasetSize = 0.0, // in GB
-                  const double baseTime = 0.0 );
-
-      void setOperation( const double datasetSize = 0.0,
-                        const double baseTime = 0.0 );
-
-      // Creates new horizontal groups inside a benchmark -- increases the number
-      // of columns in the "Benchmark", implies column spanning.
-      // (Useful e.g. for SpMV formats, different configurations etc.)
-      void
-      createHorizontalGroup( const String & name,
-                           int subcolumns );
-
-      // Times a single ComputeFunction. Subsequent calls implicitly split
-      // the current "horizontal group" into sub-columns identified by
-      // "performer", which are further split into "bandwidth", "time" and
-      // "speedup" columns.
-      // TODO: allow custom columns bound to lambda functions (e.g. for Gflops calculation)
-      // Also terminates the recursion of the following variadic template.
-      template< typename Device,
-               typename ResetFunction,
-               typename ComputeFunction >
-      double time( ResetFunction reset,
-                  const String & performer,
-                  ComputeFunction & compute,
-                  BenchmarkResult< Logger > & result );
-
-      template< typename Device,
-               typename ResetFunction,
-               typename ComputeFunction >
-      inline double time( ResetFunction reset,
-                        const String & performer,
-                        ComputeFunction & compute );
-      /*{
-         BenchmarkResult< Logger > result;
-         return time< Device, ResetFunction, ComputeFunction >( reset, performer, compute, result );
-      }*/
-
-      /****
-       * The same methods as above but without reset function
-       */
-      template< typename Device,
-               typename ComputeFunction >
-      double time( const String & performer,
-                  ComputeFunction & compute,
-                  BenchmarkResult< Logger > & result );
-
-      template< typename Device,
-               typename ComputeFunction >
-      inline double time( const String & performer,
-                        ComputeFunction & compute );
-
-      // Adds an error message to the log. Should be called in places where the
-      // "time" method could not be called (e.g. due to failed allocation).
-      void addErrorMessage( const char* msg,
-                           int numberOfComputations = 1 );
-
-      using Logger::save;
-
-      SolverMonitorType& getMonitor();
-
-      int getPerformedLoops() const;
-
-      bool isResetingOn() const;
-
-   protected:
-
-      int loops = 1, performedLoops = 0;
-
-      double minTime = 0.0;
-
-      double datasetSize = 0.0;
-
-      double baseTime = 0.0;
-
-      bool reset = true;
-
-      SolverMonitorType monitor;
-};
-
-
-template< typename Logger >
-inline typename Benchmark< Logger >::MetadataMap getHardwareMetadata()
-{
-   const int cpu_id = 0;
-   const CacheSizes cacheSizes = SystemInfo::getCPUCacheSizes( cpu_id );
-   String cacheInfo = convertToString( cacheSizes.L1data ) + ", "
-                       + convertToString( cacheSizes.L1instruction ) + ", "
-                       + convertToString( cacheSizes.L2 ) + ", "
-                       + convertToString( cacheSizes.L3 );
-#ifdef HAVE_CUDA
-   const int activeGPU = Cuda::DeviceInfo::getActiveDevice();
-   const String deviceArch = convertToString( Cuda::DeviceInfo::getArchitectureMajor( activeGPU ) ) + "." +
-                             convertToString( Cuda::DeviceInfo::getArchitectureMinor( activeGPU ) );
-#endif
-
-#ifdef HAVE_MPI
-   int nproc = 1;
-   // check if MPI was initialized (some benchmarks do not initialize MPI even when
-   // they are built with HAVE_MPI and thus MPI::GetSize() cannot be used blindly)
-   if( TNL::MPI::Initialized() )
-      nproc = TNL::MPI::GetSize();
-#endif
-
-   typename Benchmark< Logger >::MetadataMap metadata {
-       { "host name", SystemInfo::getHostname() },
-       { "architecture", SystemInfo::getArchitecture() },
-       { "system", SystemInfo::getSystemName() },
-       { "system release", SystemInfo::getSystemRelease() },
-       { "start time", SystemInfo::getCurrentTime() },
-#ifdef HAVE_MPI
-       { "number of MPI processes", convertToString( nproc ) },
-#endif
-       { "OpenMP enabled", convertToString( Devices::Host::isOMPEnabled() ) },
-       { "OpenMP threads", convertToString( Devices::Host::getMaxThreadsCount() ) },
-       { "CPU model name", SystemInfo::getCPUModelName( cpu_id ) },
-       { "CPU cores", convertToString( SystemInfo::getNumberOfCores( cpu_id ) ) },
-       { "CPU threads per core", convertToString( SystemInfo::getNumberOfThreads( cpu_id ) / SystemInfo::getNumberOfCores( cpu_id ) ) },
-       { "CPU max frequency (MHz)", convertToString( SystemInfo::getCPUMaxFrequency( cpu_id ) / 1e3 ) },
-       { "CPU cache sizes (L1d, L1i, L2, L3) (kiB)", cacheInfo },
-#ifdef HAVE_CUDA
-       { "GPU name", Cuda::DeviceInfo::getDeviceName( activeGPU ) },
-       { "GPU architecture", deviceArch },
-       { "GPU CUDA cores", convertToString( Cuda::DeviceInfo::getCudaCores( activeGPU ) ) },
-       { "GPU clock rate (MHz)", convertToString( (double) Cuda::DeviceInfo::getClockRate( activeGPU ) / 1e3 ) },
-       { "GPU global memory (GB)", convertToString( (double) Cuda::DeviceInfo::getGlobalMemory( activeGPU ) / 1e9 ) },
-       { "GPU memory clock rate (MHz)", convertToString( (double) Cuda::DeviceInfo::getMemoryClockRate( activeGPU ) / 1e3 ) },
-       { "GPU memory ECC enabled", convertToString( Cuda::DeviceInfo::getECCEnabled( activeGPU ) ) },
-#endif
-   };
-
-   return metadata;
-}
-
-} // namespace Benchmarks
-} // namespace TNL
-
-#include <Benchmarks/Benchmark.hpp>
diff --git a/src/Benchmarks/CMakeLists.txt b/src/Benchmarks/CMakeLists.txt
index 4e1961b3c1b39fad7cb794fd030cfe33e2f8dd0a..0fc8e0f023d8e4a08ee652cc0474643f8a32e1dc 100644
--- a/src/Benchmarks/CMakeLists.txt
+++ b/src/Benchmarks/CMakeLists.txt
@@ -7,11 +7,3 @@ add_subdirectory( LinearSolvers )
 add_subdirectory( ODESolvers )
 add_subdirectory( Sorting )
 add_subdirectory( Traversers )
-
-set( headers
-         Benchmarks.h
-         FunctionTimer.h
-         Logging.h
-)
-
-install( FILES ${headers} DESTINATION ${TNL_TARGET_INCLUDE_DIRECTORY}/Benchmarks )
diff --git a/src/Benchmarks/DistSpMV/tnl-benchmark-distributed-spmv.h b/src/Benchmarks/DistSpMV/tnl-benchmark-distributed-spmv.h
index b79d80ebf1e5bb9730357a690fe8824e7cc3864b..5329540cbb61ca1e3233f498d918c55284b1558c 100644
--- a/src/Benchmarks/DistSpMV/tnl-benchmark-distributed-spmv.h
+++ b/src/Benchmarks/DistSpMV/tnl-benchmark-distributed-spmv.h
@@ -27,7 +27,7 @@
 #include "Legacy/DistributedMatrix.h"
 #include <TNL/Matrices/SparseOperations.h>
 
-#include "../Benchmarks.h"
+#include <TNL/Benchmarks/Benchmarks.h>
 #include "ordering.h"
 
 #include <TNL/Matrices/SparseMatrix.h>
@@ -157,7 +157,6 @@ struct SpmvBenchmark
 
    static bool
    run( Benchmark<>& benchmark,
-        Benchmark<>::MetadataMap metadata,
         const Config::ParameterContainer& parameters )
    {
       MatrixType matrix;
@@ -169,10 +168,11 @@ struct SpmvBenchmark
       matrix.getCompressedRowLengths( rowLengths );
       const IndexType maxRowLength = max( rowLengths );
 
-      const String name = String( (TNL::MPI::GetSize() > 1) ? "DistSpMV" : "SpMV" )
-                          + " (" + parameters.getParameter< String >( "name" ) + "): ";
-      benchmark.newBenchmark( name, metadata );
+      const String title = (TNL::MPI::GetSize() > 1) ? "DistSpMV" : "SpMV";
+      std::cout << "\n== " << title << " ==\n" << std::endl;
+
       benchmark.setMetadataColumns( Benchmark<>::MetadataColumns({
+         { "matrix name", parameters.getParameter< String >( "name" ) },
          // TODO: strip the device
 //         { "matrix type", matrix.getType() },
          { "rows", convertToString( matrix.getRows() ) },
@@ -190,15 +190,15 @@ struct SpmvBenchmark
          MatrixType matrix_perm;
          Matrices::reorderSparseMatrix( matrix, matrix_perm, perm, iperm );
          if( TNL::MPI::GetSize() > 1 )
-            runDistributed( benchmark, metadata, parameters, matrix_perm, vector );
+            runDistributed( benchmark, parameters, matrix_perm, vector );
          else
-            runNonDistributed( benchmark, metadata, parameters, matrix_perm, vector );
+            runNonDistributed( benchmark, parameters, matrix_perm, vector );
       }
       else {
          if( TNL::MPI::GetSize() > 1 )
-            runDistributed( benchmark, metadata, parameters, matrix, vector );
+            runDistributed( benchmark, parameters, matrix, vector );
          else
-            runNonDistributed( benchmark, metadata, parameters, matrix, vector );
+            runNonDistributed( benchmark, parameters, matrix, vector );
       }
 
       return true;
@@ -206,7 +206,6 @@ struct SpmvBenchmark
 
    static void
    runNonDistributed( Benchmark<>& benchmark,
-                      Benchmark<>::MetadataMap metadata,
                       const Config::ParameterContainer& parameters,
                       MatrixType& matrix,
                       VectorType& vector )
@@ -219,7 +218,6 @@ struct SpmvBenchmark
 
    static void
    runDistributed( Benchmark<>& benchmark,
-                   Benchmark<>::MetadataMap metadata,
                    const Config::ParameterContainer& parameters,
                    MatrixType& matrix,
                    VectorType& vector )
@@ -331,31 +329,24 @@ main( int argc, char* argv[] )
        mode |= std::ios::app;
    std::ofstream logFile;
    if( rank == 0 )
-      logFile.open( logFileName.getString(), mode );
+      logFile.open( logFileName, mode );
 
-   // init benchmark and common metadata
-   Benchmark<> benchmark( loops, verbose );
+   // init benchmark and set parameters
+   Benchmark<> benchmark( logFile, loops, verbose );
 
-   // prepare global metadata
-   Benchmark<>::MetadataMap metadata = getHardwareMetadata< Logging >();
+   // write global metadata into a separate file
+   std::map< std::string, std::string > metadata = getHardwareMetadata();
+   writeMapAsJson( metadata, logFileName, ".metadata.json" );
 
    // TODO: implement resolveMatrixType
 //   return ! Matrices::resolveMatrixType< MainConfig,
 //                                         Devices::Host,
-//                                         SpmvBenchmark >( benchmark, metadata, parameters );
+//                                         SpmvBenchmark >( benchmark, parameters );
    using MatrixType = TNL::Matrices::SparseMatrix< double,
                                                    Devices::Host,
                                                    int,
                                                    TNL::Matrices::GeneralMatrix,
                                                    SegmentsType
                                                  >;
-   const bool status = SpmvBenchmark< MatrixType >::run( benchmark, metadata, parameters );
-
-   if( rank == 0 )
-      if( ! benchmark.save( logFile ) ) {
-         std::cerr << "Failed to write the benchmark results to file '" << parameters.getParameter< String >( "log-file" ) << "'." << std::endl;
-         return EXIT_FAILURE;
-      }
-
-   return ! status;
+   return ! SpmvBenchmark< MatrixType >::run( benchmark, parameters );
 }
diff --git a/src/Benchmarks/FunctionTimer.h b/src/Benchmarks/FunctionTimer.h
deleted file mode 100644
index 1edd6120476f5f50bf3eb714ca3ea1bd8a8ca4aa..0000000000000000000000000000000000000000
--- a/src/Benchmarks/FunctionTimer.h
+++ /dev/null
@@ -1,120 +0,0 @@
-/***************************************************************************
-                          FunctionTimer.h  -  description
-                             -------------------
-    begin                : Dec 25, 2018
-    copyright            : (C) 2018 by Tomas Oberhuber et al.
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-// Implemented by: Jakub Klinkovsky,
-//                 Tomas Oberhuber
-
-#pragma once
-
-#include <type_traits>
-
-#include <TNL/Timer.h>
-#include <TNL/Devices/Cuda.h>
-#include <TNL/Containers/Vector.h>
-#include <TNL/Solvers/IterativeSolverMonitor.h>
-
-namespace TNL {
-namespace Benchmarks {
-
-template< typename Device >
-class FunctionTimer
-{
-public:
-   // returns a pair of (mean, stddev) where mean is the arithmetic mean of the
-   // computation times and stddev is the sample standard deviation
-   template< typename ComputeFunction,
-             typename ResetFunction,
-             typename Monitor = TNL::Solvers::IterativeSolverMonitor< double, int > >
-   std::pair< double, double >
-   timeFunction( ComputeFunction compute,
-                 ResetFunction reset,
-                 int maxLoops,
-                 const double& minTime,
-                 int verbose = 1,
-                 Monitor && monitor = Monitor() )
-   {
-      // the timer is constructed zero-initialized and stopped
-      Timer timer;
-
-      // set timer to the monitor
-      if( verbose > 1 )
-         monitor.setTimer( timer );
-
-      // warm up
-      reset();
-      compute();
-
-      Containers::Vector< double > results( maxLoops );
-      results.setValue( 0.0 );
-
-      for( loops = 0;
-           loops < maxLoops || sum( results ) < minTime;
-           loops++ )
-      {
-         // abuse the monitor's "time" for loops
-         monitor.setTime( loops + 1 );
-         reset();
-
-         // Explicit synchronization of the CUDA device
-#ifdef HAVE_CUDA
-         if( std::is_same< Device, Devices::Cuda >::value )
-            cudaDeviceSynchronize();
-#endif
-
-         // reset timer before each computation
-         timer.reset();
-         timer.start();
-         compute();
-#ifdef HAVE_CUDA
-         if( std::is_same< Device, Devices::Cuda >::value )
-            cudaDeviceSynchronize();
-#endif
-         timer.stop();
-
-         results[ loops ] = timer.getRealTime();
-      }
-
-      const double mean = sum( results ) / (double) loops;
-      if( loops > 1 ) {
-         const double stddev = 1.0 / std::sqrt( loops - 1 ) * l2Norm( results - mean );
-         return std::make_pair( mean, stddev );
-      }
-      else {
-         const double stddev = std::numeric_limits<double>::quiet_NaN();
-         return std::make_pair( mean, stddev );
-      }
-   }
-
-   // returns a pair of (mean, stddev) where mean is the arithmetic mean of the
-   // computation times and stddev is the sample standard deviation
-   template< typename ComputeFunction,
-             typename Monitor = TNL::Solvers::IterativeSolverMonitor< double, int > >
-   std::pair< double, double >
-   timeFunction( ComputeFunction compute,
-                 int maxLoops,
-                 const double& minTime,
-                 int verbose = 1,
-                 Monitor && monitor = Monitor() )
-   {
-      auto noReset = [] () {};
-      return timeFunction( compute, noReset, maxLoops, minTime, verbose, monitor );
-   }
-
-   int getPerformedLoops() const
-   {
-      return this->loops;
-   }
-
-protected:
-   int loops;
-};
-
-} // namespace Benchmarks
-} // namespace TNL
diff --git a/src/Benchmarks/JsonLogging.h b/src/Benchmarks/JsonLogging.h
deleted file mode 100644
index 7d9817c654a36f937063b3f27a18bcf0dfed7cc7..0000000000000000000000000000000000000000
--- a/src/Benchmarks/JsonLogging.h
+++ /dev/null
@@ -1,267 +0,0 @@
-/***************************************************************************
-                          JsonLogging.h  -  description
-                             -------------------
-    begin                : May 11, 2021
-    copyright            : (C) 2021 by Tomas Oberhuber et al.
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-// Implemented by: Jakub Klinkovsky,
-//                 Tomas Oberhuber
-
-#pragma once
-
-#include <map>
-#include <vector>
-#include <iostream>
-#include <iomanip>
-#include <string>
-#include <sstream>
-
-#include <TNL/String.h>
-
-namespace TNL {
-namespace Benchmarks {
-
-class JsonLoggingRowElements
-{
-   public:
-
-      JsonLoggingRowElements()
-      {
-         stream << std::setprecision( 6 ) << std::fixed;
-      }
-
-      template< typename T >
-      JsonLoggingRowElements& operator << ( const T& b )
-      {
-         stream << b;
-         elements.push_back( stream.str() );
-         stream.str( std::string() );
-         return *this;
-      }
-
-      JsonLoggingRowElements& operator << ( decltype( std::setprecision( 2 ) )& setprec )
-      {
-         stream << setprec;
-         return *this;
-      }
-
-      JsonLoggingRowElements& operator << ( decltype( std::fixed )& setfixed ) // the same works also for std::scientific
-      {
-         stream << setfixed;
-         return *this;
-      }
-
-      // iterators
-      auto begin() noexcept { return elements.begin(); }
-
-      auto begin() const noexcept { return elements.begin(); }
-
-      auto cbegin() const noexcept { return elements.cbegin(); }
-
-      auto end() noexcept { return elements.end(); }
-
-      auto end() const noexcept { return elements.end(); }
-
-      auto cend() const noexcept { return elements.cend(); }
-
-      size_t size() const noexcept { return this->elements.size(); };
-   protected:
-      std::list< String > elements;
-
-      std::stringstream stream;
-};
-
-class JsonLogging
-{
-public:
-   using MetadataElement = std::pair< const char*, String >;
-   using MetadataMap = std::map< const char*, String >;
-   using MetadataColumns = std::vector<MetadataElement>;
-
-   using CommonLogs = std::vector< std::pair< const char*, String > >;
-   using LogsMetadata = std::vector< std::pair< String, int > >;
-
-   using HeaderElements = std::vector< std::pair< String, int > >;
-   using RowElements = JsonLoggingRowElements;
-
-   JsonLogging( int verbose = true,
-                String outputMode = "",
-                bool logFileAppend = false )
-   : verbose(verbose), outputMode( outputMode ), logFileAppend( logFileAppend )
-   {}
-
-   void
-   setVerbose( int verbose)
-   {
-      this->verbose = verbose;
-   }
-
-   void addCommonLogs( const CommonLogs& logs )
-   {
-      this->commonLogs = logs;
-      if( verbose )
-      {
-         std::cout << std::endl << "Benchmark setup:" << std::endl;
-         for( auto lg : logs )
-            std::cout << "   " << lg.first << " = " << lg.second << std::endl;
-         std::cout << std::endl;
-      }
-   };
-
-   void resetLogsMetada() { this->logsMetadata.clear(); };
-
-   void addLogsMetadata( const std::vector< std::pair< String, int > >& md )
-   {
-      this->logsMetadata.insert( this->logsMetadata.end(), md.begin(), md.end() );
-   }
-
-   void writeHeader()
-   {
-      if( verbose )
-      {
-         for( auto md : this->logsMetadata )
-            std::cout << std::setw( md.second ) << md.first;
-         std::cout << std::endl;
-      }
-   }
-
-   void writeRow( const RowElements& rowEls )
-   {
-      TNL_ASSERT_EQ( rowEls.size(), this->logsMetadata.size(), "" );
-      if( this->lineStarted )
-         log << "," << std::endl;
-
-      log << "      {" << std::endl;
-
-      // write common logs
-      int idx( 0 );
-      for( auto lg : this->commonLogs )
-      {
-         if( idx++ > 0 )
-            log << "," << std::endl;
-         log << "         \"" << lg.first << "\" : \"" << lg.second << "\"";
-      }
-
-      auto md = this->logsMetadata.begin();
-      for( auto el : rowEls )
-      {
-         if( verbose )
-            std::cout << std::setw( md->second ) << el;
-         if( idx++ > 0 )
-            log << "," << std::endl;
-         log << "         \"" << md++->first << "\" : \"" << el << "\"";
-      }
-      log << std::endl << "      }";
-      this->lineStarted = true;
-      if( verbose )
-         std::cout << std::endl;
-   }
-
-   void
-   writeTitle( const String & title )
-   {
-      if( outputMode == "append" )
-         return;
-
-      if( verbose )
-         std::cout << std::endl << "== " << title << " ==" << std::endl << std::endl;
-   }
-
-   void
-   writeMetadata( const MetadataMap & metadata )
-   {
-      if( outputMode == "append" )
-         return;
-
-      if( verbose )
-         std::cout << "properties:" << std::endl;
-
-      for( auto & it : metadata ) {
-         if( verbose )
-            std::cout << "   " << it.first << " = " << it.second << std::endl;
-      }
-
-      if( verbose )
-         std::cout << std::endl;
-   }
-
-   void
-   writeTableHeader( const String & spanningElement,
-                     const HeaderElements & subElements )
-   {
-   }
-
-   void
-   writeTableRow( const String & spanningElement,
-                  const RowElements & subElements )
-   {
-      writeRow( subElements );
-   }
-
-   void
-   writeErrorMessage( const char* msg,
-                      int colspan = 1 )
-   {
-      log << "\"error\" : \"" << msg << "\"" << std::endl;
-   }
-
-   void
-   closeTable()
-   {
-   }
-
-   bool save( std::ostream & logFile )
-   {
-      if( ! this->logFileAppend )
-      {
-         logFile << "{" << std::endl;
-         logFile << "   \"results\" : [ " << std::endl;
-      }
-      else
-         logFile << log.str();
-      if( logFile.good() ) {
-         log.str() = "";
-         return true;
-      }
-      return false;
-   }
-
-protected:
-   // manual double -> String conversion with fixed precision
-   static String
-   _to_string( double num, int precision = 0, bool fixed = false )
-   {
-      std::stringstream str;
-      if( fixed )
-         str << std::fixed;
-      if( precision )
-         str << std::setprecision( precision );
-      str << num;
-      return String( str.str().data() );
-   }
-
-   std::stringstream log;
-   std::string header_indent;
-   std::string body_indent;
-
-   int verbose;
-   MetadataColumns metadataColumns;
-   bool header_changed = true;
-   std::vector< std::pair< String, int > > horizontalGroups;
-
-   // new JSON implementation
-   LogsMetadata logsMetadata;
-   CommonLogs commonLogs;
-   String outputMode;
-
-   bool lineStarted = false;
-   bool resultsStarted = false;
-   bool logFileAppend = false;
-};
-
-} // namespace Benchmarks
-} // namespace TNL
diff --git a/src/Benchmarks/LinearSolvers/benchmarks.h b/src/Benchmarks/LinearSolvers/benchmarks.h
index 59d2ab3de327ced0beb8a986f44e938af4b4e5d0..33395b04d45a9143d374a451a8632ed4427f6f40 100644
--- a/src/Benchmarks/LinearSolvers/benchmarks.h
+++ b/src/Benchmarks/LinearSolvers/benchmarks.h
@@ -5,7 +5,7 @@
 #include <TNL/Solvers/IterativeSolverMonitor.h>
 #include <TNL/Matrices/DistributedMatrix.h>
 
-#include "../Benchmarks.h"
+#include <TNL/Benchmarks/Benchmarks.h>
 
 #ifdef HAVE_ARMADILLO
 #include <armadillo>
@@ -126,7 +126,7 @@ benchmarkSolver( Benchmark<>& benchmark,
 
    // subclass BenchmarkResult to add extra columns to the benchmark
    // (iterations, preconditioned residue, true residue)
-   struct MyBenchmarkResult : public BenchmarkResult<>
+   struct MyBenchmarkResult : public BenchmarkResult
    {
       using HeaderElements = BenchmarkResult::HeaderElements;
       using RowElements = BenchmarkResult::RowElements;
@@ -145,15 +145,7 @@ benchmarkSolver( Benchmark<>& benchmark,
 
       virtual HeaderElements getTableHeader() const override
       {
-         return HeaderElements( {
-            std::pair< String, int >( "time", 8 ),
-            std::pair< String, int >( "stddev", 8 ),
-            std::pair< String, int >( "stddev/time", 8 ),
-            std::pair< String, int >( "speedup", 8 ),
-            std::pair< String, int >( "converged", 8 ),
-            std::pair< String, int >( "iterations", 8 ),
-            std::pair< String, int >( "residue_precond", 8 ),
-            std::pair< String, int >( "residue_true", 8 ) } );
+         return HeaderElements({ "time", "stddev", "stddev/time", "speedup", "converged", "iterations", "residue_precond", "residue_true" });
       }
 
       virtual RowElements getRowElements() const override
diff --git a/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.h b/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.h
index 0c16513203dff54edf82564434b58fa07d2493f9..acb02a434c3e409c3a7198dab90c20c5e6e8e46c 100644
--- a/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.h
+++ b/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.h
@@ -40,7 +40,7 @@
 #include <TNL/Solvers/Linear/BICGStabL.h>
 #include <TNL/Solvers/Linear/UmfpackWrapper.h>
 
-#include "../Benchmarks.h"
+#include <TNL/Benchmarks/Benchmarks.h>
 #include "../DistSpMV/ordering.h"
 #include "benchmarks.h"
 
@@ -338,7 +338,6 @@ struct LinearSolversBenchmark
 
    static bool
    run( Benchmark<>& benchmark,
-        Benchmark<>::MetadataMap metadata,
         const Config::ParameterContainer& parameters )
    {
       const String file_matrix = parameters.getParameter< String >( "input-matrix" );
@@ -381,10 +380,11 @@ struct LinearSolversBenchmark
       matrixPointer->getCompressedRowLengths( rowLengths );
       const IndexType maxRowLength = max( rowLengths );
 
-      const String name = String( (TNL::MPI::GetSize() > 1) ? "Distributed linear solvers" : "Linear solvers" )
-                          + " (" + parameters.getParameter< String >( "name" ) + "): ";
-      benchmark.newBenchmark( name, metadata );
+      const String title = (TNL::MPI::GetSize() > 1) ? "Distributed linear solvers" : "Linear solvers";
+      std::cout << "\n== " << title << " ==\n" << std::endl;
+
       benchmark.setMetadataColumns( Benchmark<>::MetadataColumns({
+         { "matrix name", parameters.getParameter< String >( "name" ) },
          // TODO: strip the device
 //         { "matrix type", matrixPointer->getType() },
          { "rows", convertToString( matrixPointer->getRows() ) },
@@ -407,15 +407,15 @@ struct LinearSolversBenchmark
          Matrices::reorderArray( x0, x0_perm, perm );
          Matrices::reorderArray( b, b_perm, perm );
          if( TNL::MPI::GetSize() > 1 )
-            runDistributed( benchmark, metadata, parameters, matrix_perm, x0_perm, b_perm );
+            runDistributed( benchmark, parameters, matrix_perm, x0_perm, b_perm );
          else
-            runNonDistributed( benchmark, metadata, parameters, matrix_perm, x0_perm, b_perm );
+            runNonDistributed( benchmark, parameters, matrix_perm, x0_perm, b_perm );
       }
       else {
          if( TNL::MPI::GetSize() > 1 )
-            runDistributed( benchmark, metadata, parameters, matrixPointer, x0, b );
+            runDistributed( benchmark, parameters, matrixPointer, x0, b );
          else
-            runNonDistributed( benchmark, metadata, parameters, matrixPointer, x0, b );
+            runNonDistributed( benchmark, parameters, matrixPointer, x0, b );
       }
 
       return true;
@@ -423,7 +423,6 @@ struct LinearSolversBenchmark
 
    static void
    runDistributed( Benchmark<>& benchmark,
-                   Benchmark<>::MetadataMap metadata,
                    const Config::ParameterContainer& parameters,
                    const SharedPointer< MatrixType >& matrixPointer,
                    const VectorType& x0,
@@ -467,7 +466,6 @@ struct LinearSolversBenchmark
 
    static void
    runNonDistributed( Benchmark<>& benchmark,
-                      Benchmark<>::MetadataMap metadata,
                       const Config::ParameterContainer& parameters,
                       const SharedPointer< MatrixType >& matrixPointer,
                       const VectorType& x0,
@@ -611,31 +609,24 @@ main( int argc, char* argv[] )
        mode |= std::ios::app;
    std::ofstream logFile;
    if( rank == 0 )
-      logFile.open( logFileName.getString(), mode );
+      logFile.open( logFileName, mode );
 
-   // init benchmark and common metadata
-   Benchmark<> benchmark( loops, verbose );
+   // init benchmark and set parameters
+   Benchmark<> benchmark( logFile, loops, verbose );
 
-   // prepare global metadata
-   Benchmark<>::MetadataMap metadata = getHardwareMetadata< Logging >();
+   // write global metadata into a separate file
+   std::map< std::string, std::string > metadata = getHardwareMetadata();
+   writeMapAsJson( metadata, logFileName, ".metadata.json" );
 
    // TODO: implement resolveMatrixType
 //   return ! Matrices::resolveMatrixType< MainConfig,
 //                                         Devices::Host,
-//                                         LinearSolversBenchmark >( benchmark, metadata, parameters );
+//                                         LinearSolversBenchmark >( benchmark, parameters );
    using MatrixType = TNL::Matrices::SparseMatrix< double,
                                                    Devices::Host,
                                                    int,
                                                    TNL::Matrices::GeneralMatrix,
                                                    SegmentsType
                                                  >;
-   const bool status = LinearSolversBenchmark< MatrixType >::run( benchmark, metadata, parameters );
-
-   if( rank == 0 )
-      if( ! benchmark.save( logFile ) ) {
-         std::cerr << "Failed to write the benchmark results to file '" << parameters.getParameter< String >( "log-file" ) << "'." << std::endl;
-         return EXIT_FAILURE;
-      }
-
-   return ! status;
+   return ! LinearSolversBenchmark< MatrixType >::run( benchmark, parameters );
 }
diff --git a/src/Benchmarks/Logging.h b/src/Benchmarks/Logging.h
deleted file mode 100644
index 2c8262d21018473c2013bec8d8f1f13f9f9d4e77..0000000000000000000000000000000000000000
--- a/src/Benchmarks/Logging.h
+++ /dev/null
@@ -1,304 +0,0 @@
-/***************************************************************************
-                          Logging.h  -  description
-                             -------------------
-    begin                : Dec 25, 2018
-    copyright            : (C) 2018 by Tomas Oberhuber et al.
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-// Implemented by: Jakub Klinkovsky,
-//                 Tomas Oberhuber
-
-#pragma once
-
-#include <map>
-#include <vector>
-#include <iostream>
-#include <iomanip>
-#include <string>
-#include <sstream>
-
-#include <TNL/String.h>
-
-namespace TNL {
-namespace Benchmarks {
-
-class LoggingRowElements
-{
-   public:
-
-      LoggingRowElements()
-      {
-         stream << std::setprecision( 6 ) << std::fixed;
-      }
-
-      template< typename T >
-      LoggingRowElements& operator << ( const T& b )
-      {
-         stream << b;
-         elements.push_back( stream.str() );
-         stream.str( std::string() );
-         return *this;
-      }
-
-      LoggingRowElements& operator << ( decltype( std::setprecision( 2 ) )& setprec )
-      {
-         stream << setprec;
-         return *this;
-      }
-
-      LoggingRowElements& operator << ( decltype( std::fixed )& setfixed ) // the same works also for std::scientific
-      {
-         stream << setfixed;
-         return *this;
-      }
-
-      // iterators
-      auto begin() noexcept { return elements.begin(); }
-
-      auto begin() const noexcept { return elements.begin(); }
-
-      auto cbegin() const noexcept { return elements.cbegin(); }
-
-      auto end() noexcept { return elements.end(); }
-
-      auto end() const noexcept { return elements.end(); }
-
-      auto cend() const noexcept { return elements.cend(); }
-
-   protected:
-      std::list< String > elements;
-
-      std::stringstream stream;
-};
-
-class Logging
-{
-public:
-   using MetadataElement = std::pair< const char*, String >;
-   using MetadataMap = std::map< const char*, String >;
-   using MetadataColumns = std::vector<MetadataElement>;
-
-   using CommonLogs = std::vector< std::pair< const char*, String > >;
-
-   using HeaderElements = std::vector< std::pair< String, int > >;
-   using RowElements = LoggingRowElements;
-
-   Logging( int verbose = true,
-            String outputMode = "",
-            bool logFileAppend = false )
-   : verbose(verbose), outputMode( outputMode )
-   {}
-
-   void
-   setVerbose( int verbose)
-   {
-      this->verbose = verbose;
-   }
-
-   void
-   writeTitle( const String & title )
-   {
-      if( verbose )
-         std::cout << std::endl << "== " << title << " ==" << std::endl << std::endl;
-      log << ": title = " << title << std::endl;
-   }
-
-   void addCommonLogs( const CommonLogs& logs )
-   {
-      for( auto log : logs )
-      {
-         if( verbose )
-            std::cout << log.first << " = " << log.second << std::endl;
-      }
-   };
-
-   void addLogsMetadata( const std::vector< String >& md ){};
-
-   void writeHeader(){};
-
-   void
-   writeMetadata( const MetadataMap & metadata )
-   {
-      if( verbose )
-         std::cout << "properties:" << std::endl;
-
-      for( auto & it : metadata ) {
-         if( verbose )
-            std::cout << "   " << it.first << " = " << it.second << std::endl;
-         log << ": " << it.first << " = " << it.second << std::endl;
-      }
-      if( verbose )
-         std::cout << std::endl;
-   }
-
-   void
-   writeTableHeader( const String & spanningElement,
-                     const HeaderElements & subElements )
-   {
-      if( verbose && header_changed ) {
-         for( auto & it : metadataColumns ) {
-            std::cout << std::setw( 20 ) << it.first;
-         }
-
-         // spanning element is printed as usual column to stdout,
-         // but is excluded from header
-         std::cout << std::setw( 15 ) << "";
-
-         for( auto & it : subElements ) {
-            std::cout << std::setw( 15 ) << it.first;
-         }
-         std::cout << std::endl;
-
-         header_changed = false;
-      }
-
-      // initial indent string
-      header_indent = "!";
-      log << std::endl;
-      for( auto & it : metadataColumns ) {
-         log << header_indent << " " << it.first << std::endl;
-      }
-
-      // dump stacked spanning columns
-      if( horizontalGroups.size() > 0 )
-         while( horizontalGroups.back().second <= 0 ) {
-            horizontalGroups.pop_back();
-            header_indent.pop_back();
-         }
-      for( size_t i = 0; i < horizontalGroups.size(); i++ ) {
-         if( horizontalGroups[ i ].second > 0 ) {
-            log << header_indent << " " << horizontalGroups[ i ].first << std::endl;
-            header_indent += "!";
-         }
-      }
-
-      log << header_indent << " " << spanningElement << std::endl;
-      for( auto & it : subElements ) {
-         log << header_indent << "! " << it.first << std::endl;
-      }
-
-      if( horizontalGroups.size() > 0 ) {
-         horizontalGroups.back().second--;
-         header_indent.pop_back();
-      }
-   }
-
-   void
-   writeTableRow( const String & spanningElement,
-                  const RowElements & subElements )
-   {
-      if( verbose ) {
-         for( auto & it : metadataColumns ) {
-            std::cout << std::setw( 20 ) << it.second;
-         }
-         // spanning element is printed as usual column to stdout
-         std::cout << std::setw( 15 ) << spanningElement;
-         for( auto & it : subElements ) {
-            std::cout << std::setw( 15 ) << it;
-         }
-         std::cout << std::endl;
-      }
-
-      // only when changed (the header has been already adjusted)
-      // print each element on separate line
-      for( auto & it : metadataColumns ) {
-         log << it.second << std::endl;
-      }
-
-      // benchmark data are indented
-      const String indent = "    ";
-      for( auto & it : subElements ) {
-         log << indent << it << std::endl;
-      }
-   }
-
-   void
-   writeErrorMessage( const char* msg,
-                      int colspan = 1 )
-   {
-      // initial indent string
-      header_indent = "!";
-      log << std::endl;
-      for( auto & it : metadataColumns ) {
-         log << header_indent << " " << it.first << std::endl;
-      }
-
-      // make sure there is a header column for the message
-      if( horizontalGroups.size() == 0 )
-         horizontalGroups.push_back( {"", 1} );
-
-      // dump stacked spanning columns
-      while( horizontalGroups.back().second <= 0 ) {
-         horizontalGroups.pop_back();
-         header_indent.pop_back();
-      }
-      for( size_t i = 0; i < horizontalGroups.size(); i++ ) {
-         if( horizontalGroups[ i ].second > 0 ) {
-            log << header_indent << " " << horizontalGroups[ i ].first << std::endl;
-            header_indent += "!";
-         }
-      }
-      if( horizontalGroups.size() > 0 ) {
-         horizontalGroups.back().second -= colspan;
-         header_indent.pop_back();
-      }
-
-      // only when changed (the header has been already adjusted)
-      // print each element on separate line
-      for( auto & it : metadataColumns ) {
-         log << it.second << std::endl;
-      }
-      log << msg << std::endl;
-   }
-
-   void
-   closeTable()
-   {
-      log << std::endl;
-      header_indent = body_indent = "";
-      header_changed = true;
-      horizontalGroups.clear();
-   }
-
-   bool save( std::ostream & logFile )
-   {
-      closeTable();
-      logFile << log.str();
-      if( logFile.good() ) {
-         log.str() = "";
-         return true;
-      }
-      return false;
-   }
-
-protected:
-   // manual double -> String conversion with fixed precision
-   static String
-   _to_string( double num, int precision = 0, bool fixed = false )
-   {
-      std::stringstream str;
-      if( fixed )
-         str << std::fixed;
-      if( precision )
-         str << std::setprecision( precision );
-      str << num;
-      return String( str.str().data() );
-   }
-
-   std::stringstream log;
-   std::string header_indent;
-   std::string body_indent;
-
-   int verbose;
-   MetadataColumns metadataColumns;
-   bool header_changed = true;
-   std::vector< std::pair< String, int > > horizontalGroups;
-
-   String outputMode;
-};
-
-} // namespace Benchmarks
-} // namespace TNL
diff --git a/src/Benchmarks/NDArray/tnl-benchmark-ndarray-boundary.h b/src/Benchmarks/NDArray/tnl-benchmark-ndarray-boundary.h
index f7a485aa13f955ea22b97c830b0f43f37f10f529..6e0e807e2dc264b537c79ec316a1a933bdc92dae 100644
--- a/src/Benchmarks/NDArray/tnl-benchmark-ndarray-boundary.h
+++ b/src/Benchmarks/NDArray/tnl-benchmark-ndarray-boundary.h
@@ -18,7 +18,7 @@
 
 #include <TNL/Containers/NDArray.h>
 
-#include "../Benchmarks.h"
+#include <TNL/Benchmarks/Benchmarks.h>
 
 using namespace TNL;
 using namespace TNL::Benchmarks;
@@ -440,13 +440,14 @@ int main( int argc, char* argv[] )
    auto mode = std::ios::out;
    if( outputMode == "append" )
        mode |= std::ios::app;
-   std::ofstream logFile( logFileName.getString(), mode );
+   std::ofstream logFile( logFileName, mode );
 
-   // init benchmark and common metadata
-   Benchmark<> benchmark( loops, verbose );
+   // init benchmark and set parameters
+   Benchmark<> benchmark( logFile, loops, verbose );
 
-   // prepare global metadata
-   Benchmark<>::MetadataMap metadata = getHardwareMetadata< Logging >();
+   // write global metadata into a separate file
+   std::map< std::string, std::string > metadata = getHardwareMetadata();
+   writeMapAsJson( metadata, logFileName, ".metadata.json" );
 
    const String devices = parameters.getParameter< String >( "devices" );
    if( devices == "all" || devices == "host" )
@@ -456,10 +457,5 @@ int main( int argc, char* argv[] )
       run_benchmarks< Devices::Cuda >( benchmark );
 #endif
 
-   if( ! benchmark.save( logFile ) ) {
-      std::cerr << "Failed to write the benchmark results to file '" << parameters.getParameter< String >( "log-file" ) << "'." << std::endl;
-      return EXIT_FAILURE;
-   }
-
    return EXIT_SUCCESS;
 }
diff --git a/src/Benchmarks/NDArray/tnl-benchmark-ndarray.h b/src/Benchmarks/NDArray/tnl-benchmark-ndarray.h
index 8d4ac8e7ace879496e9c18cc25469e83efde3a1b..f0d2e1a9e0c4e4135a127a95908dc94d9905001d 100644
--- a/src/Benchmarks/NDArray/tnl-benchmark-ndarray.h
+++ b/src/Benchmarks/NDArray/tnl-benchmark-ndarray.h
@@ -20,7 +20,7 @@
 #include <TNL/Containers/NDArray.h>
 #include <TNL/Containers/ndarray/Operations.h>
 
-#include "../Benchmarks.h"
+#include <TNL/Benchmarks/Benchmarks.h>
 
 using namespace TNL;
 using namespace TNL::Benchmarks;
@@ -428,13 +428,14 @@ int main( int argc, char* argv[] )
    auto mode = std::ios::out;
    if( outputMode == "append" )
        mode |= std::ios::app;
-   std::ofstream logFile( logFileName.getString(), mode );
+   std::ofstream logFile( logFileName, mode );
 
-   // init benchmark and common metadata
-   Benchmark<> benchmark( loops, verbose );
+   // init benchmark and set parameters
+   Benchmark<> benchmark( logFile, loops, verbose );
 
-   // prepare global metadata
-   Benchmark<>::MetadataMap metadata = getHardwareMetadata< Logging >();
+   // write global metadata into a separate file
+   std::map< std::string, std::string > metadata = getHardwareMetadata();
+   writeMapAsJson( metadata, logFileName, ".metadata.json" );
 
    const String devices = parameters.getParameter< String >( "devices" );
    if( devices == "all" || devices == "host" )
@@ -444,10 +445,5 @@ int main( int argc, char* argv[] )
       run_benchmarks< Devices::Cuda >( benchmark );
 #endif
 
-   if( ! benchmark.save( logFile ) ) {
-      std::cerr << "Failed to write the benchmark results to file '" << parameters.getParameter< String >( "log-file" ) << "'." << std::endl;
-      return EXIT_FAILURE;
-   }
-
    return EXIT_SUCCESS;
 }
diff --git a/src/Benchmarks/ODESolvers/benchmarks.h b/src/Benchmarks/ODESolvers/benchmarks.h
index f27d6962e33cb0cf5cd19373d734a65683f5e1c2..c901b669456712a394a86ccd619c932109f0980c 100644
--- a/src/Benchmarks/ODESolvers/benchmarks.h
+++ b/src/Benchmarks/ODESolvers/benchmarks.h
@@ -15,7 +15,7 @@
 #include <TNL/Pointers/SharedPointer.h>
 #include <TNL/Config/ParameterContainer.h>
 
-#include "../Benchmarks.h"
+#include <TNL/Benchmarks/Benchmarks.h>
 
 #include <stdexcept>  // std::runtime_error
 
diff --git a/src/Benchmarks/ODESolvers/tnl-benchmark-ode-solvers.h b/src/Benchmarks/ODESolvers/tnl-benchmark-ode-solvers.h
index afdf33d3a4ade89f9444ddbc0e7b45542278ece3..01b112c2cdfdf9ee27041fd750b88a0f6b233246 100644
--- a/src/Benchmarks/ODESolvers/tnl-benchmark-ode-solvers.h
+++ b/src/Benchmarks/ODESolvers/tnl-benchmark-ode-solvers.h
@@ -28,7 +28,7 @@
 #include <TNL/Solvers/ODE/Euler.h>
 #include <TNL/Solvers/ODE/Merson.h>
 
-#include "../Benchmarks.h"
+#include <TNL/Benchmarks/Benchmarks.h>
 #include "benchmarks.h"
 #include "SimpleProblem.h"
 #include "Euler.h"
@@ -108,67 +108,41 @@ struct ODESolversBenchmark
 
    static bool
    run( Benchmark<>& benchmark,
-        Benchmark<>::MetadataMap metadata,
         const Config::ParameterContainer& parameters )
    {
-      const String name = String( (TNL::MPI::GetSize() > 1) ? "Distributed ODE solvers" : "ODE solvers" );
-                          //+ " (" + parameters.getParameter< String >( "name" ) + "): ";
-      benchmark.newBenchmark( name, metadata );
+      const String title = (TNL::MPI::GetSize() > 1) ? "Distributed ODE solvers" : "ODE solvers";
+      std::cout << "\n== " << title << " ==\n" << std::endl;
+
       for( size_t dofs = 25; dofs <= 10000000; dofs *= 2 ) {
          benchmark.setMetadataColumns( Benchmark<>::MetadataColumns({
-            // TODO: strip the device
+            { "precision", getType< Real >() },
             { "DOFs", convertToString( dofs ) },
          } ));
 
-         if( TNL::MPI::GetSize() > 1 )
-            runDistributed( benchmark, metadata, parameters, dofs );
-         else
-            runNonDistributed( benchmark, metadata, parameters, dofs );
+         benchmarkODESolvers< Real, Index >( benchmark, parameters, dofs );
       }
       return true;
    }
-
-   static void
-   runDistributed( Benchmark<>& benchmark,
-                   Benchmark<>::MetadataMap metadata,
-                   const Config::ParameterContainer& parameters,
-                   size_t dofs )
-   {
-      std::cout << "Iterative solvers:" << std::endl;
-      benchmarkODESolvers< Real, Index >( benchmark, parameters, dofs );
-   }
-
-   static void
-   runNonDistributed( Benchmark<>& benchmark,
-                      Benchmark<>::MetadataMap metadata,
-                      const Config::ParameterContainer& parameters,
-                      size_t dofs )
-   {
-      std::cout << "Iterative solvers:" << std::endl;
-      benchmarkODESolvers< Real, Index >( benchmark, parameters, dofs );
-   }
 };
 
 template< typename Real >
 bool resolveIndexType( Benchmark<>& benchmark,
-   Benchmark<>::MetadataMap& metadata,
-   Config::ParameterContainer& parameters )
+                       Config::ParameterContainer& parameters )
 {
    const String& index = parameters.getParameter< String >( "index-type" );
-   if( index == "int" ) return ODESolversBenchmark< Real, int >::run( benchmark, metadata, parameters );
-   return ODESolversBenchmark< Real, long int >::run( benchmark, metadata, parameters );
+   if( index == "int" ) return ODESolversBenchmark< Real, int >::run( benchmark, parameters );
+   return ODESolversBenchmark< Real, long int >::run( benchmark, parameters );
 }
 
 bool resolveRealTypes( Benchmark<>& benchmark,
-   Benchmark<>::MetadataMap& metadata,
-   Config::ParameterContainer& parameters )
+                       Config::ParameterContainer& parameters )
 {
    const String& realType = parameters.getParameter< String >( "real-type" );
    if( ( realType == "float" || realType == "all" ) &&
-       ! resolveIndexType< float >( benchmark, metadata, parameters ) )
+       ! resolveIndexType< float >( benchmark, parameters ) )
       return false;
    if( ( realType == "double" || realType == "all" ) &&
-       ! resolveIndexType< double >( benchmark, metadata, parameters ) )
+       ! resolveIndexType< double >( benchmark, parameters ) )
       return false;
    return true;
 }
@@ -177,7 +151,7 @@ void
 configSetup( Config::ConfigDescription& config )
 {
    config.addDelimiter( "Benchmark settings:" );
-   config.addEntry< String >( "log-file", "Log file name.", "tnl-benchmark-linear-solvers.log");
+   config.addEntry< String >( "log-file", "Log file name.", "tnl-benchmark-ode-solvers.log");
    config.addEntry< String >( "output-mode", "Mode for opening the log file.", "overwrite" );
    config.addEntryEnum( "append" );
    config.addEntryEnum( "overwrite" );
@@ -242,21 +216,14 @@ main( int argc, char* argv[] )
        mode |= std::ios::app;
    std::ofstream logFile;
    if( rank == 0 )
-      logFile.open( logFileName.getString(), mode );
-
-   // init benchmark and common metadata
-   Benchmark<> benchmark( loops, verbose );
-
-   // prepare global metadata
-   Benchmark<>::MetadataMap metadata = getHardwareMetadata< Logging >();
+      logFile.open( logFileName, mode );
 
-   const bool status = resolveRealTypes( benchmark, metadata, parameters );
+   // init benchmark and set parameters
+   Benchmark<> benchmark( logFile, loops, verbose );
 
-   if( rank == 0 )
-      if( ! benchmark.save( logFile ) ) {
-         std::cerr << "Failed to write the benchmark results to file '" << parameters.getParameter< String >( "log-file" ) << "'." << std::endl;
-         return EXIT_FAILURE;
-      }
+   // write global metadata into a separate file
+   std::map< std::string, std::string > metadata = getHardwareMetadata();
+   writeMapAsJson( metadata, logFileName, ".metadata.json" );
 
-   return ! status;
+   return ! resolveRealTypes( benchmark, parameters );
 }
diff --git a/src/Benchmarks/SpMV/CMakeLists.txt b/src/Benchmarks/SpMV/CMakeLists.txt
index 93dccab0dc793ea2f3218eb6ba0cb9f326450425..4998088530017c8d4673d5cc97a05da4248f86ff 100644
--- a/src/Benchmarks/SpMV/CMakeLists.txt
+++ b/src/Benchmarks/SpMV/CMakeLists.txt
@@ -9,15 +9,17 @@
 #include( cmake/BuildCSR5.cmake )
 
 if( BUILD_CUDA )
-    cuda_include_directories( ${CXX_BENCHMARKS_INCLUDE_DIRS} )
-    message( STATUS ${CXX_BENCHMARKS_FLAGS} )
-    CUDA_ADD_EXECUTABLE( tnl-benchmark-spmv tnl-benchmark-spmv.cu OPTIONS ${CXX_BENCHMARKS_FLAGS} ${PETSC_CXX_FLAGS} )
-    TARGET_LINK_LIBRARIES( tnl-benchmark-spmv ${CUDA_cusparse_LIBRARY} ${CUDA_cudadevrt_LIBRARY} ${PETSC_LINKER_FLAGS})
+   file( GLOB EXPLICIT_TEMPLATES spmv.templates/*.cu )
+   cuda_include_directories( ${CXX_BENCHMARKS_INCLUDE_DIRS} )
+   cuda_add_executable( tnl-benchmark-spmv tnl-benchmark-spmv.cu ${EXPLICIT_TEMPLATES} ReferenceFormats/LightSpMV-1.0/SpMV.cu ReferenceFormats/LightSpMV-1.0/SpMVCSR.cu
+                        OPTIONS ${CXX_BENCHMARKS_FLAGS} ${PETSC_CXX_FLAGS} )
+   target_link_libraries( tnl-benchmark-spmv ${CUDA_cusparse_LIBRARY} ${CUDA_cudadevrt_LIBRARY} ${PETSC_LINKER_FLAGS})
 else()
-    ADD_EXECUTABLE( tnl-benchmark-spmv tnl-benchmark-spmv.cpp )
-    target_compile_options( tnl-benchmark-spmv  PRIVATE ${CXX_BENCHMARKS_FLAGS} ${PETSC_CXX_FLAGS} )
-    target_include_directories( tnl-benchmark-spmv PRIVATE ${CXX_BENCHMARKS_INCLUDE_DIRS} )
-    TARGET_LINK_LIBRARIES( tnl-benchmark-spmv ${PETSC_LINKER_FLAGS} )
+   file( GLOB EXPLICIT_TEMPLATES spmv.templates/*.cpp )
+   add_executable( tnl-benchmark-spmv tnl-benchmark-spmv.cpp ${EXPLICIT_TEMPLATES} )
+   target_compile_options( tnl-benchmark-spmv  PRIVATE ${CXX_BENCHMARKS_FLAGS} ${PETSC_CXX_FLAGS} )
+   target_include_directories( tnl-benchmark-spmv PRIVATE ${CXX_BENCHMARKS_INCLUDE_DIRS} )
+   target_link_libraries( tnl-benchmark-spmv ${PETSC_LINKER_FLAGS} )
 endif()
 
 install( TARGETS tnl-benchmark-spmv RUNTIME DESTINATION bin )
diff --git a/src/Benchmarks/SpMV/ReferenceFormats/LightSpMVBenchmark.h b/src/Benchmarks/SpMV/ReferenceFormats/LightSpMVBenchmark.h
index 7d6ffde49960a1e53b563222af8ab4861accba0f..1d6e343201e8897a62532a7cd502b8f623ef7476 100644
--- a/src/Benchmarks/SpMV/ReferenceFormats/LightSpMVBenchmark.h
+++ b/src/Benchmarks/SpMV/ReferenceFormats/LightSpMVBenchmark.h
@@ -17,8 +17,8 @@
 #pragma push
 #pragma diag_suppress = 1444
 #include "LightSpMV-1.0/SpMV.h"
-#include "LightSpMV-1.0/SpMV.cu"
-#include "LightSpMV-1.0/SpMVCSR.cu"
+//#include "LightSpMV-1.0/SpMV.cu"
+//#include "LightSpMV-1.0/SpMVCSR.cu"
 #pragma pop
 #endif
 #include <TNL/Matrices/SparseMatrix.h>
diff --git a/src/Benchmarks/SpMV/SpmvBenchmarkResult.h b/src/Benchmarks/SpMV/SpmvBenchmarkResult.h
index 61fae4f609601f6a35a08160c9132f471cd4d72c..128b0033485990229cc90502492a3c71cef52daf 100644
--- a/src/Benchmarks/SpMV/SpmvBenchmarkResult.h
+++ b/src/Benchmarks/SpMV/SpmvBenchmarkResult.h
@@ -10,7 +10,7 @@
 
 #pragma once
 
-#include "../Benchmarks.h"
+#include <TNL/Benchmarks/Benchmarks.h>
 
 namespace TNL {
 namespace Benchmarks {
@@ -21,7 +21,7 @@ template< typename Real,
           typename ResultReal = Real,
           typename Logger = JsonLogging >
 struct SpmvBenchmarkResult
-: public BenchmarkResult< Logger >
+: public BenchmarkResult
 {
    using RealType = Real;
    using DeviceType = Device;
@@ -29,36 +29,28 @@ struct SpmvBenchmarkResult
    using HostVector = Containers::Vector< Real, Devices::Host, Index >;
    using BenchmarkVector = Containers::Vector< ResultReal, Device, Index >;
 
-   using typename BenchmarkResult< Logger >::HeaderElements;
-   using typename BenchmarkResult< Logger >::RowElements;
-   using BenchmarkResult< Logger >::stddev;
-   using BenchmarkResult< Logger >::bandwidth;
-   using BenchmarkResult< Logger >::speedup;
-   using BenchmarkResult< Logger >::time;
+   using typename BenchmarkResult::HeaderElements;
+   using typename BenchmarkResult::RowElements;
+   using BenchmarkResult::stddev;
+   using BenchmarkResult::bandwidth;
+   using BenchmarkResult::speedup;
+   using BenchmarkResult::time;
 
 
-   SpmvBenchmarkResult( const String& format,
-                        const HostVector& csrResult,
-                        const BenchmarkVector& benchmarkResult,
-                        const IndexType nonzeros )
-   : format( format ), csrResult( csrResult ), benchmarkResult( benchmarkResult ), nonzeros( nonzeros ){};
+   SpmvBenchmarkResult( const HostVector& csrResult,
+                        const BenchmarkVector& benchmarkResult )
+   : csrResult( csrResult ), benchmarkResult( benchmarkResult )
+   {}
 
    virtual HeaderElements getTableHeader() const override
    {
-      return HeaderElements( {
-         std::pair< String, int >( "format", 35 ),
-         std::pair< String, int >( "device", 12 ),
-         std::pair< String, int >( "non-zeros", 12 ),
-         std::pair< String, int >( "time", 12 ),
-         std::pair< String, int >( "stddev", 12 ),
-         std::pair< String, int >( "stddev/time", 14 ),
-         std::pair< String, int >( "bandwidth", 12 ),
-         std::pair< String, int >( "speedup", 12 ),
-         std::pair< String, int >( "CSR Diff.Max", 14 ),
-         std::pair< String, int >( "CSR Diff.L2", 14 ) } );
+      return HeaderElements({ "time", "stddev", "stddev/time", "loops", "bandwidth", "speedup", "CSR Diff.Max", "CSR Diff.L2" });
    }
 
-   void setFormat( const String& format ) { this->format = format; };
+   virtual std::vector< int > getColumnWidthHints() const override
+   {
+      return std::vector< int >({ 14, 14, 14, 6, 14, 14, 14, 14 });
+   }
 
    virtual RowElements getRowElements() const override
    {
@@ -66,20 +58,18 @@ struct SpmvBenchmarkResult
       benchmarkResultCopy = benchmarkResult;
       auto diff = csrResult - benchmarkResultCopy;
       RowElements elements;
-      elements << format
-               << ( std::is_same< Device, Devices::Host >::value ? "CPU" : "GPU" )
-               << nonzeros << time << stddev << stddev/time << bandwidth;
+      // write in scientific format to avoid precision loss
+      elements << std::scientific << time << stddev << stddev/time << loops << bandwidth;
       if( speedup != 0.0 )
          elements << speedup;
-      else elements << "N/A";
+      else
+         elements << "N/A";
       elements << max( abs( diff ) ) << lpNorm( diff, 2.0 );
       return elements;
    }
 
-   String format;
    const HostVector& csrResult;
    const BenchmarkVector& benchmarkResult;
-   const IndexType nonzeros;
 };
 
 } //namespace Benchmarks
diff --git a/src/Benchmarks/SpMV/eti.py b/src/Benchmarks/SpMV/eti.py
new file mode 120000
index 0000000000000000000000000000000000000000..c59a420e1efa07b7a627cc80511a14ad9e3d0111
--- /dev/null
+++ b/src/Benchmarks/SpMV/eti.py
@@ -0,0 +1 @@
+../../../scripts/eti.py
\ No newline at end of file
diff --git a/src/Benchmarks/SpMV/spmv.h b/src/Benchmarks/SpMV/spmv.h
index 559adadfff6078cd09d98daa006a21e8f52add8c..ff7fecfbb6ef59e6efcbf18cf6eda2b1cba53b38 100644
--- a/src/Benchmarks/SpMV/spmv.h
+++ b/src/Benchmarks/SpMV/spmv.h
@@ -16,8 +16,8 @@
 
 #include <cstdint>
 
-#include "../Benchmarks.h"
-#include "../JsonLogging.h"
+#include <TNL/Benchmarks/Benchmarks.h>
+#include <TNL/Benchmarks/JsonLogging.h>
 #include "SpmvBenchmarkResult.h"
 
 #include <TNL/Pointers/DevicePointer.h>
@@ -43,12 +43,6 @@
 #include <petscmat.h>
 #endif
 
-// Comment the following to turn off some groups of SpMV benchmarks and speed-up the compilation
-#define WITH_TNL_BENCHMARK_SPMV_GENERAL_MATRICES
-#define WITH_TNL_BENCHMARK_SPMV_SYMMETRIC_MATRICES
-#define WITH_TNL_BENCHMARK_SPMV_BINARY_MATRICES
-#define WITH_TNL_BENCHMARK_SPMV_LEGACY_FORMATS
-
 // Uncomment the following line to enable benchmarking the sandbox sparse matrix.
 //#define WITH_TNL_BENCHMARK_SPMV_SANDBOX_MATRIX
 #ifdef WITH_TNL_BENCHMARK_SPMV_SANDBOX_MATRIX
@@ -197,8 +191,7 @@ template< typename Real, typename Device, typename Index >
 using SlicedEllpackAlias = Benchmarks::SpMV::ReferenceFormats::Legacy::SlicedEllpack< Real, Device, Index >;
 
 template< typename Real,
-          template< typename, typename, typename > class Matrix,
-          template< typename, typename, typename, typename > class Vector = Containers::Vector >
+          template< typename, typename, typename > class Matrix >
 void
 benchmarkSpMVLegacy( BenchmarkType& benchmark,
                      const TNL::Containers::Vector< Real, Devices::Host, int >& csrResultVector,
@@ -211,6 +204,8 @@ benchmarkSpMVLegacy( BenchmarkType& benchmark,
    using HostVector = Containers::Vector< Real, Devices::Host, int >;
    using CudaVector = Containers::Vector< Real, Devices::Cuda, int >;
 
+   benchmark.setMetadataElement({ "format", MatrixInfo< HostMatrix >::getFormat() });
+
    HostMatrix hostMatrix;
    CudaMatrix cudaMatrix;
 
@@ -220,13 +215,13 @@ benchmarkSpMVLegacy( BenchmarkType& benchmark,
    }
    catch(const std::exception& e)
    {
-      std::cerr << "Unable to read the matrix: " << e.what() << std::endl;
+      benchmark.addErrorMessage( "Unable to read the matrix:" + String(e.what()) );
       return;
    }
 
-   const int elements = hostMatrix.getNonzeroElementsCount();
-   const double datasetSize = (double) elements * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB;
-   benchmark.setOperation( datasetSize );
+   const int nonzeros = hostMatrix.getNonzeroElementsCount();
+   const double datasetSize = (double) nonzeros * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB;
+   benchmark.setDatasetSize( datasetSize );
 
    /////
    // Benchmark SpMV on host
@@ -244,7 +239,7 @@ benchmarkSpMVLegacy( BenchmarkType& benchmark,
          hostMatrix.vectorProduct( hostInVector, hostOutVector );
 
       };
-      SpmvBenchmarkResult< Real, Devices::Host, int > hostBenchmarkResults( MatrixInfo< HostMatrix >::getFormat(), csrResultVector, hostOutVector, hostMatrix.getNonzeroElementsCount() );
+      SpmvBenchmarkResult< Real, Devices::Host, int > hostBenchmarkResults( csrResultVector, hostOutVector );
       benchmark.time< Devices::Host >( resetHostVectors, "CPU", spmvHost, hostBenchmarkResults );
    }
 
@@ -258,7 +253,7 @@ benchmarkSpMVLegacy( BenchmarkType& benchmark,
    }
    catch(const std::exception& e)
    {
-      std::cerr << "Unable to copy the matrix on GPU: " << e.what() << std::endl;
+      benchmark.addErrorMessage( "Unable to copy the matrix on GPU: " + String(e.what()) );
       return;
    }
 
@@ -272,15 +267,14 @@ benchmarkSpMVLegacy( BenchmarkType& benchmark,
    auto spmvCuda = [&]() {
       cudaMatrix.vectorProduct( cudaInVector, cudaOutVector );
    };
-   SpmvBenchmarkResult< Real, Devices::Cuda, int > cudaBenchmarkResults( MatrixInfo< HostMatrix >::getFormat(), csrResultVector, cudaOutVector, cudaMatrix.getNonzeroElementsCount() );
+   SpmvBenchmarkResult< Real, Devices::Cuda, int > cudaBenchmarkResults( csrResultVector, cudaOutVector );
    benchmark.time< Devices::Cuda >( resetCudaVectors, "GPU", spmvCuda, cudaBenchmarkResults );
  #endif
 }
 
 template< typename Real,
           typename InputMatrix,
-          template< typename, typename, typename > class Matrix,
-          template< typename, typename, typename, typename > class Vector = Containers::Vector >
+          template< typename, typename, typename > class Matrix >
 void
 benchmarkSpMV( BenchmarkType& benchmark,
                const InputMatrix& inputMatrix,
@@ -294,6 +288,8 @@ benchmarkSpMV( BenchmarkType& benchmark,
    using HostVector = Containers::Vector< Real, Devices::Host, int >;
    using CudaVector = Containers::Vector< Real, Devices::Cuda, int >;
 
+   benchmark.setMetadataElement({ "format", MatrixInfo< HostMatrix >::getFormat() });
+
    HostMatrix hostMatrix;
    try
    {
@@ -301,13 +297,13 @@ benchmarkSpMV( BenchmarkType& benchmark,
    }
    catch(const std::exception& e)
    {
-      std::cerr << "Unable to convert the matrix to the target format:"  << e.what() << std::endl;
+      benchmark.addErrorMessage( "Unable to convert the matrix to the target format:" + String(e.what()) );
       return;
    }
 
-   const int elements = hostMatrix.getNonzeroElementsCount();
-   const double datasetSize = (double) elements * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB;
-   benchmark.setOperation( datasetSize );
+   const int nonzeros = hostMatrix.getNonzeroElementsCount();
+   const double datasetSize = (double) nonzeros * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB;
+   benchmark.setDatasetSize( datasetSize );
 
    /////
    // Benchmark SpMV on host
@@ -325,7 +321,7 @@ benchmarkSpMV( BenchmarkType& benchmark,
          hostMatrix.vectorProduct( hostInVector, hostOutVector );
 
       };
-      SpmvBenchmarkResult< Real, Devices::Host, int > hostBenchmarkResults( MatrixInfo< HostMatrix >::getFormat(), csrResultVector, hostOutVector, hostMatrix.getNonzeroElementsCount() );
+      SpmvBenchmarkResult< Real, Devices::Host, int > hostBenchmarkResults( csrResultVector, hostOutVector );
       benchmark.time< Devices::Host >( resetHostVectors, "CPU", spmvHost, hostBenchmarkResults );
    }
 
@@ -340,7 +336,7 @@ benchmarkSpMV( BenchmarkType& benchmark,
    }
    catch(const std::exception& e)
    {
-      std::cerr << "Unable to copy the matrix on GPU:" << e.what() << std::endl;
+      benchmark.addErrorMessage( "Unable to copy the matrix on GPU: " + String(e.what()) );
       return;
    }
 
@@ -354,7 +350,7 @@ benchmarkSpMV( BenchmarkType& benchmark,
    auto spmvCuda = [&]() {
       cudaMatrix.vectorProduct( cudaInVector, cudaOutVector );
    };
-   SpmvBenchmarkResult< Real, Devices::Cuda, int > cudaBenchmarkResults( MatrixInfo< HostMatrix >::getFormat(), csrResultVector, cudaOutVector, cudaMatrix.getNonzeroElementsCount() );
+   SpmvBenchmarkResult< Real, Devices::Cuda, int > cudaBenchmarkResults( csrResultVector, cudaOutVector );
    benchmark.time< Devices::Cuda >( resetCudaVectors, "GPU", spmvCuda, cudaBenchmarkResults );
  #endif
 }
@@ -362,8 +358,7 @@ benchmarkSpMV( BenchmarkType& benchmark,
 template< typename Real,
           typename InputMatrix,
           template< typename, typename, typename > class Matrix,
-          typename TestReal = Real,
-          template< typename, typename, typename, typename > class Vector = Containers::Vector >
+          typename TestReal = Real >
 void
 benchmarkSpMVCSRLight( BenchmarkType& benchmark,
                        const InputMatrix& inputMatrix,
@@ -377,6 +372,8 @@ benchmarkSpMVCSRLight( BenchmarkType& benchmark,
    using HostVector = Containers::Vector< Real, Devices::Host, int >;
    using CudaVector = Containers::Vector< Real, Devices::Cuda, int >;
 
+   benchmark.setMetadataElement({ "format", MatrixInfo< HostMatrix >::getFormat() });
+
    HostMatrix hostMatrix;
    try
    {
@@ -384,13 +381,13 @@ benchmarkSpMVCSRLight( BenchmarkType& benchmark,
    }
    catch(const std::exception& e)
    {
-      std::cerr << "Unable to convert the matrix to the target format:"  << e.what() << std::endl;
+      benchmark.addErrorMessage( "Unable to convert the matrix to the target format:" + String(e.what()) );
       return;
    }
 
-   const int elements = hostMatrix.getNonzeroElementsCount();
-   const double datasetSize = (double) elements * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB;
-   benchmark.setOperation( datasetSize );
+   const int nonzeros = hostMatrix.getNonzeroElementsCount();
+   const double datasetSize = (double) nonzeros * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB;
+   benchmark.setDatasetSize( datasetSize );
 
    /////
    // Benchmark SpMV on host
@@ -408,7 +405,7 @@ benchmarkSpMVCSRLight( BenchmarkType& benchmark,
          hostMatrix.vectorProduct( hostInVector, hostOutVector );
 
       };
-      SpmvBenchmarkResult< Real, Devices::Host, int > hostBenchmarkResults( MatrixInfo< HostMatrix >::getFormat(), csrResultVector, hostOutVector, hostMatrix.getNonzeroElementsCount() );
+      SpmvBenchmarkResult< Real, Devices::Host, int > hostBenchmarkResults( csrResultVector, hostOutVector );
       benchmark.time< Devices::Host >( resetHostVectors, "CPU", spmvHost, hostBenchmarkResults );
    }
 
@@ -423,7 +420,7 @@ benchmarkSpMVCSRLight( BenchmarkType& benchmark,
    }
    catch(const std::exception& e)
    {
-      std::cerr << "Unable to copy the matrix on GPU:" << e.what() << std::endl;
+      benchmark.addErrorMessage( "Unable to copy the matrix on GPU: " + String(e.what()) );
       return;
    }
 
@@ -441,14 +438,18 @@ benchmarkSpMVCSRLight( BenchmarkType& benchmark,
    {
       cudaMatrix.getSegments().getKernel().setThreadsMapping( Algorithms::Segments::CSRLightAutomaticThreads );
       String format = MatrixInfo< HostMatrix >::getFormat() + " Automatic";
-      SpmvBenchmarkResult< Real, Devices::Cuda, int > cudaBenchmarkResults( format, csrResultVector, cudaOutVector, cudaMatrix.getNonzeroElementsCount() );
+      benchmark.setMetadataElement({ "format", format });
+
+      SpmvBenchmarkResult< Real, Devices::Cuda, int > cudaBenchmarkResults( csrResultVector, cudaOutVector );
       benchmark.time< Devices::Cuda >( resetCudaVectors, "GPU", spmvCuda, cudaBenchmarkResults );
    };
 
    {
       cudaMatrix.getSegments().getKernel().setThreadsMapping( Algorithms::Segments::CSRLightAutomaticThreadsLightSpMV );
       String format = MatrixInfo< HostMatrix >::getFormat() + " Automatic Light";
-      SpmvBenchmarkResult< Real, Devices::Cuda, int > cudaBenchmarkResults( format, csrResultVector, cudaOutVector, cudaMatrix.getNonzeroElementsCount() );
+      benchmark.setMetadataElement({ "format", format });
+
+      SpmvBenchmarkResult< Real, Devices::Cuda, int > cudaBenchmarkResults( csrResultVector, cudaOutVector );
       benchmark.time< Devices::Cuda >( resetCudaVectors, "GPU", spmvCuda, cudaBenchmarkResults );
    };
 
@@ -456,7 +457,9 @@ benchmarkSpMVCSRLight( BenchmarkType& benchmark,
    {
       cudaMatrix.getSegments().getKernel().setThreadsPerSegment( threadsPerRow );
       String format = MatrixInfo< HostMatrix >::getFormat() + " " + convertToString( threadsPerRow );
-      SpmvBenchmarkResult< Real, Devices::Cuda, int > cudaBenchmarkResults( format, csrResultVector, cudaOutVector, cudaMatrix.getNonzeroElementsCount() );
+      benchmark.setMetadataElement({ "format", format });
+
+      SpmvBenchmarkResult< Real, Devices::Cuda, int > cudaBenchmarkResults( csrResultVector, cudaOutVector );
       benchmark.time< Devices::Cuda >( resetCudaVectors, "GPU", spmvCuda, cudaBenchmarkResults );
    }*/
  #endif
@@ -465,8 +468,7 @@ benchmarkSpMVCSRLight( BenchmarkType& benchmark,
 
 template< typename Real,
           typename InputMatrix,
-          template< typename, typename, typename > class Matrix,
-          template< typename, typename, typename, typename > class Vector = Containers::Vector >
+          template< typename, typename, typename > class Matrix >
 void
 benchmarkBinarySpMV( BenchmarkType& benchmark,
                      const InputMatrix& inputMatrix,
@@ -480,6 +482,8 @@ benchmarkBinarySpMV( BenchmarkType& benchmark,
    using HostVector = Containers::Vector< Real, Devices::Host, int >;
    using CudaVector = Containers::Vector< Real, Devices::Cuda, int >;
 
+   benchmark.setMetadataElement({ "format", MatrixInfo< HostMatrix >::getFormat() });
+
    HostMatrix hostMatrix;
    try
    {
@@ -487,13 +491,13 @@ benchmarkBinarySpMV( BenchmarkType& benchmark,
    }
    catch(const std::exception& e)
    {
-      std::cerr << "Unable to convert the matrix to the target format:" << e.what() << std::endl;
+      benchmark.addErrorMessage( "Unable to convert the matrix to the target format:" + String(e.what()) );
       return;
    }
 
-   const int elements = hostMatrix.getNonzeroElementsCount();
-   const double datasetSize = (double) elements * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB;
-   benchmark.setOperation( datasetSize );
+   const int nonzeros = hostMatrix.getNonzeroElementsCount();
+   const double datasetSize = (double) nonzeros * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB;
+   benchmark.setDatasetSize( datasetSize );
 
    /////
    // Benchmark SpMV on host
@@ -511,7 +515,7 @@ benchmarkBinarySpMV( BenchmarkType& benchmark,
          hostMatrix.vectorProduct( hostInVector, hostOutVector );
 
       };
-      SpmvBenchmarkResult< Real, Devices::Host, int > hostBenchmarkResults( MatrixInfo< HostMatrix >::getFormat(), csrResultVector, hostOutVector, hostMatrix.getNonzeroElementsCount() );
+      SpmvBenchmarkResult< Real, Devices::Host, int > hostBenchmarkResults( csrResultVector, hostOutVector );
       benchmark.time< Devices::Host >( resetHostVectors, "CPU", spmvHost, hostBenchmarkResults );
    }
 
@@ -526,7 +530,7 @@ benchmarkBinarySpMV( BenchmarkType& benchmark,
    }
    catch(const std::exception& e)
    {
-      std::cerr << "Unable to copy the matrix on GPU:" << e.what() << std::endl;
+      benchmark.addErrorMessage( "Unable to copy the matrix on GPU: " + String(e.what()) );
       return;
    }
 
@@ -540,11 +544,143 @@ benchmarkBinarySpMV( BenchmarkType& benchmark,
    auto spmvCuda = [&]() {
       cudaMatrix.vectorProduct( cudaInVector, cudaOutVector );
    };
-   SpmvBenchmarkResult< Real, Devices::Cuda, int > cudaBenchmarkResults( MatrixInfo< HostMatrix >::getFormat(), csrResultVector, cudaOutVector, cudaMatrix.getNonzeroElementsCount() );
+   SpmvBenchmarkResult< Real, Devices::Cuda, int > cudaBenchmarkResults( csrResultVector, cudaOutVector );
    benchmark.time< Devices::Cuda >( resetCudaVectors, "GPU", spmvCuda, cudaBenchmarkResults );
  #endif
 }
 
+template< typename Real >
+void
+dispatchLegacy( BenchmarkType& benchmark,
+                const TNL::Containers::Vector< Real, Devices::Host, int >& hostOutVector,
+                const String& inputFileName,
+                bool allCpuTests,
+                bool verboseMR )
+{
+   using namespace Benchmarks::SpMV::ReferenceFormats;
+   benchmarkSpMVLegacy< Real, SparseMatrixLegacy_CSR_Scalar             >( benchmark, hostOutVector, inputFileName, allCpuTests, verboseMR );
+   benchmarkSpMVLegacy< Real, SparseMatrixLegacy_CSR_Vector             >( benchmark, hostOutVector, inputFileName, allCpuTests, verboseMR );
+   //benchmarkSpMVLegacy< Real, SparseMatrixLegacy_CSR_Light              >( benchmark, hostOutVector, inputFileName, verboseMR );
+   //benchmarkSpMVLegacy< Real, SparseMatrixLegacy_CSR_Light2             >( benchmark, hostOutVector, inputFileName, verboseMR );
+   //benchmarkSpMVLegacy< Real, SparseMatrixLegacy_CSR_Light3             >( benchmark, hostOutVector, inputFileName, verboseMR );
+   //benchmarkSpMVLegacy< Real, SparseMatrixLegacy_CSR_Light4             >( benchmark, hostOutVector, inputFileName, verboseMR );
+   //benchmarkSpMVLegacy< Real, SparseMatrixLegacy_CSR_Light5             >( benchmark, hostOutVector, inputFileName, verboseMR );
+   //benchmarkSpMVLegacy< Real, SparseMatrixLegacy_CSR_Light6             >( benchmark, hostOutVector, inputFileName, verboseMR );
+   benchmarkSpMVLegacy< Real, SparseMatrixLegacy_CSR_Adaptive           >( benchmark, hostOutVector, inputFileName, allCpuTests, verboseMR );
+   benchmarkSpMVLegacy< Real, SparseMatrixLegacy_CSR_MultiVector        >( benchmark, hostOutVector, inputFileName, allCpuTests, verboseMR );
+   benchmarkSpMVLegacy< Real, SparseMatrixLegacy_CSR_LightWithoutAtomic >( benchmark, hostOutVector, inputFileName, allCpuTests, verboseMR );
+   benchmarkSpMVLegacy< Real, Legacy::Ellpack                           >( benchmark, hostOutVector, inputFileName, allCpuTests, verboseMR );
+   benchmarkSpMVLegacy< Real, SlicedEllpackAlias                        >( benchmark, hostOutVector, inputFileName, allCpuTests, verboseMR );
+   benchmarkSpMVLegacy< Real, Legacy::ChunkedEllpack                    >( benchmark, hostOutVector, inputFileName, allCpuTests, verboseMR );
+   benchmarkSpMVLegacy< Real, Legacy::BiEllpack                         >( benchmark, hostOutVector, inputFileName, allCpuTests, verboseMR );
+   // AdEllpack is broken
+   //benchmarkSpMV< Real, Matrices::AdEllpack              >( benchmark, hostOutVector, inputFileName, verboseMR );
+}
+
+template< typename Real, typename HostMatrix >
+void
+dispatchBinary( BenchmarkType& benchmark,
+                const HostMatrix& hostMatrix,
+                const TNL::Containers::Vector< Real, Devices::Host, int >& hostOutVector,
+                const String& inputFileName,
+                bool allCpuTests,
+                bool verboseMR )
+{
+   benchmarkBinarySpMV< Real, HostMatrix, SparseMatrix_CSR_Scalar              >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+   benchmarkBinarySpMV< Real, HostMatrix, SparseMatrix_CSR_Vector              >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+   benchmarkSpMVCSRLight< Real, HostMatrix, SparseMatrix_CSR_Light, bool >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+   benchmarkBinarySpMV< Real, HostMatrix, SparseMatrix_CSR_Adaptive            >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+   benchmarkBinarySpMV< Real, HostMatrix, SparseMatrix_Ellpack                 >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+   benchmarkBinarySpMV< Real, HostMatrix, SparseMatrix_SlicedEllpack           >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+   benchmarkBinarySpMV< Real, HostMatrix, SparseMatrix_ChunkedEllpack          >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+   benchmarkBinarySpMV< Real, HostMatrix, SparseMatrix_BiEllpack               >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+}
+
+template< typename Real >
+void
+dispatchSpMV( BenchmarkType& benchmark,
+              const TNL::Containers::Vector< Real, Devices::Host, int >& hostOutVector,
+              const String& inputFileName,
+              bool allCpuTests,
+              bool verboseMR )
+{
+   using HostMatrixType = TNL::Matrices::SparseMatrix< Real, TNL::Devices::Host >;
+   HostMatrixType hostMatrix;
+   TNL::Matrices::MatrixReader< HostMatrixType >::readMtx( inputFileName, hostMatrix, verboseMR );
+   benchmarkSpMV< Real, HostMatrixType, SparseMatrix_CSR_Scalar                   >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+   benchmarkSpMV< Real, HostMatrixType, SparseMatrix_CSR_Vector                   >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+   //benchmarkSpMV< Real, HostMatrixType, SparseMatrix_CSR_Hybrid                   >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+   benchmarkSpMVCSRLight< Real, HostMatrixType, SparseMatrix_CSR_Light            >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+   benchmarkSpMV< Real, HostMatrixType, SparseMatrix_CSR_Adaptive                 >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+   benchmarkSpMV< Real, HostMatrixType, SparseMatrix_Ellpack                      >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+   benchmarkSpMV< Real, HostMatrixType, SparseMatrix_SlicedEllpack                >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+   benchmarkSpMV< Real, HostMatrixType, SparseMatrix_ChunkedEllpack               >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+   benchmarkSpMV< Real, HostMatrixType, SparseMatrix_BiEllpack                    >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+   dispatchBinary< Real >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+#ifdef WITH_TNL_BENCHMARK_SPMV_SANDBOX_MATRIX
+   benchmarkSpMV< Real, HostMatrixType, SparseSandboxMatrix                       >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+#endif
+}
+
+template< typename Real, typename SymmetricInputMatrix >
+void
+dispatchSymmetricBinary( BenchmarkType& benchmark,
+                         const SymmetricInputMatrix& symmetricHostMatrix,
+                         const TNL::Containers::Vector< Real, Devices::Host, int >& hostOutVector,
+                         const String& inputFileName,
+                         bool allCpuTests,
+                         bool verboseMR )
+{
+   benchmarkBinarySpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_CSR_Scalar              >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+   benchmarkBinarySpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_CSR_Vector              >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+   //benchmarkBinarySpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_CSR_Hybrid            >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+   benchmarkSpMVCSRLight< Real, SymmetricInputMatrix, SymmetricSparseMatrix_CSR_Light, bool       >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+   benchmarkBinarySpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_CSR_Adaptive            >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+   benchmarkBinarySpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_Ellpack                 >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+   benchmarkBinarySpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_SlicedEllpack           >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+   benchmarkBinarySpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_ChunkedEllpack          >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+   benchmarkBinarySpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_BiEllpack               >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+}
+
+template< typename Real >
+void
+dispatchSymmetric( BenchmarkType& benchmark,
+                   const TNL::Containers::Vector< Real, Devices::Host, int >& hostOutVector,
+                   const String& inputFileName,
+                   bool allCpuTests,
+                   bool verboseMR )
+{
+   using SymmetricInputMatrix = TNL::Matrices::SparseMatrix< Real, TNL::Devices::Host, int, TNL::Matrices::SymmetricMatrix >;
+   using InputMatrix = TNL::Matrices::SparseMatrix< Real, TNL::Devices::Host, int >;
+   SymmetricInputMatrix symmetricHostMatrix;
+   try
+   {
+      TNL::Matrices::MatrixReader< SymmetricInputMatrix >::readMtx( inputFileName, symmetricHostMatrix, verboseMR );
+   }
+   catch(const std::exception& e)
+   {
+      benchmark.addErrorMessage( "Unable to read the symmetric matrix: " + String(e.what()) );
+      return;
+   }
+   InputMatrix hostMatrix;
+   TNL::Matrices::MatrixReader< InputMatrix >::readMtx( inputFileName, hostMatrix, verboseMR );
+   // TODO: Comparison of symmetric and general matrix does not work yet.
+   //if( hostMatrix != symmetricHostMatrix )
+   //{
+   //   std::cerr << "ERROR: Symmetric matrices do not match !!!" << std::endl;
+   //}
+   benchmarkSpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_CSR_Scalar                    >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+   benchmarkSpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_CSR_Vector                    >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+   //benchmarkSpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_CSR_Hybrid                   >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+   benchmarkSpMVCSRLight< Real, SymmetricInputMatrix, SymmetricSparseMatrix_CSR_Light             >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+   benchmarkSpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_CSR_Adaptive                  >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+   benchmarkSpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_Ellpack                       >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+   benchmarkSpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_SlicedEllpack                 >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+   benchmarkSpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_ChunkedEllpack                >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+   benchmarkSpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_BiEllpack                     >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+   dispatchSymmetricBinary< Real >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+}
+
 template< typename Real = double,
           typename Index = int >
 void
@@ -584,20 +720,26 @@ benchmarkSpmv( BenchmarkType& benchmark,
    // Set-up benchmark datasize
    //
    MatrixReader< CSRHostMatrix >::readMtx( inputFileName, csrHostMatrix, verboseMR );
-   const int elements = csrHostMatrix.getNonzeroElementsCount();
-   const double datasetSize = (double) elements * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB;
-   benchmark.setOperation( datasetSize );
+   const int nonzeros = csrHostMatrix.getNonzeroElementsCount();
+   const double datasetSize = (double) nonzeros * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB;
+   benchmark.setDatasetSize( datasetSize );
 
    ////
    // Perform benchmark on host with CSR as a reference CPU format
    //
-   auto nonzeros = csrHostMatrix.getNonzeroElementsCount();
-   benchmark.addCommonLogs( BenchmarkType::CommonLogs( {
-      { "matrix name", convertToString( inputFileName ) },
+   benchmark.setMetadataColumns({
+      { "matrix name", inputFileName },
+      { "precision", getType< Real >() },
       { "rows", convertToString( csrHostMatrix.getRows() ) },
       { "columns", convertToString( csrHostMatrix.getColumns() ) },
       { "nonzeros", convertToString( nonzeros ) },
-      { "nonzeros per row", convertToString( ( double ) nonzeros / ( double ) csrHostMatrix.getRows() ) } } ) );
+      // NOTE: this can be easily calculated with Pandas based on the other metadata
+      //{ "nonzeros per row", convertToString( ( double ) nonzeros / ( double ) csrHostMatrix.getRows() ) },
+   });
+   benchmark.setMetadataWidths({
+      { "matrix name", 32 },
+      { "format", 46 },
+   });
 
    HostVector hostInVector( csrHostMatrix.getRows() ), hostOutVector( csrHostMatrix.getRows() );
 
@@ -610,10 +752,9 @@ benchmarkSpmv( BenchmarkType& benchmark,
        csrHostMatrix.vectorProduct( hostInVector, hostOutVector );
    };
 
-   SpmvBenchmarkResult< Real, Devices::Host, int > csrBenchmarkResults( String( "CSR" ), hostOutVector, hostOutVector, csrHostMatrix.getNonzeroElementsCount() );
-   benchmark.addLogsMetadata( csrBenchmarkResults.getTableHeader() );
-   benchmark.writeHeader();
-   benchmark.time< Devices::Host >( resetHostVectors, "", spmvCSRHost, csrBenchmarkResults );
+   SpmvBenchmarkResult< Real, Devices::Host, int > csrBenchmarkResults( hostOutVector, hostOutVector );
+   benchmark.setMetadataElement({ "format", "CSR" });
+   benchmark.time< Devices::Host >( resetHostVectors, "CPU", spmvCSRHost, csrBenchmarkResults );
 
 #ifdef HAVE_PETSC
    Mat petscMatrix;
@@ -640,10 +781,9 @@ benchmarkSpmv( BenchmarkType& benchmark,
       MatMult( petscMatrix, inVector, outVector );
    };
 
-   SpmvBenchmarkResult< Real, Devices::Host, int > petscBenchmarkResults( String( "Petsc" ), hostOutVector, hostOutVector, csrHostMatrix.getNonzeroElementsCount() );
-   //benchmark.addLogsMetadata( petscBenchmarkResults.getTableHeader() );
-   //benchmark.writeHeader();
-   benchmark.time< Devices::Host >( resetPetscVectors, "", petscSpmvCSRHost, petscBenchmarkResults );
+   SpmvBenchmarkResult< Real, Devices::Host, int > petscBenchmarkResults( hostOutVector, hostOutVector );
+   benchmark.setMetadataElement({ "format", "Petsc" });
+   benchmark.time< Devices::Host >( resetPetscVectors, "CPU", petscSpmvCSRHost, petscBenchmarkResults );
 #endif
 
 
@@ -671,15 +811,14 @@ benchmarkSpmv( BenchmarkType& benchmark,
        cusparseMatrix.vectorProduct( cudaInVector, cudaOutVector );
    };
 
-   SpmvBenchmarkResult< Real, Devices::Cuda, int > cudaBenchmarkResults( String( "cusparse" ), hostOutVector, cudaOutVector, csrHostMatrix.getNonzeroElementsCount() );
+   SpmvBenchmarkResult< Real, Devices::Cuda, int > cudaBenchmarkResults( hostOutVector, cudaOutVector );
+   benchmark.setMetadataElement({ "format", "cusparse" });
    benchmark.time< Devices::Cuda >( resetCusparseVectors, "GPU", spmvCusparse, cudaBenchmarkResults );
 
 #ifdef HAVE_CSR5
    ////
    // Perform benchmark on CUDA device with CSR5 as a reference GPU format
    //
-   cudaBenchmarkResults.setFormat( String( "CSR5" ) );
-
    CudaVector cudaOutVector2( cudaOutVector );
    CSR5Benchmark::CSR5Benchmark< CSRCudaMatrix > csr5Benchmark( csrCudaMatrix, cudaInVector, cudaOutVector );
 
@@ -687,6 +826,7 @@ benchmarkSpmv( BenchmarkType& benchmark,
        csr5Benchmark.vectorProduct();
    };
 
+   benchmark.setMetadataElement({ "format", "CSR5" });
    benchmark.time< Devices::Cuda >( resetCusparseVectors, "GPU", csr5SpMV, cudaBenchmarkResults );
    std::cerr << "CSR5 error = " << max( abs( cudaOutVector - cudaOutVector2 ) ) << std::endl;
    csrCudaMatrix.reset();
@@ -695,8 +835,6 @@ benchmarkSpmv( BenchmarkType& benchmark,
    ////
    // Perform benchmark on CUDA device with LightSpMV as a reference GPU format
    //
-   cudaBenchmarkResults.setFormat( String( "LightSpMV Vector" ) );
-
    LightSpMVCSRHostMatrix lightSpMVCSRHostMatrix;
    lightSpMVCSRHostMatrix = csrHostMatrix;
    LightSpMVBenchmark< Real > lightSpMVBenchmark( lightSpMVCSRHostMatrix, LightSpMVBenchmarkKernelVector );
@@ -707,123 +845,70 @@ benchmarkSpmv( BenchmarkType& benchmark,
    auto spmvLightSpMV = [&]() {
        lightSpMVBenchmark.vectorProduct();
    };
+   benchmark.setMetadataElement({ "format", "LightSpMV Vector" });
    benchmark.time< Devices::Cuda >( resetLightSpMVVectors, "GPU", spmvLightSpMV, cudaBenchmarkResults );
 
-   cudaBenchmarkResults.setFormat( String( "LightSpMV Warp" ) );
    lightSpMVBenchmark.setKernelType( LightSpMVBenchmarkKernelWarp );
+   benchmark.setMetadataElement({ "format", "LightSpMV Warp" });
    benchmark.time< Devices::Cuda >( resetLightSpMVVectors, "GPU", spmvLightSpMV, cudaBenchmarkResults );
 #endif
    csrHostMatrix.reset();
 
    bool allCpuTests = parameters.getParameter< bool >( "with-all-cpu-tests" );
-#ifdef WITH_TNL_BENCHMARK_SPMV_LEGACY_FORMATS
    /////
    // Benchmarking of TNL legacy formats
    //
    if( parameters.getParameter< bool >("with-legacy-matrices") )
-   {
-      using namespace Benchmarks::SpMV::ReferenceFormats;
-      benchmarkSpMVLegacy< Real, SparseMatrixLegacy_CSR_Scalar             >( benchmark, hostOutVector, inputFileName, allCpuTests, verboseMR );
-      benchmarkSpMVLegacy< Real, SparseMatrixLegacy_CSR_Vector             >( benchmark, hostOutVector, inputFileName, allCpuTests, verboseMR );
-      //benchmarkSpMVLegacy< Real, SparseMatrixLegacy_CSR_Light              >( benchmark, hostOutVector, inputFileName, verboseMR );
-      //benchmarkSpMVLegacy< Real, SparseMatrixLegacy_CSR_Light2             >( benchmark, hostOutVector, inputFileName, verboseMR );
-      //benchmarkSpMVLegacy< Real, SparseMatrixLegacy_CSR_Light3             >( benchmark, hostOutVector, inputFileName, verboseMR );
-      //benchmarkSpMVLegacy< Real, SparseMatrixLegacy_CSR_Light4             >( benchmark, hostOutVector, inputFileName, verboseMR );
-      //benchmarkSpMVLegacy< Real, SparseMatrixLegacy_CSR_Light5             >( benchmark, hostOutVector, inputFileName, verboseMR );
-      //benchmarkSpMVLegacy< Real, SparseMatrixLegacy_CSR_Light6             >( benchmark, hostOutVector, inputFileName, verboseMR );
-      benchmarkSpMVLegacy< Real, SparseMatrixLegacy_CSR_Adaptive           >( benchmark, hostOutVector, inputFileName, allCpuTests, verboseMR );
-      benchmarkSpMVLegacy< Real, SparseMatrixLegacy_CSR_MultiVector        >( benchmark, hostOutVector, inputFileName, allCpuTests, verboseMR );
-      benchmarkSpMVLegacy< Real, SparseMatrixLegacy_CSR_LightWithoutAtomic >( benchmark, hostOutVector, inputFileName, allCpuTests, verboseMR );
-      benchmarkSpMVLegacy< Real, Legacy::Ellpack                           >( benchmark, hostOutVector, inputFileName, allCpuTests, verboseMR );
-      benchmarkSpMVLegacy< Real, SlicedEllpackAlias                        >( benchmark, hostOutVector, inputFileName, allCpuTests, verboseMR );
-      benchmarkSpMVLegacy< Real, Legacy::ChunkedEllpack                    >( benchmark, hostOutVector, inputFileName, allCpuTests, verboseMR );
-      benchmarkSpMVLegacy< Real, Legacy::BiEllpack                         >( benchmark, hostOutVector, inputFileName, allCpuTests, verboseMR );
-   }
-   // AdEllpack is broken
-   //benchmarkSpMV< Real, Matrices::AdEllpack              >( benchmark, hostOutVector, inputFileName, verboseMR );
-#endif
+      dispatchLegacy< Real >( benchmark, hostOutVector, inputFileName, allCpuTests, verboseMR );
 
-#ifdef WITH_TNL_BENCHMARK_SPMV_GENERAL_MATRICES
    /////
    // Benchmarking TNL formats
    //
-   using HostMatrixType = TNL::Matrices::SparseMatrix< Real, TNL::Devices::Host >;
-   HostMatrixType hostMatrix;
-   TNL::Matrices::MatrixReader< HostMatrixType >::readMtx( inputFileName, hostMatrix, verboseMR );
-   benchmarkSpMV< Real, HostMatrixType, SparseMatrix_CSR_Scalar                   >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
-   benchmarkSpMV< Real, HostMatrixType, SparseMatrix_CSR_Vector                   >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
-   //benchmarkSpMV< Real, HostMatrixType, SparseMatrix_CSR_Hybrid                   >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
-   benchmarkSpMVCSRLight< Real, HostMatrixType, SparseMatrix_CSR_Light            >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
-   benchmarkSpMV< Real, HostMatrixType, SparseMatrix_CSR_Adaptive                 >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
-   benchmarkSpMV< Real, HostMatrixType, SparseMatrix_Ellpack                      >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
-   benchmarkSpMV< Real, HostMatrixType, SparseMatrix_SlicedEllpack                >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
-   benchmarkSpMV< Real, HostMatrixType, SparseMatrix_ChunkedEllpack               >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
-   benchmarkSpMV< Real, HostMatrixType, SparseMatrix_BiEllpack                    >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
-#ifdef WITH_TNL_BENCHMARK_SPMV_BINARY_MATRICES
-   benchmarkBinarySpMV< Real, HostMatrixType, SparseMatrix_CSR_Scalar              >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
-   benchmarkBinarySpMV< Real, HostMatrixType, SparseMatrix_CSR_Vector              >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
-   benchmarkSpMVCSRLight< Real, HostMatrixType, SparseMatrix_CSR_Light, bool >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
-   benchmarkBinarySpMV< Real, HostMatrixType, SparseMatrix_CSR_Adaptive            >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
-   benchmarkBinarySpMV< Real, HostMatrixType, SparseMatrix_Ellpack                 >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
-   benchmarkBinarySpMV< Real, HostMatrixType, SparseMatrix_SlicedEllpack           >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
-   benchmarkBinarySpMV< Real, HostMatrixType, SparseMatrix_ChunkedEllpack          >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
-   benchmarkBinarySpMV< Real, HostMatrixType, SparseMatrix_BiEllpack               >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
-#endif
-#ifdef WITH_TNL_BENCHMARK_SPMV_SANDBOX_MATRIX
-   benchmarkSpMV< Real, HostMatrixType, SparseSandboxMatrix                       >( benchmark, hostMatrix, hostOutVector, inputFileName, verboseMR );
-#endif
-   hostMatrix.reset();
-#endif
+   dispatchSpMV< Real >( benchmark, hostOutVector, inputFileName, allCpuTests, verboseMR );
 
-#ifdef WITH_TNL_BENCHMARK_SPMV_SYMMETRIC_MATRICES
    /////
    // Benchmarking symmetric sparse matrices
    //
    if( parameters.getParameter< bool >("with-symmetric-matrices") )
-   {
-      using SymmetricInputMatrix = TNL::Matrices::SparseMatrix< Real, TNL::Devices::Host, int, TNL::Matrices::SymmetricMatrix >;
-      using InputMatrix = TNL::Matrices::SparseMatrix< Real, TNL::Devices::Host, int >;
-      SymmetricInputMatrix symmetricHostMatrix;
-      try
-      {
-         TNL::Matrices::MatrixReader< SymmetricInputMatrix >::readMtx( inputFileName, symmetricHostMatrix, verboseMR );
-      }
-      catch(const std::exception& e)
-      {
-         std::cerr << e.what() << " ... SKIPPING " << std::endl;
-         return;
-      }
-      InputMatrix hostMatrix;
-      TNL::Matrices::MatrixReader< InputMatrix >::readMtx( inputFileName, hostMatrix, verboseMR );
-      // TODO: Comparison of symmetric and general matrix does not work yet.
-      //if( hostMatrix != symmetricHostMatrix )
-      //{
-      //   std::cerr << "ERROR: Symmetric matrices do not match !!!" << std::endl;
-      //}
-      benchmarkSpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_CSR_Scalar                    >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
-      benchmarkSpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_CSR_Vector                    >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
-      //benchmarkSpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_CSR_Hybrid                   >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
-      benchmarkSpMVCSRLight< Real, SymmetricInputMatrix, SymmetricSparseMatrix_CSR_Light             >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
-      benchmarkSpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_CSR_Adaptive                  >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
-      benchmarkSpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_Ellpack                       >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
-      benchmarkSpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_SlicedEllpack                 >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
-      benchmarkSpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_ChunkedEllpack                >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
-      benchmarkSpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_BiEllpack                     >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
-#ifdef WITH_TNL_BENCHMARK_SPMV_BINARY_MATRICES
-      benchmarkBinarySpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_CSR_Scalar              >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
-      benchmarkBinarySpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_CSR_Vector              >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
-      //benchmarkBinarySpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_CSR_Hybrid            >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
-      benchmarkSpMVCSRLight< Real, SymmetricInputMatrix, SymmetricSparseMatrix_CSR_Light, bool       >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
-      benchmarkBinarySpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_CSR_Adaptive            >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
-      benchmarkBinarySpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_Ellpack                 >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
-      benchmarkBinarySpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_SlicedEllpack           >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
-      benchmarkBinarySpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_ChunkedEllpack          >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
-      benchmarkBinarySpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_BiEllpack               >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
-#endif
-   }
-#endif
+      dispatchSymmetric< Real >( benchmark, hostOutVector, inputFileName, allCpuTests, verboseMR );
 }
 
-      } // namespace SpMVLegacy
+// =============== EXPLICIT TEMPLATE INSTANTIATIONS ===============
+// The explicit template declarations (extern ...) are converted to definitions
+// in separate source files using the eti.py script. The developer should call
+// this script whenever the declarations are changed and commit the generated
+// definitions in the git repository.
+//
+// IMPORTANT:
+// - Each template instantiation must be written on exactly one line (the code
+//   generator script (spmv.py) does not support parsing multiple lines).
+// - Make sure that all "dispatch*" functions that are called above are
+//   instantiated below.
+// - Also make sure that all functions that are explicitly instantiated below
+//   are actually used.
+// - Explicit template instantiations cannot be guarded by #ifdef (the code
+//   generator script (spmv.py) does not support parsing macros).
+// - For optimum compilation performance, the explicitly instantiated functions
+//   should be as independent as possible. The compilation of each explicit
+//   instantiation should take about the same time so that the work load in a
+//   parallel build is balanced. Functions that are not instantiated explicitly
+//   will be compiled in the main unit that is compiled serially.
+
+extern template void dispatchLegacy< float >( BenchmarkType&, const Containers::Vector< float, Devices::Host, int >&, const String&, bool, bool );
+extern template void dispatchLegacy< double >( BenchmarkType&, const Containers::Vector< double, Devices::Host, int >&, const String&, bool, bool );
+
+extern template void dispatchBinary< float >( BenchmarkType&, const Matrices::SparseMatrix< float, Devices::Host >&, const Containers::Vector< float, Devices::Host, int >&, const String&, bool, bool );
+extern template void dispatchBinary< double >( BenchmarkType&, const Matrices::SparseMatrix< float, Devices::Host >&, const Containers::Vector< double, Devices::Host, int >&, const String&, bool, bool );
+
+extern template void dispatchSpMV< float >( BenchmarkType&, const Containers::Vector< float, Devices::Host, int >&, const String&, bool, bool );
+extern template void dispatchSpMV< double >( BenchmarkType&, const Containers::Vector< double, Devices::Host, int >&, const String&, bool, bool );
+
+extern template void dispatchSymmetric< float >( BenchmarkType&, const Containers::Vector< float, Devices::Host, int >&, const String&, bool, bool );
+extern template void dispatchSymmetric< double >( BenchmarkType&, const Containers::Vector< double, Devices::Host, int >&, const String&, bool, bool );
+
+extern template void dispatchSymmetricBinary< float >( BenchmarkType&, const Matrices::SparseMatrix< float, Devices::Host >&, const Containers::Vector< float, Devices::Host, int >&, const String&, bool, bool );
+extern template void dispatchSymmetricBinary< double >( BenchmarkType&, const Matrices::SparseMatrix< float, Devices::Host >&, const Containers::Vector< double, Devices::Host, int >&, const String&, bool, bool );
+
+      } // namespace SpMV
    } // namespace Benchmarks
 } // namespace TNL
diff --git a/src/Benchmarks/SpMV/spmv.templates/spmv.t0.cpp b/src/Benchmarks/SpMV/spmv.templates/spmv.t0.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..fafd4b2b01ce4094a897dabf8261b210b8fe088e
--- /dev/null
+++ b/src/Benchmarks/SpMV/spmv.templates/spmv.t0.cpp
@@ -0,0 +1,8 @@
+#include "../spmv.h"
+namespace TNL {
+namespace Benchmarks {
+namespace SpMV {
+template void dispatchLegacy< float >( BenchmarkType&, const Containers::Vector< float, Devices::Host, int >&, const String&, bool, bool );
+} // namespace TNL
+} // namespace Benchmarks
+} // namespace SpMV
diff --git a/src/Benchmarks/SpMV/spmv.templates/spmv.t0.cu b/src/Benchmarks/SpMV/spmv.templates/spmv.t0.cu
new file mode 100644
index 0000000000000000000000000000000000000000..fafd4b2b01ce4094a897dabf8261b210b8fe088e
--- /dev/null
+++ b/src/Benchmarks/SpMV/spmv.templates/spmv.t0.cu
@@ -0,0 +1,8 @@
+#include "../spmv.h"
+namespace TNL {
+namespace Benchmarks {
+namespace SpMV {
+template void dispatchLegacy< float >( BenchmarkType&, const Containers::Vector< float, Devices::Host, int >&, const String&, bool, bool );
+} // namespace TNL
+} // namespace Benchmarks
+} // namespace SpMV
diff --git a/src/Benchmarks/SpMV/spmv.templates/spmv.t1.cpp b/src/Benchmarks/SpMV/spmv.templates/spmv.t1.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..fbd1d792ab8af6c1a5083cfa6127ecdf072b666f
--- /dev/null
+++ b/src/Benchmarks/SpMV/spmv.templates/spmv.t1.cpp
@@ -0,0 +1,8 @@
+#include "../spmv.h"
+namespace TNL {
+namespace Benchmarks {
+namespace SpMV {
+template void dispatchLegacy< double >( BenchmarkType&, const Containers::Vector< double, Devices::Host, int >&, const String&, bool, bool );
+} // namespace TNL
+} // namespace Benchmarks
+} // namespace SpMV
diff --git a/src/Benchmarks/SpMV/spmv.templates/spmv.t1.cu b/src/Benchmarks/SpMV/spmv.templates/spmv.t1.cu
new file mode 100644
index 0000000000000000000000000000000000000000..fbd1d792ab8af6c1a5083cfa6127ecdf072b666f
--- /dev/null
+++ b/src/Benchmarks/SpMV/spmv.templates/spmv.t1.cu
@@ -0,0 +1,8 @@
+#include "../spmv.h"
+namespace TNL {
+namespace Benchmarks {
+namespace SpMV {
+template void dispatchLegacy< double >( BenchmarkType&, const Containers::Vector< double, Devices::Host, int >&, const String&, bool, bool );
+} // namespace TNL
+} // namespace Benchmarks
+} // namespace SpMV
diff --git a/src/Benchmarks/SpMV/spmv.templates/spmv.t2.cpp b/src/Benchmarks/SpMV/spmv.templates/spmv.t2.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..bf67f96e16ed354300913caff8478f3050405484
--- /dev/null
+++ b/src/Benchmarks/SpMV/spmv.templates/spmv.t2.cpp
@@ -0,0 +1,8 @@
+#include "../spmv.h"
+namespace TNL {
+namespace Benchmarks {
+namespace SpMV {
+template void dispatchBinary< float >( BenchmarkType&, const Matrices::SparseMatrix< float, Devices::Host >&, const Containers::Vector< float, Devices::Host, int >&, const String&, bool, bool );
+} // namespace TNL
+} // namespace Benchmarks
+} // namespace SpMV
diff --git a/src/Benchmarks/SpMV/spmv.templates/spmv.t2.cu b/src/Benchmarks/SpMV/spmv.templates/spmv.t2.cu
new file mode 100644
index 0000000000000000000000000000000000000000..bf67f96e16ed354300913caff8478f3050405484
--- /dev/null
+++ b/src/Benchmarks/SpMV/spmv.templates/spmv.t2.cu
@@ -0,0 +1,8 @@
+#include "../spmv.h"
+namespace TNL {
+namespace Benchmarks {
+namespace SpMV {
+template void dispatchBinary< float >( BenchmarkType&, const Matrices::SparseMatrix< float, Devices::Host >&, const Containers::Vector< float, Devices::Host, int >&, const String&, bool, bool );
+} // namespace TNL
+} // namespace Benchmarks
+} // namespace SpMV
diff --git a/src/Benchmarks/SpMV/spmv.templates/spmv.t3.cpp b/src/Benchmarks/SpMV/spmv.templates/spmv.t3.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3d11252c95e93b1d2a183c12190bd08eb8ba9b0e
--- /dev/null
+++ b/src/Benchmarks/SpMV/spmv.templates/spmv.t3.cpp
@@ -0,0 +1,8 @@
+#include "../spmv.h"
+namespace TNL {
+namespace Benchmarks {
+namespace SpMV {
+template void dispatchBinary< double >( BenchmarkType&, const Matrices::SparseMatrix< float, Devices::Host >&, const Containers::Vector< double, Devices::Host, int >&, const String&, bool, bool );
+} // namespace TNL
+} // namespace Benchmarks
+} // namespace SpMV
diff --git a/src/Benchmarks/SpMV/spmv.templates/spmv.t3.cu b/src/Benchmarks/SpMV/spmv.templates/spmv.t3.cu
new file mode 100644
index 0000000000000000000000000000000000000000..3d11252c95e93b1d2a183c12190bd08eb8ba9b0e
--- /dev/null
+++ b/src/Benchmarks/SpMV/spmv.templates/spmv.t3.cu
@@ -0,0 +1,8 @@
+#include "../spmv.h"
+namespace TNL {
+namespace Benchmarks {
+namespace SpMV {
+template void dispatchBinary< double >( BenchmarkType&, const Matrices::SparseMatrix< float, Devices::Host >&, const Containers::Vector< double, Devices::Host, int >&, const String&, bool, bool );
+} // namespace TNL
+} // namespace Benchmarks
+} // namespace SpMV
diff --git a/src/Benchmarks/SpMV/spmv.templates/spmv.t4.cpp b/src/Benchmarks/SpMV/spmv.templates/spmv.t4.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..735f853e79dbca17406e69899e191471c75a009a
--- /dev/null
+++ b/src/Benchmarks/SpMV/spmv.templates/spmv.t4.cpp
@@ -0,0 +1,8 @@
+#include "../spmv.h"
+namespace TNL {
+namespace Benchmarks {
+namespace SpMV {
+template void dispatchSpMV< float >( BenchmarkType&, const Containers::Vector< float, Devices::Host, int >&, const String&, bool, bool );
+} // namespace TNL
+} // namespace Benchmarks
+} // namespace SpMV
diff --git a/src/Benchmarks/SpMV/spmv.templates/spmv.t4.cu b/src/Benchmarks/SpMV/spmv.templates/spmv.t4.cu
new file mode 100644
index 0000000000000000000000000000000000000000..735f853e79dbca17406e69899e191471c75a009a
--- /dev/null
+++ b/src/Benchmarks/SpMV/spmv.templates/spmv.t4.cu
@@ -0,0 +1,8 @@
+#include "../spmv.h"
+namespace TNL {
+namespace Benchmarks {
+namespace SpMV {
+template void dispatchSpMV< float >( BenchmarkType&, const Containers::Vector< float, Devices::Host, int >&, const String&, bool, bool );
+} // namespace TNL
+} // namespace Benchmarks
+} // namespace SpMV
diff --git a/src/Benchmarks/SpMV/spmv.templates/spmv.t5.cpp b/src/Benchmarks/SpMV/spmv.templates/spmv.t5.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e880c7d586b1cc380b6aa012f6f9e79d21ab82f3
--- /dev/null
+++ b/src/Benchmarks/SpMV/spmv.templates/spmv.t5.cpp
@@ -0,0 +1,8 @@
+#include "../spmv.h"
+namespace TNL {
+namespace Benchmarks {
+namespace SpMV {
+template void dispatchSpMV< double >( BenchmarkType&, const Containers::Vector< double, Devices::Host, int >&, const String&, bool, bool );
+} // namespace TNL
+} // namespace Benchmarks
+} // namespace SpMV
diff --git a/src/Benchmarks/SpMV/spmv.templates/spmv.t5.cu b/src/Benchmarks/SpMV/spmv.templates/spmv.t5.cu
new file mode 100644
index 0000000000000000000000000000000000000000..e880c7d586b1cc380b6aa012f6f9e79d21ab82f3
--- /dev/null
+++ b/src/Benchmarks/SpMV/spmv.templates/spmv.t5.cu
@@ -0,0 +1,8 @@
+#include "../spmv.h"
+namespace TNL {
+namespace Benchmarks {
+namespace SpMV {
+template void dispatchSpMV< double >( BenchmarkType&, const Containers::Vector< double, Devices::Host, int >&, const String&, bool, bool );
+} // namespace TNL
+} // namespace Benchmarks
+} // namespace SpMV
diff --git a/src/Benchmarks/SpMV/spmv.templates/spmv.t6.cpp b/src/Benchmarks/SpMV/spmv.templates/spmv.t6.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..082fe7967c41b3ec9a1a90dc70e48556becd334c
--- /dev/null
+++ b/src/Benchmarks/SpMV/spmv.templates/spmv.t6.cpp
@@ -0,0 +1,8 @@
+#include "../spmv.h"
+namespace TNL {
+namespace Benchmarks {
+namespace SpMV {
+template void dispatchSymmetric< float >( BenchmarkType&, const Containers::Vector< float, Devices::Host, int >&, const String&, bool, bool );
+} // namespace TNL
+} // namespace Benchmarks
+} // namespace SpMV
diff --git a/src/Benchmarks/SpMV/spmv.templates/spmv.t6.cu b/src/Benchmarks/SpMV/spmv.templates/spmv.t6.cu
new file mode 100644
index 0000000000000000000000000000000000000000..082fe7967c41b3ec9a1a90dc70e48556becd334c
--- /dev/null
+++ b/src/Benchmarks/SpMV/spmv.templates/spmv.t6.cu
@@ -0,0 +1,8 @@
+#include "../spmv.h"
+namespace TNL {
+namespace Benchmarks {
+namespace SpMV {
+template void dispatchSymmetric< float >( BenchmarkType&, const Containers::Vector< float, Devices::Host, int >&, const String&, bool, bool );
+} // namespace TNL
+} // namespace Benchmarks
+} // namespace SpMV
diff --git a/src/Benchmarks/SpMV/spmv.templates/spmv.t7.cpp b/src/Benchmarks/SpMV/spmv.templates/spmv.t7.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a1191d34dc9e0f61710abfede52d1e0f72ebf6b0
--- /dev/null
+++ b/src/Benchmarks/SpMV/spmv.templates/spmv.t7.cpp
@@ -0,0 +1,8 @@
+#include "../spmv.h"
+namespace TNL {
+namespace Benchmarks {
+namespace SpMV {
+template void dispatchSymmetric< double >( BenchmarkType&, const Containers::Vector< double, Devices::Host, int >&, const String&, bool, bool );
+} // namespace TNL
+} // namespace Benchmarks
+} // namespace SpMV
diff --git a/src/Benchmarks/SpMV/spmv.templates/spmv.t7.cu b/src/Benchmarks/SpMV/spmv.templates/spmv.t7.cu
new file mode 100644
index 0000000000000000000000000000000000000000..a1191d34dc9e0f61710abfede52d1e0f72ebf6b0
--- /dev/null
+++ b/src/Benchmarks/SpMV/spmv.templates/spmv.t7.cu
@@ -0,0 +1,8 @@
+#include "../spmv.h"
+namespace TNL {
+namespace Benchmarks {
+namespace SpMV {
+template void dispatchSymmetric< double >( BenchmarkType&, const Containers::Vector< double, Devices::Host, int >&, const String&, bool, bool );
+} // namespace TNL
+} // namespace Benchmarks
+} // namespace SpMV
diff --git a/src/Benchmarks/SpMV/spmv.templates/spmv.t8.cpp b/src/Benchmarks/SpMV/spmv.templates/spmv.t8.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..fea28e004019fc4aa767798e6030fed5cca41160
--- /dev/null
+++ b/src/Benchmarks/SpMV/spmv.templates/spmv.t8.cpp
@@ -0,0 +1,8 @@
+#include "../spmv.h"
+namespace TNL {
+namespace Benchmarks {
+namespace SpMV {
+template void dispatchSymmetricBinary< float >( BenchmarkType&, const Matrices::SparseMatrix< float, Devices::Host >&, const Containers::Vector< float, Devices::Host, int >&, const String&, bool, bool );
+} // namespace TNL
+} // namespace Benchmarks
+} // namespace SpMV
diff --git a/src/Benchmarks/SpMV/spmv.templates/spmv.t8.cu b/src/Benchmarks/SpMV/spmv.templates/spmv.t8.cu
new file mode 100644
index 0000000000000000000000000000000000000000..fea28e004019fc4aa767798e6030fed5cca41160
--- /dev/null
+++ b/src/Benchmarks/SpMV/spmv.templates/spmv.t8.cu
@@ -0,0 +1,8 @@
+#include "../spmv.h"
+namespace TNL {
+namespace Benchmarks {
+namespace SpMV {
+template void dispatchSymmetricBinary< float >( BenchmarkType&, const Matrices::SparseMatrix< float, Devices::Host >&, const Containers::Vector< float, Devices::Host, int >&, const String&, bool, bool );
+} // namespace TNL
+} // namespace Benchmarks
+} // namespace SpMV
diff --git a/src/Benchmarks/SpMV/spmv.templates/spmv.t9.cpp b/src/Benchmarks/SpMV/spmv.templates/spmv.t9.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..16c242a796e7a09012a746bf2a3eca9599fe372d
--- /dev/null
+++ b/src/Benchmarks/SpMV/spmv.templates/spmv.t9.cpp
@@ -0,0 +1,8 @@
+#include "../spmv.h"
+namespace TNL {
+namespace Benchmarks {
+namespace SpMV {
+template void dispatchSymmetricBinary< double >( BenchmarkType&, const Matrices::SparseMatrix< float, Devices::Host >&, const Containers::Vector< double, Devices::Host, int >&, const String&, bool, bool );
+} // namespace TNL
+} // namespace Benchmarks
+} // namespace SpMV
diff --git a/src/Benchmarks/SpMV/spmv.templates/spmv.t9.cu b/src/Benchmarks/SpMV/spmv.templates/spmv.t9.cu
new file mode 100644
index 0000000000000000000000000000000000000000..16c242a796e7a09012a746bf2a3eca9599fe372d
--- /dev/null
+++ b/src/Benchmarks/SpMV/spmv.templates/spmv.t9.cu
@@ -0,0 +1,8 @@
+#include "../spmv.h"
+namespace TNL {
+namespace Benchmarks {
+namespace SpMV {
+template void dispatchSymmetricBinary< double >( BenchmarkType&, const Matrices::SparseMatrix< float, Devices::Host >&, const Containers::Vector< double, Devices::Host, int >&, const String&, bool, bool );
+} // namespace TNL
+} // namespace Benchmarks
+} // namespace SpMV
diff --git a/src/Benchmarks/SpMV/tnl-benchmark-spmv.h b/src/Benchmarks/SpMV/tnl-benchmark-spmv.h
index c5ff2bb3fa90dd9cf88207d7b087767db8377825..dd2617d3bba16f1be9351e73719c7efe117bbdde 100644
--- a/src/Benchmarks/SpMV/tnl-benchmark-spmv.h
+++ b/src/Benchmarks/SpMV/tnl-benchmark-spmv.h
@@ -38,17 +38,10 @@ using namespace TNL::Benchmarks;
 template< typename Real >
 void
 runSpMVBenchmarks( TNL::Benchmarks::SpMV::BenchmarkType & benchmark,
-                   TNL::Benchmarks::SpMV::BenchmarkType::MetadataMap metadata,
                    const String & inputFileName,
                    const Config::ParameterContainer& parameters,
                    bool verboseMR = false )
 {
-   const String precision = getType< Real >();
-   metadata["precision"] = precision;
-
-   // Sparse matrix-vector multiplication
-   benchmark.newBenchmark( String("Sparse matrix-vector multiplication (") + precision + ")",
-                           metadata );
    // Start the actual benchmark in spmv.h
    try {
       TNL::Benchmarks::SpMV::benchmarkSpmv< Real >( benchmark, inputFileName, parameters, verboseMR );
@@ -66,7 +59,7 @@ std::string getCurrDateTime()
    char buffer[ 80 ];
    time( &rawtime );
    timeinfo = localtime( &rawtime );
-   strftime( buffer, sizeof( buffer ), "%d-%m-%Y--%H:%M:%S", timeinfo );
+   strftime( buffer, sizeof( buffer ), "%Y-%m-%d--%H:%M:%S", timeinfo );
    std::string curr_date_time( buffer );
    return curr_date_time;
 }
@@ -75,15 +68,14 @@ void
 setupConfig( Config::ConfigDescription & config )
 {
    config.addDelimiter( "Benchmark settings:" );
-   config.addEntry< String >( "input-file", "Input file name.", "" );
+   config.addRequiredEntry< String >( "input-file", "Input file name." );
    config.addEntry< bool >( "with-symmetric-matrices", "Perform benchmark even for symmetric matrix formats.", true );
    config.addEntry< bool >( "with-legacy-matrices", "Perform benchmark even for legacy TNL matrix formats.", true );
    config.addEntry< bool >( "with-all-cpu-tests", "All matrix formats are tested on both CPU and GPU. ", false );
    config.addEntry< String >( "log-file", "Log file name.", "tnl-benchmark-spmv::" + getCurrDateTime() + ".log");
-   config.addEntry< String >( "output-mode", "Mode for opening the log file - 'close' will only finalize the log file.", "append" );
+   config.addEntry< String >( "output-mode", "Mode for opening the log file.", "append" );
    config.addEntryEnum( "append" );
    config.addEntryEnum( "overwrite" );
-   config.addEntryEnum( "close" );
    config.addEntry< String >( "precision", "Precision of the arithmetics.", "double" );
    config.addEntryEnum( "float" );
    config.addEntryEnum( "double" );
@@ -134,22 +126,13 @@ main( int argc, char* argv[] )
    const int verboseMR = parameters.getParameter< int >( "verbose-MReader" );
 
    // open log file
-   if( outputMode == "close" )
-   {
-      std::fstream file;
-      file.open( logFileName.getString(), std::ios::out | std::ios::app );
-      file << std::endl << "   ]" << std::endl << "}";
-      return EXIT_SUCCESS;
-   }
    if( inputFileName == "" )
    {
       std::cerr << "ERROR: Input file name is required." << std::endl;
       return EXIT_FAILURE;
    }
-   bool logFileAppend( false );
    if( std::experimental::filesystem::exists(logFileName.getString()) )
    {
-      logFileAppend = true;
       std::cout << "Log file " << logFileName << " exists and ";
       if( outputMode == "append" )
          std::cout << "new logs will be appended." << std::endl;
@@ -160,26 +143,22 @@ main( int argc, char* argv[] )
    auto mode = std::ios::out;
    if( outputMode == "append" )
        mode |= std::ios::app;
-   std::ofstream logFile( logFileName.getString(), mode );
+   std::ofstream logFile( logFileName, mode );
 
-   // init benchmark and common metadata
-   TNL::Benchmarks::SpMV::BenchmarkType benchmark( loops, verbose, outputMode, logFileAppend );
+   // init benchmark and set parameters
+   TNL::Benchmarks::SpMV::BenchmarkType benchmark( logFile, loops, verbose );
 
-   // prepare global metadata
-   TNL::Benchmarks::SpMV::BenchmarkType::MetadataMap metadata = getHardwareMetadata< Logging >();
+   // write global metadata into a separate file
+   std::map< std::string, std::string > metadata = getHardwareMetadata();
+   writeMapAsJson( metadata, logFileName, ".metadata.json" );
 
    // Initiate setup of benchmarks
    if( precision == "all" || precision == "float" )
-      runSpMVBenchmarks< float >( benchmark, metadata, inputFileName, parameters, verboseMR );
+      runSpMVBenchmarks< float >( benchmark, inputFileName, parameters, verboseMR );
    if( precision == "all" || precision == "double" )
-      runSpMVBenchmarks< double >( benchmark, metadata, inputFileName, parameters, verboseMR );
-
-   if( ! benchmark.save( logFile ) ) {
-      std::cerr << "Failed to write the benchmark results to file '" << parameters.getParameter< String >( "log-file" ) << "'." << std::endl;
-      return EXIT_FAILURE;
-   }
+      runSpMVBenchmarks< double >( benchmark, inputFileName, parameters, verboseMR );
 
    // Confirm that the benchmark has finished
-   std::cout << "\n== BENCHMARK FINISHED ==" << std::endl;
+   std::cout << "\n==> BENCHMARK FINISHED" << std::endl;
    return EXIT_SUCCESS;
 }
diff --git a/src/Benchmarks/Traversers/BenchmarkTraverserUserData.h b/src/Benchmarks/Traversers/BenchmarkTraverserUserData.h
index 2ae00ec697452aad99163752bbd83fee4c5af1f1..35b08d993e2240dfb11ca8414ae1e4691975ce32 100644
--- a/src/Benchmarks/Traversers/BenchmarkTraverserUserData.h
+++ b/src/Benchmarks/Traversers/BenchmarkTraverserUserData.h
@@ -12,6 +12,8 @@
 
 #pragma once
 
+#include <TNL/Pointers/SharedPointer.h>
+
 namespace TNL {
    namespace Benchmarks {
       namespace Traversers {
@@ -25,7 +27,7 @@ class BenchmarkTraverserUserData
       using RealType = typename MeshType::RealType;
       using DeviceType = typename MeshType::DeviceType;
       using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >;
-      
+
       BenchmarkTraverserUserData( MeshFunctionPointer& f )
          : u( &f.template modifyData< DeviceType >() ), data( f->getData().getData() ){}
 
diff --git a/src/Benchmarks/Traversers/CMakeLists.txt b/src/Benchmarks/Traversers/CMakeLists.txt
index 5932d2606db852736f1ce665f4d52f53d0fa5d09..6b7712d2b3da3845d29a06b6ee2ba7d48be6a4b6 100644
--- a/src/Benchmarks/Traversers/CMakeLists.txt
+++ b/src/Benchmarks/Traversers/CMakeLists.txt
@@ -1,10 +1,9 @@
 # TODO: Split the benchmark into several files for faster build
 
-#if( BUILD_CUDA )
-#    CUDA_ADD_EXECUTABLE( tnl-benchmark-traversers tnl-benchmark-traversers.cu )
-#else()
-#    ADD_EXECUTABLE( tnl-benchmark-traversers tnl-benchmark-traversers.cpp )
-#endif()
-
-#install( TARGETS tnl-benchmark-traversers RUNTIME DESTINATION bin )
+if( BUILD_CUDA )
+    CUDA_ADD_EXECUTABLE( tnl-benchmark-traversers tnl-benchmark-traversers.cu )
+else()
+    ADD_EXECUTABLE( tnl-benchmark-traversers tnl-benchmark-traversers.cpp )
+endif()
 
+install( TARGETS tnl-benchmark-traversers RUNTIME DESTINATION bin )
diff --git a/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper.h b/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper.h
index 6da7ec09bb05d1591689688bf9fb85420fe5e820..13022458a5f0a153ae6bab14295c421cd5b8c183 100644
--- a/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper.h
+++ b/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper.h
@@ -12,10 +12,6 @@
 
 #pragma once
 
-#include "AddOneEntitiesProcessor.h"
-#include "BenchmarkTraverserUserData.h"
-#include "SimpleCell.h"
-
 namespace TNL {
    namespace Benchmarks {
       namespace Traversers {
@@ -23,7 +19,6 @@ namespace TNL {
 template< typename Grid >
 class GridTraverserBenchmarkHelper{};
 
-
       } // namespace Traversers
    } // namespace Benchmarks
 } // namespace TNL
diff --git a/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper_1D.h b/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper_1D.h
index e460a8bca4ac8edb77dcab823576349335de6a73..b2fc9bf9c86f0bfc0af43954d5b7489a54f72151 100644
--- a/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper_1D.h
+++ b/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper_1D.h
@@ -12,12 +12,15 @@
 
 #pragma once
 
+#include <TNL/Functions/MeshFunctionView.h>
+#include <TNL/Pointers/SharedPointer.h>
+#include <TNL/Meshes/Traverser.h>
+
 #include "GridTraverserBenchmarkHelper.h"
 #include "AddOneEntitiesProcessor.h"
 #include "BenchmarkTraverserUserData.h"
 #include "SimpleCell.h"
 
-
 namespace TNL {
    namespace Benchmarks {
       namespace Traversers {
@@ -40,10 +43,10 @@ _GridTraverser1D(
    typedef Index IndexType;
    typedef Meshes::Grid< 1, Real, Devices::Cuda, Index > GridType;
    //typename GridType::CoordinatesType coordinates;
- 
+
    GridEntity entity( *grid );
-   entity.getCoordinates().x() = begin.x() + ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
-   //coordinates.x() = begin.x() + ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
+   entity.getCoordinates().x() = begin.x() + ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
+   //coordinates.x() = begin.x() + ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
    if( entity.getCoordinates() <= end )
    {
       entity.refresh();
@@ -69,7 +72,7 @@ class GridTraverserBenchmarkHelper< Meshes::Grid< 1, Real, Devices::Host, Index
       using RealType = typename GridType::RealType;
       using IndexType = typename GridType::IndexType;
       using CoordinatesType = typename GridType::CoordinatesType;
-      using MeshFunction = Functions::MeshFunction< GridType >;
+      using MeshFunction = Functions::MeshFunctionView< GridType >;
       using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >;
       using CellType = typename GridType::template EntityType< Dimension, Meshes::GridEntityNoStencilStorage >;
       using SimpleCellType = SimpleCell< GridType >;
@@ -108,7 +111,7 @@ class GridTraverserBenchmarkHelper< Meshes::Grid< 1, Real, Devices::Cuda, Index
       using RealType = typename GridType::RealType;
       using IndexType = typename GridType::IndexType;
       using CoordinatesType = typename GridType::CoordinatesType;
-      using MeshFunction = Functions::MeshFunction< GridType >;
+      using MeshFunction = Functions::MeshFunctionView< GridType >;
       using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >;
       using CellType = typename GridType::template EntityType< Dimension, Meshes::GridEntityNoStencilStorage >;
       using SimpleCellType = SimpleCell< GridType >;
@@ -122,7 +125,7 @@ class GridTraverserBenchmarkHelper< Meshes::Grid< 1, Real, Devices::Cuda, Index
       {
 #ifdef HAVE_CUDA
             dim3 blockSize( 256 ), blocksCount, gridsCount;
-            Devices::Cuda::setupThreads(
+            Cuda::setupThreads(
                blockSize,
                blocksCount,
                gridsCount,
@@ -131,7 +134,7 @@ class GridTraverserBenchmarkHelper< Meshes::Grid< 1, Real, Devices::Cuda, Index
             for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ )
             {
                dim3 gridSize;
-               Devices::Cuda::setupGrid(
+               Cuda::setupGrid(
                   blocksCount,
                   gridsCount,
                   gridIdx,
@@ -148,7 +151,7 @@ class GridTraverserBenchmarkHelper< Meshes::Grid< 1, Real, Devices::Cuda, Index
 #endif
       }
 };
-         
+
       } // namespace Traversers
    } // namespace Benchmarks
 } // namespace TNL
diff --git a/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper_2D.h b/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper_2D.h
index eca6c7fee0057e2689fa5f473a214d97266471e6..a3d1a04364f87311969e727413052dfde92e2d40 100644
--- a/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper_2D.h
+++ b/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper_2D.h
@@ -12,6 +12,10 @@
 
 #pragma once
 
+#include <TNL/Functions/MeshFunctionView.h>
+#include <TNL/Pointers/SharedPointer.h>
+#include <TNL/Meshes/Traverser.h>
+
 #include "GridTraverserBenchmarkHelper.h"
 #include "AddOneEntitiesProcessor.h"
 #include "BenchmarkTraverserUserData.h"
@@ -40,8 +44,8 @@ _GridTraverser2D(
    typedef Meshes::Grid< 2, Real, Devices::Cuda, Index > GridType;
 
    GridEntity entity( *grid );
-   entity.getCoordinates().x() = begin.x() + ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
-   entity.getCoordinates().y() = begin.y() + ( gridIdx.y * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y;
+   entity.getCoordinates().x() = begin.x() + ( gridIdx.x * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
+   entity.getCoordinates().y() = begin.y() + ( gridIdx.y * Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y;
    if( entity.getCoordinates() <= end )
    {
       entity.refresh();
@@ -62,7 +66,7 @@ class GridTraverserBenchmarkHelper< Meshes::Grid< 2, Real, Devices::Host, Index
       using RealType = typename GridType::RealType;
       using IndexType = typename GridType::IndexType;
       using CoordinatesType = typename GridType::CoordinatesType;
-      using MeshFunction = Functions::MeshFunction< GridType >;
+      using MeshFunction = Functions::MeshFunctionView< GridType >;
       using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >;
       using CellType = typename GridType::template EntityType< Dimension, Meshes::GridEntityNoStencilStorage >;
       using SimpleCellType = SimpleCell< GridType >;
@@ -104,7 +108,7 @@ class GridTraverserBenchmarkHelper< Meshes::Grid< 2, Real, Devices::Cuda, Index
       using RealType = typename GridType::RealType;
       using IndexType = typename GridType::IndexType;
       using CoordinatesType = typename GridType::CoordinatesType;
-      using MeshFunction = Functions::MeshFunction< GridType >;
+      using MeshFunction = Functions::MeshFunctionView< GridType >;
       using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >;
       using CellType = typename GridType::template EntityType< Dimension, Meshes::GridEntityNoStencilStorage >;
       using SimpleCellType = SimpleCell< GridType >;
@@ -118,7 +122,7 @@ class GridTraverserBenchmarkHelper< Meshes::Grid< 2, Real, Devices::Cuda, Index
       {
 #ifdef HAVE_CUDA
             dim3 blockSize( 16, 16 ), blocksCount, gridsCount;
-            Devices::Cuda::setupThreads(
+            Cuda::setupThreads(
                blockSize,
                blocksCount,
                gridsCount,
@@ -129,7 +133,7 @@ class GridTraverserBenchmarkHelper< Meshes::Grid< 2, Real, Devices::Cuda, Index
                for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ )
                {
                   dim3 gridSize;
-                  Devices::Cuda::setupGrid(
+                  Cuda::setupGrid(
                      blocksCount,
                      gridsCount,
                      gridIdx,
diff --git a/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper_3D.h b/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper_3D.h
index 4a5da6fd4b89c1cebc716ccfdff31d6ecaf96470..939d8a6815f4f8b4ef0439d8f61c3cc84cc01a19 100644
--- a/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper_3D.h
+++ b/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper_3D.h
@@ -12,6 +12,10 @@
 
 #pragma once
 
+#include <TNL/Functions/MeshFunctionView.h>
+#include <TNL/Pointers/SharedPointer.h>
+#include <TNL/Meshes/Traverser.h>
+
 #include "GridTraverserBenchmarkHelper.h"
 #include "AddOneEntitiesProcessor.h"
 #include "BenchmarkTraverserUserData.h"
@@ -38,12 +42,12 @@ _GridTraverser3D(
    typedef Real RealType;
    typedef Index IndexType;
    typedef Meshes::Grid< 3, Real, Devices::Cuda, Index > GridType;
- 
+
    GridEntity entity( *grid );
-   entity.getCoordinates().x() = begin.x() + ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
-   entity.getCoordinates().y() = begin.y() + ( gridIdx.y * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y;
-   entity.getCoordinates().z() = begin.z() + ( gridIdx.z * Devices::Cuda::getMaxGridSize() + blockIdx.z ) * blockDim.z + threadIdx.z;
-   
+   entity.getCoordinates().x() = begin.x() + ( gridIdx.x * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
+   entity.getCoordinates().y() = begin.y() + ( gridIdx.y * Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y;
+   entity.getCoordinates().z() = begin.z() + ( gridIdx.z * Cuda::getMaxGridSize() + blockIdx.z ) * blockDim.z + threadIdx.z;
+
    if( entity.getCoordinates() <= end )
    {
       entity.refresh();
@@ -64,7 +68,7 @@ class GridTraverserBenchmarkHelper< Meshes::Grid< 3, Real, Devices::Host, Index
       using RealType = typename GridType::RealType;
       using IndexType = typename GridType::IndexType;
       using CoordinatesType = typename GridType::CoordinatesType;
-      using MeshFunction = Functions::MeshFunction< GridType >;
+      using MeshFunction = Functions::MeshFunctionView< GridType >;
       using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >;
       using CellType = typename GridType::template EntityType< Dimension, Meshes::GridEntityNoStencilStorage >;
       using SimpleCellType = SimpleCell< GridType >;
@@ -107,7 +111,7 @@ class GridTraverserBenchmarkHelper< Meshes::Grid< 3, Real, Devices::Cuda, Index
       using RealType = typename GridType::RealType;
       using IndexType = typename GridType::IndexType;
       using CoordinatesType = typename GridType::CoordinatesType;
-      using MeshFunction = Functions::MeshFunction< GridType >;
+      using MeshFunction = Functions::MeshFunctionView< GridType >;
       using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >;
       using CellType = typename GridType::template EntityType< Dimension, Meshes::GridEntityNoStencilStorage >;
       using SimpleCellType = SimpleCell< GridType >;
@@ -121,7 +125,7 @@ class GridTraverserBenchmarkHelper< Meshes::Grid< 3, Real, Devices::Cuda, Index
       {
 #ifdef HAVE_CUDA
             dim3 blockSize( 32, 4, 2 ), blocksCount, gridsCount;
-            Devices::Cuda::setupThreads(
+            Cuda::setupThreads(
                blockSize,
                blocksCount,
                gridsCount,
@@ -134,7 +138,7 @@ class GridTraverserBenchmarkHelper< Meshes::Grid< 3, Real, Devices::Cuda, Index
                   for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ )
                   {
                      dim3 gridSize;
-                     Devices::Cuda::setupGrid(
+                     Cuda::setupGrid(
                         blocksCount,
                         gridsCount,
                         gridIdx,
diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark.h b/src/Benchmarks/Traversers/GridTraversersBenchmark.h
index 01590f1221f7a451270234044e180a16ff589e02..5ea87b6dcdc8a8917304826c1cef71185a65c915 100644
--- a/src/Benchmarks/Traversers/GridTraversersBenchmark.h
+++ b/src/Benchmarks/Traversers/GridTraversersBenchmark.h
@@ -12,28 +12,10 @@
 
 #pragma once
 
-#include <TNL/Algorithms/ParallelFor.h>
-#include <TNL/Devices/Host.h>
-#include <TNL/Devices/Cuda.h>
-#include <TNL/Containers/Vector.h>
-#include <TNL/Meshes/Grid.h>
-#include <TNL/Meshes/GridEntityConfig.h>
-#include <TNL/Meshes/Traverser.h>
-#include <TNL/Functions/MeshFunction.h>
-#include <TNL/Pointers/SharedPointer.h>
-
-#include "AddOneEntitiesProcessor.h"
-#include "AddTwoEntitiesProcessor.h"
-#include "GridTraverserBenchmarkHelper.h"
-#include "BenchmarkTraverserUserData.h"
-#include "cuda-kernels.h"
-
 namespace TNL {
    namespace Benchmarks {
       namespace Traversers {
 
-
-
 template< int Dimension,
           typename Device,
           typename Real,
diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h
index 9820af39274cd5c8db310b583c0595d2b64bf252..6e8e0c37fc6452df57c76995af3e7cf6fe64cd63 100644
--- a/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h
+++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h
@@ -13,23 +13,27 @@
 #pragma once
 
 #include <TNL/Algorithms/ParallelFor.h>
+#include <TNL/Algorithms/contains.h>
 #include <TNL/Devices/Host.h>
 #include <TNL/Devices/Cuda.h>
 #include <TNL/Containers/Vector.h>
 #include <TNL/Meshes/Grid.h>
 #include <TNL/Meshes/GridEntityConfig.h>
 #include <TNL/Meshes/Traverser.h>
-#include <TNL/Functions/MeshFunction.h>
+#include <TNL/Functions/MeshFunctionView.h>
 #include <TNL/Pointers/SharedPointer.h>
 #include "cuda-kernels.h"
+#include "AddOneEntitiesProcessor.h"
+#include "AddTwoEntitiesProcessor.h"
+#include "BenchmarkTraverserUserData.h"
 #include "GridTraversersBenchmark.h"
+#include "GridTraverserBenchmarkHelper.h"
 #include "SimpleCell.h"
 
 namespace TNL {
    namespace Benchmarks {
       namespace Traversers {
 
-
 template< typename Device,
           typename Real,
           typename Index >
@@ -41,7 +45,7 @@ class GridTraversersBenchmark< 1, Device, Real, Index >
       using GridType = Meshes::Grid< 1, Real, Device, Index >;
       using GridPointer = Pointers::SharedPointer< GridType >;
       using Coordinates = typename GridType::CoordinatesType;
-      using MeshFunction = Functions::MeshFunction< GridType >;
+      using MeshFunction = Functions::MeshFunctionView< GridType >;
       using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >;
       using CellType = typename GridType::template EntityType< 1, Meshes::GridEntityNoStencilStorage >;
       using SimpleCellType = SimpleCell< GridType >;
@@ -51,11 +55,13 @@ class GridTraversersBenchmark< 1, Device, Real, Index >
       using AddTwoEntitiesProcessorType = AddTwoEntitiesProcessor< UserDataType >;
 
       GridTraversersBenchmark( Index size )
-      :size( size ), v( size ), grid( size ), u( grid ),
+      :size( size ),
+       v( size ),
+       grid( size ),
        userData( this->u )
       {
          v_data = v.getData();
-         u->getData().bind( v );
+         u->bind( grid, v );
       }
 
       void reset()
@@ -74,7 +80,7 @@ class GridTraversersBenchmark< 1, Device, Real, Index >
          {
 #ifdef HAVE_CUDA
             dim3 blockSize( 256 ), blocksCount, gridsCount;
-            Devices::Cuda::setupThreads(
+            Cuda::setupThreads(
                blockSize,
                blocksCount,
                gridsCount,
@@ -83,7 +89,7 @@ class GridTraversersBenchmark< 1, Device, Real, Index >
             for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ )
             {
                dim3 gridSize;
-               Devices::Cuda::setupGrid(
+               Cuda::setupGrid(
                   blocksCount,
                   gridsCount,
                   gridIdx,
@@ -100,7 +106,7 @@ class GridTraversersBenchmark< 1, Device, Real, Index >
          {
             data[ i ] += (Real) 1.0;
          };
-         ParallelFor< Device, AsynchronousMode >::exec( ( Index ) 0, size, f, v.getData() );
+         Algorithms::ParallelFor< Device, Algorithms::AsynchronousMode >::exec( ( Index ) 0, size, f, v.getData() );
       }
 
       void addOneUsingSimpleCell()
@@ -113,7 +119,7 @@ class GridTraversersBenchmark< 1, Device, Real, Index >
             entity.refresh();
             data[ entity.getIndex() ] += (Real) 1.0;
          };
-         ParallelFor< Device, AsynchronousMode >::exec( ( Index ) 0, size, f, v.getData() );*/
+         Algorithms::ParallelFor< Device, Algorithms::AsynchronousMode >::exec( ( Index ) 0, size, f, v.getData() );*/
          GridTraverserBenchmarkHelper< GridType >::simpleCellTest(
             grid,
             userData,
@@ -132,15 +138,15 @@ class GridTraversersBenchmark< 1, Device, Real, Index >
             _u->getData().getData()[ entity.getIndex() ] += (Real) 1.0;
             // ( *_u )( entity ) += (Real) 1.0;
          };
-         ParallelFor< Device, AsynchronousMode >::exec( ( Index ) 0, size, f );
+         Algorithms::ParallelFor< Device, Algorithms::AsynchronousMode >::exec( ( Index ) 0, size, f );
       }
 
       void addOneUsingTraverser()
       {
          using CoordinatesType = typename GridType::CoordinatesType;
-         traverser.template processAllEntities< UserDataType, AddOneEntitiesProcessorType >
+         traverser.template processAllEntities< AddOneEntitiesProcessorType >
             ( grid, userData );
-         
+
          /*GridTraverserBenchmarkHelper< GridType >::noBCTraverserTest(
             grid,
             userData,
@@ -151,8 +157,8 @@ class GridTraversersBenchmark< 1, Device, Real, Index >
       {
          std::cout << loops << " -> " << v << std::endl;
          if( reseting )
-            return v.containsOnlyValue( 1.0 );
-         return v.containsOnlyValue( ( Real ) loops );
+            return Algorithms::containsOnlyValue( v, 1.0 );
+         return Algorithms::containsOnlyValue( v, ( Real ) loops );
       }
 
       void traverseUsingPureC()
@@ -168,7 +174,7 @@ class GridTraversersBenchmark< 1, Device, Real, Index >
          {
 #ifdef HAVE_CUDA
             dim3 blockSize( 256 ), blocksCount, gridsCount;
-            Devices::Cuda::setupThreads(
+            Cuda::setupThreads(
                blockSize,
                blocksCount,
                gridsCount,
@@ -177,7 +183,7 @@ class GridTraversersBenchmark< 1, Device, Real, Index >
             for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ )
             {
                dim3 gridSize;
-               Devices::Cuda::setupGrid(
+               Cuda::setupGrid(
                   blocksCount,
                   gridsCount,
                   gridIdx,
@@ -187,7 +193,7 @@ class GridTraversersBenchmark< 1, Device, Real, Index >
             for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ )
             {
                dim3 gridSize;
-               Devices::Cuda::setupGrid(
+               Cuda::setupGrid(
                   blocksCount,
                   gridsCount,
                   gridIdx,
@@ -201,11 +207,11 @@ class GridTraversersBenchmark< 1, Device, Real, Index >
       void traverseUsingTraverser()
       {
          // TODO !!!!!!!!!!!!!!!!!!!!!!
-         //traverser.template processAllEntities< UserDataType, AddOneEntitiesProcessorType >
+         //traverser.template processAllEntities< AddOneEntitiesProcessorType >
 
-         traverser.template processBoundaryEntities< UserDataType, AddTwoEntitiesProcessorType >
+         traverser.template processBoundaryEntities< AddTwoEntitiesProcessorType >
             ( grid, userData );
-         traverser.template processInteriorEntities< UserDataType, AddOneEntitiesProcessorType >
+         traverser.template processInteriorEntities< AddOneEntitiesProcessorType >
             ( grid, userData );
       }
 
diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h
index 0e9ae7f2f36a9d19a913a4437d988d5172592e5b..d310ec451751bf7b13c1b984ee5a2e0e2faa52b1 100644
--- a/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h
+++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h
@@ -13,16 +13,21 @@
 #pragma once
 
 #include <TNL/Algorithms/ParallelFor.h>
+#include <TNL/Algorithms/contains.h>
 #include <TNL/Devices/Host.h>
 #include <TNL/Devices/Cuda.h>
 #include <TNL/Containers/Vector.h>
 #include <TNL/Meshes/Grid.h>
 #include <TNL/Meshes/GridEntityConfig.h>
 #include <TNL/Meshes/Traverser.h>
-#include <TNL/Functions/MeshFunction.h>
+#include <TNL/Functions/MeshFunctionView.h>
 #include <TNL/Pointers/SharedPointer.h>
 #include "cuda-kernels.h"
+#include "AddOneEntitiesProcessor.h"
+#include "AddTwoEntitiesProcessor.h"
+#include "BenchmarkTraverserUserData.h"
 #include "GridTraversersBenchmark.h"
+#include "GridTraverserBenchmarkHelper.h"
 #include "SimpleCell.h"
 
 namespace TNL {
@@ -35,12 +40,12 @@ template< typename Device,
 class GridTraversersBenchmark< 2, Device, Real, Index >
 {
    public:
-      
+
       using Vector = Containers::Vector< Real, Device, Index >;
       using GridType = Meshes::Grid< 2, Real, Device, Index >;
       using GridPointer = Pointers::SharedPointer< GridType >;
       using Coordinates = typename GridType::CoordinatesType;
-      using MeshFunction = Functions::MeshFunction< GridType >;
+      using MeshFunction = Functions::MeshFunctionView< GridType >;
       using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >;
       using CellType = typename GridType::template EntityType< 2, Meshes::GridEntityNoStencilStorage >;
       using SimpleCellType = SimpleCell< GridType >;
@@ -50,11 +55,13 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
       using AddTwoEntitiesProcessorType = AddTwoEntitiesProcessor< UserDataType >;
 
       GridTraversersBenchmark( Index size )
-      :size( size ), v( size * size ), grid( size, size ), u( grid ),
+      :size( size ),
+       v( size * size ),
+       grid( size, size ),
        userData( u )
       {
          v_data = v.getData();
-         u->getData().bind( v );
+         u->bind( grid, v );
       }
 
       void reset()
@@ -74,7 +81,7 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
          {
 #ifdef HAVE_CUDA
             dim3 blockSize( 16, 16 ), blocksCount, gridsCount;
-            Devices::Cuda::setupThreads(
+            Cuda::setupThreads(
                blockSize,
                blocksCount,
                gridsCount,
@@ -85,7 +92,7 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
                for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ )
                {
                   dim3 gridSize;
-                  Devices::Cuda::setupGrid(
+                  Cuda::setupGrid(
                      blocksCount,
                      gridsCount,
                      gridIdx,
@@ -103,8 +110,8 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
          {
             data[ j * _size + i ] += (Real) 1.0;
          };
-         
-         ParallelFor2D< Device, AsynchronousMode >::exec(
+
+         Algorithms::ParallelFor2D< Device, Algorithms::AsynchronousMode >::exec(
             ( Index ) 0,
             ( Index ) 0,
             this->size,
@@ -123,8 +130,8 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
             entity.refresh();
             data[ entity.getIndex() ] += (Real) 1.0;
          };
-         
-         ParallelFor2D< Device, AsynchronousMode >::exec(
+
+         Algorithms::ParallelFor2D< Device, Algorithms::AsynchronousMode >::exec(
             ( Index ) 0,
             ( Index ) 0,
             this->size,
@@ -134,7 +141,7 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
             grid,
             userData,
             size );
-         
+
       }
 
       void addOneUsingParallelForAndMeshFunction()
@@ -150,8 +157,8 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
             //( *_u )( entity ) += (Real) 1.0;
             _u->getData().getData()[ entity.getIndex() ] += (Real) 1.0;
          };
-         
-         ParallelFor2D< Device, AsynchronousMode >::exec(
+
+         Algorithms::ParallelFor2D< Device, Algorithms::AsynchronousMode >::exec(
             ( Index ) 0,
             ( Index ) 0,
             this->size,
@@ -163,9 +170,9 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
       void addOneUsingTraverser()
       {
          using CoordinatesType = typename GridType::CoordinatesType;
-         traverser.template processAllEntities< UserDataType, AddOneEntitiesProcessorType >
+         traverser.template processAllEntities< AddOneEntitiesProcessorType >
             ( grid, userData );
-         
+
          /*Meshes::GridTraverser< Grid >::template processEntities< Cell, WriteOneEntitiesProcessorType, WriteOneTraverserUserDataType, false >(
            grid,
            CoordinatesType( 0 ),
@@ -188,8 +195,8 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
       bool checkAddOne( int loops, bool reseting )
       {
          if( reseting )
-            return v.containsOnlyValue( 1.0 );
-         return v.containsOnlyValue( ( Real ) loops );
+            return Algorithms::containsOnlyValue( v, 1.0 );
+         return Algorithms::containsOnlyValue( v, ( Real ) loops );
       }
 
       void traverseUsingPureC()
@@ -215,7 +222,7 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
          {
 #ifdef HAVE_CUDA
             dim3 blockSize( 32, 8 ), blocksCount, gridsCount;
-            Devices::Cuda::setupThreads(
+            Cuda::setupThreads(
                blockSize,
                blocksCount,
                gridsCount,
@@ -226,7 +233,7 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
                for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ )
                {
                   dim3 gridSize;
-                  Devices::Cuda::setupGrid(
+                  Cuda::setupGrid(
                      blocksCount,
                      gridsCount,
                      gridIdx,
@@ -237,7 +244,7 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
                for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ )
                {
                   dim3 gridSize;
-                  Devices::Cuda::setupGrid(
+                  Cuda::setupGrid(
                      blocksCount,
                      gridsCount,
                      gridIdx,
@@ -250,15 +257,15 @@ class GridTraversersBenchmark< 2, Device, Real, Index >
 
       void traverseUsingTraverser()
       {
-         //traverser.template processAllEntities< UserDataType, AddOneEntitiesProcessorType >
-         traverser.template processBoundaryEntities< UserDataType, AddTwoEntitiesProcessorType >
+         //traverser.template processAllEntities< AddOneEntitiesProcessorType >
+         traverser.template processBoundaryEntities< AddTwoEntitiesProcessorType >
             ( grid, userData );
-         traverser.template processInteriorEntities< UserDataType, AddOneEntitiesProcessorType >
+         traverser.template processInteriorEntities< AddOneEntitiesProcessorType >
             ( grid, userData );
       }
 
    protected:
-        
+
       Index size;
       Vector v;
       Real* v_data;
diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h
index 26b6413e43edbc86a3a02490580afc834bb0bde6..8e352536665968e4832b7986ba3bfa3842c9f1a6 100644
--- a/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h
+++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h
@@ -13,18 +13,21 @@
 #pragma once
 
 #include <TNL/Algorithms/ParallelFor.h>
+#include <TNL/Algorithms/contains.h>
 #include <TNL/Devices/Host.h>
 #include <TNL/Devices/Cuda.h>
 #include <TNL/Containers/Vector.h>
 #include <TNL/Meshes/Grid.h>
 #include <TNL/Meshes/GridEntityConfig.h>
 #include <TNL/Meshes/Traverser.h>
-#include <TNL/Functions/MeshFunction.h>
+#include <TNL/Functions/MeshFunctionView.h>
 #include <TNL/Pointers/SharedPointer.h>
 #include "cuda-kernels.h"
 #include "AddOneEntitiesProcessor.h"
+#include "AddTwoEntitiesProcessor.h"
 #include "BenchmarkTraverserUserData.h"
 #include "GridTraversersBenchmark.h"
+#include "GridTraverserBenchmarkHelper.h"
 #include "SimpleCell.h"
 
 namespace TNL {
@@ -42,7 +45,7 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
       using GridType = Meshes::Grid< 3, Real, Device, Index >;
       using GridPointer = Pointers::SharedPointer< GridType >;
       using Coordinates = typename GridType::CoordinatesType;
-      using MeshFunction = Functions::MeshFunction< GridType >;
+      using MeshFunction = Functions::MeshFunctionView< GridType >;
       using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >;
       using CellType = typename GridType::template EntityType< 3, Meshes::GridEntityNoStencilStorage >;
       using SimpleCellType = SimpleCell< GridType >;
@@ -55,11 +58,10 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
       : size( size ),
         v( size * size * size ),
         grid( size, size, size ),
-        u( grid ),
         userData( u )
       {
          v_data = v.getData();
-         u->getData().bind( v );
+         u->bind( grid, v );
       }
 
       void reset()
@@ -80,7 +82,7 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
          {
 #ifdef HAVE_CUDA
             dim3 blockSize( 32, 4, 2 ), blocksCount, gridsCount;
-            Devices::Cuda::setupThreads(
+            Cuda::setupThreads(
                blockSize,
                blocksCount,
                gridsCount,
@@ -93,7 +95,7 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
                   for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ )
                   {
                      dim3 gridSize;
-                     Devices::Cuda::setupGrid(
+                     Cuda::setupGrid(
                         blocksCount,
                         gridsCount,
                         gridIdx,
@@ -111,8 +113,8 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
          {
             data[ ( k * _size + j ) * _size + i ] += (Real) 1.0;
          };
-         
-         ParallelFor3D< Device, AsynchronousMode >::exec(
+
+         Algorithms::ParallelFor3D< Device, Algorithms::AsynchronousMode >::exec(
             ( Index ) 0,
             ( Index ) 0,
             ( Index ) 0,
@@ -135,7 +137,7 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
             data[ entity.getIndex() ] += (Real) 1.0;
          };
 
-         ParallelFor3D< Device, AsynchronousMode >::exec(
+         Algorithms::ParallelFor3D< Device, Algorithms::AsynchronousMode >::exec(
             ( Index ) 0,
             ( Index ) 0,
             ( Index ) 0,
@@ -165,7 +167,7 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
             _u->getData().getData()[ entity.getIndex() ] += (Real) 1.0;
          };
 
-         ParallelFor3D< Device, AsynchronousMode >::exec(
+         Algorithms::ParallelFor3D< Device, Algorithms::AsynchronousMode >::exec(
             ( Index ) 0,
             ( Index ) 0,
             ( Index ) 0,
@@ -177,15 +179,15 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
 
       void addOneUsingTraverser()
       {
-         traverser.template processAllEntities< UserDataType, AddOneEntitiesProcessorType >
+         traverser.template processAllEntities< AddOneEntitiesProcessorType >
             ( grid, userData );
       }
 
       bool checkAddOne( int loops, bool reseting )
       {
          if( reseting )
-            return v.containsOnlyValue( 1.0 );
-         return v.containsOnlyValue( ( Real ) loops );
+            return Algorithms::containsOnlyValue( v, 1.0 );
+         return Algorithms::containsOnlyValue( v, ( Real ) loops );
       }
 
       void traverseUsingPureC()
@@ -221,7 +223,7 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
          {
 #ifdef HAVE_CUDA
             dim3 blockSize( 32, 4, 2 ), blocksCount, gridsCount;
-            Devices::Cuda::setupThreads(
+            Cuda::setupThreads(
                blockSize,
                blocksCount,
                gridsCount,
@@ -234,7 +236,7 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
                   for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ )
                   {
                      dim3 gridSize;
-                     Devices::Cuda::setupGrid(
+                     Cuda::setupGrid(
                         blocksCount,
                         gridsCount,
                         gridIdx,
@@ -246,7 +248,7 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
                   for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ )
                   {
                      dim3 gridSize;
-                     Devices::Cuda::setupGrid(
+                     Cuda::setupGrid(
                         blocksCount,
                         gridsCount,
                         gridIdx,
@@ -260,16 +262,16 @@ class GridTraversersBenchmark< 3, Device, Real, Index >
       void traverseUsingTraverser()
       {
          // TODO !!!!!!!!!!!!!!!!!!!!!!
-         //traverser.template processAllEntities< UserDataType, AddOneEntitiesProcessorType >
+         //traverser.template processAllEntities< AddOneEntitiesProcessorType >
 
-         traverser.template processBoundaryEntities< UserDataType, AddTwoEntitiesProcessorType >
+         traverser.template processBoundaryEntities< AddTwoEntitiesProcessorType >
             ( grid, userData );
-         traverser.template processInteriorEntities< UserDataType, AddOneEntitiesProcessorType >
+         traverser.template processInteriorEntities< AddOneEntitiesProcessorType >
             ( grid, userData );
       }
 
    protected:
-      
+
       Index size;
       Vector v;
       Real* v_data;
diff --git a/src/Benchmarks/Traversers/cuda-kernels.h b/src/Benchmarks/Traversers/cuda-kernels.h
index a90baf5b02075d9c80d572bd51b93ed5cd97b391..d092925bf0bfec6192a770809e44cdecafa8abff 100644
--- a/src/Benchmarks/Traversers/cuda-kernels.h
+++ b/src/Benchmarks/Traversers/cuda-kernels.h
@@ -25,7 +25,7 @@ template< typename Real,
           typename Index >
 __global__ void fullGridTraverseKernel1D( const Index size, const dim3 gridIdx, Real* v_data  )
 {
-   const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
+   const Index threadIdx_x = ( gridIdx.x * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
    if( threadIdx_x < size )
       v_data[ threadIdx_x ] += (Real) 1.0;
 }
@@ -34,8 +34,8 @@ template< typename Real,
           typename Index >
 __global__ void fullGridTraverseKernel2D( const Index size, const dim3 gridIdx, Real* v_data  )
 {
-   const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
-   const Index threadIdx_y = ( gridIdx.y * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y;
+   const Index threadIdx_x = ( gridIdx.x * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
+   const Index threadIdx_y = ( gridIdx.y * Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y;
    if( threadIdx_x < size && threadIdx_y < size )
       v_data[ threadIdx_y * size + threadIdx_x ] += (Real) 1.0;
 }
@@ -44,21 +44,21 @@ template< typename Real,
           typename Index >
 __global__ void fullGridTraverseKernel3D( const Index size, const dim3 gridIdx, Real* v_data  )
 {
-   const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
-   const Index threadIdx_y = ( gridIdx.y * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y;
-   const Index threadIdx_z = ( gridIdx.z * Devices::Cuda::getMaxGridSize() + blockIdx.z ) * blockDim.z + threadIdx.z;
+   const Index threadIdx_x = ( gridIdx.x * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
+   const Index threadIdx_y = ( gridIdx.y * Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y;
+   const Index threadIdx_z = ( gridIdx.z * Cuda::getMaxGridSize() + blockIdx.z ) * blockDim.z + threadIdx.z;
    if( threadIdx_x < size && threadIdx_y < size && threadIdx_z < size )
       v_data[ ( threadIdx_z * size + threadIdx_y ) * size + threadIdx_x ] += (Real) 1.0;
 }
 
 /****
- * Traversing interior cells 
+ * Traversing interior cells
  */
 template< typename Real,
           typename Index >
 __global__ void interiorTraverseKernel1D( const Index size, const dim3 gridIdx, Real* v_data  )
 {
-   const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
+   const Index threadIdx_x = ( gridIdx.x * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
    if( threadIdx_x > 0 && threadIdx_x < size - 1 )
       v_data[ threadIdx_x ] += (Real) 1.0;
 }
@@ -67,9 +67,9 @@ template< typename Real,
           typename Index >
 __global__ void interiorTraverseKernel2D( const Index size, const dim3 gridIdx, Real* v_data  )
 {
-   const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
-   const Index threadIdx_y = ( gridIdx.y * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y;
-   if( threadIdx_x > 0 && threadIdx_y > 0 && 
+   const Index threadIdx_x = ( gridIdx.x * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
+   const Index threadIdx_y = ( gridIdx.y * Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y;
+   if( threadIdx_x > 0 && threadIdx_y > 0 &&
        threadIdx_x < size - 1 && threadIdx_y < size - 1 )
          v_data[ threadIdx_y * size + threadIdx_x ] += (Real) 1.0;
 }
@@ -78,9 +78,9 @@ template< typename Real,
           typename Index >
 __global__ void interiorTraverseKernel3D( const Index size, const dim3 gridIdx, Real* v_data  )
 {
-   const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
-   const Index threadIdx_y = ( gridIdx.y * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y;
-   const Index threadIdx_z = ( gridIdx.z * Devices::Cuda::getMaxGridSize() + blockIdx.z ) * blockDim.z + threadIdx.z;
+   const Index threadIdx_x = ( gridIdx.x * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
+   const Index threadIdx_y = ( gridIdx.y * Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y;
+   const Index threadIdx_z = ( gridIdx.z * Cuda::getMaxGridSize() + blockIdx.z ) * blockDim.z + threadIdx.z;
    if( threadIdx_x > 0 && threadIdx_y > 0 && threadIdx_z > 0 &&
        threadIdx_x < size - 1 && threadIdx_y < size - 1 && threadIdx_z < size - 1 )
       v_data[ ( threadIdx_z * size + threadIdx_y ) * size + threadIdx_x ] += (Real) 1.0;
@@ -93,7 +93,7 @@ template< typename Real,
           typename Index >
 __global__ void boundariesTraverseKernel1D( const Index size, const dim3 gridIdx, Real* v_data  )
 {
-   const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
+   const Index threadIdx_x = ( gridIdx.x * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
    if( threadIdx_x == 0 || threadIdx_x == size - 1 )
       v_data[ threadIdx_x ] += (Real) 2.0;
 }
@@ -102,9 +102,9 @@ template< typename Real,
           typename Index >
 __global__ void boundariesTraverseKernel2D( const Index size, const dim3 gridIdx, Real* v_data  )
 {
-   const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
-   const Index threadIdx_y = ( gridIdx.y * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y;
-   if( threadIdx_x > 0 && threadIdx_y > 0 && 
+   const Index threadIdx_x = ( gridIdx.x * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
+   const Index threadIdx_y = ( gridIdx.y * Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y;
+   if( threadIdx_x > 0 && threadIdx_y > 0 &&
        threadIdx_x < size - 1 && threadIdx_y < size - 1 )
          v_data[ threadIdx_y * size + threadIdx_x ] += (Real) 2.0;
 }
@@ -113,9 +113,9 @@ template< typename Real,
           typename Index >
 __global__ void boundariesTraverseKernel3D( const Index size, const dim3 gridIdx, Real* v_data  )
 {
-   const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
-   const Index threadIdx_y = ( gridIdx.y * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y;
-   const Index threadIdx_z = ( gridIdx.z * Devices::Cuda::getMaxGridSize() + blockIdx.z ) * blockDim.z + threadIdx.z;
+   const Index threadIdx_x = ( gridIdx.x * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
+   const Index threadIdx_y = ( gridIdx.y * Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y;
+   const Index threadIdx_z = ( gridIdx.z * Cuda::getMaxGridSize() + blockIdx.z ) * blockDim.z + threadIdx.z;
    if( threadIdx_x == 0 || threadIdx_y == 0 || threadIdx_z == 0 ||
        threadIdx_x == size - 1 || threadIdx_y == size - 1 || threadIdx_z == size - 1 )
       v_data[ ( threadIdx_z * size + threadIdx_y ) * size + threadIdx_x ] += (Real) 2.0;
diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
index 30c364ac37f2bc32753a14c723ff2276332550c9..592098b9553d20f290ed111637820630c028ccbd 100644
--- a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
+++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
@@ -12,7 +12,7 @@
 
 #pragma once
 
-#include "../Benchmarks.h"
+#include <TNL/Benchmarks/Benchmarks.h>
 //#include "grid-traversing.h"
 #include "GridTraversersBenchmark.h"
 
@@ -20,25 +20,25 @@
 #include <TNL/Devices/Host.h>
 #include <TNL/Devices/Cuda.h>
 #include <TNL/Algorithms/ParallelFor.h>
-#include <TNL/Containers/List.h>
 
 using namespace TNL;
 using namespace TNL::Benchmarks;
 using namespace TNL::Benchmarks::Traversers;
 
 
+template< typename T, typename S >
+bool containsValue( const std::vector< T >& container, const S& value )
+{
+   return std::find( container.begin(), container.end(), value ) != container.end();
+}
+
 template< int Dimension,
           typename Real = float,
           typename Index = int >
 bool runBenchmark( const Config::ParameterContainer& parameters,
-                   Benchmark& benchmark,
-                   Benchmark::MetadataMap& metadata )
+                   Benchmark<>& benchmark )
 {
-   const Containers::List< String >& tests = parameters.getParameter< Containers::List< String > >( "tests" );
-   // FIXME: the --tests is just a string because list does not work with enums
-//   const Containers::List< String >& tests = parameters.getParameter< Containers::List< String > >( "tests" );
-   //Containers::List< String > tests;
-   //tests.Append( parameters.getParameter< String >( "tests" ) );
+   const std::vector< String >& tests = parameters.getParameter< std::vector< String > >( "tests" );
    // FIXME: getParameter< std::size_t >() does not work with parameters added with addEntry< int >(),
    // which have a default value. The workaround below works for int values, but it is not possible
    // to pass 64-bit integer values
@@ -49,15 +49,14 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
    const bool withHost = parameters.getParameter< bool >( "with-host" );
 #ifdef HAVE_CUDA
    const bool withCuda = parameters.getParameter< bool >( "with-cuda" );
-#else
-   const bool withCuda = false;
+//#else
+//   const bool withCuda = false;
 #endif
    const bool check = parameters.getParameter< bool >( "check" );
 
    /****
     * Full grid traversing with no boundary conditions
     */
-   benchmark.newBenchmark( String("Traversing without boundary conditions" + convertToString( Dimension ) + "D" ), metadata );
    for( std::size_t size = minSize; size <= maxSize; size *= 2 )
    {
       GridTraversersBenchmark< Dimension, Devices::Host, Real, Index > hostTraverserBenchmark( size );
@@ -76,14 +75,16 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
          cudaTraverserBenchmark.reset();
       };
 #endif
-      benchmark.setMetadataColumns(
-         Benchmark::MetadataColumns( 
-            {  {"size", convertToString( size ) }, } ) );
+      benchmark.setMetadataColumns({
+            { "dimension", convertToString( Dimension ) },
+            { "traverser", "without BC" },
+            { "size", convertToString( size ) },
+      });
 
       /****
        * Add one using pure C code
        */
-      if( tests.containsValue( "all" ) || tests.containsValue( "add-one-pure-c"  ) )
+      if( containsValue( tests, "all" ) || containsValue( tests, "add-one-pure-c"  ) )
       {
          benchmark.setOperation( "Pure C", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
 
@@ -93,9 +94,9 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
          };
          if( withHost )
          {
-            benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingPureC );
+            const BenchmarkResult result = benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingPureC );
             if( check && ! hostTraverserBenchmark.checkAddOne(
-                  benchmark.getPerformedLoops(),
+                  result.loops,
                   benchmark.isResetingOn() ) )
                benchmark.addErrorMessage( "Test results are not correct." );
          }
@@ -106,9 +107,9 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
          };
          if( withCuda )
          {
-            benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingPureC );
+            const BenchmarkResult result = benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingPureC );
             if( check && ! cudaTraverserBenchmark.checkAddOne(
-                  benchmark.getPerformedLoops(),
+                  result.loops,
                   benchmark.isResetingOn() ) )
                benchmark.addErrorMessage( "Test results are not correct." );
          }
@@ -118,7 +119,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
       /****
        * Add one using parallel for
        */
-      if( tests.containsValue( "all" ) || tests.containsValue( "add-one-parallel-for" ) )
+      if( containsValue( tests, "all" ) || containsValue( tests, "add-one-parallel-for" ) )
       {
          benchmark.setOperation( "parallel for", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
 
@@ -128,9 +129,9 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
          };
          if( withHost )
          {
-            benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelFor );
-            if( check && ! hostTraverserBenchmark.checkAddOne( 
-                  benchmark.getPerformedLoops(),
+            const BenchmarkResult result = benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelFor );
+            if( check && ! hostTraverserBenchmark.checkAddOne(
+                  result.loops,
                   benchmark.isResetingOn() ) )
                benchmark.addErrorMessage( "Test results are not correct." );
          }
@@ -142,9 +143,9 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
          };
          if( withCuda )
          {
-            benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingParallelFor );
-            if( check && ! cudaTraverserBenchmark.checkAddOne( 
-                  benchmark.getPerformedLoops(),
+            const BenchmarkResult result = benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingParallelFor );
+            if( check && ! cudaTraverserBenchmark.checkAddOne(
+                  result.loops,
                   benchmark.isResetingOn() ) )
                benchmark.addErrorMessage( "Test results are not correct." );
          }
@@ -154,7 +155,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
       /****
        * Add one using parallel for with grid entity
        */
-      if( tests.containsValue( "all" ) || tests.containsValue( "add-one-simple-cell" ) )
+      if( containsValue( tests, "all" ) || containsValue( tests, "add-one-simple-cell" ) )
       {
          auto hostAddOneUsingSimpleCell = [&] ()
          {
@@ -163,9 +164,9 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
          benchmark.setOperation( "simple cell", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
          if( withHost )
          {
-            benchmark.time< Devices::Host >( hostReset, "CPU", hostAddOneUsingSimpleCell );
-            if( check && ! hostTraverserBenchmark.checkAddOne( 
-                  benchmark.getPerformedLoops(),
+            const BenchmarkResult result = benchmark.time< Devices::Host >( hostReset, "CPU", hostAddOneUsingSimpleCell );
+            if( check && ! hostTraverserBenchmark.checkAddOne(
+                  result.loops,
                   benchmark.isResetingOn() ) )
                benchmark.addErrorMessage( "Test results are not correct." );
          }
@@ -176,9 +177,9 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
          };
          if( withCuda )
          {
-            benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaAddOneUsingSimpleCell );
-            if( check && ! cudaTraverserBenchmark.checkAddOne( 
-                  benchmark.getPerformedLoops(),
+            const BenchmarkResult result = benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaAddOneUsingSimpleCell );
+            if( check && ! cudaTraverserBenchmark.checkAddOne(
+                  result.loops,
                   benchmark.isResetingOn() ) )
                benchmark.addErrorMessage( "Test results are not correct." );
          }
@@ -188,7 +189,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
       /****
        * Add one using parallel for with mesh function
        */
-      if( tests.containsValue( "all" ) || tests.containsValue( "add-one-parallel-for-and-mesh-function" ) )
+      if( containsValue( tests, "all" ) || containsValue( tests, "add-one-parallel-for-and-mesh-function" ) )
       {
          auto hostAddOneUsingParallelForAndMeshFunction = [&] ()
          {
@@ -197,9 +198,9 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
          benchmark.setOperation( "par.for+mesh fc.", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
          if( withHost )
          {
-            benchmark.time< Devices::Host >( hostReset, "CPU", hostAddOneUsingParallelForAndMeshFunction );
-            if( check && ! hostTraverserBenchmark.checkAddOne( 
-                  benchmark.getPerformedLoops(),
+            const BenchmarkResult result = benchmark.time< Devices::Host >( hostReset, "CPU", hostAddOneUsingParallelForAndMeshFunction );
+            if( check && ! hostTraverserBenchmark.checkAddOne(
+                  result.loops,
                   benchmark.isResetingOn() ) )
                benchmark.addErrorMessage( "Test results are not correct." );
          }
@@ -210,9 +211,9 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
          };
          if( withCuda )
          {
-            benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaAddOneUsingParallelForAndMeshFunction );
-            if( check && ! cudaTraverserBenchmark.checkAddOne( 
-                  benchmark.getPerformedLoops(),
+            const BenchmarkResult result = benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaAddOneUsingParallelForAndMeshFunction );
+            if( check && ! cudaTraverserBenchmark.checkAddOne(
+                  result.loops,
                   benchmark.isResetingOn() ) )
                benchmark.addErrorMessage( "Test results are not correct." );
          }
@@ -222,7 +223,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
       /****
        * Add one using traverser
        */
-      if( tests.containsValue( "all" ) || tests.containsValue( "add-one-traverser" ) )
+      if( containsValue( tests, "all" ) || containsValue( tests, "add-one-traverser" ) )
       {
          benchmark.setOperation( "traverser", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
          auto hostWriteOneUsingTraverser = [&] ()
@@ -231,9 +232,9 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
          };
          if( withHost )
          {
-            benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingTraverser );
-            if( check && ! hostTraverserBenchmark.checkAddOne( 
-                  benchmark.getPerformedLoops(),
+            const BenchmarkResult result = benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingTraverser );
+            if( check && ! hostTraverserBenchmark.checkAddOne(
+                  result.loops,
                   benchmark.isResetingOn() ) )
                benchmark.addErrorMessage( "Test results are not correct." );
          }
@@ -245,9 +246,9 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
          };
          if( withCuda )
          {
-            benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingTraverser );
-            if( check && ! cudaTraverserBenchmark.checkAddOne( 
-                  benchmark.getPerformedLoops(),
+            const BenchmarkResult result = benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingTraverser );
+            if( check && ! cudaTraverserBenchmark.checkAddOne(
+                  result.loops,
                   benchmark.isResetingOn() ) )
                benchmark.addErrorMessage( "Test results are not correct." );
          }
@@ -255,15 +256,10 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
       }
       std::cout << "--------------------------------------------------------------------------------------------------------" << std::endl;
    }
-   return true;
-      }
-      std::cout << "--------------------------------------------------------------------------------------------------------" << std::endl;
-   }
 
    /****
     * Full grid traversing including boundary conditions
     */
-   benchmark.newBenchmark( String("Traversing with boundary conditions" + convertToString( Dimension ) + "D" ), metadata );
    for( std::size_t size = minSize; size <= maxSize; size *= 2 )
    {
       GridTraversersBenchmark< Dimension, Devices::Host, Real, Index > hostTraverserBenchmark( size );
@@ -281,9 +277,11 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
       };
 #endif
 
-      benchmark.setMetadataColumns(
-         Benchmark::MetadataColumns(
-            {  {"size", convertToString( size ) }, } ) );
+      benchmark.setMetadataColumns({
+            { "dimension", convertToString( Dimension ) },
+            { "traverser", "with BC" },
+            { "size", convertToString( size ) },
+      });
 
       /****
        * Write one and two (as BC) using C for
@@ -300,7 +298,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
       };
 #endif
 
-      if( tests.containsValue( "all" ) || tests.containsValue( "bc-pure-c" ) )
+      if( containsValue( tests, "all" ) || containsValue( tests, "bc-pure-c" ) )
       {
          benchmark.setOperation( "Pure C", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
          if( withHost )
@@ -335,7 +333,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
       };
 #endif
 
-      if( tests.containsValue( "all" ) || tests.containsValue( "bc-parallel-for" ) )
+      if( containsValue( tests, "all" ) || containsValue( tests, "bc-parallel-for" ) )
       {
          benchmark.setOperation( "parallel for", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
          if( withHost )
@@ -364,7 +362,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
 //         cudaTraverserBenchmark.addOneUsingParallelFor();
 //      };
 //
-//      if( tests.containsValue( "all" ) || tests.containsValue( "bc-parallel-for" ) )
+//      if( containsValue( tests, "all" ) || containsValue( tests, "bc-parallel-for" ) )
 //      {
 //         benchmark.setOperation( "parallel for", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
 //         if( withHost )
@@ -393,8 +391,8 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
          cudaTraverserBenchmark.addOneUsingTraverser();
       };
 #endif
-      
-      if( tests.containsValue( "all" ) || tests.containsValue( "bc-traverser" ) )
+
+      if( containsValue( tests, "all" ) || containsValue( tests, "bc-traverser" ) )
       {
          benchmark.setOperation( "traverser", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
          if( withHost )
@@ -418,11 +416,9 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
    return true;
 }
 
-void setupConfig( Config::ConfigDescription& config )
+void configSetup( Config::ConfigDescription& config )
 {
-   config.addList< String >( "tests", "Tests to be performed.", "all" );
-   // FIXME: addList does not work with addEntryEnum - ConfigDescription::addEntryEnum throws std::bad_cast
-   // config.addList< String >( "tests", "Tests to be performed.", "all" );
+   config.addList< String >( "tests", "Tests to be performed.", {"all"} );
    config.addEntryEnum( "all" );
    config.addEntryEnum( "add-one-pure-c" );
    config.addEntryEnum( "add-one-parallel-for" );
@@ -443,15 +439,15 @@ void setupConfig( Config::ConfigDescription& config )
    config.addEntryEnum( "append" );
    config.addEntryEnum( "overwrite" );
 
-   config.addEntry< String >( "precision", "Precision of the arithmetics.", "double" );
-   config.addEntryEnum( "float" );
-   config.addEntryEnum( "double" );
-   config.addEntryEnum( "all" );
+//   config.addEntry< String >( "precision", "Precision of the arithmetics.", "double" );
+//   config.addEntryEnum( "float" );
+//   config.addEntryEnum( "double" );
+//   config.addEntryEnum( "all" );
    config.addEntry< int >( "dimension", "Set the problem dimension. 0 means all dimensions 1,2 and 3.", 0 );
    config.addEntry< int >( "min-size", "Minimum size of arrays/vectors used in the benchmark.", 10 );
    config.addEntry< int >( "max-size", "Minimum size of arrays/vectors used in the benchmark.", 1000 );
-   config.addEntry< int >( "size-step-factor", "Factor determining the size of arrays/vectors used in the benchmark. First size is min-size and each following size is stepFactor*previousSize, up to max-size.", 2 );
-   Benchmark::configSetup( config );
+//   config.addEntry< int >( "size-step-factor", "Factor determining the size of arrays/vectors used in the benchmark. First size is min-size and each following size is stepFactor*previousSize, up to max-size.", 2 );
+   Benchmark<>::configSetup( config );
 
    config.addDelimiter( "Device settings:" );
    Devices::Host::configSetup( config );
@@ -463,24 +459,27 @@ bool setupBenchmark( const Config::ParameterContainer& parameters )
 {
    const String & logFileName = parameters.getParameter< String >( "log-file" );
    const String & outputMode = parameters.getParameter< String >( "output-mode" );
-   const String & precision = parameters.getParameter< String >( "precision" );
-   const unsigned sizeStepFactor = parameters.getParameter< unsigned >( "size-step-factor" );
-
-   Benchmark benchmark; //( loops, verbose );
-   benchmark.setup( parameters );
-   Benchmark::MetadataMap metadata = getHardwareMetadata();
-   runBenchmark< Dimension >( parameters, benchmark, metadata );
+//   const String & precision = parameters.getParameter< String >( "precision" );
+//   const unsigned sizeStepFactor = parameters.getParameter< unsigned >( "size-step-factor" );
 
    auto mode = std::ios::out;
    if( outputMode == "append" )
        mode |= std::ios::app;
-   std::ofstream logFile( logFileName.getString(), mode );
+   std::ofstream logFile( logFileName, mode );
+
+   // init benchmark and set parameters
+   Benchmark<> benchmark( logFile ); //( loops, verbose );
+   benchmark.setup( parameters );
+
+   // write global metadata into a separate file
+   std::map< std::string, std::string > metadata = getHardwareMetadata();
+   metadata["loops"] = convertToString( parameters.getParameter< int >( "loops" ) );
+   metadata["reset"] = convertToString( parameters.getParameter< bool >( "reset" ) );
+   metadata["minimal test time"] = convertToString( parameters.getParameter< double >( "min-time" ) );
+   writeMapAsJson( metadata, logFileName, ".metadata.json" );
+
+   runBenchmark< Dimension >( parameters, benchmark );
 
-   if( ! benchmark.save( logFile ) )
-   {
-      std::cerr << "Failed to write the benchmark results to file '" << parameters.getParameter< String >( "log-file" ) << "'." << std::endl;
-      return false;
-   }
    return true;
 }
 
@@ -488,15 +487,15 @@ int main( int argc, char* argv[] )
 {
    Config::ConfigDescription config;
    Config::ParameterContainer parameters;
-   
-   setupConfig( config );
+
+   configSetup( config );
    if( ! parseCommandLine( argc, argv, config, parameters ) )
       return EXIT_FAILURE;
 
    if( ! Devices::Host::setup( parameters ) ||
        ! Devices::Cuda::setup( parameters ) )
       return EXIT_FAILURE;
-   
+
    const int dimension = parameters.getParameter< int >( "dimension" );
    bool status( false );
    if( ! dimension )
@@ -520,7 +519,5 @@ int main( int argc, char* argv[] )
             break;
       }
    }
-   if( status == false )
-      return EXIT_FAILURE;
-   return EXIT_SUCCESS;
+   return ! status;
 }
diff --git a/src/Python/BenchmarkLogs.py b/src/Python/BenchmarkLogs.py
new file mode 100644
index 0000000000000000000000000000000000000000..8c92040988565d67015a5dd92f2ec958b7849b92
--- /dev/null
+++ b/src/Python/BenchmarkLogs.py
@@ -0,0 +1,116 @@
+#!/usr/bin/python3
+
+__all__ = [
+    "dict_to_html_table",
+    "get_benchmark_metadata",
+    "get_benchmark_dataframes",
+]
+
+import os.path
+import json
+import pandas
+
+def dict_to_html_table(data):
+    html = "<table border=1>\n"
+    html += "<tbody>\n"
+    for key in sorted(data.keys()):
+        html += f"\t<tr><td>{key}</td><td>{data[key]}</td></tr>\n"
+    html += "</tbody>\n"
+    html += "</table>\n"
+    return html
+
+def get_benchmark_metadata(filename):
+    """
+    Reads metadata of the benchmark in the given file.
+
+    :param str filename: path of the file with metadata or benchmark results.
+        - If it ends with ".metadata.json", metadata is read from that file.
+        - Otherwise, the extension is first replaced with ".metadata.json".
+    :returns: dict as returned by json.load, or None if the file does not exist.
+    """
+    if not filename.endswith(".metadata.json"):
+        filename = os.path.splitext(filename)[0] + ".metadata.json"
+    if os.path.isfile(filename):
+        print(f"Parsing metadata from file {filename}")
+        return json.load(open(filename, "r"))
+    print(f"Metadata file {filename} does not exist")
+    return None
+
+def get_benchmark_dataframe(logFile):
+    """
+    Get pandas dataframe with benchmark results stored in the given log file.
+
+    :param logFile: path to the log file
+    :returns: pandas.DataFrame instance
+    """
+    print(f"Parsing input file {logFile}")
+    df = pandas.read_json(open(logFile, "r"), orient="records", lines=True)
+
+    # convert "N/A" in the speedup column to nan
+    if "speedup" in df.columns:
+        df["speedup"] = pandas.to_numeric(df["speedup"], errors="coerce")
+
+    return df
+
+def gen_dataframes_per_operation(logFile, header_elements=None):
+    """
+    Reads benchmark results stored in the given log file and splits them into
+    multiple dataframes according to the "operation" column.
+
+    Various post-processing steps are done on each partial dataframe:
+    - columns with only NaN values are removed
+    - the operation column is removed
+    - the "index" and "columns" of the dataframe are set:
+        - if header_elements are given, they are set as "columns" and everything
+          else is used for the index
+        - otherwise, all columns in the dataframe before "time" are used for
+          the index, and the remaining columns (starting with "time") stay as
+          "columns"
+    - the "performer" column is set as the last column of the index
+    - note that the index is not explicitly sorted, so data is ordered as in the
+      input file
+
+    :param logFile: path to the log file
+    :yields: pairs of (str, pandas.DataFrame) object, where the str denotes the
+             particular operation name
+    """
+    main_df = get_benchmark_dataframe(logFile)
+
+    # check if there is at least one operation
+    if "operation" not in main_df.columns:
+        yield "Dummy operation", main_df
+        return
+
+    # extract all benchmark operations, preserve their order as found in the dataframe
+    operations = []
+    for op in main_df["operation"]:
+        if op not in operations:
+            operations.append(op)
+
+    # set operation as index
+    main_df = main_df.set_index("operation")
+
+    # if header_elements was not provided, we assume that "time" and all following columns
+    # are benchmark results, and all preceding columns are metadata columns that will be
+    # set as index of the dataframe
+    if header_elements is None:
+        header_elements = list(main_df.columns)
+        header_elements = header_elements[header_elements.index("time"):]
+        # FIXME: the "rows" and "columns" (in the gemv operation) are parsed after the correct header elements, because the preceding operations don't have these metadata columns
+        # TODO: each benchmark should record the header elements in the metadata file
+        header_elements = [e for e in header_elements if e not in ["rows", "columns"]]
+
+    # emit one df per operation
+    for op in operations:
+        df = main_df.loc[op]
+        # remove columns with only NaNs
+        df = df.dropna(axis=1, how="all")
+        # remove the operation column (index)
+        df = df.reset_index(drop=True)
+        # prepare index_columns and make sure that performer is the last
+        index_columns = [c for c in df.columns if c not in header_elements and c != "performer"]
+        index_columns.append("performer")
+        # set new index for the df: all columns except header_elements
+        df = df.set_index(index_columns)
+        # emit a pair (op, df)
+        yield op, df
diff --git a/src/Python/BenchmarkPlots.py b/src/Python/BenchmarkPlots.py
new file mode 100644
index 0000000000000000000000000000000000000000..19a6e94f6cdb5090aee7b3cc62a0b7020aa2f8a4
--- /dev/null
+++ b/src/Python/BenchmarkPlots.py
@@ -0,0 +1,129 @@
+#!/usr/bin/python3
+
+__all__ = [
+    "plot_bandwidth_vs_size",
+    "heatmaps_bandwidth",
+    "get_image_html_tag",
+]
+
+import numpy
+import matplotlib.pyplot as plt
+from cycler import cycler
+import io
+import base64
+
+custom_cycler = cycler(linestyle=["-", "--", ":", "-."]) * cycler("color", ["#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd", "#8c564b", "#e377c2", "#7f7f7f", "#bcbd22", "#17becf"])
+
+def plot_bandwidth_vs_size(df, size_name="size", prop_cycler=custom_cycler, **kwargs):
+    """
+    Creates a bandwidth-size plot. The "size" data are expected in the index of
+    the dataframe, all other columns of the index are used for labels of the
+    graph lines.
+
+    :param df: a pandas.DataFrame instance
+    :param size_name: name of the "size" column in the index
+    :param prop_cycler:
+        property cycler for the graph lines, see the documentation for details:
+        https://matplotlib.org/stable/tutorials/intermediate/color_cycle.html
+    :param kwargs:
+        optional keyword arguments passed to matplotlib's errorbar function
+    :returns: a tuple (fig, ax) as returned by plt.subplots()
+    """
+    # prepare the dataframe
+    assert "bandwidth" in df.columns
+    assert size_name in df.index.names
+    df = df.reset_index(level=size_name).sort_index()
+
+    # set default parameters for the plot
+    kwargs.setdefault("capsize", 4)
+
+    # plot the graph
+    fig, ax = plt.subplots()
+    ax.set_xlabel(size_name)
+    ax.set_ylabel("bandwidth [GiB/s]")
+    ax.set_prop_cycle(prop_cycler)
+    for idx in df.index.unique():
+        part = df.loc[idx]
+        err = part["bandwidth"] * part["stddev/time"]
+        ax.errorbar(part[size_name], part["bandwidth"], yerr=err, label=", ".join(idx), **kwargs)
+    # see https://stackoverflow.com/a/43439132
+    ax.legend(bbox_to_anchor=(1.04, 1), loc="upper left", borderaxespad=0.)
+
+    return fig, ax
+
+def heatmaps_bandwidth(df, x_name="columns", y_name="rows", *, cbar_kw=None, **kwargs):
+    """
+    Creates heatmaps two-dimensional data of bandwidth. The "size" data (i.e.
+    x_name and y_name) are expected in the index of the dataframe, all other
+    columns of the index are used to label the heatmaps. Heatmaps are generated
+    using the Python generator interface for each unique tuple of dataframe
+    index values.
+
+    :param df: a pandas.DataFrame instance
+    :param x_name: name of the column in the index to map along the x-axis
+    :param y_name: name of the column in the index to map along the y-axis
+    :param cbar_kw:
+        optional dict of arguments passed to matplotlib's colorbar function
+    :param kwargs:
+        optional keyword arguments passed to matplotlib's imshow function
+    :returns: a tuple (fig, ax) as returned by plt.subplots()
+    """
+    # prepare the dataframe
+    assert "bandwidth" in df.columns
+    assert x_name in df.index.names
+    assert y_name in df.index.names
+    df = df.reset_index(level=[x_name, y_name]).sort_index()
+
+    if cbar_kw is None:
+        cbar_kw = {}
+
+    for idx in df.index.unique():
+        # drop the index
+        part = df.loc[idx].reset_index(drop=True)
+        # get just the data we need
+        part = part[[x_name, y_name, "bandwidth"]].set_index([y_name, x_name])
+        # convert to a 2D array
+        bandwidth = part.stack().unstack(level=x_name)
+        # remove the column full of "bandwidth" from the index
+        bandwidth = bandwidth.reset_index(level=1, drop=True)
+
+        # figure setup
+        fig, ax = plt.subplots()
+        ax.set_xlabel(x_name)
+        ax.set_ylabel(y_name)
+        label = ", ".join(idx)
+        ax.set_title(f"{label} bandwidth [GiB/s]")
+
+        # plot the heatmap and colorbar
+        im = ax.imshow(bandwidth, interpolation=None, **kwargs)
+        cbar = ax.figure.colorbar(im, ax=ax, **cbar_kw)
+        cbar.ax.set_ylabel("bandwidth", rotation=-90, va="bottom")
+
+        # set ticks and their labels
+        ax.set_xticks(numpy.arange(len(bandwidth.columns)))
+        ax.set_yticks(numpy.arange(len(bandwidth.index)))
+        ax.set_xticklabels(int(n) for n in bandwidth.columns)
+        ax.set_yticklabels(int(n) for n in bandwidth.index)
+
+        # rotate xtick labels and set their alignment
+        plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor")
+
+        yield fig, ax
+
+def get_image_html_tag(fig, format="svg"):
+    """
+    Returns an HTML tag with embedded image data in the given format.
+
+    :param fig: a matplotlib figure instance
+    :param format: output image format (passed to fig.savefig)
+    """
+    stream = io.BytesIO()
+    # bbox_inches: expand the canvas to include the legend that was put outside the plot
+    # see https://stackoverflow.com/a/43439132
+    fig.savefig(stream, format=format, bbox_inches="tight")
+    data = stream.getvalue()
+
+    if format == "svg":
+        return data.decode("utf-8")
+    data = base64.b64encode(data).decode("utf-8")
+    return f"<img src=\"data:image/{format};base64,{data}\">"
diff --git a/src/Python/CMakeLists.txt b/src/Python/CMakeLists.txt
index 505e5f19429d3b9ecf5c1e6010a3293ca347e3a4..87f2c9cc12f87d337c739aafda5c72c28bf58af8 100644
--- a/src/Python/CMakeLists.txt
+++ b/src/Python/CMakeLists.txt
@@ -6,6 +6,8 @@ set( PYTHON_SITE_PACKAGES_DIR lib/python${PYTHON_VERSION_MAJOR}.${PYTHON_VERSION
 if( PYTHONINTERP_FOUND )
    CONFIGURE_FILE( "__init__.py.in" "__init__.py" )
    INSTALL( FILES ${CMAKE_CURRENT_BINARY_DIR}/__init__.py
+                  BenchmarkLogs.py
+                  BenchmarkPlots.py
                   LogParser.py
             DESTINATION ${PYTHON_SITE_PACKAGES_DIR}/TNL )
 endif()
diff --git a/src/Python/LogParser.py b/src/Python/LogParser.py
index 0c327d71c89fa242cd869076beb0f19e531b8839..e3c9c672a82f8cbc0fc6afc0f10ee357436c6148 100644
--- a/src/Python/LogParser.py
+++ b/src/Python/LogParser.py
@@ -1,5 +1,10 @@
 #!/usr/bin/env python3
 
+import warnings
+warnings.warn("The CustomLogging format for TNL benchmarks is deprecated. Please switch your benchmark "
+              "to JsonLogging and use the tnl-benchmark-to-html.py script for post-processing.",
+              DeprecationWarning)
+
 import collections
 
 try:
diff --git a/src/TNL/Benchmarks/Benchmarks.h b/src/TNL/Benchmarks/Benchmarks.h
new file mode 100644
index 0000000000000000000000000000000000000000..4036fa6d46dc7a58fa7dfc6395a428e157532f2d
--- /dev/null
+++ b/src/TNL/Benchmarks/Benchmarks.h
@@ -0,0 +1,165 @@
+/***************************************************************************
+                          Benchmarks.h  -  description
+                             -------------------
+    begin                : Dec 30, 2015
+    copyright            : (C) 2015 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Jakub Klinkovsky,
+//                 Tomas Oberhuber
+
+#pragma once
+
+#include "JsonLogging.h"
+
+#include <limits>
+
+#include <TNL/String.h>
+#include <TNL/Solvers/IterativeSolverMonitor.h>
+
+namespace TNL {
+namespace Benchmarks {
+
+const double oneGB = 1024.0 * 1024.0 * 1024.0;
+
+struct BenchmarkResult
+{
+   using HeaderElements = typename Logging::HeaderElements;
+   using RowElements = typename Logging::RowElements;
+
+   int loops = 0;
+   double time = std::numeric_limits<double>::quiet_NaN();
+   double stddev = std::numeric_limits<double>::quiet_NaN();
+   double bandwidth = std::numeric_limits<double>::quiet_NaN();
+   double speedup = std::numeric_limits<double>::quiet_NaN();
+
+   virtual HeaderElements getTableHeader() const
+   {
+      return HeaderElements({ "time", "stddev", "stddev/time", "loops", "bandwidth", "speedup" });
+   }
+
+   virtual std::vector< int > getColumnWidthHints() const
+   {
+      return std::vector< int >({ 14, 14, 14, 6, 14, 14 });
+   }
+
+   virtual RowElements getRowElements() const
+   {
+      RowElements elements;
+      // write in scientific format to avoid precision loss
+      elements << std::scientific << time << stddev << stddev / time << loops << bandwidth;
+      if( speedup != 0 )
+         elements << speedup;
+      else
+         elements << "N/A";
+      return elements;
+   }
+};
+
+template< typename Logger = JsonLogging >
+class Benchmark
+{
+   public:
+      using MetadataElement = typename Logger::MetadataElement;
+      using MetadataColumns = typename Logger::MetadataColumns;
+      using SolverMonitorType = Solvers::IterativeSolverMonitor< double, int >;
+
+      Benchmark( std::ostream& output, int loops = 10, bool verbose = true );
+
+      static void configSetup( Config::ConfigDescription& config );
+
+      void setup( const Config::ParameterContainer& parameters );
+
+      void setLoops( int loops );
+
+      void setMinTime( double minTime );
+
+      bool isResetingOn() const;
+
+      // Sets metadata columns -- values used for all subsequent rows until
+      // the next call to this function.
+      void setMetadataColumns( const MetadataColumns & metadata );
+
+      // Sets the value of one metadata column -- useful for iteratively
+      // changing MetadataColumns that were set using the previous method.
+      void setMetadataElement( const typename MetadataColumns::value_type & element );
+
+      // Sets the width of metadata columns when printed to the terminal.
+      void setMetadataWidths( const std::map< std::string, int > & widths );
+
+      // Sets the dataset size and base time for the calculations of bandwidth
+      // and speedup in the benchmarks result.
+      void setDatasetSize( const double datasetSize = 0.0, // in GB
+                           const double baseTime = 0.0 );
+
+      // Sets current operation -- operations expand the table vertically
+      //  - baseTime should be reset to 0.0 for most operations, but sometimes
+      //    it is useful to override it
+      //  - Order of operations inside a "Benchmark" does not matter, rows can be
+      //    easily sorted while converting to HTML.)
+      void
+      setOperation( const String & operation,
+                    const double datasetSize = 0.0, // in GB
+                    const double baseTime = 0.0 );
+
+      // Times a single ComputeFunction. Subsequent calls implicitly split
+      // the current operation into sub-columns identified by "performer",
+      // which are further split into "bandwidth", "time" and "speedup" columns.
+      template< typename Device,
+                typename ResetFunction,
+                typename ComputeFunction >
+      void time( ResetFunction reset,
+                 const String & performer,
+                 ComputeFunction & compute,
+                 BenchmarkResult & result );
+
+      template< typename Device,
+                typename ResetFunction,
+                typename ComputeFunction >
+      BenchmarkResult time( ResetFunction reset,
+                            const String & performer,
+                            ComputeFunction & compute );
+
+      // The same methods as above but without the reset function
+      template< typename Device,
+                typename ComputeFunction >
+      void time( const String & performer,
+                 ComputeFunction & compute,
+                 BenchmarkResult & result );
+
+      template< typename Device,
+                typename ComputeFunction >
+      BenchmarkResult time( const String & performer,
+                            ComputeFunction & compute );
+
+      // Adds an error message to the log. Should be called in places where the
+      // "time" method could not be called (e.g. due to failed allocation).
+      void addErrorMessage( const std::string& message );
+
+      SolverMonitorType& getMonitor();
+
+      double getBaseTime() const;
+
+   protected:
+      Logger logger;
+
+      int loops = 1;
+
+      double minTime = 0.0;
+
+      double datasetSize = 0.0;
+
+      double baseTime = 0.0;
+
+      bool reset = true;
+
+      SolverMonitorType monitor;
+};
+
+} // namespace Benchmarks
+} // namespace TNL
+
+#include "Benchmarks.hpp"
diff --git a/src/TNL/Benchmarks/Benchmarks.hpp b/src/TNL/Benchmarks/Benchmarks.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..8aa3ae9c17f17493ba1ac1570e82aef8f4705128
--- /dev/null
+++ b/src/TNL/Benchmarks/Benchmarks.hpp
@@ -0,0 +1,234 @@
+/***************************************************************************
+                          Benchmarks.hpp  -  description
+                             -------------------
+    begin                : Jun 7, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Jakub Klinkovsky,
+//                 Tomas Oberhuber
+
+#pragma once
+
+#include "Benchmarks.h"
+#include "Utils.h"
+
+#include <iostream>
+#include <exception>
+
+namespace TNL {
+namespace Benchmarks {
+
+
+template< typename Logger >
+Benchmark< Logger >::
+Benchmark( std::ostream& output, int loops, bool verbose )
+: logger(output, verbose), loops(loops)
+{}
+
+template< typename Logger >
+void
+Benchmark< Logger >::
+configSetup( Config::ConfigDescription& config )
+{
+   config.addEntry< int >( "loops", "Number of iterations for every computation.", 10 );
+   config.addEntry< bool >( "reset", "Call reset function between loops.", true );
+   config.addEntry< double >( "min-time", "Minimal real time in seconds for every computation.", 0.0 );
+   config.addEntry< int >( "verbose", "Verbose mode, the higher number the more verbosity.", 1 );
+}
+
+template< typename Logger >
+void
+Benchmark< Logger >::
+setup( const Config::ParameterContainer& parameters )
+{
+   this->loops = parameters.getParameter< int >( "loops" );
+   this->reset = parameters.getParameter< bool >( "reset" );
+   this->minTime = parameters.getParameter< double >( "min-time" );
+   const int verbose = parameters.getParameter< int >( "verbose" );
+   logger.setVerbose( verbose );
+}
+
+template< typename Logger >
+void
+Benchmark< Logger >::
+setLoops( int loops )
+{
+   this->loops = loops;
+}
+
+template< typename Logger >
+void
+Benchmark< Logger >::
+setMinTime( double minTime )
+{
+   this->minTime = minTime;
+}
+
+template< typename Logger >
+bool
+Benchmark< Logger >::
+isResetingOn() const
+{
+   return reset;
+}
+
+template< typename Logger >
+void
+Benchmark< Logger >::
+setMetadataColumns( const MetadataColumns & metadata )
+{
+   logger.setMetadataColumns( metadata );
+}
+
+template< typename Logger >
+void
+Benchmark< Logger >::
+setMetadataElement( const typename MetadataColumns::value_type & element )
+{
+   logger.setMetadataElement( element );
+}
+
+template< typename Logger >
+void
+Benchmark< Logger >::
+setMetadataWidths( const std::map< std::string, int > & widths )
+{
+   logger.setMetadataWidths( widths );
+}
+
+template< typename Logger >
+void
+Benchmark< Logger >::
+setDatasetSize( const double datasetSize,
+                const double baseTime )
+{
+   this->datasetSize = datasetSize;
+   this->baseTime = baseTime;
+}
+
+template< typename Logger >
+void
+Benchmark< Logger >::
+setOperation( const String & operation,
+              const double datasetSize,
+              const double baseTime )
+{
+   monitor.setStage( operation.getString() );
+   logger.setMetadataElement( {"operation", operation}, 0 );
+   setDatasetSize( datasetSize, baseTime );
+}
+
+template< typename Logger >
+   template< typename Device,
+             typename ResetFunction,
+             typename ComputeFunction >
+void
+Benchmark< Logger >::
+time( ResetFunction reset,
+      const String & performer,
+      ComputeFunction & compute,
+      BenchmarkResult & result )
+{
+   result.time = std::numeric_limits<double>::quiet_NaN();
+   result.stddev = std::numeric_limits<double>::quiet_NaN();
+
+   // run the monitor main loop
+   Solvers::SolverMonitorThread monitor_thread( monitor );
+   if( logger.getVerbose() <= 1 )
+      // stop the main loop when not verbose
+      monitor.stopMainLoop();
+
+   std::string errorMessage;
+   try {
+      if( this->reset )
+         std::tie( result.loops, result.time, result.stddev ) = timeFunction< Device >( compute, reset, loops, minTime, monitor );
+      else {
+         auto noReset = [] () {};
+         std::tie( result.loops, result.time, result.stddev ) = timeFunction< Device >( compute, noReset, loops, minTime, monitor );
+      }
+   }
+   catch ( const std::exception& e ) {
+      errorMessage = "timeFunction failed due to a C++ exception with description: " + std::string(e.what());
+      std::cerr << errorMessage << std::endl;
+   }
+
+   result.bandwidth = datasetSize / result.time;
+   result.speedup = this->baseTime / result.time;
+   if( this->baseTime == 0.0 )
+      this->baseTime = result.time;
+
+   logger.logResult( performer, result.getTableHeader(), result.getRowElements(), result.getColumnWidthHints(), errorMessage );
+}
+
+template< typename Logger >
+   template< typename Device,
+             typename ResetFunction,
+             typename ComputeFunction >
+BenchmarkResult
+Benchmark< Logger >::
+time( ResetFunction reset,
+      const String& performer,
+      ComputeFunction& compute )
+{
+   BenchmarkResult result;
+   time< Device >( reset, performer, compute, result );
+   return result;
+}
+
+template< typename Logger >
+   template< typename Device,
+             typename ComputeFunction >
+void
+Benchmark< Logger >::
+time( const String & performer,
+      ComputeFunction & compute,
+      BenchmarkResult & result )
+{
+   auto noReset = [] () {};
+   time< Device >( noReset, performer, compute, result );
+}
+
+template< typename Logger >
+   template< typename Device,
+             typename ComputeFunction >
+BenchmarkResult
+Benchmark< Logger >::
+time( const String & performer,
+      ComputeFunction & compute )
+{
+   BenchmarkResult result;
+   time< Device >( performer, compute, result );
+   return result;
+}
+
+template< typename Logger >
+void
+Benchmark< Logger >::
+addErrorMessage( const std::string& message )
+{
+   logger.writeErrorMessage( message );
+   std::cerr << message << std::endl;
+}
+
+template< typename Logger >
+auto
+Benchmark< Logger >::
+getMonitor() -> SolverMonitorType&
+{
+   return monitor;
+}
+
+template< typename Logger >
+double
+Benchmark< Logger >::
+getBaseTime() const
+{
+   return baseTime;
+}
+
+} // namespace Benchmarks
+} // namespace TNL
diff --git a/src/TNL/Benchmarks/CustomLogging.h b/src/TNL/Benchmarks/CustomLogging.h
new file mode 100644
index 0000000000000000000000000000000000000000..d734ecbe15a66c69d632081a3d55fd465c20ba24
--- /dev/null
+++ b/src/TNL/Benchmarks/CustomLogging.h
@@ -0,0 +1,148 @@
+/***************************************************************************
+                          CustomLogging.h  -  description
+                             -------------------
+    begin                : May 11, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Jakub Klinkovsky,
+//                 Tomas Oberhuber
+
+#pragma once
+
+#include "Logging.h"
+#include <TNL/Assert.h>
+
+namespace TNL {
+namespace Benchmarks {
+
+class CustomLogging
+: public Logging
+{
+public:
+   // inherit constructors
+   using Logging::Logging;
+
+   void
+   writeTableHeader( const std::string & spanningElement,
+                     const HeaderElements & subElements )
+   {
+      if( verbose && header_changed ) {
+         for( auto & it : metadataColumns ) {
+            const int width = (metadataWidths.count( it.first )) ? metadataWidths[ it.first ] : 15;
+            std::cout << std::setw( width ) << it.first;
+         }
+
+         // spanning element is printed as usual column to stdout,
+         // but is excluded from header
+         std::cout << std::setw( 15 ) << "";
+
+         for( auto & it : subElements ) {
+            std::cout << std::setw( 15 ) << it;
+         }
+         std::cout << std::endl;
+
+         header_changed = false;
+      }
+
+      // initial indent string
+      log << std::endl;
+      for( auto & it : metadataColumns ) {
+         log << "! " << it.first << std::endl;
+      }
+
+      log << "! " << spanningElement << std::endl;
+      for( auto & it : subElements ) {
+         log << "!! " << it << std::endl;
+      }
+   }
+
+   void
+   writeTableRow( const std::string & spanningElement,
+                  const RowElements & subElements,
+                  const std::string & errorMessage )
+   {
+      if( verbose ) {
+         for( auto & it : metadataColumns ) {
+            const int width = (metadataWidths.count( it.first )) ? metadataWidths[ it.first ] : 15;
+            std::cout << std::setw( width ) << it.second;
+         }
+         // spanning element is printed as usual column to stdout
+         std::cout << std::setw( 15 ) << spanningElement;
+         for( auto & it : subElements ) {
+            std::cout << std::setw( 15 ) << it;
+         }
+         std::cout << std::endl;
+      }
+
+      // only when changed (the header has been already adjusted)
+      // print each element on separate line
+      for( auto & it : metadataColumns ) {
+         log << it.second << std::endl;
+      }
+
+      if( errorMessage.empty() ) {
+         // benchmark data are indented
+         const std::string indent = "    ";
+         for( auto & it : subElements ) {
+            log << indent << it << std::endl;
+         }
+      }
+      else {
+         // write the message
+         log << errorMessage << std::endl;
+      }
+   }
+
+   virtual void
+   logResult( const std::string& performer,
+              const HeaderElements& headerElements,
+              const RowElements& rowElements,
+              const WidthHints& columnWidthHints,
+              const std::string& errorMessage = "" ) override
+   {
+      TNL_ASSERT_EQ( headerElements.size(), rowElements.size(), "elements must have equal sizes" );
+      TNL_ASSERT_EQ( headerElements.size(), columnWidthHints.size(), "elements must have equal sizes" );
+      writeTableHeader( performer, headerElements );
+      writeTableRow( performer, rowElements, errorMessage );
+   }
+
+   virtual void
+   writeErrorMessage( const std::string& message ) override
+   {
+      // initial indent string
+      log << std::endl;
+      for( auto & it : metadataColumns ) {
+         log << "! " << it.first << std::endl;
+      }
+
+      // only when changed (the header has been already adjusted)
+      // print each element on separate line
+      for( auto & it : metadataColumns ) {
+         log << it.second << std::endl;
+      }
+
+      // write the message
+      log << message << std::endl;
+   }
+
+protected:
+   // manual double -> string conversion with fixed precision
+   static std::string
+   _to_string( double num, int precision = 0, bool fixed = false )
+   {
+      std::stringstream str;
+      if( fixed )
+         str << std::fixed;
+      if( precision )
+         str << std::setprecision( precision );
+      str << num;
+      return std::string( str.str().data() );
+   }
+};
+
+} // namespace Benchmarks
+} // namespace TNL
diff --git a/src/TNL/Benchmarks/JsonLogging.h b/src/TNL/Benchmarks/JsonLogging.h
new file mode 100644
index 0000000000000000000000000000000000000000..d74ecbdf23660a0aa907b9e94307cfcf1e9c918f
--- /dev/null
+++ b/src/TNL/Benchmarks/JsonLogging.h
@@ -0,0 +1,137 @@
+/***************************************************************************
+                          JsonLogging.h  -  description
+                             -------------------
+    begin                : May 11, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Jakub Klinkovsky,
+//                 Tomas Oberhuber
+
+#pragma once
+
+#include "Logging.h"
+#include <TNL/Assert.h>
+
+namespace TNL {
+namespace Benchmarks {
+
+class JsonLogging
+: public Logging
+{
+public:
+   // inherit constructors
+   using Logging::Logging;
+
+   void writeHeader( const HeaderElements& headerElements, const WidthHints& widths )
+   {
+      TNL_ASSERT_EQ( headerElements.size(), widths.size(), "elements must have equal sizes" );
+      if( verbose && header_changed )
+      {
+         for( auto & lg : metadataColumns ) {
+            const int width = (metadataWidths.count( lg.first )) ? metadataWidths[ lg.first ] : 14;
+            std::cout << std::setw( width ) << lg.first;
+         }
+         for( std::size_t i = 0; i < headerElements.size(); i++ )
+            std::cout << std::setw( widths[ i ] ) << headerElements[ i ];
+         std::cout << std::endl;
+         header_changed = false;
+      }
+   }
+
+   void writeRow( const HeaderElements& headerElements,
+                  const RowElements& rowElements,
+                  const WidthHints& widths,
+                  const std::string& errorMessage )
+   {
+      TNL_ASSERT_EQ( headerElements.size(), rowElements.size(), "elements must have equal sizes" );
+      TNL_ASSERT_EQ( headerElements.size(), widths.size(), "elements must have equal sizes" );
+
+      log << "{";
+
+      // write common logs
+      int idx( 0 );
+      for( auto lg : this->metadataColumns )
+      {
+         if( verbose ) {
+            const int width = (metadataWidths.count( lg.first )) ? metadataWidths[ lg.first ] : 14;
+            std::cout << std::setw( width ) << lg.second;
+         }
+         if( idx++ > 0 )
+            log << ", ";
+         log << "\"" << lg.first << "\": \"" << lg.second << "\"";
+      }
+
+      std::size_t i = 0;
+      for( auto el : rowElements )
+      {
+         if( verbose )
+            std::cout << std::setw( widths[ i ] ) << el;
+         if( idx++ > 0 )
+            log << ", ";
+         log << "\"" << headerElements[ i ] << "\": \"" << el << "\"";
+         i++;
+      }
+      if( ! errorMessage.empty() ) {
+         if( idx++ > 0 )
+            log << ", ";
+         log << "\"error\": \"" << errorMessage << "\"";
+      }
+      log << "}" << std::endl;
+      if( verbose )
+         std::cout << std::endl;
+   }
+
+   virtual void
+   logResult( const std::string& performer,
+              const HeaderElements& headerElements,
+              const RowElements& rowElements,
+              const WidthHints& columnWidthHints,
+              const std::string& errorMessage = "" ) override
+   {
+      setMetadataElement({ "performer", performer });
+      writeHeader( headerElements, columnWidthHints );
+      writeRow( headerElements, rowElements, columnWidthHints, errorMessage );
+   }
+
+   virtual void
+   writeErrorMessage( const std::string& message ) override
+   {
+      log << "{";
+
+      // write common logs
+      int idx( 0 );
+      for( auto lg : this->metadataColumns )
+      {
+         if( idx++ > 0 )
+            log << ", ";
+         log << "\"" << lg.first << "\": \"" << lg.second << "\"";
+      }
+
+      if( idx++ > 0 )
+         log << ", ";
+      log << "\"error\": \"" << message << "\"";
+
+      log << "}" << std::endl;
+   }
+
+protected:
+   // manual double -> string conversion with fixed precision
+   static std::string
+   _to_string( double num, int precision = 0, bool fixed = false )
+   {
+      std::stringstream str;
+      if( fixed )
+         str << std::fixed;
+      if( precision )
+         str << std::setprecision( precision );
+      str << num;
+      return std::string( str.str().data() );
+   }
+};
+
+} // namespace Benchmarks
+} // namespace TNL
diff --git a/src/TNL/Benchmarks/Logging.h b/src/TNL/Benchmarks/Logging.h
new file mode 100644
index 0000000000000000000000000000000000000000..693d02ee796556f4c852bfcaf19f92f921074939
--- /dev/null
+++ b/src/TNL/Benchmarks/Logging.h
@@ -0,0 +1,181 @@
+/***************************************************************************
+                          Logging.h  -  description
+                             -------------------
+    begin                : Dec 25, 2018
+    copyright            : (C) 2018 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Jakub Klinkovsky,
+//                 Tomas Oberhuber
+
+#pragma once
+
+#include <list>
+#include <map>
+#include <vector>
+#include <iostream>
+#include <iomanip>
+#include <string>
+#include <fstream>
+
+namespace TNL {
+namespace Benchmarks {
+
+class LoggingRowElements
+{
+   public:
+
+      LoggingRowElements()
+      {
+         stream << std::setprecision( 6 ) << std::fixed;
+      }
+
+      template< typename T >
+      LoggingRowElements& operator << ( const T& b )
+      {
+         stream << b;
+         elements.push_back( stream.str() );
+         stream.str( std::string() );
+         return *this;
+      }
+
+      LoggingRowElements& operator << ( decltype( std::setprecision( 2 ) )& setprec )
+      {
+         stream << setprec;
+         return *this;
+      }
+
+      LoggingRowElements& operator << ( decltype( std::fixed )& setfixed ) // the same works also for std::scientific
+      {
+         stream << setfixed;
+         return *this;
+      }
+
+      std::size_t size() const noexcept { return elements.size(); };
+
+      // iterators
+      auto begin() noexcept { return elements.begin(); }
+
+      auto begin() const noexcept { return elements.begin(); }
+
+      auto cbegin() const noexcept { return elements.cbegin(); }
+
+      auto end() noexcept { return elements.end(); }
+
+      auto end() const noexcept { return elements.end(); }
+
+      auto cend() const noexcept { return elements.cend(); }
+
+   protected:
+      std::list< std::string > elements;
+
+      std::stringstream stream;
+};
+
+class Logging
+{
+public:
+   using MetadataElement = std::pair< std::string, std::string >;
+   using MetadataColumns = std::vector< MetadataElement >;
+
+   using HeaderElements = std::vector< std::string >;
+   using RowElements = LoggingRowElements;
+   using WidthHints = std::vector< int >;
+
+   Logging( std::ostream& log, int verbose = true )
+   : log(log), verbose(verbose)
+   {
+      try {
+         // check if we got an open file
+         std::ofstream& file = dynamic_cast< std::ofstream& >( log );
+         if( file.is_open() )
+            // enable exceptions, but only if we got an open file
+            // (under MPI, only the master rank typically opens the log file and thus
+            // logs from other ranks are ignored here)
+            file.exceptions( std::ostream::failbit | std::ostream::badbit | std::ostream::eofbit );
+      }
+      catch( std::bad_cast& ) {
+         // also enable exceptions if we did not get a file
+         log.exceptions( std::ostream::failbit | std::ostream::badbit | std::ostream::eofbit );
+      }
+   }
+
+   void
+   setVerbose( int verbose )
+   {
+      this->verbose = verbose;
+   }
+
+   int getVerbose() const
+   {
+      return verbose;
+   }
+
+   virtual void setMetadataColumns( const MetadataColumns& elements )
+   {
+      // check if a header element changed (i.e. a first item of the pairs)
+      if( metadataColumns.size() != elements.size() )
+         header_changed = true;
+      else
+         for( std::size_t i = 0; i < metadataColumns.size(); i++ )
+            if( metadataColumns[ i ].first != elements[ i ].first ) {
+               header_changed = true;
+               break;
+            }
+      metadataColumns = elements;
+   }
+
+   virtual void
+   setMetadataElement( const typename MetadataColumns::value_type & element,
+                       int insertPosition = -1 /* negative values insert from the end */ )
+   {
+      bool found = false;
+      for( auto & it : metadataColumns )
+         if( it.first == element.first ) {
+            if( it.second != element.second )
+               it.second = element.second;
+            found = true;
+            break;
+         }
+      if( ! found ) {
+         if( insertPosition < 0 )
+            metadataColumns.insert( metadataColumns.end() + insertPosition + 1, element );
+         else
+            metadataColumns.insert( metadataColumns.begin() + insertPosition, element );
+         header_changed = true;
+      }
+   }
+
+   virtual void
+   setMetadataWidths( const std::map< std::string, int > & widths )
+   {
+      for( auto & it : widths )
+         if( metadataWidths.count( it.first ) )
+            metadataWidths[ it.first ] = it.second;
+         else
+            metadataWidths.insert( it );
+   }
+
+   virtual void
+   logResult( const std::string& performer,
+              const HeaderElements& headerElements,
+              const RowElements& rowElements,
+              const WidthHints& columnWidthHints,
+              const std::string& errorMessage = "" ) = 0;
+
+   virtual void writeErrorMessage( const std::string& message ) = 0;
+
+protected:
+   std::ostream& log;
+   int verbose = 0;
+
+   MetadataColumns metadataColumns;
+   std::map< std::string, int > metadataWidths;
+   bool header_changed = true;
+};
+
+} // namespace Benchmarks
+} // namespace TNL
diff --git a/src/TNL/Benchmarks/Utils.h b/src/TNL/Benchmarks/Utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..e1e243437d70f435db9ca4ca66cda350595bf6fa
--- /dev/null
+++ b/src/TNL/Benchmarks/Utils.h
@@ -0,0 +1,186 @@
+/***************************************************************************
+                          Utils.h  -  description
+                             -------------------
+    begin                : Dec 25, 2018
+    copyright            : (C) 2018 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Jakub Klinkovsky,
+//                 Tomas Oberhuber
+
+#pragma once
+
+#include <tuple>
+#include <map>
+#include <fstream>
+#include <experimental/filesystem>
+
+#include <TNL/Timer.h>
+#include <TNL/Devices/Cuda.h>
+#include <TNL/Containers/Vector.h>
+#include <TNL/Solvers/IterativeSolverMonitor.h>
+
+#include <TNL/Devices/Host.h>
+#include <TNL/SystemInfo.h>
+#include <TNL/Cuda/DeviceInfo.h>
+#include <TNL/Config/ConfigDescription.h>
+#include <TNL/MPI/Wrappers.h>
+
+namespace TNL {
+namespace Benchmarks {
+
+// returns a tuple of (loops, mean, stddev) where loops is the number of
+// performed loops (i.e. timing samples), mean is the arithmetic mean of the
+// computation times and stddev is the sample standard deviation
+template< typename Device,
+          typename ComputeFunction,
+          typename ResetFunction,
+          typename Monitor = TNL::Solvers::IterativeSolverMonitor< double, int > >
+std::tuple< int, double, double >
+timeFunction( ComputeFunction compute,
+              ResetFunction reset,
+              int maxLoops,
+              const double& minTime,
+              Monitor && monitor = Monitor() )
+{
+   // the timer is constructed zero-initialized and stopped
+   Timer timer;
+
+   // set timer to the monitor
+   monitor.setTimer( timer );
+
+   // warm up
+   reset();
+   compute();
+
+   Containers::Vector< double > results( maxLoops );
+   results.setValue( 0.0 );
+
+   int loops;
+   for( loops = 0;
+        loops < maxLoops || sum( results ) < minTime;
+        loops++ )
+   {
+      // abuse the monitor's "time" for loops
+      monitor.setTime( loops + 1 );
+      reset();
+
+      // Explicit synchronization of the CUDA device
+#ifdef HAVE_CUDA
+      if( std::is_same< Device, Devices::Cuda >::value )
+         cudaDeviceSynchronize();
+#endif
+
+      // reset timer before each computation
+      timer.reset();
+      timer.start();
+      compute();
+#ifdef HAVE_CUDA
+      if( std::is_same< Device, Devices::Cuda >::value )
+         cudaDeviceSynchronize();
+#endif
+      timer.stop();
+
+      results[ loops ] = timer.getRealTime();
+   }
+
+   const double mean = sum( results ) / (double) loops;
+   double stddev;
+   if( loops > 1 )
+      stddev = 1.0 / std::sqrt( loops - 1 ) * l2Norm( results - mean );
+   else
+      stddev = std::numeric_limits<double>::quiet_NaN();
+   return std::make_tuple( loops, mean, stddev );
+}
+
+inline std::map< std::string, std::string > getHardwareMetadata()
+{
+   const int cpu_id = 0;
+   const CacheSizes cacheSizes = SystemInfo::getCPUCacheSizes( cpu_id );
+   const std::string cacheInfo = std::to_string( cacheSizes.L1data ) + ", "
+                               + std::to_string( cacheSizes.L1instruction ) + ", "
+                               + std::to_string( cacheSizes.L2 ) + ", "
+                               + std::to_string( cacheSizes.L3 );
+#ifdef HAVE_CUDA
+   const int activeGPU = Cuda::DeviceInfo::getActiveDevice();
+   const std::string deviceArch = std::to_string( Cuda::DeviceInfo::getArchitectureMajor( activeGPU ) ) + "."
+                                + std::to_string( Cuda::DeviceInfo::getArchitectureMinor( activeGPU ) );
+#endif
+
+#ifdef HAVE_MPI
+   int nproc = 1;
+   // check if MPI was initialized (some benchmarks do not initialize MPI even when
+   // they are built with HAVE_MPI and thus MPI::GetSize() cannot be used blindly)
+   if( TNL::MPI::Initialized() )
+      nproc = TNL::MPI::GetSize();
+#endif
+
+   std::map< std::string, std::string > metadata {
+       { "host name", SystemInfo::getHostname() },
+       { "architecture", SystemInfo::getArchitecture() },
+       { "system", SystemInfo::getSystemName() },
+       { "system release", SystemInfo::getSystemRelease() },
+       { "start time", SystemInfo::getCurrentTime() },
+#ifdef HAVE_MPI
+       { "number of MPI processes", std::to_string( nproc ) },
+#endif
+       { "OpenMP enabled", std::to_string( Devices::Host::isOMPEnabled() ) },
+       { "OpenMP threads", std::to_string( Devices::Host::getMaxThreadsCount() ) },
+       { "CPU model name", SystemInfo::getCPUModelName( cpu_id ) },
+       { "CPU cores", std::to_string( SystemInfo::getNumberOfCores( cpu_id ) ) },
+       { "CPU threads per core", std::to_string( SystemInfo::getNumberOfThreads( cpu_id ) / SystemInfo::getNumberOfCores( cpu_id ) ) },
+       { "CPU max frequency (MHz)", std::to_string( SystemInfo::getCPUMaxFrequency( cpu_id ) / 1e3 ) },
+       { "CPU cache sizes (L1d, L1i, L2, L3) (kiB)", cacheInfo },
+#ifdef HAVE_CUDA
+       { "GPU name", Cuda::DeviceInfo::getDeviceName( activeGPU ) },
+       { "GPU architecture", deviceArch },
+       { "GPU CUDA cores", std::to_string( Cuda::DeviceInfo::getCudaCores( activeGPU ) ) },
+       { "GPU clock rate (MHz)", std::to_string( (double) Cuda::DeviceInfo::getClockRate( activeGPU ) / 1e3 ) },
+       { "GPU global memory (GB)", std::to_string( (double) Cuda::DeviceInfo::getGlobalMemory( activeGPU ) / 1e9 ) },
+       { "GPU memory clock rate (MHz)", std::to_string( (double) Cuda::DeviceInfo::getMemoryClockRate( activeGPU ) / 1e3 ) },
+       { "GPU memory ECC enabled", std::to_string( Cuda::DeviceInfo::getECCEnabled( activeGPU ) ) },
+#endif
+   };
+
+   return metadata;
+}
+
+inline void writeMapAsJson( const std::map< std::string, std::string >& data,
+                            std::ostream& out )
+{
+   out << "{\n";
+   for( auto it = data.begin(); it != data.end(); ) {
+      out << "\t\"" << it->first << "\": \"" << it->second << "\"";
+      // increment the iterator now to peek at the next element
+      it++;
+      // write a comma if there are still elements remaining
+      if( it != data.end() )
+         out << ",";
+      out << "\n";
+   }
+   out << "}\n" << std::flush;
+}
+
+inline void writeMapAsJson( const std::map< std::string, std::string >& data,
+                            std::string filename,
+                            std::string newExtension = "" )
+{
+   namespace fs = std::experimental::filesystem;
+
+   if( newExtension != "" ) {
+      const fs::path oldPath = filename;
+      const fs::path newPath = oldPath.parent_path() / ( oldPath.stem().string() + newExtension );
+      filename = newPath;
+   }
+
+   std::ofstream file( filename );
+   // enable exceptions
+   file.exceptions( std::ostream::failbit | std::ostream::badbit | std::ostream::eofbit );
+   writeMapAsJson( data, file );
+}
+
+} // namespace Benchmarks
+} // namespace TNL
diff --git a/src/Tools/CMakeLists.txt b/src/Tools/CMakeLists.txt
index deb03b47562d32d4739dd99a95f69bff82becc3f..84a05dd9ca25620402d48d35b5981841025895ad 100644
--- a/src/Tools/CMakeLists.txt
+++ b/src/Tools/CMakeLists.txt
@@ -79,5 +79,6 @@ INSTALL( TARGETS tnl-init
          DESTINATION bin )
 
 INSTALL( PROGRAMS tnl-err2eoc
+                  tnl-benchmark-to-html.py
                   tnl-log-to-html.py
          DESTINATION bin )
diff --git a/src/Tools/tnl-benchmark-to-html.py b/src/Tools/tnl-benchmark-to-html.py
new file mode 100755
index 0000000000000000000000000000000000000000..70c79349129a318f7ae2f22893619eab15e2d417
--- /dev/null
+++ b/src/Tools/tnl-benchmark-to-html.py
@@ -0,0 +1,93 @@
+#!/usr/bin/python3
+
+import sys
+import os.path
+import matplotlib.pyplot as plt
+
+from TNL.BenchmarkLogs import *
+from TNL.BenchmarkPlots import *
+
+if len(sys.argv) < 2 or len(sys.argv) > 3:
+    print(f"""\
+usage: {sys.argv[0]} FILE.log [OUTPUT.html]
+
+where FILE.log contains one JSON record per line,
+and OUTPUT.html is the output file name (by default, OUTPUT=FILE).
+""", file=sys.stderr)
+    sys.exit(1)
+
+logFile = sys.argv[1]
+if len(sys.argv) > 2:
+    htmlFile = sys.argv[2]
+else:
+    htmlFile = os.path.splitext(logFile)[0] + ".html"
+
+
+metadata = get_benchmark_metadata(logFile)
+if metadata is not None and "title" in metadata:
+    title = metadata["title"]
+else:
+    title = os.path.splitext(os.path.basename(logFile))[0]
+dataframes = list(gen_dataframes_per_operation(logFile))
+
+print(f"Writing output to {htmlFile}")
+with open(htmlFile, 'w') as f:
+    print("<html>", file=f)
+    # add some basic style
+    print("""\
+<head>
+<meta charset="UTF-8">
+<style>
+    h1, h2 { border-bottom: solid 1px lightgray; }
+    table { border-collapse: collapse; }
+    table.benchmark td { text-align: end; }
+    th, td { padding: 2px; }
+</style>
+</head>
+<body>""", file=f)
+
+    print(f"<h1>{title}</h1>", file=f)
+    if metadata is not None:
+        print(dict_to_html_table(metadata), file=f)
+
+    # create a TOC
+    print(f"<h2>Table of contents</h2>", file=f)
+    print("<ol>", file=f)
+    for op, df in dataframes:
+        id = op.replace(" ", "_")
+        print(f"<li><a href=\"#{id}\">{op}</a></li>", file=f)
+    print("</ol>", file=f)
+
+    # formatters for specific columns of the table
+    formatters = {
+        "stddev": lambda value: f"{value:e}",
+        "bandwidth": lambda value: f"{value:.3f}",
+        "speedup": lambda value: f"{value:.3f}",
+    }
+
+    for op, df in dataframes:
+        # section heading
+        id = op.replace(" ", "_")
+        print(f"<h2 id=\"{id}\">{op}</h2>", file=f)
+        # table
+        print(df.to_html(classes="benchmark", formatters=formatters), file=f)
+
+        # graphs
+        size_name = None
+        if "size" in df.index.names:
+            size_name = "size"
+        elif "DOFs" in df.index.names:
+            size_name = "DOFs"
+        if size_name is not None:
+            fig, ax = plot_bandwidth_vs_size(df, size_name)
+            print(get_image_html_tag(fig, format="png"), file=f)
+            plt.close(fig)
+
+        # heatmaps
+        if "rows" in df.index.names and "columns" in df.index.names:
+            for fig, ax in heatmaps_bandwidth(df):
+                print(get_image_html_tag(fig, format="png"), file=f)
+                plt.close(fig)
+
+    print("</body>", file=f)
+    print("</html>", file=f)
diff --git a/src/Tools/tnl-log-to-html.py b/src/Tools/tnl-log-to-html.py
index bb9577298871c54464d14e111d21180e6738e71d..a42c417c2f15ab3d41af13ac14ddeaec9c6a36f8 100755
--- a/src/Tools/tnl-log-to-html.py
+++ b/src/Tools/tnl-log-to-html.py
@@ -1,5 +1,10 @@
 #!/usr/bin/env python3
 
+import warnings
+warnings.warn("The CustomLogging format for TNL benchmarks is deprecated. Please switch your benchmark "
+              "to JsonLogging and use the tnl-benchmark-to-html.py script for post-processing.",
+              DeprecationWarning)
+
 import sys
 
 from TNL.LogParser import LogParser