diff --git a/CMakeLists.txt b/CMakeLists.txt
index 851e28ec2e6db851e449ece0d037a3d40f41589c..b7540a169070b96faf9ad644d4faaf9f7f646bc5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -29,6 +29,7 @@ option(WITH_TESTS "Build tests" ON)
 option(WITH_COVERAGE "Enable code coverage reports from unit tests" OFF)
 option(WITH_EXAMPLES "Compile the 'examples' directory" ON)
 option(WITH_TOOLS "Compile the 'src/Tools' directory" ON)
+option(WITH_BENCHMARKS "Compile the 'src/Benchmarks' directory" ON)
 option(WITH_PYTHON "Compile the Python bindings" ON)
 option(WITH_TEMPLATES_INSTANTIATION "Enable explicit template instantiation" OFF)
 
@@ -473,6 +474,7 @@ message( "   WITH_TESTS=${WITH_TESTS}" )
 message( "   WITH_COVERAGE=${WITH_COVERAGE}" )
 message( "   WITH_EXAMPLES=${WITH_EXAMPLES}" )
 message( "   WITH_TOOLS=${WITH_TOOLS}" )
+message( "   WITH_BENCHMARKS=${WITH_BENCHMARKS}" )
 message( "   WITH_PYTHON=${WITH_PYTHON}" )
 message( "   WITH_TEMPLATES_INSTANTIATION=${WITH_TEMPLATES_INSTANTIATION}" )
 # Print compiler options
diff --git a/build b/build
index 13929b20258bc8c9f5dbf7ad7cd7015ed0e35b19..e0c8dbb993e592c420aa62abd991c189fbff4870 100755
--- a/build
+++ b/build
@@ -26,6 +26,7 @@ WITH_COVERAGE="no"
 WITH_EXAMPLES="yes"
 WITH_PYTHON="yes"
 WITH_TOOLS="yes"
+WITH_BENCHMARKS="yes"
 
 WITH_TEMPLATE_INSTANTIATION="no"
 INSTANTIATE_LONG_INT="no"
@@ -60,6 +61,7 @@ do
         --with-coverage=*                ) WITH_COVERAGE="${option#*=}" ;;
         --with-examples=*                ) WITH_EXAMPLES="${option#*=}" ;;
         --with-tools=*                   ) WITH_TOOLS="${option#*=}" ;;
+        --with-benchmarks=*              ) WITH_BENCHMARKS="${option#*=}" ;;
         --with-python=*                  ) WITH_PYTHON="${option#*=}" ;;
         --with-templates-instantiation=* ) WITH_TEMPLATE_INSTANTIATION="${option#*=}" ;;
         --instantiate-long-int=*         ) INSTANTIATE_LONG_INT="${option#*=}" ;;
@@ -164,6 +166,7 @@ cmake_command=(
          -DWITH_COVERAGE=${WITH_COVERAGE}
          -DWITH_EXAMPLES=${WITH_EXAMPLES}
          -DWITH_TOOLS=${WITH_TOOLS}
+         -DWITH_BENCHMARKS=${WITH_BENCHMARKS}
          -DWITH_PYTHON=${WITH_PYTHON}
          -DDCMTK_DIR=${DCMTK_DIR}
          -DWITH_TEMPLATE_INSTANTIATION=${WITH_TEMPLATE_INSTANTIATION}
diff --git a/src/Benchmarks/BLAS/CMakeLists.txt b/src/Benchmarks/BLAS/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..dc396c46e60512b5cab5f5f7455756f609b76c47
--- /dev/null
+++ b/src/Benchmarks/BLAS/CMakeLists.txt
@@ -0,0 +1,10 @@
+if( BUILD_CUDA )
+    CUDA_ADD_EXECUTABLE( tnl-benchmark-blas tnl-benchmark-blas.cu )
+    CUDA_ADD_CUBLAS_TO_TARGET( tnl-benchmark-blas )
+    TARGET_LINK_LIBRARIES( tnl-benchmark-blas tnl )
+else()
+    ADD_EXECUTABLE( tnl-benchmark-blas tnl-benchmark-blas.cpp )
+    TARGET_LINK_LIBRARIES( tnl-benchmark-blas tnl )
+endif()
+
+install( TARGETS tnl-benchmark-blas RUNTIME DESTINATION bin )
diff --git a/src/Benchmarks/BLAS/array-operations.h b/src/Benchmarks/BLAS/array-operations.h
new file mode 100644
index 0000000000000000000000000000000000000000..aacdb9cc65315af377ddec49c167a9a197483bd5
--- /dev/null
+++ b/src/Benchmarks/BLAS/array-operations.h
@@ -0,0 +1,165 @@
+/***************************************************************************
+                          array-operations.h  -  description
+                             -------------------
+    begin                : Dec 30, 2015
+    copyright            : (C) 2015 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Jakub Klinkovsky
+
+#pragma once
+
+#include "../Benchmarks.h"
+
+#include <TNL/Containers/Array.h>
+
+namespace TNL {
+namespace Benchmarks {
+
+template< typename Real = double,
+          typename Index = int >
+bool
+benchmarkArrayOperations( Benchmark & benchmark,
+                          const int & loops,
+                          const long & size )
+{
+   typedef Containers::Array< Real, Devices::Host, Index > HostArray;
+   typedef Containers::Array< Real, Devices::Cuda, Index > CudaArray;
+   using namespace std;
+
+   double datasetSize = ( double ) ( loops * size ) * sizeof( Real ) / oneGB;
+
+   HostArray hostArray, hostArray2;
+   CudaArray deviceArray, deviceArray2;
+   hostArray.setSize( size );
+   hostArray2.setSize( size );
+#ifdef HAVE_CUDA
+   deviceArray.setSize( size );
+   deviceArray2.setSize( size );
+#endif
+
+   Real resultHost, resultDevice;
+
+
+   // reset functions
+   auto reset1 = [&]() {
+      hostArray.setValue( 1.0 );
+#ifdef HAVE_CUDA
+      deviceArray.setValue( 1.0 );
+#endif
+   };
+   auto reset2 = [&]() {
+      hostArray2.setValue( 1.0 );
+#ifdef HAVE_CUDA
+      deviceArray2.setValue( 1.0 );
+#endif
+   };
+   auto reset12 = [&]() {
+      reset1();
+      reset2();
+   };
+
+
+   reset12();
+
+
+   auto compareHost = [&]() {
+      resultHost = (int) hostArray == hostArray2;
+   };
+   auto compareCuda = [&]() {
+      resultDevice = (int) deviceArray == deviceArray2;
+   };
+   benchmark.setOperation( "comparison (operator==)", 2 * datasetSize );
+   benchmark.time( reset1, "CPU", compareHost );
+#ifdef HAVE_CUDA
+   benchmark.time( reset1, "GPU", compareCuda );
+#endif
+
+
+   auto copyAssignHostHost = [&]() {
+      hostArray = hostArray2;
+   };
+   auto copyAssignCudaCuda = [&]() {
+      deviceArray = deviceArray2;
+   };
+   benchmark.setOperation( "copy (operator=)", 2 * datasetSize );
+   // copyBasetime is used later inside HAVE_CUDA guard, so the compiler will
+   // complain when compiling without CUDA
+   const double copyBasetime = benchmark.time( reset1, "CPU", copyAssignHostHost );
+#ifdef HAVE_CUDA
+   benchmark.time( reset1, "GPU", copyAssignCudaCuda );
+#endif
+
+
+   auto copyAssignHostCuda = [&]() {
+      deviceArray = hostArray;
+   };
+   auto copyAssignCudaHost = [&]() {
+      hostArray = deviceArray;
+   };
+#ifdef HAVE_CUDA
+   benchmark.setOperation( "copy (operator=)", datasetSize, copyBasetime );
+   benchmark.time( reset1,
+                   "CPU->GPU", copyAssignHostCuda,
+                   "GPU->CPU", copyAssignCudaHost );
+#endif
+
+
+   auto setValueHost = [&]() {
+      hostArray.setValue( 3.0 );
+   };
+   auto setValueCuda = [&]() {
+      deviceArray.setValue( 3.0 );
+   };
+   benchmark.setOperation( "setValue", datasetSize );
+   benchmark.time( reset1, "CPU", setValueHost );
+#ifdef HAVE_CUDA
+   benchmark.time( reset1, "GPU", setValueCuda );
+#endif
+
+
+   auto setSizeHost = [&]() {
+      hostArray.setSize( size );
+   };
+   auto setSizeCuda = [&]() {
+      deviceArray.setSize( size );
+   };
+   auto resetSize1 = [&]() {
+      hostArray.reset();
+#ifdef HAVE_CUDA
+      deviceArray.reset();
+#endif
+   };
+   benchmark.setOperation( "allocation (setSize)", datasetSize );
+   benchmark.time( resetSize1, "CPU", setSizeHost );
+#ifdef HAVE_CUDA
+   benchmark.time( resetSize1, "GPU", setSizeCuda );
+#endif
+
+
+   auto resetSizeHost = [&]() {
+      hostArray.reset();
+   };
+   auto resetSizeCuda = [&]() {
+      deviceArray.reset();
+   };
+   auto setSize1 = [&]() {
+      hostArray.setSize( size );
+#ifdef HAVE_CUDA
+      deviceArray.setSize( size );
+#endif
+   };
+   benchmark.setOperation( "deallocation (reset)", datasetSize );
+   benchmark.time( setSize1, "CPU", resetSizeHost );
+#ifdef HAVE_CUDA
+   benchmark.time( setSize1, "GPU", resetSizeCuda );
+#endif
+
+   return true;
+}
+
+} // namespace Benchmarks
+} // namespace TNL
diff --git a/tests/benchmarks/cublasWrappers.h b/src/Benchmarks/BLAS/cublasWrappers.h
similarity index 74%
rename from tests/benchmarks/cublasWrappers.h
rename to src/Benchmarks/BLAS/cublasWrappers.h
index 6b71a4ed7befc12664440cf5425f5975e6feec0c..1e63e139d6faa513706ed1b18a207e87ea1a079d 100644
--- a/tests/benchmarks/cublasWrappers.h
+++ b/src/Benchmarks/BLAS/cublasWrappers.h
@@ -8,14 +8,14 @@ inline cublasStatus_t
 cublasIgamax( cublasHandle_t handle, int n,
               const float           *x, int incx, int *result )
 {
-    return cublasIsamax( handle, n, x, incx, result );
+   return cublasIsamax( handle, n, x, incx, result );
 }
 
 inline cublasStatus_t
 cublasIgamax( cublasHandle_t handle, int n,
               const double          *x, int incx, int *result )
 {
-    return cublasIdamax( handle, n, x, incx, result );
+   return cublasIdamax( handle, n, x, incx, result );
 }
 
 
@@ -23,14 +23,14 @@ inline cublasStatus_t
 cublasIgamin( cublasHandle_t handle, int n,
               const float           *x, int incx, int *result )
 {
-    return cublasIsamin( handle, n, x, incx, result );
+   return cublasIsamin( handle, n, x, incx, result );
 }
 
 inline cublasStatus_t
 cublasIgamin( cublasHandle_t handle, int n,
               const double          *x, int incx, int *result )
 {
-    return cublasIdamin( handle, n, x, incx, result );
+   return cublasIdamin( handle, n, x, incx, result );
 }
 
 
@@ -38,14 +38,14 @@ inline cublasStatus_t
 cublasGasum( cublasHandle_t handle, int n,
              const float           *x, int incx, float  *result )
 {
-    return cublasSasum( handle, n, x, incx, result );
+   return cublasSasum( handle, n, x, incx, result );
 }
 
 inline cublasStatus_t
 cublasGasum( cublasHandle_t handle, int n,
              const double          *x, int incx, double *result )
 {
-    return cublasDasum( handle, n, x, incx, result );
+   return cublasDasum( handle, n, x, incx, result );
 }
 
 
@@ -55,7 +55,7 @@ cublasGaxpy( cublasHandle_t handle, int n,
              const float           *x, int incx,
              float                 *y, int incy )
 {
-    return cublasSaxpy( handle, n, alpha, x, incx, y, incy );
+   return cublasSaxpy( handle, n, alpha, x, incx, y, incy );
 }
 
 inline cublasStatus_t
@@ -64,7 +64,7 @@ cublasGaxpy( cublasHandle_t handle, int n,
              const double          *x, int incx,
              double                *y, int incy )
 {
-    return cublasDaxpy( handle, n, alpha, x, incx, y, incy );
+   return cublasDaxpy( handle, n, alpha, x, incx, y, incy );
 }
 
 
@@ -74,7 +74,7 @@ cublasGdot( cublasHandle_t handle, int n,
             const float        *y, int incy,
             float         *result )
 {
-    return cublasSdot( handle, n, x, incx, y, incy, result );
+   return cublasSdot( handle, n, x, incx, y, incy, result );
 }
 
 inline cublasStatus_t
@@ -83,7 +83,7 @@ cublasGdot( cublasHandle_t handle, int n,
             const double       *y, int incy,
             double        *result )
 {
-    return cublasDdot( handle, n, x, incx, y, incy, result );
+   return cublasDdot( handle, n, x, incx, y, incy, result );
 }
 
 
@@ -91,14 +91,14 @@ inline cublasStatus_t
 cublasGnrm2( cublasHandle_t handle, int n,
              const float           *x, int incx, float  *result )
 {
-    return cublasSnrm2( handle, n, x, incx, result );
+   return cublasSnrm2( handle, n, x, incx, result );
 }
 
 inline cublasStatus_t
 cublasGnrm2( cublasHandle_t handle, int n,
              const double          *x, int incx, double *result )
 {
-    return cublasDnrm2( handle, n, x, incx, result );
+   return cublasDnrm2( handle, n, x, incx, result );
 }
 
 
@@ -107,7 +107,7 @@ cublasGscal( cublasHandle_t handle, int n,
              const float           *alpha,
              float           *x, int incx )
 {
-    return cublasSscal( handle, n, alpha, x, incx );
+   return cublasSscal( handle, n, alpha, x, incx );
 }
 
 inline cublasStatus_t
@@ -115,7 +115,7 @@ cublasGscal( cublasHandle_t handle, int n,
              const double          *alpha,
              double          *x, int incx )
 {
-    return cublasDscal( handle, n, alpha, x, incx );
+   return cublasDscal( handle, n, alpha, x, incx );
 }
 
 #endif
diff --git a/src/Benchmarks/BLAS/spmv.h b/src/Benchmarks/BLAS/spmv.h
new file mode 100644
index 0000000000000000000000000000000000000000..a6840af9f8c1f1dedeee876fd2bca258a55c64ba
--- /dev/null
+++ b/src/Benchmarks/BLAS/spmv.h
@@ -0,0 +1,189 @@
+/***************************************************************************
+                          spmv.h  -  description
+                             -------------------
+    begin                : Dec 30, 2015
+    copyright            : (C) 2015 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Jakub Klinkovsky
+
+#pragma once
+
+#include "../Benchmarks.h"
+
+#include <TNL/Containers/List.h>
+#include <TNL/Pointers/DevicePointer.h>
+#include <TNL/Matrices/CSR.h>
+#include <TNL/Matrices/Ellpack.h>
+#include <TNL/Matrices/SlicedEllpack.h>
+#include <TNL/Matrices/ChunkedEllpack.h>
+
+namespace TNL {
+namespace Benchmarks {
+
+// silly alias to match the number of template parameters with other formats
+template< typename Real, typename Device, typename Index >
+using SlicedEllpack = Matrices::SlicedEllpack< Real, Device, Index >;
+
+template< typename Matrix >
+int setHostTestMatrix( Matrix& matrix,
+                       const int elementsPerRow )
+{
+   const int size = matrix.getRows();
+   int elements( 0 );
+   for( int row = 0; row < size; row++ ) {
+      int col = row - elementsPerRow / 2;
+      for( int element = 0; element < elementsPerRow; element++ ) {
+         if( col + element >= 0 &&
+            col + element < size )
+         {
+            matrix.setElement( row, col + element, element + 1 );
+            elements++;
+         }
+      }
+   }
+   return elements;
+}
+
+#ifdef HAVE_CUDA
+template< typename Matrix >
+__global__ void setCudaTestMatrixKernel( Matrix* matrix,
+                                         const int elementsPerRow,
+                                         const int gridIdx )
+{
+   const int rowIdx = ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
+   if( rowIdx >= matrix->getRows() )
+      return;
+   int col = rowIdx - elementsPerRow / 2;
+   for( int element = 0; element < elementsPerRow; element++ ) {
+      if( col + element >= 0 &&
+         col + element < matrix->getColumns() )
+         matrix->setElementFast( rowIdx, col + element, element + 1 );
+   }
+}
+#endif
+
+template< typename Matrix >
+void setCudaTestMatrix( Matrix& matrix,
+                        const int elementsPerRow )
+{
+#ifdef HAVE_CUDA
+   typedef typename Matrix::IndexType IndexType;
+   typedef typename Matrix::RealType RealType;
+   Pointers::DevicePointer< Matrix > kernel_matrix( matrix );
+   dim3 cudaBlockSize( 256 ), cudaGridSize( Devices::Cuda::getMaxGridSize() );
+   const IndexType cudaBlocks = roundUpDivision( matrix.getRows(), cudaBlockSize.x );
+   const IndexType cudaGrids = roundUpDivision( cudaBlocks, Devices::Cuda::getMaxGridSize() );
+   for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ ) {
+      if( gridIdx == cudaGrids - 1 )
+         cudaGridSize.x = cudaBlocks % Devices::Cuda::getMaxGridSize();
+      setCudaTestMatrixKernel< Matrix >
+         <<< cudaGridSize, cudaBlockSize >>>
+         ( &kernel_matrix.template modifyData< Devices::Cuda >(), elementsPerRow, gridIdx );
+        TNL_CHECK_CUDA_DEVICE;
+   }
+#endif
+}
+
+
+// TODO: rename as benchmark_SpMV_synthetic and move to spmv-synthetic.h
+template< typename Real,
+          template< typename, typename, typename > class Matrix,
+          template< typename, typename, typename > class Vector = Containers::Vector >
+bool
+benchmarkSpMV( Benchmark & benchmark,
+               const int & loops,
+               const int & size,
+               const int elementsPerRow = 5 )
+{
+   typedef Matrix< Real, Devices::Host, int > HostMatrix;
+   typedef Matrix< Real, Devices::Cuda, int > DeviceMatrix;
+   typedef Containers::Vector< Real, Devices::Host, int > HostVector;
+   typedef Containers::Vector< Real, Devices::Cuda, int > CudaVector;
+
+   HostMatrix hostMatrix;
+   DeviceMatrix deviceMatrix;
+   Containers::Vector< int, Devices::Host, int > hostRowLengths;
+   Containers::Vector< int, Devices::Cuda, int > deviceRowLengths;
+   HostVector hostVector, hostVector2;
+   CudaVector deviceVector, deviceVector2;
+
+   // create benchmark group
+   Containers::List< String > parsedType;
+   parseObjectType( HostMatrix::getType(), parsedType );
+   benchmark.createHorizontalGroup( parsedType[ 0 ], 2 );
+
+   hostRowLengths.setSize( size );
+   hostMatrix.setDimensions( size, size );
+   hostVector.setSize( size );
+   hostVector2.setSize( size );
+#ifdef HAVE_CUDA
+   deviceRowLengths.setSize( size );
+   deviceMatrix.setDimensions( size, size );
+   deviceVector.setSize( size );
+   deviceVector2.setSize( size );
+#endif
+
+   hostRowLengths.setValue( elementsPerRow );
+#ifdef HAVE_CUDA
+   deviceRowLengths.setValue( elementsPerRow );
+#endif
+
+   hostMatrix.setCompressedRowLengths( hostRowLengths );
+#ifdef HAVE_CUDA
+   deviceMatrix.setCompressedRowLengths( deviceRowLengths );
+#endif
+
+   const int elements = setHostTestMatrix< HostMatrix >( hostMatrix, elementsPerRow );
+   setCudaTestMatrix< DeviceMatrix >( deviceMatrix, elementsPerRow );
+   const double datasetSize = ( double ) loops * elements * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB;
+
+   // reset function
+   auto reset = [&]() {
+      hostVector.setValue( 1.0 );
+      hostVector2.setValue( 0.0 );
+#ifdef HAVE_CUDA
+      deviceVector.setValue( 1.0 );
+      deviceVector2.setValue( 0.0 );
+#endif
+   };
+
+   // compute functions
+   auto spmvHost = [&]() {
+      hostMatrix.vectorProduct( hostVector, hostVector2 );
+   };
+   auto spmvCuda = [&]() {
+      deviceMatrix.vectorProduct( deviceVector, deviceVector2 );
+   };
+
+   benchmark.setOperation( datasetSize );
+   benchmark.time( reset, "CPU", spmvHost );
+#ifdef HAVE_CUDA
+   benchmark.time( reset, "GPU", spmvCuda );
+#endif
+
+   return true;
+}
+
+template< typename Real = double,
+          typename Index = int >
+bool
+benchmarkSpmvSynthetic( Benchmark & benchmark,
+                        const int & loops,
+                        const int & size,
+                        const int & elementsPerRow )
+{
+   bool result = true;
+   // TODO: benchmark all formats from tnl-benchmark-spmv (different parameters of the base formats)
+   result |= benchmarkSpMV< Real, Matrices::CSR >( benchmark, loops, size, elementsPerRow );
+   result |= benchmarkSpMV< Real, Matrices::Ellpack >( benchmark, loops, size, elementsPerRow );
+   result |= benchmarkSpMV< Real, SlicedEllpack >( benchmark, loops, size, elementsPerRow );
+   result |= benchmarkSpMV< Real, Matrices::ChunkedEllpack >( benchmark, loops, size, elementsPerRow );
+   return result;
+}
+
+} // namespace Benchmarks
+} // namespace TNL
diff --git a/tests/benchmarks/tnl-benchmark-blas.cpp b/src/Benchmarks/BLAS/tnl-benchmark-blas.cpp
similarity index 100%
rename from tests/benchmarks/tnl-benchmark-blas.cpp
rename to src/Benchmarks/BLAS/tnl-benchmark-blas.cpp
diff --git a/tests/benchmarks/tnl-benchmark-blas.cu b/src/Benchmarks/BLAS/tnl-benchmark-blas.cu
similarity index 100%
rename from tests/benchmarks/tnl-benchmark-blas.cu
rename to src/Benchmarks/BLAS/tnl-benchmark-blas.cu
diff --git a/src/Benchmarks/BLAS/tnl-benchmark-blas.h b/src/Benchmarks/BLAS/tnl-benchmark-blas.h
new file mode 100644
index 0000000000000000000000000000000000000000..73ea0b375a23440105ea6dfbae599c63fb108adb
--- /dev/null
+++ b/src/Benchmarks/BLAS/tnl-benchmark-blas.h
@@ -0,0 +1,192 @@
+/***************************************************************************
+                          tnl-benchmark-blas.h  -  description
+                             -------------------
+    begin                : Jan 27, 2010
+    copyright            : (C) 2010 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Jakub Klinkovsky
+
+#pragma once
+
+#include <TNL/Devices/Host.h>
+#include <TNL/Devices/CudaDeviceInfo.h>
+#include <TNL/Devices/SystemInfo.h>
+#include <TNL/Config/ConfigDescription.h>
+#include <TNL/Config/ParameterContainer.h>
+
+#include "array-operations.h"
+#include "vector-operations.h"
+#include "spmv.h"
+
+using namespace TNL;
+using namespace TNL::Benchmarks;
+
+
+// TODO: should benchmarks check the result of the computation?
+
+
+template< typename Real >
+void
+runBlasBenchmarks( Benchmark & benchmark,
+                   Benchmark::MetadataMap metadata,
+                   const std::size_t & minSize,
+                   const std::size_t & maxSize,
+                   const double & sizeStepFactor,
+                   const unsigned & loops,
+                   const unsigned & elementsPerRow )
+{
+   const String precision = getType< Real >();
+   metadata["precision"] = precision;
+
+   // Array operations
+   benchmark.newBenchmark( String("Array operations (") + precision + ")",
+                           metadata );
+   for( std::size_t size = minSize; size <= maxSize; size *= 2 ) {
+      benchmark.setMetadataColumns( Benchmark::MetadataColumns({
+         {"size", size},
+      } ));
+      benchmarkArrayOperations< Real >( benchmark, loops, size );
+   }
+
+   // Vector operations
+   benchmark.newBenchmark( String("Vector operations (") + precision + ")",
+                           metadata );
+   for( std::size_t size = minSize; size <= maxSize; size *= sizeStepFactor ) {
+      benchmark.setMetadataColumns( Benchmark::MetadataColumns({
+         {"size", size},
+      } ));
+      benchmarkVectorOperations< Real >( benchmark, loops, size );
+   }
+
+   // Sparse matrix-vector multiplication
+   benchmark.newBenchmark( String("Sparse matrix-vector multiplication (") + precision + ")",
+                           metadata );
+   for( std::size_t size = minSize; size <= maxSize; size *= 2 ) {
+      benchmark.setMetadataColumns( Benchmark::MetadataColumns({
+         {"rows", size},
+         {"columns", size},
+         {"elements per row", elementsPerRow},
+      } ));
+      benchmarkSpmvSynthetic< Real >( benchmark, loops, size, elementsPerRow );
+   }
+}
+
+void
+setupConfig( Config::ConfigDescription & config )
+{
+   config.addDelimiter( "Benchmark settings:" );
+   config.addEntry< String >( "log-file", "Log file name.", "tnl-benchmark-blas.log");
+   config.addEntry< String >( "output-mode", "Mode for opening the log file.", "overwrite" );
+   config.addEntryEnum( "append" );
+   config.addEntryEnum( "overwrite" );
+   config.addEntry< String >( "precision", "Precision of the arithmetics.", "double" );
+   config.addEntryEnum( "float" );
+   config.addEntryEnum( "double" );
+   config.addEntryEnum( "all" );
+   config.addEntry< int >( "min-size", "Minimum size of arrays/vectors used in the benchmark.", 100000 );
+   config.addEntry< int >( "max-size", "Minimum size of arrays/vectors used in the benchmark.", 10000000 );
+   config.addEntry< int >( "size-step-factor", "Factor determining the size of arrays/vectors used in the benchmark. First size is min-size and each following size is stepFactor*previousSize, up to max-size.", 2 );
+   config.addEntry< int >( "loops", "Number of iterations for every computation.", 10 );
+   config.addEntry< int >( "elements-per-row", "Number of elements per row of the sparse matrix used in the matrix-vector multiplication benchmark.", 5 );
+   config.addEntry< int >( "verbose", "Verbose mode.", 1 );
+
+   config.addDelimiter( "Device settings:" );
+   Devices::Host::configSetup( config );
+   Devices::Cuda::configSetup( config );
+}
+
+int
+main( int argc, char* argv[] )
+{
+   Config::ParameterContainer parameters;
+   Config::ConfigDescription conf_desc;
+
+   setupConfig( conf_desc );
+
+   if( ! parseCommandLine( argc, argv, conf_desc, parameters ) ) {
+      conf_desc.printUsage( argv[ 0 ] );
+      return 1;
+   }
+
+   Devices::Host::setup( parameters );
+   Devices::Cuda::setup( parameters );
+
+   const String & logFileName = parameters.getParameter< String >( "log-file" );
+   const String & outputMode = parameters.getParameter< String >( "output-mode" );
+   const String & precision = parameters.getParameter< String >( "precision" );
+   // FIXME: getParameter< std::size_t >() does not work with parameters added with addEntry< int >(),
+   // which have a default value. The workaround below works for int values, but it is not possible
+   // to pass 64-bit integer values
+//   const std::size_t minSize = parameters.getParameter< std::size_t >( "min-size" );
+//   const std::size_t maxSize = parameters.getParameter< std::size_t >( "max-size" );
+   const std::size_t minSize = parameters.getParameter< int >( "min-size" );
+   const std::size_t maxSize = parameters.getParameter< int >( "max-size" );
+   const unsigned sizeStepFactor = parameters.getParameter< unsigned >( "size-step-factor" );
+   const unsigned loops = parameters.getParameter< unsigned >( "loops" );
+   const unsigned elementsPerRow = parameters.getParameter< unsigned >( "elements-per-row" );
+   const unsigned verbose = parameters.getParameter< unsigned >( "verbose" );
+
+   if( sizeStepFactor <= 1 ) {
+       std::cerr << "The value of --size-step-factor must be greater than 1." << std::endl;
+       return EXIT_FAILURE;
+   }
+
+   // open log file
+   auto mode = std::ios::out;
+   if( outputMode == "append" )
+       mode |= std::ios::app;
+   std::ofstream logFile( logFileName.getString(), mode );
+
+   // init benchmark and common metadata
+   Benchmark benchmark( loops, verbose );
+
+   // prepare global metadata
+   const int cpu_id = 0;
+   Devices::CacheSizes cacheSizes = Devices::SystemInfo::getCPUCacheSizes( cpu_id );
+   String cacheInfo = String( cacheSizes.L1data ) + ", "
+                       + String( cacheSizes.L1instruction ) + ", "
+                       + String( cacheSizes.L2 ) + ", "
+                       + String( cacheSizes.L3 );
+#ifdef HAVE_CUDA
+   const int activeGPU = Devices::CudaDeviceInfo::getActiveDevice();
+   const String deviceArch = String( Devices::CudaDeviceInfo::getArchitectureMajor( activeGPU ) ) + "." +
+                             String( Devices::CudaDeviceInfo::getArchitectureMinor( activeGPU ) );
+#endif
+   Benchmark::MetadataMap metadata {
+      { "host name", Devices::SystemInfo::getHostname() },
+      { "architecture", Devices::SystemInfo::getArchitecture() },
+      { "system", Devices::SystemInfo::getSystemName() },
+      { "system release", Devices::SystemInfo::getSystemRelease() },
+      { "start time", Devices::SystemInfo::getCurrentTime() },
+      { "CPU model name", Devices::SystemInfo::getCPUModelName( cpu_id ) },
+      { "CPU cores", Devices::SystemInfo::getNumberOfCores( cpu_id ) },
+      { "CPU threads per core", Devices::SystemInfo::getNumberOfThreads( cpu_id ) / Devices::SystemInfo::getNumberOfCores( cpu_id ) },
+      { "CPU max frequency (MHz)", Devices::SystemInfo::getCPUMaxFrequency( cpu_id ) / 1e3 },
+      { "CPU cache sizes (L1d, L1i, L2, L3) (kiB)", cacheInfo },
+#ifdef HAVE_CUDA
+      { "GPU name", Devices::CudaDeviceInfo::getDeviceName( activeGPU ) },
+      { "GPU architecture", deviceArch },
+      { "GPU CUDA cores", Devices::CudaDeviceInfo::getCudaCores( activeGPU ) },
+      { "GPU clock rate (MHz)", (double) Devices::CudaDeviceInfo::getClockRate( activeGPU ) / 1e3 },
+      { "GPU global memory (GB)", (double) Devices::CudaDeviceInfo::getGlobalMemory( activeGPU ) / 1e9 },
+      { "GPU memory clock rate (MHz)", (double) Devices::CudaDeviceInfo::getMemoryClockRate( activeGPU ) / 1e3 },
+      { "GPU memory ECC enabled", Devices::CudaDeviceInfo::getECCEnabled( activeGPU ) },
+#endif
+   };
+
+   if( precision == "all" || precision == "float" )
+      runBlasBenchmarks< float >( benchmark, metadata, minSize, maxSize, sizeStepFactor, loops, elementsPerRow );
+   if( precision == "all" || precision == "double" )
+      runBlasBenchmarks< double >( benchmark, metadata, minSize, maxSize, sizeStepFactor, loops, elementsPerRow );
+
+   if( ! benchmark.save( logFile ) ) {
+      std::cerr << "Failed to write the benchmark results to file '" << parameters.getParameter< String >( "log-file" ) << "'." << std::endl;
+      return EXIT_FAILURE;
+   }
+
+   return EXIT_SUCCESS;
+}
diff --git a/src/Benchmarks/BLAS/vector-operations.h b/src/Benchmarks/BLAS/vector-operations.h
new file mode 100644
index 0000000000000000000000000000000000000000..e65f8980b1066e042206e328d15b50e32c81432f
--- /dev/null
+++ b/src/Benchmarks/BLAS/vector-operations.h
@@ -0,0 +1,440 @@
+/***************************************************************************
+                          vector-operations.h  -  description
+                             -------------------
+    begin                : Dec 30, 2015
+    copyright            : (C) 2015 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Jakub Klinkovsky
+
+#pragma once
+
+#include <stdlib.h> // srand48
+
+#include "../Benchmarks.h"
+
+#include <TNL/Containers/Vector.h>
+
+#ifdef HAVE_CUDA
+#include "cublasWrappers.h"
+#endif
+
+namespace TNL {
+namespace Benchmarks {
+
+template< typename Real = double,
+          typename Index = int >
+bool
+benchmarkVectorOperations( Benchmark & benchmark,
+                           const int & loops,
+                           const long & size )
+{
+   typedef Containers::Vector< Real, Devices::Host, Index > HostVector;
+   typedef Containers::Vector< Real, Devices::Cuda, Index > CudaVector;
+   using namespace std;
+
+   double datasetSize = ( double ) ( loops * size ) * sizeof( Real ) / oneGB;
+
+   HostVector hostVector, hostVector2;
+   CudaVector deviceVector, deviceVector2;
+   hostVector.setSize( size );
+   hostVector2.setSize( size );
+#ifdef HAVE_CUDA
+   deviceVector.setSize( size );
+   deviceVector2.setSize( size );
+#endif
+
+   Real resultHost, resultDevice;
+
+#ifdef HAVE_CUDA
+   cublasHandle_t cublasHandle;
+   cublasCreate( &cublasHandle );
+#endif
+
+
+   // reset functions
+   // (Make sure to always use some in benchmarks, even if it's not necessary
+   // to assure correct result - it helps to clear cache and avoid optimizations
+   // of the benchmark loop.)
+   auto reset1 = [&]() {
+      hostVector.setValue( 1.0 );
+#ifdef HAVE_CUDA
+      deviceVector.setValue( 1.0 );
+#endif
+      // A relatively harmless call to keep the compiler from realizing we
+      // don't actually do any useful work with the result of the reduciton.
+      srand48(resultHost);
+      resultHost = resultDevice = 0.0;
+   };
+   auto reset2 = [&]() {
+      hostVector2.setValue( 1.0 );
+#ifdef HAVE_CUDA
+      deviceVector2.setValue( 1.0 );
+#endif
+   };
+   auto reset12 = [&]() {
+      reset1();
+      reset2();
+   };
+
+
+   reset12();
+
+
+   auto maxHost = [&]() {
+      resultHost = hostVector.max();
+   };
+   auto maxHostGeneral = [&]() {
+      Real result( 0 );
+      Containers::Algorithms::ParallelReductionMax< Real > operation;
+      Containers::Algorithms::Reduction< Devices::Host >::reduce(
+              operation,
+              hostVector.getSize(),
+              hostVector.getData(),
+              ( Real* ) 0,
+              result );
+      return result;
+   };
+   auto maxCuda = [&]() {
+      resultDevice = deviceVector.max();
+   };
+   benchmark.setOperation( "max", datasetSize );
+   benchmark.time( reset1, "CPU", maxHost );
+   benchmark.time( reset1, "CPU (general)", maxHostGeneral );
+#ifdef HAVE_CUDA
+   benchmark.time( reset1, "GPU", maxCuda );
+#endif
+
+
+   auto minHost = [&]() {
+      resultHost = hostVector.min();
+   };
+   auto minHostGeneral = [&]() {
+      Real result( 0 );
+      Containers::Algorithms::ParallelReductionMin< Real > operation;
+      Containers::Algorithms::Reduction< Devices::Host >::reduce(
+              operation,
+              hostVector.getSize(),
+              hostVector.getData(),
+              ( Real* ) 0,
+              result );
+      return result;
+   };
+   auto minCuda = [&]() {
+      resultDevice = deviceVector.min();
+   };
+   benchmark.setOperation( "min", datasetSize );
+   benchmark.time( reset1, "CPU", minHost );
+   benchmark.time( reset1, "CPU (general)", minHostGeneral );
+#ifdef HAVE_CUDA
+   benchmark.time( reset1, "GPU", minCuda );
+#endif
+
+
+   auto absMaxHost = [&]() {
+      resultHost = hostVector.absMax();
+   };
+   auto absMaxHostGeneral = [&]() {
+      Real result( 0 );
+      Containers::Algorithms::ParallelReductionAbsMax< Real > operation;
+      Containers::Algorithms::Reduction< Devices::Host >::reduce(
+              operation,
+              hostVector.getSize(),
+              hostVector.getData(),
+              ( Real* ) 0,
+              result );
+      return result;
+   };
+   auto absMaxCuda = [&]() {
+      resultDevice = deviceVector.absMax();
+   };
+#ifdef HAVE_CUDA
+   auto absMaxCublas = [&]() {
+      int index = 0;
+      cublasIgamax( cublasHandle, size,
+                    deviceVector.getData(), 1,
+                    &index );
+      resultDevice = deviceVector.getElement( index );
+   };
+#endif
+   benchmark.setOperation( "absMax", datasetSize );
+   benchmark.time( reset1, "CPU", absMaxHost );
+   benchmark.time( reset1, "CPU (general)", absMaxHostGeneral );
+#ifdef HAVE_CUDA
+   benchmark.time( reset1, "GPU", absMaxCuda );
+   benchmark.time( reset1, "cuBLAS", absMaxCublas );
+#endif
+
+
+   auto absMinHost = [&]() {
+      resultHost = hostVector.absMin();
+   };
+   auto absMinHostGeneral = [&]() {
+      Real result( 0 );
+      Containers::Algorithms::ParallelReductionAbsMin< Real > operation;
+      Containers::Algorithms::Reduction< Devices::Host >::reduce(
+              operation,
+              hostVector.getSize(),
+              hostVector.getData(),
+              ( Real* ) 0,
+              result );
+      return result;
+   };
+   auto absMinCuda = [&]() {
+      resultDevice = deviceVector.absMin();
+   };
+#ifdef HAVE_CUDA
+   auto absMinCublas = [&]() {
+      int index = 0;
+      cublasIgamin( cublasHandle, size,
+                    deviceVector.getData(), 1,
+                    &index );
+      resultDevice = deviceVector.getElement( index );
+   };
+#endif
+   benchmark.setOperation( "absMin", datasetSize );
+   benchmark.time( reset1, "CPU", absMinHost );
+   benchmark.time( reset1, "CPU (general)", absMinHostGeneral );
+#ifdef HAVE_CUDA
+   benchmark.time( reset1, "GPU", absMinCuda );
+   benchmark.time( reset1, "cuBLAS", absMinCublas );
+#endif
+
+
+   auto sumHost = [&]() {
+      resultHost = hostVector.sum();
+   };
+   auto sumHostGeneral = [&]() {
+      Real result( 0 );
+      Containers::Algorithms::ParallelReductionSum< Real > operation;
+      Containers::Algorithms::Reduction< Devices::Host >::reduce(
+              operation,
+              hostVector.getSize(),
+              hostVector.getData(),
+              ( Real* ) 0,
+              result );
+      return result;
+   };
+   auto sumCuda = [&]() {
+      resultDevice = deviceVector.sum();
+   };
+   benchmark.setOperation( "sum", datasetSize );
+   benchmark.time( reset1, "CPU", sumHost );
+   benchmark.time( reset1, "CPU (general)", sumHostGeneral );
+#ifdef HAVE_CUDA
+   benchmark.time( reset1, "GPU", sumCuda );
+#endif
+
+
+   auto l1normHost = [&]() {
+      resultHost = hostVector.lpNorm( 1.0 );
+   };
+   auto l1normHostGeneral = [&]() {
+      Real result( 0 );
+      Containers::Algorithms::ParallelReductionAbsSum< Real > operation;
+      Containers::Algorithms::Reduction< Devices::Host >::reduce(
+              operation,
+              hostVector.getSize(),
+              hostVector.getData(),
+              ( Real* ) 0,
+              result );
+      return result;
+   };
+   auto l1normCuda = [&]() {
+      resultDevice = deviceVector.lpNorm( 1.0 );
+   };
+#ifdef HAVE_CUDA
+   auto l1normCublas = [&]() {
+      cublasGasum( cublasHandle, size,
+                   deviceVector.getData(), 1,
+                   &resultDevice );
+   };
+#endif
+   benchmark.setOperation( "l1 norm", datasetSize );
+   benchmark.time( reset1, "CPU", l1normHost );
+   benchmark.time( reset1, "CPU (general)", l1normHostGeneral );
+#ifdef HAVE_CUDA
+   benchmark.time( reset1, "GPU", l1normCuda );
+   benchmark.time( reset1, "cuBLAS", l1normCublas );
+#endif
+
+
+   auto l2normHost = [&]() {
+      resultHost = hostVector.lpNorm( 2.0 );
+   };
+   auto l2normHostGeneral = [&]() {
+      Real result( 0 );
+      Containers::Algorithms::ParallelReductionL2Norm< Real > operation;
+      Containers::Algorithms::Reduction< Devices::Host >::reduce(
+              operation,
+              hostVector.getSize(),
+              hostVector.getData(),
+              ( Real* ) 0,
+              result );
+      return result;
+   };
+   auto l2normCuda = [&]() {
+      resultDevice = deviceVector.lpNorm( 2.0 );
+   };
+#ifdef HAVE_CUDA
+   auto l2normCublas = [&]() {
+      cublasGnrm2( cublasHandle, size,
+                   deviceVector.getData(), 1,
+                   &resultDevice );
+   };
+#endif
+   benchmark.setOperation( "l2 norm", datasetSize );
+   benchmark.time( reset1, "CPU", l2normHost );
+   benchmark.time( reset1, "CPU (general)", l2normHostGeneral );
+#ifdef HAVE_CUDA
+   benchmark.time( reset1, "GPU", l2normCuda );
+   benchmark.time( reset1, "cuBLAS", l2normCublas );
+#endif
+
+
+   auto l3normHost = [&]() {
+      resultHost = hostVector.lpNorm( 3.0 );
+   };
+   auto l3normHostGeneral = [&]() {
+      Real result( 0 );
+      Containers::Algorithms::ParallelReductionLpNorm< Real > operation;
+      operation.setPower( 3.0 );
+      Containers::Algorithms::Reduction< Devices::Host >::reduce(
+              operation,
+              hostVector.getSize(),
+              hostVector.getData(),
+              ( Real* ) 0,
+              result );
+      return result;
+   };
+   auto l3normCuda = [&]() {
+      resultDevice = deviceVector.lpNorm( 3.0 );
+   };
+   benchmark.setOperation( "l3 norm", datasetSize );
+   benchmark.time( reset1, "CPU", l3normHost );
+   benchmark.time( reset1, "CPU (general)", l3normHostGeneral );
+#ifdef HAVE_CUDA
+   benchmark.time( reset1, "GPU", l3normCuda );
+#endif
+
+
+   auto scalarProductHost = [&]() {
+      resultHost = hostVector.scalarProduct( hostVector2 );
+   };
+   auto scalarProductHostGeneral = [&]() {
+      Real result( 0 );
+      Containers::Algorithms::ParallelReductionScalarProduct< Real, Real > operation;
+      Containers::Algorithms::Reduction< Devices::Host >::reduce(
+              operation,
+              hostVector.getSize(),
+              hostVector.getData(),
+              hostVector2.getData(),
+              result );
+      return result;
+   };
+   auto scalarProductCuda = [&]() {
+      resultDevice = deviceVector.scalarProduct( deviceVector2 );
+   };
+#ifdef HAVE_CUDA
+   auto scalarProductCublas = [&]() {
+      cublasGdot( cublasHandle, size,
+                  deviceVector.getData(), 1,
+                  deviceVector2.getData(), 1,
+                  &resultDevice );
+   };
+#endif
+   benchmark.setOperation( "scalar product", 2 * datasetSize );
+   benchmark.time( reset1, "CPU", scalarProductHost );
+   benchmark.time( reset1, "CPU (general)", scalarProductHostGeneral );
+#ifdef HAVE_CUDA
+   benchmark.time( reset1, "GPU", scalarProductCuda );
+   benchmark.time( reset1, "cuBLAS", scalarProductCublas );
+#endif
+
+   /*
+   std::cout << "Benchmarking prefix-sum:" << std::endl;
+   timer.reset();
+   timer.start();
+   hostVector.computePrefixSum();
+   timer.stop();
+   timeHost = timer.getTime();
+   bandwidth = 2 * datasetSize / loops / timer.getTime();
+   std::cout << "  CPU: bandwidth: " << bandwidth << " GB/sec, time: " << timer.getTime() << " sec." << std::endl;
+
+   timer.reset();
+   timer.start();
+   deviceVector.computePrefixSum();
+   timer.stop();
+   timeDevice = timer.getTime();
+   bandwidth = 2 * datasetSize / loops / timer.getTime();
+   std::cout << "  GPU: bandwidth: " << bandwidth << " GB/sec, time: " << timer.getTime() << " sec." << std::endl;
+   std::cout << "  CPU/GPU speedup: " << timeHost / timeDevice << std::endl;
+
+   HostVector auxHostVector;
+   auxHostVector.setLike( deviceVector );
+   auxHostVector = deviceVector;
+   for( int i = 0; i < size; i++ )
+      if( hostVector.getElement( i ) != auxHostVector.getElement( i ) )
+      {
+         std::cerr << "Error in prefix sum at position " << i << ":  " << hostVector.getElement( i ) << " != " << auxHostVector.getElement( i ) << std::endl;
+      }
+   */
+
+
+   auto multiplyHost = [&]() {
+      hostVector *= 0.5;
+   };
+   auto multiplyCuda = [&]() {
+      deviceVector *= 0.5;
+   };
+#ifdef HAVE_CUDA
+   auto multiplyCublas = [&]() {
+      const Real alpha = 0.5;
+      cublasGscal( cublasHandle, size,
+                   &alpha,
+                   deviceVector.getData(), 1 );
+   };
+#endif
+   benchmark.setOperation( "scalar multiplication", 2 * datasetSize );
+   benchmark.time( reset1, "CPU", multiplyHost );
+#ifdef HAVE_CUDA
+   benchmark.time( reset1, "GPU", multiplyCuda );
+   benchmark.time( reset1, "cuBLAS", multiplyCublas );
+#endif
+
+
+   auto addVectorHost = [&]() {
+      hostVector.addVector( hostVector2 );
+   };
+   auto addVectorCuda = [&]() {
+      deviceVector.addVector( deviceVector2 );
+   };
+#ifdef HAVE_CUDA
+   auto addVectorCublas = [&]() {
+      const Real alpha = 1.0;
+      cublasGaxpy( cublasHandle, size,
+                   &alpha,
+                   deviceVector2.getData(), 1,
+                   deviceVector.getData(), 1 );
+   };
+#endif
+   benchmark.setOperation( "vector addition", 3 * datasetSize );
+   benchmark.time( reset1, "CPU", addVectorHost );
+#ifdef HAVE_CUDA
+   benchmark.time( reset1, "GPU", addVectorCuda );
+   benchmark.time( reset1, "cuBLAS", addVectorCublas );
+#endif
+
+
+#ifdef HAVE_CUDA
+   cublasDestroy( cublasHandle );
+#endif
+
+   return true;
+}
+
+} // namespace Benchmarks
+} // namespace TNL
diff --git a/src/Benchmarks/Benchmarks.h b/src/Benchmarks/Benchmarks.h
new file mode 100644
index 0000000000000000000000000000000000000000..60decddc8c04c9422451b1581f8db044655a93ba
--- /dev/null
+++ b/src/Benchmarks/Benchmarks.h
@@ -0,0 +1,456 @@
+/***************************************************************************
+                          benchmarks.h  -  description
+                             -------------------
+    begin                : Dec 30, 2015
+    copyright            : (C) 2015 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Jakub Klinkovsky
+
+#pragma once
+
+#include <iostream>
+#include <iomanip>
+#include <map>
+#include <vector>
+
+#include <TNL/Timer.h>
+#include <TNL/String.h>
+#include <TNL/Solvers/IterativeSolverMonitor.h>
+
+namespace TNL {
+namespace Benchmarks {
+
+const double oneGB = 1024.0 * 1024.0 * 1024.0;
+
+template< typename ComputeFunction,
+          typename ResetFunction,
+          typename Monitor = TNL::Solvers::IterativeSolverMonitor< double, int > >
+double
+timeFunction( ComputeFunction compute,
+              ResetFunction reset,
+              int loops,
+              Monitor && monitor = Monitor() )
+{
+   // the timer is constructed zero-initialized and stopped
+   Timer timer;
+
+   // set timer to the monitor
+   monitor.setTimer( timer );
+
+   // warm up
+   reset();
+   compute();
+
+   for(int i = 0; i < loops; ++i) {
+      // abuse the monitor's "time" for loops
+      monitor.setTime( i + 1 );
+
+      reset();
+
+      // Explicit synchronization of the CUDA device
+      // TODO: not necessary for host computations
+#ifdef HAVE_CUDA
+      cudaDeviceSynchronize();
+#endif
+      timer.start();
+      compute();
+#ifdef HAVE_CUDA
+      cudaDeviceSynchronize();
+#endif
+      timer.stop();
+   }
+
+   return timer.getRealTime();
+}
+
+
+class Logging
+{
+public:
+   using MetadataElement = std::pair< const char*, String >;
+   using MetadataMap = std::map< const char*, String >;
+   using MetadataColumns = std::vector<MetadataElement>;
+
+   using HeaderElements = std::initializer_list< String >;
+   using RowElements = std::initializer_list< double >;
+
+   Logging( bool verbose = true )
+   : verbose(verbose)
+   {}
+
+   void
+   writeTitle( const String & title )
+   {
+      if( verbose )
+         std::cout << std::endl << "== " << title << " ==" << std::endl << std::endl;
+      log << ": title = " << title << std::endl;
+   }
+
+   void
+   writeMetadata( const MetadataMap & metadata )
+   {
+      if( verbose )
+         std::cout << "properties:" << std::endl;
+
+      for( auto & it : metadata ) {
+         if( verbose )
+            std::cout << "   " << it.first << " = " << it.second << std::endl;
+         log << ": " << it.first << " = " << it.second << std::endl;
+      }
+      if( verbose )
+         std::cout << std::endl;
+   }
+
+   void
+   writeTableHeader( const String & spanningElement,
+                     const HeaderElements & subElements )
+   {
+      using namespace std;
+
+      if( verbose && header_changed ) {
+         for( auto & it : metadataColumns ) {
+            std::cout << std::setw( 20 ) << it.first;
+         }
+
+         // spanning element is printed as usual column to stdout,
+         // but is excluded from header
+         std::cout << std::setw( 15 ) << "";
+
+         for( auto & it : subElements ) {
+            std::cout << std::setw( 15 ) << it;
+         }
+         std::cout << std::endl;
+
+         header_changed = false;
+      }
+
+      // initial indent string
+      header_indent = "!";
+      log << std::endl;
+      for( auto & it : metadataColumns ) {
+         log << header_indent << " " << it.first << std::endl;
+      }
+
+      // dump stacked spanning columns
+      if( horizontalGroups.size() > 0 )
+         while( horizontalGroups.back().second <= 0 ) {
+            horizontalGroups.pop_back();
+            header_indent.pop_back();
+         }
+      for( size_t i = 0; i < horizontalGroups.size(); i++ ) {
+         if( horizontalGroups[ i ].second > 0 ) {
+            log << header_indent << " " << horizontalGroups[ i ].first << std::endl;
+            header_indent += "!";
+         }
+      }
+
+      log << header_indent << " " << spanningElement << std::endl;
+      for( auto & it : subElements ) {
+         log << header_indent << "! " << it << std::endl;
+      }
+
+      if( horizontalGroups.size() > 0 ) {
+         horizontalGroups.back().second--;
+         header_indent.pop_back();
+      }
+   }
+
+   void
+   writeTableRow( const String & spanningElement,
+                  const RowElements & subElements )
+   {
+      using namespace std;
+
+      if( verbose ) {
+         for( auto & it : metadataColumns ) {
+            std::cout << std::setw( 20 ) << it.second;
+         }
+         // spanning element is printed as usual column to stdout
+         std::cout << std::setw( 15 ) << spanningElement;
+         for( auto & it : subElements ) {
+            std::cout << std::setw( 15 );
+            if( it != 0.0 )std::cout << it;
+            else std::cout << "N/A";
+         }
+         std::cout << std::endl;
+      }
+
+      // only when changed (the header has been already adjusted)
+      // print each element on separate line
+      for( auto & it : metadataColumns ) {
+         log << it.second << std::endl;
+      }
+
+      // benchmark data are indented
+      const String indent = "    ";
+      for( auto & it : subElements ) {
+         if( it != 0.0 ) log << indent << it << std::endl;
+         else log << indent << "N/A" << std::endl;
+      }
+   }
+
+   void
+   writeErrorMessage( const char* msg,
+                      int colspan = 1 )
+   {
+      // initial indent string
+      header_indent = "!";
+      log << std::endl;
+      for( auto & it : metadataColumns ) {
+         log << header_indent << " " << it.first << std::endl;
+      }
+
+      // make sure there is a header column for the message
+      if( horizontalGroups.size() == 0 )
+         horizontalGroups.push_back( {"", 1} );
+
+      // dump stacked spanning columns
+      while( horizontalGroups.back().second <= 0 ) {
+         horizontalGroups.pop_back();
+         header_indent.pop_back();
+      }
+      for( size_t i = 0; i < horizontalGroups.size(); i++ ) {
+         if( horizontalGroups[ i ].second > 0 ) {
+            log << header_indent << " " << horizontalGroups[ i ].first << std::endl;
+            header_indent += "!";
+         }
+      }
+      if( horizontalGroups.size() > 0 ) {
+         horizontalGroups.back().second -= colspan;
+         header_indent.pop_back();
+      }
+
+      // only when changed (the header has been already adjusted)
+      // print each element on separate line
+      for( auto & it : metadataColumns ) {
+         log << it.second << std::endl;
+      }
+      log << msg << std::endl;
+   }
+
+   void
+   closeTable()
+   {
+      log << std::endl;
+      header_indent = body_indent = "";
+      header_changed = true;
+      horizontalGroups.clear();
+   }
+
+   bool save( std::ostream & logFile )
+   {
+      closeTable();
+      logFile << log.str();
+      if( logFile.good() ) {
+         log.str() = "";
+         return true;
+      }
+      return false;
+   }
+
+protected:
+
+   // manual double -> String conversion with fixed precision
+   static String
+   _to_string( double num, int precision = 0, bool fixed = false )
+   {
+      std::stringstream str;
+      if( fixed )
+         str << std::fixed;
+      if( precision )
+         str << std::setprecision( precision );
+      str << num;
+      return String( str.str().data() );
+   }
+
+   std::stringstream log;
+   std::string header_indent;
+   std::string body_indent;
+
+   bool verbose;
+   MetadataColumns metadataColumns;
+   bool header_changed = true;
+   std::vector< std::pair< String, int > > horizontalGroups;
+};
+
+
+class Benchmark
+: protected Logging
+{
+public:
+   using Logging::MetadataElement;
+   using Logging::MetadataMap;
+   using Logging::MetadataColumns;
+
+   Benchmark( int loops = 10,
+              bool verbose = true )
+   : Logging(verbose), loops(loops)
+   {}
+
+   // TODO: ensure that this is not called in the middle of the benchmark
+   // (or just remove it completely?)
+   void
+   setLoops( int loops )
+   {
+      this->loops = loops;
+   }
+
+   // Marks the start of a new benchmark
+   void
+   newBenchmark( const String & title )
+   {
+      closeTable();
+      writeTitle( title );
+      monitor.setStage( title.getString() );
+   }
+
+   // Marks the start of a new benchmark (with custom metadata)
+   void
+   newBenchmark( const String & title,
+                 MetadataMap metadata )
+   {
+      closeTable();
+      writeTitle( title );
+      monitor.setStage( title.getString() );
+      // add loops to metadata
+      metadata["loops"] = String(loops);
+      writeMetadata( metadata );
+   }
+
+   // Sets metadata columns -- values used for all subsequent rows until
+   // the next call to this function.
+   void
+   setMetadataColumns( const MetadataColumns & metadata )
+   {
+      if( metadataColumns != metadata )
+         header_changed = true;
+      metadataColumns = metadata;
+   }
+
+   // TODO: maybe should be renamed to createVerticalGroup and ensured that vertical and horizontal groups are not used within the same "Benchmark"
+   // Sets current operation -- operations expand the table vertically
+   //  - baseTime should be reset to 0.0 for most operations, but sometimes
+   //    it is useful to override it
+   //  - Order of operations inside a "Benchmark" does not matter, rows can be
+   //    easily sorted while converting to HTML.)
+   void
+   setOperation( const String & operation,
+                 const double datasetSize = 0.0, // in GB
+                 const double baseTime = 0.0 )
+   {
+      if( metadataColumns.size() > 0 && String(metadataColumns[ 0 ].first) == "operation" ) {
+         metadataColumns[ 0 ].second = operation;
+      }
+      else {
+         metadataColumns.insert( metadataColumns.begin(), {"operation", operation} );
+      }
+      setOperation( datasetSize, baseTime );
+      header_changed = true;
+   }
+
+   void
+   setOperation( const double datasetSize = 0.0,
+                 const double baseTime = 0.0 )
+   {
+      this->datasetSize = datasetSize;
+      this->baseTime = baseTime;
+   }
+
+   // Creates new horizontal groups inside a benchmark -- increases the number
+   // of columns in the "Benchmark", implies column spanning.
+   // (Useful e.g. for SpMV formats, different configurations etc.)
+   void
+   createHorizontalGroup( const String & name,
+                          int subcolumns )
+   {
+      if( horizontalGroups.size() == 0 ) {
+         horizontalGroups.push_back( {name, subcolumns} );
+      }
+      else {
+         auto & last = horizontalGroups.back();
+         if( last.first != name && last.second > 0 ) {
+            horizontalGroups.push_back( {name, subcolumns} );
+         }
+         else {
+            last.first = name;
+            last.second = subcolumns;
+         }
+      }
+   }
+
+   // Times a single ComputeFunction. Subsequent calls implicitly split
+   // the current "horizontal group" into sub-columns identified by
+   // "performer", which are further split into "bandwidth", "time" and
+   // "speedup" columns.
+   // TODO: allow custom columns bound to lambda functions (e.g. for Gflops calculation)
+   // Also terminates the recursion of the following variadic template.
+   template< typename ResetFunction,
+             typename ComputeFunction >
+   double
+   time( ResetFunction reset,
+         const String & performer,
+         ComputeFunction & compute )
+   {
+      double time;
+      if( verbose ) {
+         // run the monitor main loop
+         Solvers::SolverMonitorThread monitor_thread( monitor );
+         time = timeFunction( compute, reset, loops, monitor );
+      }
+      else {
+         time = timeFunction( compute, reset, loops, monitor );
+      }
+
+      const double bandwidth = datasetSize / time;
+      const double speedup = this->baseTime / time;
+      if( this->baseTime == 0.0 )
+         this->baseTime = time;
+
+      writeTableHeader( performer, HeaderElements({"bandwidth", "time", "speedup"}) );
+      writeTableRow( performer, RowElements({ bandwidth, time, speedup }) );
+
+      return this->baseTime;
+   }
+
+   // Recursive template function to deal with multiple computations with the
+   // same reset function.
+   template< typename ResetFunction,
+             typename ComputeFunction,
+             typename... NextComputations >
+   inline double
+   time( ResetFunction reset,
+         const String & performer,
+         ComputeFunction & compute,
+         NextComputations & ... nextComputations )
+   {
+      time( reset, performer, compute );
+      time( reset, nextComputations... );
+      return this->baseTime;
+   }
+
+   // Adds an error message to the log. Should be called in places where the
+   // "time" method could not be called (e.g. due to failed allocation).
+   void
+   addErrorMessage( const char* msg,
+                    int numberOfComputations = 1 )
+   {
+      // each computation has 3 subcolumns
+      const int colspan = 3 * numberOfComputations;
+      writeErrorMessage( msg, colspan );
+   }
+
+   using Logging::save;
+
+protected:
+   int loops;
+   double datasetSize = 0.0;
+   double baseTime = 0.0;
+   Solvers::IterativeSolverMonitor< double, int > monitor;
+};
+
+} // namespace Benchmarks
+} // namespace TNL
diff --git a/src/Benchmarks/CMakeLists.txt b/src/Benchmarks/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..e34ade5be305bdee495749a38db1b9af004a6a92
--- /dev/null
+++ b/src/Benchmarks/CMakeLists.txt
@@ -0,0 +1,10 @@
+add_subdirectory( HeatEquation )
+add_subdirectory( BLAS )
+add_subdirectory( SpMV )
+add_subdirectory( LinearSolvers )
+
+set( headers
+         Benchmarks.h
+)
+
+install( FILES ${headers} DESTINATION ${TNL_TARGET_INCLUDE_DIRECTORY}/Benchmarks )
diff --git a/tests/benchmarks/heat-equation-benchmark/BenchmarkLaplace.h b/src/Benchmarks/HeatEquation/BenchmarkLaplace.h
similarity index 100%
rename from tests/benchmarks/heat-equation-benchmark/BenchmarkLaplace.h
rename to src/Benchmarks/HeatEquation/BenchmarkLaplace.h
diff --git a/tests/benchmarks/heat-equation-benchmark/BenchmarkLaplace_impl.h b/src/Benchmarks/HeatEquation/BenchmarkLaplace_impl.h
similarity index 100%
rename from tests/benchmarks/heat-equation-benchmark/BenchmarkLaplace_impl.h
rename to src/Benchmarks/HeatEquation/BenchmarkLaplace_impl.h
diff --git a/tests/benchmarks/heat-equation-benchmark/CMakeLists.txt b/src/Benchmarks/HeatEquation/CMakeLists.txt
similarity index 100%
rename from tests/benchmarks/heat-equation-benchmark/CMakeLists.txt
rename to src/Benchmarks/HeatEquation/CMakeLists.txt
diff --git a/tests/benchmarks/heat-equation-benchmark/DirichletBoundaryConditions.h b/src/Benchmarks/HeatEquation/DirichletBoundaryConditions.h
similarity index 100%
rename from tests/benchmarks/heat-equation-benchmark/DirichletBoundaryConditions.h
rename to src/Benchmarks/HeatEquation/DirichletBoundaryConditions.h
diff --git a/tests/benchmarks/heat-equation-benchmark/HeatEquationBenchmarkBuildConfigTag.h b/src/Benchmarks/HeatEquation/HeatEquationBenchmarkBuildConfigTag.h
similarity index 100%
rename from tests/benchmarks/heat-equation-benchmark/HeatEquationBenchmarkBuildConfigTag.h
rename to src/Benchmarks/HeatEquation/HeatEquationBenchmarkBuildConfigTag.h
diff --git a/tests/benchmarks/heat-equation-benchmark/HeatEquationBenchmarkProblem.h b/src/Benchmarks/HeatEquation/HeatEquationBenchmarkProblem.h
similarity index 100%
rename from tests/benchmarks/heat-equation-benchmark/HeatEquationBenchmarkProblem.h
rename to src/Benchmarks/HeatEquation/HeatEquationBenchmarkProblem.h
diff --git a/tests/benchmarks/heat-equation-benchmark/HeatEquationBenchmarkProblem_impl.h b/src/Benchmarks/HeatEquation/HeatEquationBenchmarkProblem_impl.h
similarity index 100%
rename from tests/benchmarks/heat-equation-benchmark/HeatEquationBenchmarkProblem_impl.h
rename to src/Benchmarks/HeatEquation/HeatEquationBenchmarkProblem_impl.h
diff --git a/tests/benchmarks/heat-equation-benchmark/HeatEquationBenchmarkRhs.h b/src/Benchmarks/HeatEquation/HeatEquationBenchmarkRhs.h
similarity index 100%
rename from tests/benchmarks/heat-equation-benchmark/HeatEquationBenchmarkRhs.h
rename to src/Benchmarks/HeatEquation/HeatEquationBenchmarkRhs.h
diff --git a/tests/benchmarks/heat-equation-benchmark/TestGridEntity.h b/src/Benchmarks/HeatEquation/TestGridEntity.h
similarity index 100%
rename from tests/benchmarks/heat-equation-benchmark/TestGridEntity.h
rename to src/Benchmarks/HeatEquation/TestGridEntity.h
diff --git a/tests/benchmarks/heat-equation-benchmark/Tuning/ExplicitUpdater.h b/src/Benchmarks/HeatEquation/Tuning/ExplicitUpdater.h
similarity index 100%
rename from tests/benchmarks/heat-equation-benchmark/Tuning/ExplicitUpdater.h
rename to src/Benchmarks/HeatEquation/Tuning/ExplicitUpdater.h
diff --git a/tests/benchmarks/heat-equation-benchmark/Tuning/GridTraverser.h b/src/Benchmarks/HeatEquation/Tuning/GridTraverser.h
similarity index 100%
rename from tests/benchmarks/heat-equation-benchmark/Tuning/GridTraverser.h
rename to src/Benchmarks/HeatEquation/Tuning/GridTraverser.h
diff --git a/tests/benchmarks/heat-equation-benchmark/Tuning/GridTraverser_impl.h b/src/Benchmarks/HeatEquation/Tuning/GridTraverser_impl.h
similarity index 100%
rename from tests/benchmarks/heat-equation-benchmark/Tuning/GridTraverser_impl.h
rename to src/Benchmarks/HeatEquation/Tuning/GridTraverser_impl.h
diff --git a/tests/benchmarks/heat-equation-benchmark/Tuning/SimpleCell.h b/src/Benchmarks/HeatEquation/Tuning/SimpleCell.h
similarity index 100%
rename from tests/benchmarks/heat-equation-benchmark/Tuning/SimpleCell.h
rename to src/Benchmarks/HeatEquation/Tuning/SimpleCell.h
diff --git a/tests/benchmarks/heat-equation-benchmark/Tuning/Traverser_Grid2D.h b/src/Benchmarks/HeatEquation/Tuning/Traverser_Grid2D.h
similarity index 100%
rename from tests/benchmarks/heat-equation-benchmark/Tuning/Traverser_Grid2D.h
rename to src/Benchmarks/HeatEquation/Tuning/Traverser_Grid2D.h
diff --git a/tests/benchmarks/heat-equation-benchmark/Tuning/Traverser_Grid2D_impl.h b/src/Benchmarks/HeatEquation/Tuning/Traverser_Grid2D_impl.h
similarity index 100%
rename from tests/benchmarks/heat-equation-benchmark/Tuning/Traverser_Grid2D_impl.h
rename to src/Benchmarks/HeatEquation/Tuning/Traverser_Grid2D_impl.h
diff --git a/tests/benchmarks/heat-equation-benchmark/Tuning/tunning.h b/src/Benchmarks/HeatEquation/Tuning/tunning.h
similarity index 100%
rename from tests/benchmarks/heat-equation-benchmark/Tuning/tunning.h
rename to src/Benchmarks/HeatEquation/Tuning/tunning.h
diff --git a/tests/benchmarks/heat-equation-benchmark/pure-c-rhs.h b/src/Benchmarks/HeatEquation/pure-c-rhs.h
similarity index 100%
rename from tests/benchmarks/heat-equation-benchmark/pure-c-rhs.h
rename to src/Benchmarks/HeatEquation/pure-c-rhs.h
diff --git a/tests/benchmarks/heat-equation-benchmark/run-HeatEquationBenchmark b/src/Benchmarks/HeatEquation/run-HeatEquationBenchmark
similarity index 100%
rename from tests/benchmarks/heat-equation-benchmark/run-HeatEquationBenchmark
rename to src/Benchmarks/HeatEquation/run-HeatEquationBenchmark
diff --git a/tests/benchmarks/heat-equation-benchmark/tnl-benchmark-heat-equation.cpp b/src/Benchmarks/HeatEquation/tnl-benchmark-heat-equation.cpp
similarity index 100%
rename from tests/benchmarks/heat-equation-benchmark/tnl-benchmark-heat-equation.cpp
rename to src/Benchmarks/HeatEquation/tnl-benchmark-heat-equation.cpp
diff --git a/tests/benchmarks/heat-equation-benchmark/tnl-benchmark-heat-equation.cu b/src/Benchmarks/HeatEquation/tnl-benchmark-heat-equation.cu
similarity index 100%
rename from tests/benchmarks/heat-equation-benchmark/tnl-benchmark-heat-equation.cu
rename to src/Benchmarks/HeatEquation/tnl-benchmark-heat-equation.cu
diff --git a/tests/benchmarks/heat-equation-benchmark/tnl-benchmark-heat-equation.h b/src/Benchmarks/HeatEquation/tnl-benchmark-heat-equation.h
similarity index 100%
rename from tests/benchmarks/heat-equation-benchmark/tnl-benchmark-heat-equation.h
rename to src/Benchmarks/HeatEquation/tnl-benchmark-heat-equation.h
diff --git a/tests/benchmarks/heat-equation-benchmark/tnl-benchmark-simple-heat-equation-bug.cu b/src/Benchmarks/HeatEquation/tnl-benchmark-simple-heat-equation-bug.cu
similarity index 100%
rename from tests/benchmarks/heat-equation-benchmark/tnl-benchmark-simple-heat-equation-bug.cu
rename to src/Benchmarks/HeatEquation/tnl-benchmark-simple-heat-equation-bug.cu
diff --git a/tests/benchmarks/heat-equation-benchmark/tnl-benchmark-simple-heat-equation-bug.h b/src/Benchmarks/HeatEquation/tnl-benchmark-simple-heat-equation-bug.h
similarity index 100%
rename from tests/benchmarks/heat-equation-benchmark/tnl-benchmark-simple-heat-equation-bug.h
rename to src/Benchmarks/HeatEquation/tnl-benchmark-simple-heat-equation-bug.h
diff --git a/tests/benchmarks/heat-equation-benchmark/tnl-benchmark-simple-heat-equation.cpp b/src/Benchmarks/HeatEquation/tnl-benchmark-simple-heat-equation.cpp
similarity index 100%
rename from tests/benchmarks/heat-equation-benchmark/tnl-benchmark-simple-heat-equation.cpp
rename to src/Benchmarks/HeatEquation/tnl-benchmark-simple-heat-equation.cpp
diff --git a/tests/benchmarks/heat-equation-benchmark/tnl-benchmark-simple-heat-equation.cu b/src/Benchmarks/HeatEquation/tnl-benchmark-simple-heat-equation.cu
similarity index 100%
rename from tests/benchmarks/heat-equation-benchmark/tnl-benchmark-simple-heat-equation.cu
rename to src/Benchmarks/HeatEquation/tnl-benchmark-simple-heat-equation.cu
diff --git a/tests/benchmarks/heat-equation-benchmark/tnl-benchmark-simple-heat-equation.h b/src/Benchmarks/HeatEquation/tnl-benchmark-simple-heat-equation.h
similarity index 100%
rename from tests/benchmarks/heat-equation-benchmark/tnl-benchmark-simple-heat-equation.h
rename to src/Benchmarks/HeatEquation/tnl-benchmark-simple-heat-equation.h
diff --git a/tests/benchmarks/heat-equation-benchmark/tnlTestGrid2D.h b/src/Benchmarks/HeatEquation/tnlTestGrid2D.h
similarity index 100%
rename from tests/benchmarks/heat-equation-benchmark/tnlTestGrid2D.h
rename to src/Benchmarks/HeatEquation/tnlTestGrid2D.h
diff --git a/tests/benchmarks/heat-equation-benchmark/tnlTestGridEntity.h b/src/Benchmarks/HeatEquation/tnlTestGridEntity.h
similarity index 100%
rename from tests/benchmarks/heat-equation-benchmark/tnlTestGridEntity.h
rename to src/Benchmarks/HeatEquation/tnlTestGridEntity.h
diff --git a/tests/benchmarks/heat-equation-benchmark/tnlTestNeighbourGridEntitiesStorage.h b/src/Benchmarks/HeatEquation/tnlTestNeighbourGridEntitiesStorage.h
similarity index 100%
rename from tests/benchmarks/heat-equation-benchmark/tnlTestNeighbourGridEntitiesStorage.h
rename to src/Benchmarks/HeatEquation/tnlTestNeighbourGridEntitiesStorage.h
diff --git a/tests/benchmarks/heat-equation-benchmark/tnlTestNeighbourGridEntityGetter.h b/src/Benchmarks/HeatEquation/tnlTestNeighbourGridEntityGetter.h
similarity index 100%
rename from tests/benchmarks/heat-equation-benchmark/tnlTestNeighbourGridEntityGetter.h
rename to src/Benchmarks/HeatEquation/tnlTestNeighbourGridEntityGetter.h
diff --git a/tests/benchmarks/heat-equation-benchmark/tnlTestNeighbourGridEntityGetter2D_impl.h b/src/Benchmarks/HeatEquation/tnlTestNeighbourGridEntityGetter2D_impl.h
similarity index 100%
rename from tests/benchmarks/heat-equation-benchmark/tnlTestNeighbourGridEntityGetter2D_impl.h
rename to src/Benchmarks/HeatEquation/tnlTestNeighbourGridEntityGetter2D_impl.h
diff --git a/src/Benchmarks/LinearSolvers/CMakeLists.txt b/src/Benchmarks/LinearSolvers/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..1a95c92f791c7d7a0176415fd4968d8aa6bb8982
--- /dev/null
+++ b/src/Benchmarks/LinearSolvers/CMakeLists.txt
@@ -0,0 +1,9 @@
+if( BUILD_CUDA )
+    CUDA_ADD_EXECUTABLE( tnl-benchmark-linear-solvers tnl-benchmark-linear-solvers.cu )
+    TARGET_LINK_LIBRARIES( tnl-benchmark-linear-solvers tnl )
+else()
+    ADD_EXECUTABLE( tnl-benchmark-linear-solvers tnl-benchmark-linear-solvers.cpp )
+    TARGET_LINK_LIBRARIES( tnl-benchmark-linear-solvers tnl )
+endif()
+
+install( TARGETS tnl-benchmark-linear-solvers RUNTIME DESTINATION bin )
diff --git a/tests/benchmarks/tnl-benchmark-linear-solvers.cpp b/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.cpp
similarity index 100%
rename from tests/benchmarks/tnl-benchmark-linear-solvers.cpp
rename to src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.cpp
diff --git a/tests/benchmarks/tnl-benchmark-linear-solvers.cu b/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.cu
similarity index 100%
rename from tests/benchmarks/tnl-benchmark-linear-solvers.cu
rename to src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.cu
diff --git a/tests/benchmarks/tnl-benchmark-linear-solvers.h b/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.h
similarity index 100%
rename from tests/benchmarks/tnl-benchmark-linear-solvers.h
rename to src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.h
diff --git a/src/Benchmarks/SpMV/CMakeLists.txt b/src/Benchmarks/SpMV/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..a73e6738cdb8bb4effd889960f3ffcd5b7255b90
--- /dev/null
+++ b/src/Benchmarks/SpMV/CMakeLists.txt
@@ -0,0 +1,9 @@
+if( BUILD_CUDA )
+    CUDA_ADD_EXECUTABLE( tnl-benchmark-spmv tnl-benchmark-spmv.cu )
+    TARGET_LINK_LIBRARIES( tnl-benchmark-spmv tnl ${CUDA_cusparse_LIBRARY} )
+else()
+    ADD_EXECUTABLE( tnl-benchmark-spmv tnl-benchmark-spmv.cpp )
+    TARGET_LINK_LIBRARIES( tnl-benchmark-spmv tnl )
+endif()
+
+install( TARGETS tnl-benchmark-spmv RUNTIME DESTINATION bin )
diff --git a/tests/benchmarks/tnl-benchmark-spmv.cpp b/src/Benchmarks/SpMV/tnl-benchmark-spmv.cpp
similarity index 100%
rename from tests/benchmarks/tnl-benchmark-spmv.cpp
rename to src/Benchmarks/SpMV/tnl-benchmark-spmv.cpp
diff --git a/tests/benchmarks/tnl-benchmark-spmv.cu b/src/Benchmarks/SpMV/tnl-benchmark-spmv.cu
similarity index 100%
rename from tests/benchmarks/tnl-benchmark-spmv.cu
rename to src/Benchmarks/SpMV/tnl-benchmark-spmv.cu
diff --git a/tests/benchmarks/tnl-benchmark-spmv.h b/src/Benchmarks/SpMV/tnl-benchmark-spmv.h
similarity index 100%
rename from tests/benchmarks/tnl-benchmark-spmv.h
rename to src/Benchmarks/SpMV/tnl-benchmark-spmv.h
diff --git a/tests/benchmarks/tnlCusparseCSRMatrix.h b/src/Benchmarks/SpMV/tnlCusparseCSRMatrix.h
similarity index 100%
rename from tests/benchmarks/tnlCusparseCSRMatrix.h
rename to src/Benchmarks/SpMV/tnlCusparseCSRMatrix.h
diff --git a/tests/benchmarks/share/CMakeLists.txt b/src/Benchmarks/scripts/CMakeLists.txt
similarity index 100%
rename from tests/benchmarks/share/CMakeLists.txt
rename to src/Benchmarks/scripts/CMakeLists.txt
diff --git a/tests/benchmarks/share/convert-matrices b/src/Benchmarks/scripts/convert-matrices
similarity index 100%
rename from tests/benchmarks/share/convert-matrices
rename to src/Benchmarks/scripts/convert-matrices
diff --git a/tests/benchmarks/share/cuda-profiler.conf b/src/Benchmarks/scripts/cuda-profiler.conf
similarity index 100%
rename from tests/benchmarks/share/cuda-profiler.conf
rename to src/Benchmarks/scripts/cuda-profiler.conf
diff --git a/tests/benchmarks/share/draw-matrices b/src/Benchmarks/scripts/draw-matrices
similarity index 100%
rename from tests/benchmarks/share/draw-matrices
rename to src/Benchmarks/scripts/draw-matrices
diff --git a/tests/benchmarks/share/florida-matrix-market b/src/Benchmarks/scripts/florida-matrix-market
similarity index 100%
rename from tests/benchmarks/share/florida-matrix-market
rename to src/Benchmarks/scripts/florida-matrix-market
diff --git a/tests/benchmarks/share/get-matrices b/src/Benchmarks/scripts/get-matrices
similarity index 100%
rename from tests/benchmarks/share/get-matrices
rename to src/Benchmarks/scripts/get-matrices
diff --git a/tests/benchmarks/share/matrix-market b/src/Benchmarks/scripts/matrix-market
similarity index 100%
rename from tests/benchmarks/share/matrix-market
rename to src/Benchmarks/scripts/matrix-market
diff --git a/tests/benchmarks/share/process-cuda-profile.pl b/src/Benchmarks/scripts/process-cuda-profile.pl
similarity index 100%
rename from tests/benchmarks/share/process-cuda-profile.pl
rename to src/Benchmarks/scripts/process-cuda-profile.pl
diff --git a/tests/benchmarks/share/run-matrix-solvers-benchmark b/src/Benchmarks/scripts/run-matrix-solvers-benchmark
similarity index 100%
rename from tests/benchmarks/share/run-matrix-solvers-benchmark
rename to src/Benchmarks/scripts/run-matrix-solvers-benchmark
diff --git a/tests/benchmarks/share/run-tnl-benchmark-linear-solvers b/src/Benchmarks/scripts/run-tnl-benchmark-linear-solvers
similarity index 100%
rename from tests/benchmarks/share/run-tnl-benchmark-linear-solvers
rename to src/Benchmarks/scripts/run-tnl-benchmark-linear-solvers
diff --git a/tests/benchmarks/share/run-tnl-benchmark-spmv b/src/Benchmarks/scripts/run-tnl-benchmark-spmv
similarity index 100%
rename from tests/benchmarks/share/run-tnl-benchmark-spmv
rename to src/Benchmarks/scripts/run-tnl-benchmark-spmv
diff --git a/tests/benchmarks/share/tnl-run-heat-equation-benchmark b/src/Benchmarks/scripts/tnl-run-heat-equation-benchmark
similarity index 100%
rename from tests/benchmarks/share/tnl-run-heat-equation-benchmark
rename to src/Benchmarks/scripts/tnl-run-heat-equation-benchmark
diff --git a/tests/benchmarks/share/tnl-run-spmv-benchmark b/src/Benchmarks/scripts/tnl-run-spmv-benchmark
similarity index 100%
rename from tests/benchmarks/share/tnl-run-spmv-benchmark
rename to src/Benchmarks/scripts/tnl-run-spmv-benchmark
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index a2086691dc4ccbe4c263fe5f11b40e361d957d73..280d40ad2168b2b99f5db998b6a819b23c1382ff 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -1,5 +1,9 @@
 ADD_SUBDIRECTORY( TNL )
 
+if( ${WITH_BENCHMARKS} )
+   ADD_SUBDIRECTORY( Benchmarks )
+endif()
+
 if( ${WITH_TOOLS} )
    ADD_SUBDIRECTORY( Tools )
 endif()
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 33873402d8074b5fa304e84546e1188b57d9943c..93c15ce593a9db00fc240c6263706f9c10a46ac5 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -1,9 +1,4 @@
-set( ENABLE_CODECOVERAGE )
-
 ADD_SUBDIRECTORY( data )
-ADD_SUBDIRECTORY( benchmarks )
 #ADD_SUBDIRECTORY( unit-tests )
 ADD_SUBDIRECTORY( long-time-unit-tests )
 ADD_SUBDIRECTORY( mpi )
-
-unset( ENABLE_CODECOVERAGE )
diff --git a/tests/benchmarks/CMakeLists.txt b/tests/benchmarks/CMakeLists.txt
deleted file mode 100644
index e53ba6878070e6b8f8d3b830ab8f64afcccbcf77..0000000000000000000000000000000000000000
--- a/tests/benchmarks/CMakeLists.txt
+++ /dev/null
@@ -1,29 +0,0 @@
-ADD_SUBDIRECTORY( share )
-ADD_SUBDIRECTORY( heat-equation-benchmark )
-
-IF( BUILD_CUDA )
-    CUDA_ADD_EXECUTABLE( tnl-benchmark-blas tnl-benchmark-blas.cu )
-    CUDA_ADD_CUBLAS_TO_TARGET( tnl-benchmark-blas )
-    TARGET_LINK_LIBRARIES( tnl-benchmark-blas tnl )
-
-    CUDA_ADD_EXECUTABLE( tnl-benchmark-spmv tnl-benchmark-spmv.cu )
-    TARGET_LINK_LIBRARIES( tnl-benchmark-spmv tnl ${CUDA_cusparse_LIBRARY} )
-
-    CUDA_ADD_EXECUTABLE( tnl-benchmark-linear-solvers tnl-benchmark-linear-solvers.cu )
-    TARGET_LINK_LIBRARIES( tnl-benchmark-linear-solvers tnl )
-ELSE()
-    ADD_EXECUTABLE( tnl-benchmark-blas tnl-benchmark-blas.cpp )
-    TARGET_LINK_LIBRARIES( tnl-benchmark-blas tnl )
-
-    ADD_EXECUTABLE( tnl-benchmark-spmv tnl-benchmark-spmv.cpp )
-    TARGET_LINK_LIBRARIES( tnl-benchmark-spmv tnl )
-
-    ADD_EXECUTABLE( tnl-benchmark-linear-solvers tnl-benchmark-linear-solvers.cpp )
-    TARGET_LINK_LIBRARIES( tnl-benchmark-linear-solvers tnl )
-ENDIF()
-
-INSTALL( TARGETS
-            tnl-benchmark-blas
-            tnl-benchmark-spmv
-            tnl-benchmark-linear-solvers
-         RUNTIME DESTINATION bin )
diff --git a/tests/benchmarks/array-operations.h b/tests/benchmarks/array-operations.h
deleted file mode 100644
index 504dcc1da03a91fa87af913008f9355579e62930..0000000000000000000000000000000000000000
--- a/tests/benchmarks/array-operations.h
+++ /dev/null
@@ -1,167 +0,0 @@
-/***************************************************************************
-                          array-operations.h  -  description
-                             -------------------
-    begin                : Dec 30, 2015
-    copyright            : (C) 2015 by Tomas Oberhuber et al.
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-// Implemented by: Jakub Klinkovsky
-
-#pragma once
-
-#include "benchmarks.h"
-
-#include <TNL/Containers/Array.h>
-
-namespace TNL
-{
-namespace benchmarks
-{
-
-template< typename Real = double,
-          typename Index = int >
-bool
-benchmarkArrayOperations( Benchmark & benchmark,
-                          const int & loops,
-                          const long & size )
-{
-    typedef Containers::Array< Real, Devices::Host, Index > HostArray;
-    typedef Containers::Array< Real, Devices::Cuda, Index > CudaArray;
-    using namespace std;
-
-    double datasetSize = ( double ) ( loops * size ) * sizeof( Real ) / oneGB;
-
-    HostArray hostArray, hostArray2;
-    CudaArray deviceArray, deviceArray2;
-    hostArray.setSize( size );
-    hostArray2.setSize( size );
-#ifdef HAVE_CUDA
-    deviceArray.setSize( size );
-    deviceArray2.setSize( size );
-#endif
-
-    Real resultHost, resultDevice;
-
-
-    // reset functions
-    auto reset1 = [&]() {
-        hostArray.setValue( 1.0 );
-#ifdef HAVE_CUDA
-        deviceArray.setValue( 1.0 );
-#endif
-    };
-    auto reset2 = [&]() {
-        hostArray2.setValue( 1.0 );
-#ifdef HAVE_CUDA
-        deviceArray2.setValue( 1.0 );
-#endif
-    };
-    auto reset12 = [&]() {
-        reset1();
-        reset2();
-    };
-
-
-    reset12();
-
-
-    auto compareHost = [&]() {
-        resultHost = (int) hostArray == hostArray2;
-    };
-    auto compareCuda = [&]() {
-        resultDevice = (int) deviceArray == deviceArray2;
-    };
-    benchmark.setOperation( "comparison (operator==)", 2 * datasetSize );
-    benchmark.time( reset1, "CPU", compareHost );
-#ifdef HAVE_CUDA
-    benchmark.time( reset1, "GPU", compareCuda );
-#endif
-
-
-    auto copyAssignHostHost = [&]() {
-        hostArray = hostArray2;
-    };
-    auto copyAssignCudaCuda = [&]() {
-        deviceArray = deviceArray2;
-    };
-    benchmark.setOperation( "copy (operator=)", 2 * datasetSize );
-    // copyBasetime is used later inside HAVE_CUDA guard, so the compiler will
-    // complain when compiling without CUDA
-    const double copyBasetime = benchmark.time( reset1, "CPU", copyAssignHostHost );
-#ifdef HAVE_CUDA
-    benchmark.time( reset1, "GPU", copyAssignCudaCuda );
-#endif
-
-
-    auto copyAssignHostCuda = [&]() {
-        deviceArray = hostArray;
-    };
-    auto copyAssignCudaHost = [&]() {
-        hostArray = deviceArray;
-    };
-#ifdef HAVE_CUDA
-    benchmark.setOperation( "copy (operator=)", datasetSize, copyBasetime );
-    benchmark.time( reset1,
-                    "CPU->GPU", copyAssignHostCuda,
-                    "GPU->CPU", copyAssignCudaHost );
-#endif
-
-
-    auto setValueHost = [&]() {
-        hostArray.setValue( 3.0 );
-    };
-    auto setValueCuda = [&]() {
-        deviceArray.setValue( 3.0 );
-    };
-    benchmark.setOperation( "setValue", datasetSize );
-    benchmark.time( reset1, "CPU", setValueHost );
-#ifdef HAVE_CUDA
-    benchmark.time( reset1, "GPU", setValueCuda );
-#endif
-
-
-    auto setSizeHost = [&]() {
-        hostArray.setSize( size );
-    };
-    auto setSizeCuda = [&]() {
-        deviceArray.setSize( size );
-    };
-    auto resetSize1 = [&]() {
-        hostArray.reset();
-#ifdef HAVE_CUDA
-        deviceArray.reset();
-#endif
-    };
-    benchmark.setOperation( "allocation (setSize)", datasetSize );
-    benchmark.time( resetSize1, "CPU", setSizeHost );
-#ifdef HAVE_CUDA
-    benchmark.time( resetSize1, "GPU", setSizeCuda );
-#endif
-
-
-    auto resetSizeHost = [&]() {
-        hostArray.reset();
-    };
-    auto resetSizeCuda = [&]() {
-        deviceArray.reset();
-    };
-    auto setSize1 = [&]() {
-        hostArray.setSize( size );
-#ifdef HAVE_CUDA
-        deviceArray.setSize( size );
-#endif
-    };
-    benchmark.setOperation( "deallocation (reset)", datasetSize );
-    benchmark.time( setSize1, "CPU", resetSizeHost );
-#ifdef HAVE_CUDA
-    benchmark.time( setSize1, "GPU", resetSizeCuda );
-#endif
-
-    return true;
-}
-
-} // namespace benchmarks
-} // namespace tnl
diff --git a/tests/benchmarks/benchmarks.h b/tests/benchmarks/benchmarks.h
deleted file mode 100644
index ce5e631a6899170cfeba58911be71e5cc17eb7e6..0000000000000000000000000000000000000000
--- a/tests/benchmarks/benchmarks.h
+++ /dev/null
@@ -1,437 +0,0 @@
-/***************************************************************************
-                          benchmarks.h  -  description
-                             -------------------
-    begin                : Dec 30, 2015
-    copyright            : (C) 2015 by Tomas Oberhuber et al.
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-// Implemented by: Jakub Klinkovsky
-
-#pragma once
-
-#include <iostream>
-#include <iomanip>
-#include <map>
-#include <vector>
-
-#include <TNL/Timer.h>
-#include <TNL/String.h>
-
-namespace TNL
-{
-namespace benchmarks
-{
-
-const double oneGB = 1024.0 * 1024.0 * 1024.0;
-
-template< typename ComputeFunction,
-          typename ResetFunction >
-double
-timeFunction( ComputeFunction compute,
-              ResetFunction reset,
-              const int & loops )
-{
-    // the timer is constructed zero-initialized and stopped
-    Timer timer;
-
-    reset();
-    for(int i = 0; i < loops; ++i) {
-        // Explicit synchronization of the CUDA device
-        // TODO: not necessary for host computations
-#ifdef HAVE_CUDA
-        cudaDeviceSynchronize();
-#endif
-        timer.start();
-        compute();
-#ifdef HAVE_CUDA
-        cudaDeviceSynchronize();
-#endif
-        timer.stop();
-
-        reset();
-    }
-
-    return timer.getRealTime();
-}
-
-
-struct InternalError {};
-
-
-class Logging
-{
-public:
-    using MetadataElement = std::pair< const char*, String >;
-    using MetadataMap = std::map< const char*, String >;
-    using MetadataColumns = std::vector<MetadataElement>;
-
-    using HeaderElements = std::initializer_list< String >;
-    using RowElements = std::initializer_list< double >;
-
-    Logging( bool verbose = true )
-        : verbose(verbose)
-    { }
-
-    void
-    writeTitle( const String & title )
-    {
-        if( verbose )
-            std::cout << std::endl << "== " << title << " ==" << std::endl << std::endl;
-        log << ": title = " << title << std::endl;
-    }
-
-    void
-    writeMetadata( const MetadataMap & metadata )
-    {
-        if( verbose )
-            std::cout << "properties:" << std::endl;
-
-        for( auto & it : metadata ) {
-            if( verbose )
-                std::cout << "   " << it.first << " = " << it.second << std::endl;
-            log << ": " << it.first << " = " << it.second << std::endl;
-        }
-        if( verbose )
-            std::cout << std::endl;
-    }
-
-    void
-    writeTableHeader( const String & spanningElement,
-                      const HeaderElements & subElements )
-    {
-        using namespace std;
-
-        if( verbose && header_changed ) {
-            for( auto & it : metadataColumns ) {
-               std::cout << std::setw( 20 ) << it.first;
-            }
-
-            // spanning element is printed as usual column to stdout,
-            // but is excluded from header
-           std::cout << std::setw( 15 ) << "";
-
-            for( auto & it : subElements ) {
-               std::cout << std::setw( 15 ) << it;
-            }
-           std::cout << std::endl;
-
-            header_changed = false;
-        }
-
-        // initial indent string
-        header_indent = "!";
-        log << std::endl;
-        for( auto & it : metadataColumns ) {
-            log << header_indent << " " << it.first << std::endl;
-        }
-
-        // dump stacked spanning columns
-        if( horizontalGroups.size() > 0 )
-            while( horizontalGroups.back().second <= 0 ) {
-                horizontalGroups.pop_back();
-                header_indent.pop_back();
-            }
-        for( size_t i = 0; i < horizontalGroups.size(); i++ ) {
-            if( horizontalGroups[ i ].second > 0 ) {
-                log << header_indent << " " << horizontalGroups[ i ].first << std::endl;
-                header_indent += "!";
-            }
-        }
-
-        log << header_indent << " " << spanningElement << std::endl;
-        for( auto & it : subElements ) {
-            log << header_indent << "! " << it << std::endl;
-        }
-
-        if( horizontalGroups.size() > 0 ) {
-            horizontalGroups.back().second--;
-            header_indent.pop_back();
-        }
-    }
-
-    void
-    writeTableRow( const String & spanningElement,
-                   const RowElements & subElements )
-    {
-        using namespace std;
-
-        if( verbose ) {
-            for( auto & it : metadataColumns ) {
-               std::cout << std::setw( 20 ) << it.second;
-            }
-            // spanning element is printed as usual column to stdout
-           std::cout << std::setw( 15 ) << spanningElement;
-            for( auto & it : subElements ) {
-               std::cout << std::setw( 15 );
-                if( it != 0.0 )std::cout << it;
-                else std::cout << "N/A";
-            }
-           std::cout << std::endl;
-        }
-
-        // only when changed (the header has been already adjusted)
-        // print each element on separate line
-        for( auto & it : metadataColumns ) {
-            log << it.second << std::endl;
-        }
-
-        // benchmark data are indented
-        const String indent = "    ";
-        for( auto & it : subElements ) {
-            if( it != 0.0 ) log << indent << it << std::endl;
-            else log << indent << "N/A" << std::endl;
-        }
-    }
-
-    void
-    writeErrorMessage( const char* msg,
-                       const int & colspan = 1 )
-    {
-        // initial indent string
-        header_indent = "!";
-        log << std::endl;
-        for( auto & it : metadataColumns ) {
-            log << header_indent << " " << it.first << std::endl;
-        }
-
-        // make sure there is a header column for the message
-        if( horizontalGroups.size() == 0 )
-            horizontalGroups.push_back( {"", 1} );
-
-        // dump stacked spanning columns
-        while( horizontalGroups.back().second <= 0 ) {
-            horizontalGroups.pop_back();
-            header_indent.pop_back();
-        }
-        for( size_t i = 0; i < horizontalGroups.size(); i++ ) {
-            if( horizontalGroups[ i ].second > 0 ) {
-                log << header_indent << " " << horizontalGroups[ i ].first << std::endl;
-                header_indent += "!";
-            }
-        }
-        if( horizontalGroups.size() > 0 ) {
-            horizontalGroups.back().second -= colspan;
-            header_indent.pop_back();
-        }
-
-        // only when changed (the header has been already adjusted)
-        // print each element on separate line
-        for( auto & it : metadataColumns ) {
-            log << it.second << std::endl;
-        }
-        log << msg << std::endl;
-    }
-
-    void
-    closeTable()
-    {
-        log << std::endl;
-        header_indent = body_indent = "";
-        header_changed = true;
-        horizontalGroups.clear();
-    }
-
-    bool save( std::ostream & logFile )
-    {
-        closeTable();
-        logFile << log.str();
-        if( logFile.good() ) {
-            log.str() = "";
-            return true;
-        }
-        return false;
-    }
-
-protected:
-
-    // manual double -> String conversion with fixed precision
-    static String
-    _to_string( const double & num, const int & precision = 0, bool fixed = false )
-    {
-        std::stringstream str;
-        if( fixed )
-            str << std::fixed;
-        if( precision )
-            str << std::setprecision( precision );
-        str << num;
-        return String( str.str().data() );
-    }
-
-    std::stringstream log;
-    std::string header_indent;
-    std::string body_indent;
-
-    bool verbose;
-    MetadataColumns metadataColumns;
-    bool header_changed = true;
-    std::vector< std::pair< String, int > > horizontalGroups;
-};
-
-
-class Benchmark
-    : protected Logging
-{
-public:
-    using Logging::MetadataElement;
-    using Logging::MetadataMap;
-    using Logging::MetadataColumns;
-
-    Benchmark( const int & loops = 10,
-               bool verbose = true )
-        : Logging(verbose), loops(loops)
-    { }
-
-    // TODO: ensure that this is not called in the middle of the benchmark
-    // (or just remove it completely?)
-    void
-    setLoops( const int & loops )
-    {
-        this->loops = loops;
-    }
-
-    // Marks the start of a new benchmark
-    void
-    newBenchmark( const String & title )
-    {
-        closeTable();
-        writeTitle( title );
-    }
-
-    // Marks the start of a new benchmark (with custom metadata)
-    void
-    newBenchmark( const String & title,
-                  MetadataMap metadata )
-    {
-        closeTable();
-        writeTitle( title );
-        // add loops to metadata
-        metadata["loops"] = String(loops);
-        writeMetadata( metadata );
-    }
-
-    // Sets metadata columns -- values used for all subsequent rows until
-    // the next call to this function.
-    void
-    setMetadataColumns( const MetadataColumns & metadata )
-    {
-        if( metadataColumns != metadata )
-            header_changed = true;
-        metadataColumns = metadata;
-    }
-
-    // TODO: maybe should be renamed to createVerticalGroup and ensured that vertical and horizontal groups are not used within the same "Benchmark"
-    // Sets current operation -- operations expand the table vertically
-    //  - baseTime should be reset to 0.0 for most operations, but sometimes
-    //    it is useful to override it
-    //  - Order of operations inside a "Benchmark" does not matter, rows can be
-    //    easily sorted while converting to HTML.)
-    void
-    setOperation( const String & operation,
-                  const double & datasetSize = 0.0, // in GB
-                  const double & baseTime = 0.0 )
-    {
-        if( metadataColumns.size() > 0 && String(metadataColumns[ 0 ].first) == "operation" ) {
-            metadataColumns[ 0 ].second = operation;
-        }
-        else {
-            metadataColumns.insert( metadataColumns.begin(), {"operation", operation} );
-        }
-        setOperation( datasetSize, baseTime );
-        header_changed = true;
-    }
-
-    void
-    setOperation( const double & datasetSize = 0.0,
-                  const double & baseTime = 0.0 )
-    {
-        this->datasetSize = datasetSize;
-        this->baseTime = baseTime;
-    }
-
-    // Creates new horizontal groups inside a benchmark -- increases the number
-    // of columns in the "Benchmark", implies column spanning.
-    // (Useful e.g. for SpMV formats, different configurations etc.)
-    void
-    createHorizontalGroup( const String & name,
-                           const int & subcolumns )
-    {
-        if( horizontalGroups.size() == 0 ) {
-            horizontalGroups.push_back( {name, subcolumns} );
-        }
-        else {
-            auto & last = horizontalGroups.back();
-            if( last.first != name && last.second > 0 ) {
-                horizontalGroups.push_back( {name, subcolumns} );
-            }
-            else {
-                last.first = name;
-                last.second = subcolumns;
-            }
-        }
-    }
-
-    // Times a single ComputeFunction. Subsequent calls implicitly split
-    // the current "horizontal group" into sub-columns identified by
-    // "performer", which are further split into "bandwidth", "time" and
-    // "speedup" columns.
-    // TODO: allow custom columns bound to lambda functions (e.g. for Gflops calculation)
-    // Also terminates the recursion of the following variadic template.
-    template< typename ResetFunction,
-              typename ComputeFunction >
-    double
-    time( ResetFunction reset,
-          const String & performer,
-          ComputeFunction & compute )
-    {
-        const double time = timeFunction( compute, reset, loops );
-        const double bandwidth = datasetSize / time;
-        const double speedup = this->baseTime / time;
-        if( this->baseTime == 0.0 )
-            this->baseTime = time;
-
-        writeTableHeader( performer, HeaderElements({"bandwidth", "time", "speedup"}) );
-        writeTableRow( performer, RowElements({ bandwidth, time, speedup }) );
-
-        return this->baseTime;
-    }
-
-    // Recursive template function to deal with multiple computations with the
-    // same reset function.
-    template< typename ResetFunction,
-              typename ComputeFunction,
-              typename... NextComputations >
-    inline double
-    time( ResetFunction reset,
-          const String & performer,
-          ComputeFunction & compute,
-          NextComputations & ... nextComputations )
-    {
-        time( reset, performer, compute );
-        time( reset, nextComputations... );
-        return this->baseTime;
-    }
-
-    // Adds an error message to the log. Should be called in places where the
-    // "time" method could not be called (e.g. due to failed allocation).
-    void
-    addErrorMessage( const char* msg,
-                     const int & numberOfComputations = 1 )
-    {
-        // each computation has 3 subcolumns
-        const int colspan = 3 * numberOfComputations;
-        writeErrorMessage( msg, colspan );
-    }
-
-    using Logging::save;
-
-protected:
-    int loops;
-    double datasetSize = 0.0;
-    double baseTime = 0.0;
-};
-
-} // namespace benchmarks
-} // namespace tnl
diff --git a/tests/benchmarks/spmv.h b/tests/benchmarks/spmv.h
deleted file mode 100644
index cc957c7278ec9b21a418e627af9063e32bec3785..0000000000000000000000000000000000000000
--- a/tests/benchmarks/spmv.h
+++ /dev/null
@@ -1,191 +0,0 @@
-/***************************************************************************
-                          spmv.h  -  description
-                             -------------------
-    begin                : Dec 30, 2015
-    copyright            : (C) 2015 by Tomas Oberhuber et al.
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-// Implemented by: Jakub Klinkovsky
-
-#pragma once
-
-#include "benchmarks.h"
-
-#include <TNL/Containers/List.h>
-#include <TNL/Pointers/DevicePointer.h>
-#include <TNL/Matrices/CSR.h>
-#include <TNL/Matrices/Ellpack.h>
-#include <TNL/Matrices/SlicedEllpack.h>
-#include <TNL/Matrices/ChunkedEllpack.h>
-
-namespace TNL
-{
-namespace benchmarks
-{
-
-// silly alias to match the number of template parameters with other formats
-template< typename Real, typename Device, typename Index >
-using SlicedEllpack = Matrices::SlicedEllpack< Real, Device, Index >;
-
-template< typename Matrix >
-int setHostTestMatrix( Matrix& matrix,
-                       const int elementsPerRow )
-{
-    const int size = matrix.getRows();
-    int elements( 0 );
-    for( int row = 0; row < size; row++ ) {
-        int col = row - elementsPerRow / 2;
-        for( int element = 0; element < elementsPerRow; element++ ) {
-            if( col + element >= 0 &&
-                col + element < size )
-            {
-                matrix.setElement( row, col + element, element + 1 );
-                elements++;
-            }
-        }
-    }
-    return elements;
-}
-
-#ifdef HAVE_CUDA
-template< typename Matrix >
-__global__ void setCudaTestMatrixKernel( Matrix* matrix,
-                                         const int elementsPerRow,
-                                         const int gridIdx )
-{
-    const int rowIdx = ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
-    if( rowIdx >= matrix->getRows() )
-        return;
-    int col = rowIdx - elementsPerRow / 2;
-    for( int element = 0; element < elementsPerRow; element++ ) {
-        if( col + element >= 0 &&
-            col + element < matrix->getColumns() )
-           matrix->setElementFast( rowIdx, col + element, element + 1 );
-    }
-}
-#endif
-
-template< typename Matrix >
-void setCudaTestMatrix( Matrix& matrix,
-                        const int elementsPerRow )
-{
-#ifdef HAVE_CUDA
-    typedef typename Matrix::IndexType IndexType;
-    typedef typename Matrix::RealType RealType;
-    Pointers::DevicePointer< Matrix > kernel_matrix( matrix );
-    dim3 cudaBlockSize( 256 ), cudaGridSize( Devices::Cuda::getMaxGridSize() );
-    const IndexType cudaBlocks = roundUpDivision( matrix.getRows(), cudaBlockSize.x );
-    const IndexType cudaGrids = roundUpDivision( cudaBlocks, Devices::Cuda::getMaxGridSize() );
-    for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ ) {
-        if( gridIdx == cudaGrids - 1 )
-            cudaGridSize.x = cudaBlocks % Devices::Cuda::getMaxGridSize();
-        setCudaTestMatrixKernel< Matrix >
-            <<< cudaGridSize, cudaBlockSize >>>
-            ( &kernel_matrix.template modifyData< Devices::Cuda >(), elementsPerRow, gridIdx );
-        TNL_CHECK_CUDA_DEVICE;
-    }
-#endif
-}
-
-
-// TODO: rename as benchmark_SpMV_synthetic and move to spmv-synthetic.h
-template< typename Real,
-          template< typename, typename, typename > class Matrix,
-          template< typename, typename, typename > class Vector = Containers::Vector >
-bool
-benchmarkSpMV( Benchmark & benchmark,
-               const int & loops,
-               const int & size,
-               const int elementsPerRow = 5 )
-{
-    typedef Matrix< Real, Devices::Host, int > HostMatrix;
-    typedef Matrix< Real, Devices::Cuda, int > DeviceMatrix;
-    typedef Containers::Vector< Real, Devices::Host, int > HostVector;
-    typedef Containers::Vector< Real, Devices::Cuda, int > CudaVector;
-
-    HostMatrix hostMatrix;
-    DeviceMatrix deviceMatrix;
-    Containers::Vector< int, Devices::Host, int > hostRowLengths;
-    Containers::Vector< int, Devices::Cuda, int > deviceRowLengths;
-    HostVector hostVector, hostVector2;
-    CudaVector deviceVector, deviceVector2;
-
-    // create benchmark group
-    Containers::List< String > parsedType;
-    parseObjectType( HostMatrix::getType(), parsedType );
-    benchmark.createHorizontalGroup( parsedType[ 0 ], 2 );
-
-    hostRowLengths.setSize( size );
-    hostMatrix.setDimensions( size, size );
-    hostVector.setSize( size );
-    hostVector2.setSize( size );
-#ifdef HAVE_CUDA
-    deviceRowLengths.setSize( size );
-    deviceMatrix.setDimensions( size, size );
-    deviceVector.setSize( size );
-    deviceVector2.setSize( size );
-#endif
-
-    hostRowLengths.setValue( elementsPerRow );
-#ifdef HAVE_CUDA
-    deviceRowLengths.setValue( elementsPerRow );
-#endif
-
-    hostMatrix.setCompressedRowLengths( hostRowLengths );
-#ifdef HAVE_CUDA
-    deviceMatrix.setCompressedRowLengths( deviceRowLengths );
-#endif
-
-    const int elements = setHostTestMatrix< HostMatrix >( hostMatrix, elementsPerRow );
-    setCudaTestMatrix< DeviceMatrix >( deviceMatrix, elementsPerRow );
-    const double datasetSize = ( double ) loops * elements * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB;
-
-    // reset function
-    auto reset = [&]() {
-        hostVector.setValue( 1.0 );
-        hostVector2.setValue( 0.0 );
-#ifdef HAVE_CUDA
-        deviceVector.setValue( 1.0 );
-        deviceVector2.setValue( 0.0 );
-#endif
-    };
-
-    // compute functions
-    auto spmvHost = [&]() {
-        hostMatrix.vectorProduct( hostVector, hostVector2 );
-    };
-    auto spmvCuda = [&]() {
-        deviceMatrix.vectorProduct( deviceVector, deviceVector2 );
-    };
-
-    benchmark.setOperation( datasetSize );
-    benchmark.time( reset, "CPU", spmvHost );
-#ifdef HAVE_CUDA
-    benchmark.time( reset, "GPU", spmvCuda );
-#endif
-
-    return true;
-}
-
-template< typename Real = double,
-          typename Index = int >
-bool
-benchmarkSpmvSynthetic( Benchmark & benchmark,
-                        const int & loops,
-                        const int & size,
-                        const int & elementsPerRow )
-{
-    bool result = true;
-    // TODO: benchmark all formats from tnl-benchmark-spmv (different parameters of the base formats)
-    result |= benchmarkSpMV< Real, Matrices::CSR >( benchmark, loops, size, elementsPerRow );
-    result |= benchmarkSpMV< Real, Matrices::Ellpack >( benchmark, loops, size, elementsPerRow );
-    result |= benchmarkSpMV< Real, SlicedEllpack >( benchmark, loops, size, elementsPerRow );
-    result |= benchmarkSpMV< Real, Matrices::ChunkedEllpack >( benchmark, loops, size, elementsPerRow );
-    return result;
-}
-
-} // namespace benchmarks
-} // namespace tnl
diff --git a/tests/benchmarks/tnl-benchmark-blas.h b/tests/benchmarks/tnl-benchmark-blas.h
deleted file mode 100644
index 8c71209534894d10a430bae874035c9d14d7bbc3..0000000000000000000000000000000000000000
--- a/tests/benchmarks/tnl-benchmark-blas.h
+++ /dev/null
@@ -1,192 +0,0 @@
-/***************************************************************************
-                          tnl-benchmark-blas.h  -  description
-                             -------------------
-    begin                : Jan 27, 2010
-    copyright            : (C) 2010 by Tomas Oberhuber et al.
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-// Implemented by: Jakub Klinkovsky
-
-#pragma once
-
-#include <TNL/Devices/Host.h>
-#include <TNL/Devices/CudaDeviceInfo.h>
-#include <TNL/Devices/SystemInfo.h>
-#include <TNL/Config/ConfigDescription.h>
-#include <TNL/Config/ParameterContainer.h>
-
-#include "array-operations.h"
-#include "vector-operations.h"
-#include "spmv.h"
-
-using namespace TNL;
-using namespace TNL::benchmarks;
-
-
-// TODO: should benchmarks check the result of the computation?
-
-
-template< typename Real >
-void
-runBlasBenchmarks( Benchmark & benchmark,
-                   Benchmark::MetadataMap metadata,
-                   const std::size_t & minSize,
-                   const std::size_t & maxSize,
-                   const double & sizeStepFactor,
-                   const unsigned & loops,
-                   const unsigned & elementsPerRow )
-{
-    const String precision = getType< Real >();
-    metadata["precision"] = precision;
-
-    // Array operations
-    benchmark.newBenchmark( String("Array operations (") + precision + ")",
-                            metadata );
-    for( std::size_t size = minSize; size <= maxSize; size *= 2 ) {
-        benchmark.setMetadataColumns( Benchmark::MetadataColumns({
-           {"size", size},
-        } ));
-        benchmarkArrayOperations< Real >( benchmark, loops, size );
-    }
-
-    // Vector operations
-    benchmark.newBenchmark( String("Vector operations (") + precision + ")",
-                            metadata );
-    for( std::size_t size = minSize; size <= maxSize; size *= sizeStepFactor ) {
-        benchmark.setMetadataColumns( Benchmark::MetadataColumns({
-           {"size", size},
-        } ));
-        benchmarkVectorOperations< Real >( benchmark, loops, size );
-    }
-
-    // Sparse matrix-vector multiplication
-    benchmark.newBenchmark( String("Sparse matrix-vector multiplication (") + precision + ")",
-                            metadata );
-    for( std::size_t size = minSize; size <= maxSize; size *= 2 ) {
-        benchmark.setMetadataColumns( Benchmark::MetadataColumns({
-            {"rows", size},
-            {"columns", size},
-            {"elements per row", elementsPerRow},
-        } ));
-        benchmarkSpmvSynthetic< Real >( benchmark, loops, size, elementsPerRow );
-    }
-}
-
-void
-setupConfig( Config::ConfigDescription & config )
-{
-    config.addDelimiter( "Benchmark settings:" );
-    config.addEntry< String >( "log-file", "Log file name.", "tnl-benchmark-blas.log");
-    config.addEntry< String >( "output-mode", "Mode for opening the log file.", "overwrite" );
-    config.addEntryEnum( "append" );
-    config.addEntryEnum( "overwrite" );
-    config.addEntry< String >( "precision", "Precision of the arithmetics.", "double" );
-    config.addEntryEnum( "float" );
-    config.addEntryEnum( "double" );
-    config.addEntryEnum( "all" );
-    config.addEntry< int >( "min-size", "Minimum size of arrays/vectors used in the benchmark.", 100000 );
-    config.addEntry< int >( "max-size", "Minimum size of arrays/vectors used in the benchmark.", 10000000 );
-    config.addEntry< int >( "size-step-factor", "Factor determining the size of arrays/vectors used in the benchmark. First size is min-size and each following size is stepFactor*previousSize, up to max-size.", 2 );
-    config.addEntry< int >( "loops", "Number of iterations for every computation.", 10 );
-    config.addEntry< int >( "elements-per-row", "Number of elements per row of the sparse matrix used in the matrix-vector multiplication benchmark.", 5 );
-    config.addEntry< int >( "verbose", "Verbose mode.", 1 );
-
-    config.addDelimiter( "Device settings:" );
-    Devices::Host::configSetup( config );
-    Devices::Cuda::configSetup( config );
-}
-
-int
-main( int argc, char* argv[] )
-{
-    Config::ParameterContainer parameters;
-    Config::ConfigDescription conf_desc;
-
-    setupConfig( conf_desc );
-
-    if( ! parseCommandLine( argc, argv, conf_desc, parameters ) ) {
-        conf_desc.printUsage( argv[ 0 ] );
-        return 1;
-    }
-
-    Devices::Host::setup( parameters );
-    Devices::Cuda::setup( parameters );
-
-    const String & logFileName = parameters.getParameter< String >( "log-file" );
-    const String & outputMode = parameters.getParameter< String >( "output-mode" );
-    const String & precision = parameters.getParameter< String >( "precision" );
-    // FIXME: getParameter< std::size_t >() does not work with parameters added with addEntry< int >(),
-    // which have a default value. The workaround below works for int values, but it is not possible
-    // to pass 64-bit integer values
-//    const std::size_t minSize = parameters.getParameter< std::size_t >( "min-size" );
-//    const std::size_t maxSize = parameters.getParameter< std::size_t >( "max-size" );
-    const std::size_t minSize = parameters.getParameter< int >( "min-size" );
-    const std::size_t maxSize = parameters.getParameter< int >( "max-size" );
-    const unsigned sizeStepFactor = parameters.getParameter< unsigned >( "size-step-factor" );
-    const unsigned loops = parameters.getParameter< unsigned >( "loops" );
-    const unsigned elementsPerRow = parameters.getParameter< unsigned >( "elements-per-row" );
-    const unsigned verbose = parameters.getParameter< unsigned >( "verbose" );
-
-    if( sizeStepFactor <= 1 ) {
-        std::cerr << "The value of --size-step-factor must be greater than 1." << std::endl;
-        return EXIT_FAILURE;
-    }
-
-    // open log file
-    auto mode = std::ios::out;
-    if( outputMode == "append" )
-        mode |= std::ios::app;
-    std::ofstream logFile( logFileName.getString(), mode );
-
-    // init benchmark and common metadata
-    Benchmark benchmark( loops, verbose );
-
-    // prepare global metadata
-    const int cpu_id = 0;
-    Devices::CacheSizes cacheSizes = Devices::SystemInfo::getCPUCacheSizes( cpu_id );
-    String cacheInfo = String( cacheSizes.L1data ) + ", "
-                        + String( cacheSizes.L1instruction ) + ", "
-                        + String( cacheSizes.L2 ) + ", "
-                        + String( cacheSizes.L3 );
-#ifdef HAVE_CUDA
-    const int activeGPU = Devices::CudaDeviceInfo::getActiveDevice();
-    const String deviceArch = String( Devices::CudaDeviceInfo::getArchitectureMajor( activeGPU ) ) + "." +
-                              String( Devices::CudaDeviceInfo::getArchitectureMinor( activeGPU ) );
-#endif
-    Benchmark::MetadataMap metadata {
-        { "host name", Devices::SystemInfo::getHostname() },
-        { "architecture", Devices::SystemInfo::getArchitecture() },
-        { "system", Devices::SystemInfo::getSystemName() },
-        { "system release", Devices::SystemInfo::getSystemRelease() },
-        { "start time", Devices::SystemInfo::getCurrentTime() },
-        { "CPU model name", Devices::SystemInfo::getCPUModelName( cpu_id ) },
-        { "CPU cores", Devices::SystemInfo::getNumberOfCores( cpu_id ) },
-        { "CPU threads per core", Devices::SystemInfo::getNumberOfThreads( cpu_id ) / Devices::SystemInfo::getNumberOfCores( cpu_id ) },
-        { "CPU max frequency (MHz)", Devices::SystemInfo::getCPUMaxFrequency( cpu_id ) / 1e3 },
-        { "CPU cache sizes (L1d, L1i, L2, L3) (kiB)", cacheInfo },
-#ifdef HAVE_CUDA
-        { "GPU name", Devices::CudaDeviceInfo::getDeviceName( activeGPU ) },
-        { "GPU architecture", deviceArch },
-        { "GPU CUDA cores", Devices::CudaDeviceInfo::getCudaCores( activeGPU ) },
-        { "GPU clock rate (MHz)", (double) Devices::CudaDeviceInfo::getClockRate( activeGPU ) / 1e3 },
-        { "GPU global memory (GB)", (double) Devices::CudaDeviceInfo::getGlobalMemory( activeGPU ) / 1e9 },
-        { "GPU memory clock rate (MHz)", (double) Devices::CudaDeviceInfo::getMemoryClockRate( activeGPU ) / 1e3 },
-        { "GPU memory ECC enabled", Devices::CudaDeviceInfo::getECCEnabled( activeGPU ) },
-#endif
-    };
-
-    if( precision == "all" || precision == "float" )
-        runBlasBenchmarks< float >( benchmark, metadata, minSize, maxSize, sizeStepFactor, loops, elementsPerRow );
-    if( precision == "all" || precision == "double" )
-        runBlasBenchmarks< double >( benchmark, metadata, minSize, maxSize, sizeStepFactor, loops, elementsPerRow );
-
-    if( ! benchmark.save( logFile ) ) {
-        std::cerr << "Failed to write the benchmark results to file '" << parameters.getParameter< String >( "log-file" ) << "'." << std::endl;
-        return EXIT_FAILURE;
-    }
-
-    return EXIT_SUCCESS;
-}
diff --git a/tests/benchmarks/vector-operations.h b/tests/benchmarks/vector-operations.h
deleted file mode 100644
index cdf2443964a5bb33354de6e0ec0688018c7128a1..0000000000000000000000000000000000000000
--- a/tests/benchmarks/vector-operations.h
+++ /dev/null
@@ -1,442 +0,0 @@
-/***************************************************************************
-                          vector-operations.h  -  description
-                             -------------------
-    begin                : Dec 30, 2015
-    copyright            : (C) 2015 by Tomas Oberhuber et al.
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-// Implemented by: Jakub Klinkovsky
-
-#pragma once
-
-#include <stdlib.h> // srand48
-
-#include "benchmarks.h"
-
-#include <TNL/Containers/Vector.h>
-
-#ifdef HAVE_CUDA
-#include "cublasWrappers.h"
-#endif
-
-namespace TNL
-{
-namespace benchmarks
-{
-
-template< typename Real = double,
-          typename Index = int >
-bool
-benchmarkVectorOperations( Benchmark & benchmark,
-                           const int & loops,
-                           const long & size )
-{
-    typedef Containers::Vector< Real, Devices::Host, Index > HostVector;
-    typedef Containers::Vector< Real, Devices::Cuda, Index > CudaVector;
-    using namespace std;
-
-    double datasetSize = ( double ) ( loops * size ) * sizeof( Real ) / oneGB;
-
-    HostVector hostVector, hostVector2;
-    CudaVector deviceVector, deviceVector2;
-    hostVector.setSize( size );
-    hostVector2.setSize( size );
-#ifdef HAVE_CUDA
-    deviceVector.setSize( size );
-    deviceVector2.setSize( size );
-#endif
-
-    Real resultHost, resultDevice;
-
-#ifdef HAVE_CUDA
-    cublasHandle_t cublasHandle;
-    cublasCreate( &cublasHandle );
-#endif
-
-
-    // reset functions
-    // (Make sure to always use some in benchmarks, even if it's not necessary
-    // to assure correct result - it helps to clear cache and avoid optimizations
-    // of the benchmark loop.)
-    auto reset1 = [&]() {
-        hostVector.setValue( 1.0 );
-#ifdef HAVE_CUDA
-        deviceVector.setValue( 1.0 );
-#endif
-        // A relatively harmless call to keep the compiler from realizing we
-        // don't actually do any useful work with the result of the reduciton.
-        srand48(resultHost);
-        resultHost = resultDevice = 0.0;
-    };
-    auto reset2 = [&]() {
-        hostVector2.setValue( 1.0 );
-#ifdef HAVE_CUDA
-        deviceVector2.setValue( 1.0 );
-#endif
-    };
-    auto reset12 = [&]() {
-        reset1();
-        reset2();
-    };
-
-
-    reset12();
-
-
-    auto maxHost = [&]() {
-        resultHost = hostVector.max();
-    };
-    auto maxHostGeneral = [&]() {
-        Real result( 0 );
-        Containers::Algorithms::ParallelReductionMax< Real > operation;
-        Containers::Algorithms::Reduction< Devices::Host >::reduce(
-              operation,
-              hostVector.getSize(),
-              hostVector.getData(),
-              ( Real* ) 0,
-              result );
-        return result;
-    };
-    auto maxCuda = [&]() {
-        resultDevice = deviceVector.max();
-    };
-    benchmark.setOperation( "max", datasetSize );
-    benchmark.time( reset1, "CPU", maxHost );
-    benchmark.time( reset1, "CPU (general)", maxHostGeneral );
-#ifdef HAVE_CUDA
-    benchmark.time( reset1, "GPU", maxCuda );
-#endif
-
-
-    auto minHost = [&]() {
-        resultHost = hostVector.min();
-    };
-    auto minHostGeneral = [&]() {
-        Real result( 0 );
-        Containers::Algorithms::ParallelReductionMin< Real > operation;
-        Containers::Algorithms::Reduction< Devices::Host >::reduce(
-              operation,
-              hostVector.getSize(),
-              hostVector.getData(),
-              ( Real* ) 0,
-              result );
-        return result;
-    };
-    auto minCuda = [&]() {
-        resultDevice = deviceVector.min();
-    };
-    benchmark.setOperation( "min", datasetSize );
-    benchmark.time( reset1, "CPU", minHost );
-    benchmark.time( reset1, "CPU (general)", minHostGeneral );
-#ifdef HAVE_CUDA
-    benchmark.time( reset1, "GPU", minCuda );
-#endif
-
-
-    auto absMaxHost = [&]() {
-        resultHost = hostVector.absMax();
-    };
-    auto absMaxHostGeneral = [&]() {
-        Real result( 0 );
-        Containers::Algorithms::ParallelReductionAbsMax< Real > operation;
-        Containers::Algorithms::Reduction< Devices::Host >::reduce(
-              operation,
-              hostVector.getSize(),
-              hostVector.getData(),
-              ( Real* ) 0,
-              result );
-        return result;
-    };
-    auto absMaxCuda = [&]() {
-        resultDevice = deviceVector.absMax();
-    };
-#ifdef HAVE_CUDA
-    auto absMaxCublas = [&]() {
-        int index = 0;
-        cublasIgamax( cublasHandle, size,
-                      deviceVector.getData(), 1,
-                      &index );
-        resultDevice = deviceVector.getElement( index );
-    };
-#endif
-    benchmark.setOperation( "absMax", datasetSize );
-    benchmark.time( reset1, "CPU", absMaxHost );
-    benchmark.time( reset1, "CPU (general)", absMaxHostGeneral );
-#ifdef HAVE_CUDA
-    benchmark.time( reset1, "GPU", absMaxCuda );
-    benchmark.time( reset1, "cuBLAS", absMaxCublas );
-#endif
-
-
-    auto absMinHost = [&]() {
-        resultHost = hostVector.absMin();
-    };
-    auto absMinHostGeneral = [&]() {
-        Real result( 0 );
-        Containers::Algorithms::ParallelReductionAbsMin< Real > operation;
-        Containers::Algorithms::Reduction< Devices::Host >::reduce(
-              operation,
-              hostVector.getSize(),
-              hostVector.getData(),
-              ( Real* ) 0,
-              result );
-        return result;
-    };
-    auto absMinCuda = [&]() {
-        resultDevice = deviceVector.absMin();
-    };
-#ifdef HAVE_CUDA
-    auto absMinCublas = [&]() {
-        int index = 0;
-        cublasIgamin( cublasHandle, size,
-                      deviceVector.getData(), 1,
-                      &index );
-        resultDevice = deviceVector.getElement( index );
-    };
-#endif
-    benchmark.setOperation( "absMin", datasetSize );
-    benchmark.time( reset1, "CPU", absMinHost );
-    benchmark.time( reset1, "CPU (general)", absMinHostGeneral );
-#ifdef HAVE_CUDA
-    benchmark.time( reset1, "GPU", absMinCuda );
-    benchmark.time( reset1, "cuBLAS", absMinCublas );
-#endif
-
-
-    auto sumHost = [&]() {
-        resultHost = hostVector.sum();
-    };
-    auto sumHostGeneral = [&]() {
-        Real result( 0 );
-        Containers::Algorithms::ParallelReductionSum< Real > operation;
-        Containers::Algorithms::Reduction< Devices::Host >::reduce(
-              operation,
-              hostVector.getSize(),
-              hostVector.getData(),
-              ( Real* ) 0,
-              result );
-        return result;
-    };
-    auto sumCuda = [&]() {
-        resultDevice = deviceVector.sum();
-    };
-    benchmark.setOperation( "sum", datasetSize );
-    benchmark.time( reset1, "CPU", sumHost );
-    benchmark.time( reset1, "CPU (general)", sumHostGeneral );
-#ifdef HAVE_CUDA
-    benchmark.time( reset1, "GPU", sumCuda );
-#endif
-
-
-    auto l1normHost = [&]() {
-        resultHost = hostVector.lpNorm( 1.0 );
-    };
-    auto l1normHostGeneral = [&]() {
-        Real result( 0 );
-        Containers::Algorithms::ParallelReductionAbsSum< Real > operation;
-        Containers::Algorithms::Reduction< Devices::Host >::reduce(
-              operation,
-              hostVector.getSize(),
-              hostVector.getData(),
-              ( Real* ) 0,
-              result );
-        return result;
-    };
-    auto l1normCuda = [&]() {
-        resultDevice = deviceVector.lpNorm( 1.0 );
-    };
-#ifdef HAVE_CUDA
-    auto l1normCublas = [&]() {
-        cublasGasum( cublasHandle, size,
-                     deviceVector.getData(), 1,
-                     &resultDevice );
-    };
-#endif
-    benchmark.setOperation( "l1 norm", datasetSize );
-    benchmark.time( reset1, "CPU", l1normHost );
-    benchmark.time( reset1, "CPU (general)", l1normHostGeneral );
-#ifdef HAVE_CUDA
-    benchmark.time( reset1, "GPU", l1normCuda );
-    benchmark.time( reset1, "cuBLAS", l1normCublas );
-#endif
-
-
-    auto l2normHost = [&]() {
-        resultHost = hostVector.lpNorm( 2.0 );
-    };
-    auto l2normHostGeneral = [&]() {
-        Real result( 0 );
-        Containers::Algorithms::ParallelReductionL2Norm< Real > operation;
-        Containers::Algorithms::Reduction< Devices::Host >::reduce(
-              operation,
-              hostVector.getSize(),
-              hostVector.getData(),
-              ( Real* ) 0,
-              result );
-        return result;
-    };
-    auto l2normCuda = [&]() {
-        resultDevice = deviceVector.lpNorm( 2.0 );
-    };
-#ifdef HAVE_CUDA
-    auto l2normCublas = [&]() {
-        cublasGnrm2( cublasHandle, size,
-                     deviceVector.getData(), 1,
-                     &resultDevice );
-    };
-#endif
-    benchmark.setOperation( "l2 norm", datasetSize );
-    benchmark.time( reset1, "CPU", l2normHost );
-    benchmark.time( reset1, "CPU (general)", l2normHostGeneral );
-#ifdef HAVE_CUDA
-    benchmark.time( reset1, "GPU", l2normCuda );
-    benchmark.time( reset1, "cuBLAS", l2normCublas );
-#endif
-
-
-    auto l3normHost = [&]() {
-        resultHost = hostVector.lpNorm( 3.0 );
-    };
-    auto l3normHostGeneral = [&]() {
-        Real result( 0 );
-        Containers::Algorithms::ParallelReductionLpNorm< Real > operation;
-        operation.setPower( 3.0 );
-        Containers::Algorithms::Reduction< Devices::Host >::reduce(
-              operation,
-              hostVector.getSize(),
-              hostVector.getData(),
-              ( Real* ) 0,
-              result );
-        return result;
-    };
-    auto l3normCuda = [&]() {
-        resultDevice = deviceVector.lpNorm( 3.0 );
-    };
-    benchmark.setOperation( "l3 norm", datasetSize );
-    benchmark.time( reset1, "CPU", l3normHost );
-    benchmark.time( reset1, "CPU (general)", l3normHostGeneral );
-#ifdef HAVE_CUDA
-    benchmark.time( reset1, "GPU", l3normCuda );
-#endif
-
-
-    auto scalarProductHost = [&]() {
-        resultHost = hostVector.scalarProduct( hostVector2 );
-    };
-    auto scalarProductHostGeneral = [&]() {
-        Real result( 0 );
-        Containers::Algorithms::ParallelReductionScalarProduct< Real, Real > operation;
-        Containers::Algorithms::Reduction< Devices::Host >::reduce(
-              operation,
-              hostVector.getSize(),
-              hostVector.getData(),
-              hostVector2.getData(),
-              result );
-        return result;
-    };
-    auto scalarProductCuda = [&]() {
-        resultDevice = deviceVector.scalarProduct( deviceVector2 );
-    };
-#ifdef HAVE_CUDA
-    auto scalarProductCublas = [&]() {
-        cublasGdot( cublasHandle, size,
-                    deviceVector.getData(), 1,
-                    deviceVector2.getData(), 1,
-                    &resultDevice );
-    };
-#endif
-    benchmark.setOperation( "scalar product", 2 * datasetSize );
-    benchmark.time( reset1, "CPU", scalarProductHost );
-    benchmark.time( reset1, "CPU (general)", scalarProductHostGeneral );
-#ifdef HAVE_CUDA
-    benchmark.time( reset1, "GPU", scalarProductCuda );
-    benchmark.time( reset1, "cuBLAS", scalarProductCublas );
-#endif
-
-    /*
-   std::cout << "Benchmarking prefix-sum:" << std::endl;
-    timer.reset();
-    timer.start();
-    hostVector.computePrefixSum();
-    timer.stop();
-    timeHost = timer.getTime();
-    bandwidth = 2 * datasetSize / loops / timer.getTime();
-   std::cout << "  CPU: bandwidth: " << bandwidth << " GB/sec, time: " << timer.getTime() << " sec." << std::endl;
-
-    timer.reset();
-    timer.start();
-    deviceVector.computePrefixSum();
-    timer.stop();
-    timeDevice = timer.getTime();
-    bandwidth = 2 * datasetSize / loops / timer.getTime();
-   std::cout << "  GPU: bandwidth: " << bandwidth << " GB/sec, time: " << timer.getTime() << " sec." << std::endl;
-   std::cout << "  CPU/GPU speedup: " << timeHost / timeDevice << std::endl;
-
-    HostVector auxHostVector;
-    auxHostVector.setLike( deviceVector );
-    auxHostVector = deviceVector;
-    for( int i = 0; i < size; i++ )
-       if( hostVector.getElement( i ) != auxHostVector.getElement( i ) )
-       {
-          std::cerr << "Error in prefix sum at position " << i << ":  " << hostVector.getElement( i ) << " != " << auxHostVector.getElement( i ) << std::endl;
-       }
-    */
-
-
-    auto multiplyHost = [&]() {
-        hostVector *= 0.5;
-    };
-    auto multiplyCuda = [&]() {
-        deviceVector *= 0.5;
-    };
-#ifdef HAVE_CUDA
-    auto multiplyCublas = [&]() {
-        const Real alpha = 0.5;
-        cublasGscal( cublasHandle, size,
-                     &alpha,
-                     deviceVector.getData(), 1 );
-    };
-#endif
-    benchmark.setOperation( "scalar multiplication", 2 * datasetSize );
-    benchmark.time( reset1, "CPU", multiplyHost );
-#ifdef HAVE_CUDA
-    benchmark.time( reset1, "GPU", multiplyCuda );
-    benchmark.time( reset1, "cuBLAS", multiplyCublas );
-#endif
-
-
-    auto addVectorHost = [&]() {
-        hostVector.addVector( hostVector2 );
-    };
-    auto addVectorCuda = [&]() {
-        deviceVector.addVector( deviceVector2 );
-    };
-#ifdef HAVE_CUDA
-    auto addVectorCublas = [&]() {
-        const Real alpha = 1.0;
-        cublasGaxpy( cublasHandle, size,
-                     &alpha,
-                     deviceVector2.getData(), 1,
-                     deviceVector.getData(), 1 );
-    };
-#endif
-    benchmark.setOperation( "vector addition", 3 * datasetSize );
-    benchmark.time( reset1, "CPU", addVectorHost );
-#ifdef HAVE_CUDA
-    benchmark.time( reset1, "GPU", addVectorCuda );
-    benchmark.time( reset1, "cuBLAS", addVectorCublas );
-#endif
-
-
-#ifdef HAVE_CUDA
-    cublasDestroy( cublasHandle );
-#endif
-
-    return true;
-}
-
-} // namespace benchmarks
-} // namespace tnl