diff --git a/src/Benchmarks/Benchmarks.h b/src/Benchmarks/Benchmarks.h
index 007e930012363796bc9388a70ddd5ba819e5b264..0f719301ddb472d3bfa701ed88f91473867e844d 100644
--- a/src/Benchmarks/Benchmarks.h
+++ b/src/Benchmarks/Benchmarks.h
@@ -16,6 +16,8 @@
 #include <iomanip>
 #include <map>
 #include <vector>
+#include <exception>
+#include <limits>
 
 #include <TNL/Timer.h>
 #include <TNL/String.h>
@@ -310,7 +312,6 @@ public:
    {
       closeTable();
       writeTitle( title );
-      monitor.setStage( title.getString() );
    }
 
    // Marks the start of a new benchmark (with custom metadata)
@@ -320,7 +321,6 @@ public:
    {
       closeTable();
       writeTitle( title );
-      monitor.setStage( title.getString() );
       // add loops to metadata
       metadata["loops"] = String(loops);
       writeMetadata( metadata );
@@ -347,6 +347,7 @@ public:
                  const double datasetSize = 0.0, // in GB
                  const double baseTime = 0.0 )
    {
+      monitor.setStage( operation.getString() );
       if( metadataColumns.size() > 0 && String(metadataColumns[ 0 ].first) == "operation" ) {
          metadataColumns[ 0 ].second = operation;
       }
@@ -401,13 +402,19 @@ public:
          ComputeFunction & compute )
    {
       double time;
-      if( verbose ) {
-         // run the monitor main loop
-         Solvers::SolverMonitorThread monitor_thread( monitor );
-         time = timeFunction( compute, reset, loops, monitor );
+      try {
+         if( verbose ) {
+            // run the monitor main loop
+            Solvers::SolverMonitorThread monitor_thread( monitor );
+            time = timeFunction( compute, reset, loops, monitor );
+         }
+         else {
+            time = timeFunction( compute, reset, loops, monitor );
+         }
       }
-      else {
-         time = timeFunction( compute, reset, loops, monitor );
+      catch ( const std::exception& e ) {
+         std::cerr << "timeFunction failed due to a C++ exception with description: " << e.what() << std::endl;
+         time = std::numeric_limits<double>::quiet_NaN();
       }
 
       const double bandwidth = datasetSize / time;
@@ -450,6 +457,12 @@ public:
 
    using Logging::save;
 
+   Solvers::IterativeSolverMonitor< double, int >&
+   getMonitor()
+   {
+      return monitor;
+   }
+
 protected:
    int loops;
    double datasetSize = 0.0;
diff --git a/src/Benchmarks/LinearSolvers/CMakeLists.txt b/src/Benchmarks/LinearSolvers/CMakeLists.txt
index 1a95c92f791c7d7a0176415fd4968d8aa6bb8982..af0187928335a85638fa07680b79a7ad797c601a 100644
--- a/src/Benchmarks/LinearSolvers/CMakeLists.txt
+++ b/src/Benchmarks/LinearSolvers/CMakeLists.txt
@@ -1,9 +1,12 @@
 if( BUILD_CUDA )
-    CUDA_ADD_EXECUTABLE( tnl-benchmark-linear-solvers tnl-benchmark-linear-solvers.cu )
-    TARGET_LINK_LIBRARIES( tnl-benchmark-linear-solvers tnl )
-else()
-    ADD_EXECUTABLE( tnl-benchmark-linear-solvers tnl-benchmark-linear-solvers.cpp )
-    TARGET_LINK_LIBRARIES( tnl-benchmark-linear-solvers tnl )
+   CUDA_ADD_EXECUTABLE( tnl-benchmark-linear-solvers-cuda tnl-benchmark-linear-solvers.cu
+                        OPTIONS -DHAVE_CUSPARSE )
+   TARGET_LINK_LIBRARIES( tnl-benchmark-linear-solvers-cuda tnl ${CUDA_cusparse_LIBRARY} )
+
+   install( TARGETS tnl-benchmark-linear-solvers-cuda RUNTIME DESTINATION bin )
 endif()
 
+ADD_EXECUTABLE( tnl-benchmark-linear-solvers tnl-benchmark-linear-solvers.cpp )
+TARGET_LINK_LIBRARIES( tnl-benchmark-linear-solvers tnl )
+
 install( TARGETS tnl-benchmark-linear-solvers RUNTIME DESTINATION bin )
diff --git a/src/Benchmarks/LinearSolvers/CuSolverWrapper.h b/src/Benchmarks/LinearSolvers/CuSolverWrapper.h
new file mode 100644
index 0000000000000000000000000000000000000000..40b827ef9a8854f0057dbc1a0f3c556ba33bd703
--- /dev/null
+++ b/src/Benchmarks/LinearSolvers/CuSolverWrapper.h
@@ -0,0 +1,135 @@
+#pragma once
+
+#ifdef HAVE_CUDA
+
+#include <cusolverSp.h>
+
+#include <TNL/Config/ConfigDescription.h>
+#include <TNL/Matrices/CSR.h>
+#include <TNL/Solvers/Linear/LinearSolver.h>
+
+
+template< typename Matrix >
+struct is_csr_matrix
+{
+    static constexpr bool value = false;
+};
+
+template< typename Real, typename Device, typename Index >
+struct is_csr_matrix< TNL::Matrices::CSR< Real, Device, Index > >
+{
+    static constexpr bool value = true;
+};
+
+
+namespace TNL {
+namespace Solvers {
+namespace Linear {
+
+template< typename Matrix >
+class CuSolverWrapper
+    : public LinearSolver< Matrix >
+{
+    static_assert( ::is_csr_matrix< Matrix >::value, "The CuSolverWrapper solver is available only for CSR matrices." );
+    static_assert( std::is_same< typename Matrix::DeviceType, Devices::Cuda >::value, "CuSolverWrapper is available only on CUDA" );
+    static_assert( std::is_same< typename Matrix::RealType, float >::value ||
+                   std::is_same< typename Matrix::RealType, double >::value, "unsupported RealType" );
+    static_assert( std::is_same< typename Matrix::IndexType, int >::value, "unsupported IndexType" );
+
+    using Base = LinearSolver< Matrices::CSR< double, Devices::Cuda, int > >;
+public:
+    using RealType = typename Base::RealType;
+    using DeviceType = typename Base::DeviceType;
+    using IndexType = typename Base::IndexType;
+    using VectorViewType = typename Base::VectorViewType;
+    using ConstVectorViewType = typename Base::ConstVectorViewType;
+
+    bool solve( ConstVectorViewType b, VectorViewType x ) override
+    {
+        TNL_ASSERT_EQ( this->matrix->getRows(), this->matrix->getColumns(), "matrix must be square" );
+        TNL_ASSERT_EQ( this->matrix->getColumns(), x.getSize(), "wrong size of the solution vector" );
+        TNL_ASSERT_EQ( this->matrix->getColumns(), b.getSize(), "wrong size of the right hand side" );
+
+        const IndexType size = this->matrix->getRows();
+
+        this->resetIterations();
+        this->setResidue( this->getConvergenceResidue() + 1.0 );
+
+        cusolverSpHandle_t handle;
+        cusolverStatus_t status;
+        cusparseStatus_t cusparse_status;
+        cusparseMatDescr_t mat_descr;
+
+        status = cusolverSpCreate( &handle );
+        if( status != CUSOLVER_STATUS_SUCCESS ) {
+            std::cerr << "cusolverSpCreate failed: " << status << std::endl;
+            return false;
+        }
+
+        cusparse_status = cusparseCreateMatDescr( &mat_descr );
+        if( cusparse_status != CUSPARSE_STATUS_SUCCESS ) {
+            std::cerr << "cusparseCreateMatDescr failed: " << cusparse_status << std::endl;
+            return false;
+        }
+
+        cusparseSetMatType( mat_descr, CUSPARSE_MATRIX_TYPE_GENERAL );
+        cusparseSetMatIndexBase( mat_descr, CUSPARSE_INDEX_BASE_ZERO );
+
+        const RealType tol = 1e-16;
+        const int reorder = 0;
+        int singularity = 0;
+
+        if( std::is_same< typename Matrix::RealType, float >::value )
+        {
+            status = cusolverSpScsrlsvqr( handle,
+                                          size,
+                                          this->matrix->getValues().getSize(),
+                                          mat_descr,
+                                          (const float*) this->matrix->getValues().getData(),
+                                          this->matrix->getRowPointers().getData(),
+                                          this->matrix->getColumnIndexes().getData(),
+                                          (const float*) b.getData(),
+                                          tol,
+                                          reorder,
+                                          (float*) x.getData(),
+                                          &singularity );
+
+            if( status != CUSOLVER_STATUS_SUCCESS ) {
+                std::cerr << "cusolverSpScsrlsvqr failed: " << status << std::endl;
+                return false;
+            }
+        }
+
+        if( std::is_same< typename Matrix::RealType, double >::value )
+        {
+            status = cusolverSpDcsrlsvqr( handle,
+                                          size,
+                                          this->matrix->getValues().getSize(),
+                                          mat_descr,
+                                          (const double*) this->matrix->getValues().getData(),
+                                          this->matrix->getRowPointers().getData(),
+                                          this->matrix->getColumnIndexes().getData(),
+                                          (const double*) b.getData(),
+                                          tol,
+                                          reorder,
+                                          (double*) x.getData(),
+                                          &singularity );
+
+            if( status != CUSOLVER_STATUS_SUCCESS ) {
+                std::cerr << "cusolverSpDcsrlsvqr failed: " << status << std::endl;
+                return false;
+            }
+        }
+
+        cusparseDestroyMatDescr( mat_descr );
+        cusolverSpDestroy( handle );
+
+        return true;
+    }
+};
+
+} // namespace Linear
+} // namespace Solvers
+} // namespace TNL
+
+#endif
diff --git a/src/Benchmarks/LinearSolvers/benchmarks.h b/src/Benchmarks/LinearSolvers/benchmarks.h
new file mode 100644
index 0000000000000000000000000000000000000000..1ccde92e6cbbf401e0630f84da99963cc1ed0180
--- /dev/null
+++ b/src/Benchmarks/LinearSolvers/benchmarks.h
@@ -0,0 +1,191 @@
+#pragma once
+
+#include <TNL/Pointers/SharedPointer.h>
+#include <TNL/Config/ParameterContainer.h>
+#include <TNL/Solvers/IterativeSolverMonitor.h>
+#include <TNL/DistributedContainers/DistributedMatrix.h>
+
+#include "../Benchmarks.h"
+
+#ifdef HAVE_ARMADILLO
+#include <armadillo>
+#include <TNL/Matrices/CSR.h>
+#endif
+
+#include <stdexcept>  // std::runtime_error
+
+using namespace TNL;
+using namespace TNL::Pointers;
+using namespace TNL::Benchmarks;
+
+
+template< typename Device >
+const char*
+getPerformer()
+{
+   if( std::is_same< Device, Devices::Cuda >::value )
+      return "GPU";
+   return "CPU";
+}
+
+template< typename Matrix >
+void barrier( const Matrix& matrix )
+{
+}
+
+template< typename Matrix, typename Communicator >
+void barrier( const DistributedContainers::DistributedMatrix< Matrix, Communicator >& matrix )
+{
+   Communicator::Barrier( matrix.getCommunicationGroup() );
+}
+
+template< template<typename> class Preconditioner, typename Matrix >
+void
+benchmarkPreconditionerUpdate( Benchmark& benchmark,
+                               const Config::ParameterContainer& parameters,
+                               const SharedPointer< Matrix >& matrix )
+{
+   barrier( matrix );
+   const char* performer = getPerformer< typename Matrix::DeviceType >();
+   Preconditioner< Matrix > preconditioner;
+   preconditioner.setup( parameters );
+
+   auto reset = []() {};
+   auto compute = [&]() {
+      preconditioner.update( matrix );
+      barrier( matrix );
+   };
+
+   benchmark.time( reset, performer, compute );
+}
+
+template< template<typename> class Solver, template<typename> class Preconditioner, typename Matrix, typename Vector >
+void
+benchmarkSolver( Benchmark& benchmark,
+                 const Config::ParameterContainer& parameters,
+                 const SharedPointer< Matrix >& matrix,
+                 const Vector& x0,
+                 const Vector& b )
+{
+   barrier( matrix );
+   const char* performer = getPerformer< typename Matrix::DeviceType >();
+
+   // setup
+   Solver< Matrix > solver;
+   solver.setup( parameters );
+   solver.setMatrix( matrix );
+
+   // FIXME: getMonitor returns solver monitor specialized for double and int
+   solver.setSolverMonitor( benchmark.getMonitor() );
+
+   auto pre = std::make_shared< Preconditioner< Matrix > >();
+   pre->setup( parameters );
+   solver.setPreconditioner( pre );
+   // preconditioner update may throw if it's not implemented for CUDA
+   try {
+      pre->update( matrix );
+   }
+   catch ( const std::runtime_error& ) {}
+
+   Vector x;
+   x.setLike( x0 );
+
+   // reset function
+   auto reset = [&]() {
+      x = x0;
+   };
+
+   // benchmark function
+   auto compute = [&]() {
+      const bool converged = solver.solve( b, x );
+      barrier( matrix );
+      if( ! converged )
+         throw std::runtime_error("solver did not converge");
+   };
+
+   benchmark.time( reset, performer, compute );
+
+   // TODO: add extra columns to the benchmark (iterations, preconditioned residue, true residue)
+//   Vector r;
+//   r.setLike( x );
+//   matrix->vectorProduct( x, r );
+//   r.addVector( b, 1.0, -1.0 );
+//   const double trueResidue = r.lpNorm( 2.0 ) / b.lpNorm( 2.0 );
+
+//    std::cout << "Converged: " << solver.checkConvergence() << ", iterations = " << solver.getIterations() << ", residue = " << solver.getResidue() << "    " << std::endl;
+//    std::cout << "Mean time: " << time / loops << " seconds." << std::endl;
+//    std::cout << outputPrefix
+//              << "converged: "   << std::setw( 5) << std::boolalpha << ( ! std::isnan(solver.getResidue()) && solver.getResidue() < solver.getConvergenceResidue() ) << "   "
+//              << "iterations = " << std::setw( 4) << solver.getIterations() << "   "
+//              << "preconditioned residue = " << std::setw(10) << solver.getResidue() << "   "
+//              << "true residue = " << std::setw(10) << trueResidue << "   "
+//              << "mean time = "  << std::setw( 9) << time / loops << " seconds." << std::endl;
+}
+
+#ifdef HAVE_ARMADILLO
+// TODO: make a TNL solver like UmfpackWrapper
+template< typename Vector >
+void
+benchmarkArmadillo( const Config::ParameterContainer& parameters,
+                    const Pointers::SharedPointer< Matrices::CSR< double, Devices::Host, int > >& matrix,
+                    const Vector& x0,
+                    const Vector& b )
+{
+    using namespace TNL;
+
+    // copy matrix into Armadillo's class
+    // sp_mat is in CSC format
+    arma::uvec _colptr( matrix->getRowPointers().getSize() );
+    for( int i = 0; i < matrix->getRowPointers().getSize(); i++ )
+        _colptr[ i ] = matrix->getRowPointers()[ i ];
+    arma::uvec _rowind( matrix->getColumnIndexes().getSize() );
+    for( int i = 0; i < matrix->getColumnIndexes().getSize(); i++ )
+        _rowind[ i ] = matrix->getColumnIndexes()[ i ];
+    arma::vec _values( matrix->getValues().getData(), matrix->getValues().getSize() );
+    arma::sp_mat AT( _rowind,
+                     _colptr,
+                     _values,
+                     matrix->getColumns(),
+                     matrix->getRows() );
+    arma::sp_mat A = AT.t();
+
+    Vector x;
+    x.setLike( x0 );
+
+    // Armadillo vector using the same memory as x (with copy_aux_mem=false, strict=true)
+    arma::vec arma_x( x.getData(), x.getSize(), false, true );
+    arma::vec arma_b( b.getData(), b.getSize() );
+
+    arma::superlu_opts settings;
+//    settings.equilibrate = false;
+    settings.equilibrate = true;
+    settings.pivot_thresh = 1.0;
+//    settings.permutation = arma::superlu_opts::COLAMD;
+    settings.permutation = arma::superlu_opts::MMD_AT_PLUS_A;
+//    settings.refine = arma::superlu_opts::REF_DOUBLE;
+    settings.refine = arma::superlu_opts::REF_NONE;
+
+    // reset function
+    auto reset = [&]() {
+        x = x0;
+    };
+
+    // benchmark function
+    auto compute = [&]() {
+        const bool converged = arma::spsolve( arma_x, A, arma_b, "superlu", settings );
+        if( ! converged )
+            throw std::runtime_error("solver did not converge");
+    };
+
+    const int loops = parameters.getParameter< int >( "loops" );
+    double time = timeFunction( compute, reset, loops );
+
+    arma::vec r = A * arma_x - arma_b;
+//    std::cout << "Converged: " << (time > 0) << ", residue = " << arma::norm( r ) / arma::norm( arma_b ) << "                                                 " << std::endl;
+//    std::cout << "Mean time: " << time / loops << " seconds." << std::endl;
+    std::cout << "Converged: "   << std::setw( 5) << std::boolalpha << (time > 0) << "   "
+              << "iterations = " << std::setw( 4) << "N/A" << "   "
+              << "residue = "    << std::setw(10) << arma::norm( r ) / arma::norm( arma_b ) << "   "
+              << "mean time = "  << std::setw( 9) << time / loops << " seconds." << std::endl;
+}
+#endif
diff --git a/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.cpp b/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.cpp
index 917636d6bfe0830f58ecfbf93992216b1a0b8177..f335b4f06733d635a09a4a6eac19a78fd4734637 100644
--- a/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.cpp
+++ b/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.cpp
@@ -1,11 +1 @@
-/***************************************************************************
-                          tnl-benchmark-linear-solvers.cpp  -  description
-                             -------------------
-    begin                : Jun 22, 2014
-    copyright            : (C) 2014 by Tomas Oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
 #include "tnl-benchmark-linear-solvers.h"
diff --git a/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.cu b/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.cu
index 2a117204e76a9881d55de83a13f31fcd40e154ae..f335b4f06733d635a09a4a6eac19a78fd4734637 100644
--- a/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.cu
+++ b/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.cu
@@ -1,11 +1 @@
-/***************************************************************************
-                          tnl-benchmark-linear-solvers.cu  -  description
-                             -------------------
-    begin                : Jun 22, 2014
-    copyright            : (C) 2014 by Tomas Oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
 #include "tnl-benchmark-linear-solvers.h"
diff --git a/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.h b/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.h
index 3e50f973e5c4b36cd3721fa53ea224d7dfa49f66..fe10fd72e1d0d332b6a5873b3cf746006858cfee 100644
--- a/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.h
+++ b/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.h
@@ -1,227 +1,472 @@
 /***************************************************************************
                           tnl-benchmark-linear-solvers.h  -  description
                              -------------------
-    begin                : Jun 22, 2014
-    copyright            : (C) 2014 by Tomas Oberhuber
+    begin                : Sep 18, 2018
+    copyright            : (C) 2018 by Tomas Oberhuber et al.
     email                : tomas.oberhuber@fjfi.cvut.cz
  ***************************************************************************/
 
 /* See Copyright Notice in tnl/Copyright */
 
-#ifndef TNL_BENCHMARK_LINEAR_SOLVERS_H_
-#define TNL_BENCHMARK_LINEAR_SOLVERS_H_
+// Implemented by: Jakub Klinkovsky
 
-#include <fstream>
-#include <iomanip>
-#include <unistd.h>
+#pragma once
+
+#ifndef NDEBUG
+#include <TNL/Debugging/FPE.h>
+#endif
 
 #include <TNL/Config/ConfigDescription.h>
 #include <TNL/Config/ParameterContainer.h>
-#include <TNL/Timer.h>
-#include <TNL/Pointers/SharedPointer.h>
-#include <TNL/Matrices/Dense.h>
-#include <TNL/Matrices/Tridiagonal.h>
-#include <TNL/Matrices/Multidiagonal.h>
-#include <TNL/Matrices/CSR.h>
-#include <TNL/Matrices/Ellpack.h>
-#include <TNL/Matrices/SlicedEllpack.h>
-#include <TNL/Matrices/ChunkedEllpack.h>
-#include <TNL/Matrices/MatrixReader.h>
+#include <TNL/Devices/Host.h>
+#include <TNL/Devices/Cuda.h>
+#include <TNL/Communicators/MpiCommunicator.h>
+#include <TNL/Communicators/NoDistrCommunicator.h>
+#include <TNL/Communicators/ScopedInitializer.h>
+#include <TNL/DistributedContainers/Partitioner.h>
+#include <TNL/DistributedContainers/DistributedVector.h>
+#include <TNL/DistributedContainers/DistributedMatrix.h>
+#include <TNL/Solvers/Linear/Preconditioners/Diagonal.h>
+#include <TNL/Solvers/Linear/Preconditioners/ILU0.h>
+#include <TNL/Solvers/Linear/Preconditioners/ILUT.h>
 #include <TNL/Solvers/Linear/GMRES.h>
-#include <TNL/Solvers/Linear/CG.h>
-#include <TNL/Solvers/Linear/BICGStab.h>
+#include <TNL/Solvers/Linear/CWYGMRES.h>
 #include <TNL/Solvers/Linear/TFQMR.h>
-#include <TNL/Solvers/IterativeSolverMonitor.h>
+#include <TNL/Solvers/Linear/BICGStab.h>
+#include <TNL/Solvers/Linear/BICGStabL.h>
+#include <TNL/Solvers/Linear/UmfpackWrapper.h>
+
+#include "../Benchmarks.h"
+#include "../DistSpMV/ordering.h"
+#include "benchmarks.h"
+
+// FIXME: nvcc 8.0 fails when cusolverSp.h is included (works fine with clang):
+// /opt/cuda/include/cuda_fp16.h(3068): error: more than one instance of overloaded function "isinf" matches the argument list:
+//             function "isinf(float)"
+//             function "std::isinf(float)"
+//             argument types are: (float)
+#if defined(HAVE_CUDA) && !defined(__NVCC__)
+   #include "CuSolverWrapper.h"
+   #define HAVE_CUSOLVER
+#endif
 
-using namespace TNL;
-using namespace TNL::Matrices;
+#include <TNL/Matrices/SlicedEllpack.h>
 
-void configSetup( Config::ConfigDescription& config )
+using namespace TNL;
+using namespace TNL::Benchmarks;
+using namespace TNL::Pointers;
+
+#ifdef HAVE_MPI
+using CommunicatorType = Communicators::MpiCommunicator;
+#else
+using CommunicatorType = Communicators::NoDistrCommunicator;
+#endif
+
+
+template< typename Matrix, typename Vector >
+void
+benchmarkIterativeSolvers( Benchmark& benchmark,
+// FIXME: ParameterContainer should be copyable, but that leads to double-free
+//                           Config::ParameterContainer parameters,
+                           Config::ParameterContainer& parameters,
+                           const SharedPointer< Matrix >& matrixPointer,
+                           const Vector& x0,
+                           const Vector& b )
 {
-   config.addDelimiter                            ( "General settings:" );
-   config.addRequiredEntry< String >( "test" , "Test to be performed." );
-      config.addEntryEnum< String >( "mtx" );
-      config.addEntryEnum< String >( "tnl" );
-   config.addRequiredEntry< String >( "input-file" , "Input binary file name." );
-   config.addEntry< String >( "log-file", "Log file name.", "tnl-benchmark-linear-solvers.log");
-   config.addEntry< String >( "precision", "Precision of the arithmetics.", "double" );
-   config.addEntry< String >( "matrix-format", "Matrix format.", "csr" );
-      config.addEntryEnum< String >( "dense" );
-      config.addEntryEnum< String >( "tridiagonal" );
-      config.addEntryEnum< String >( "multidiagonal" );
-      config.addEntryEnum< String >( "ellpack" );
-      config.addEntryEnum< String >( "sliced-ellpack" );
-      config.addEntryEnum< String >( "chunked-ellpack" );
-      config.addEntryEnum< String >( "csr" );
-   config.addEntry< String >( "solver", "Linear solver.", "gmres" );
-      config.addEntryEnum< String >( "sor" );
-      config.addEntryEnum< String >( "cg" );
-      config.addEntryEnum< String >( "gmres" );
-   config.addEntry< String >( "device", "Device.", "host" );
-      config.addEntryEnum< String >( "host" );
-      config.addEntryEnum< String >( "cuda" );
-   config.addEntry< int >( "verbose", "Verbose mode.", 1 );
-}
+#ifdef HAVE_CUDA
+   using CudaMatrix = typename Matrix::CudaType;
+   using CudaVector = typename Vector::CudaType;
+
+   CudaVector cuda_x0, cuda_b;
+   cuda_x0 = x0;
+   cuda_b = b;
+
+   SharedPointer< CudaMatrix > cudaMatrixPointer;
+   *cudaMatrixPointer = *matrixPointer;
+
+   // synchronize shared pointers
+   Devices::Cuda::synchronizeDevice();
+#endif
+
+   using namespace Solvers::Linear;
+   using namespace Solvers::Linear::Preconditioners;
+
+   const int ell_max = 2;
+
+   benchmark.setOperation("preconditioner update (Jacobi)");
+   benchmarkPreconditionerUpdate< Diagonal >( benchmark, parameters, matrixPointer );
+#ifdef HAVE_CUDA
+   benchmarkPreconditionerUpdate< Diagonal >( benchmark, parameters, cudaMatrixPointer );
+#endif
+
+   benchmark.setOperation("GMRES (Jacobi)");
+   benchmarkSolver< GMRES, Diagonal >( benchmark, parameters, matrixPointer, x0, b );
+#ifdef HAVE_CUDA
+   benchmarkSolver< GMRES, Diagonal >( benchmark, parameters, cudaMatrixPointer, cuda_x0, cuda_b );
+#endif
+
+   benchmark.setOperation("CWYGMRES (Jacobi)");
+   benchmarkSolver< CWYGMRES, Diagonal >( benchmark, parameters, matrixPointer, x0, b );
+#ifdef HAVE_CUDA
+   benchmarkSolver< CWYGMRES, Diagonal >( benchmark, parameters, cudaMatrixPointer, cuda_x0, cuda_b );
+#endif
+
+   benchmark.setOperation("TFQMR (Jacobi)");
+   benchmarkSolver< TFQMR, Diagonal >( benchmark, parameters, matrixPointer, x0, b );
+#ifdef HAVE_CUDA
+   benchmarkSolver< TFQMR, Diagonal >( benchmark, parameters, cudaMatrixPointer, cuda_x0, cuda_b );
+#endif
+
+   benchmark.setOperation("BiCGstab (Jacobi)");
+   benchmarkSolver< BICGStab, Diagonal >( benchmark, parameters, matrixPointer, x0, b );
+#ifdef HAVE_CUDA
+   benchmarkSolver< BICGStab, Diagonal >( benchmark, parameters, cudaMatrixPointer, cuda_x0, cuda_b );
+#endif
+
+   for( int ell = 1; ell <= ell_max; ell++ ) {
+      parameters.template setParameter< int >( "bicgstab-ell", ell );
+      benchmark.setOperation("BiCGstab(" + String(ell) + ") (Jacobi)");
+      benchmarkSolver< BICGStabL, Diagonal >( benchmark, parameters, matrixPointer, x0, b );
+#ifdef HAVE_CUDA
+      benchmarkSolver< BICGStabL, Diagonal >( benchmark, parameters, cudaMatrixPointer, cuda_x0, cuda_b );
+#endif
+   }
 
-template< typename Solver >
-bool benchmarkSolver( const Config::ParameterContainer& parameters,
-                      Pointers::SharedPointer<  typename Solver::MatrixType >& matrix)
-{
-   typedef typename Solver::MatrixType MatrixType;
-   typedef typename MatrixType::RealType RealType;
-   typedef typename MatrixType::DeviceType DeviceType;
-   typedef typename MatrixType::IndexType IndexType;
-   typedef Containers::Vector< RealType, DeviceType, IndexType > VectorType;
-   typedef Pointers::SharedPointer<  MatrixType > MatrixPointer;
-
-   VectorType x, y, b;
-   x.setSize( matrix->getColumns() );
-   x.setValue( 1.0 / ( RealType ) matrix->getColumns() );
-   y.setSize( matrix->getColumns() );
-   b.setSize( matrix->getRows() );
-   matrix->vectorProduct( x, b );
-
-   Solver solver;
-   Solvers::IterativeSolverMonitor< RealType, IndexType > monitor;
-   monitor.setVerbose( 1 );
-   solver.setSolverMonitor( monitor );
-   solver.setMatrix( matrix );
-   solver.setConvergenceResidue( 1.0e-6 );
-   solver.solve( b, y );
-   std::cout << std::endl;
-   return true;
-}
 
-template< typename Matrix >
-bool readMatrix( const Config::ParameterContainer& parameters,
-                 Matrix& matrix )
-{
-   const String fileName = parameters.getParameter< String >( "input-file" );
+   benchmark.setOperation("preconditioner update (ILU0)");
+   benchmarkPreconditionerUpdate< ILU0 >( benchmark, parameters, matrixPointer );
+#ifdef HAVE_CUDA
+   benchmarkPreconditionerUpdate< ILU0 >( benchmark, parameters, cudaMatrixPointer );
+#endif
+
+   benchmark.setOperation("GMRES (ILU0)");
+   benchmarkSolver< GMRES, ILU0 >( benchmark, parameters, matrixPointer, x0, b );
+#ifdef HAVE_CUDA
+   benchmarkSolver< GMRES, ILU0 >( benchmark, parameters, cudaMatrixPointer, cuda_x0, cuda_b );
+#endif
+
+   benchmark.setOperation("CWYGMRES (ILU0)");
+   benchmarkSolver< CWYGMRES, ILU0 >( benchmark, parameters, matrixPointer, x0, b );
+#ifdef HAVE_CUDA
+   benchmarkSolver< CWYGMRES, ILU0 >( benchmark, parameters, cudaMatrixPointer, cuda_x0, cuda_b );
+#endif
+
+   benchmark.setOperation("TFQMR (ILU0)");
+   benchmarkSolver< TFQMR, ILU0 >( benchmark, parameters, matrixPointer, x0, b );
+#ifdef HAVE_CUDA
+   benchmarkSolver< TFQMR, ILU0 >( benchmark, parameters, cudaMatrixPointer, cuda_x0, cuda_b );
+#endif
+
+   benchmark.setOperation("BiCGstab (ILU0)");
+   benchmarkSolver< BICGStab, ILU0 >( benchmark, parameters, matrixPointer, x0, b );
+#ifdef HAVE_CUDA
+   benchmarkSolver< BICGStab, ILU0 >( benchmark, parameters, cudaMatrixPointer, cuda_x0, cuda_b );
+#endif
+
+   for( int ell = 1; ell <= ell_max; ell++ ) {
+      parameters.template setParameter< int >( "bicgstab-ell", ell );
+      benchmark.setOperation("BiCGstab(" + String(ell) + ") (ILU0)");
+      benchmarkSolver< BICGStabL, ILU0 >( benchmark, parameters, matrixPointer, x0, b );
+#ifdef HAVE_CUDA
+      benchmarkSolver< BICGStabL, ILU0 >( benchmark, parameters, cudaMatrixPointer, cuda_x0, cuda_b );
+#endif
+   }
 
-   Matrix* hostMatrix;
-   if( std::is_same< typename Matrix::DeviceType, Devices::Cuda >::value )
-   {
 
+   benchmark.setOperation("preconditioner update (ILUT)");
+   benchmarkPreconditionerUpdate< ILUT >( benchmark, parameters, matrixPointer );
+#ifdef HAVE_CUDA
+   benchmarkPreconditionerUpdate< ILUT >( benchmark, parameters, cudaMatrixPointer );
+#endif
+
+   benchmark.setOperation("GMRES (ILUT)");
+   benchmarkSolver< GMRES, ILUT >( benchmark, parameters, matrixPointer, x0, b );
+#ifdef HAVE_CUDA
+   benchmarkSolver< GMRES, ILUT >( benchmark, parameters, cudaMatrixPointer, cuda_x0, cuda_b );
+#endif
+
+   benchmark.setOperation("CWYGMRES (ILUT)");
+   benchmarkSolver< CWYGMRES, ILUT >( benchmark, parameters, matrixPointer, x0, b );
+#ifdef HAVE_CUDA
+   benchmarkSolver< CWYGMRES, ILUT >( benchmark, parameters, cudaMatrixPointer, cuda_x0, cuda_b );
+#endif
+
+   benchmark.setOperation("TFQMR (ILUT)");
+   benchmarkSolver< TFQMR, ILUT >( benchmark, parameters, matrixPointer, x0, b );
+#ifdef HAVE_CUDA
+   benchmarkSolver< TFQMR, ILUT >( benchmark, parameters, cudaMatrixPointer, cuda_x0, cuda_b );
+#endif
+
+   benchmark.setOperation("BiCGstab (ILUT)");
+   benchmarkSolver< BICGStab, ILUT >( benchmark, parameters, matrixPointer, x0, b );
+#ifdef HAVE_CUDA
+   benchmarkSolver< BICGStab, ILUT >( benchmark, parameters, cudaMatrixPointer, cuda_x0, cuda_b );
+#endif
+
+   for( int ell = 1; ell <= ell_max; ell++ ) {
+      parameters.template setParameter< int >( "bicgstab-ell", ell );
+      benchmark.setOperation("BiCGstab(" + String(ell) + ") (ILUT)");
+      benchmarkSolver< BICGStabL, ILUT >( benchmark, parameters, matrixPointer, x0, b );
+#ifdef HAVE_CUDA
+      benchmarkSolver< BICGStabL, ILUT >( benchmark, parameters, cudaMatrixPointer, cuda_x0, cuda_b );
+#endif
    }
-   if( std::is_same< typename Matrix::DeviceType, Devices::Host >::value )
-   {
-      hostMatrix = &matrix;
-      try
-      {
-         if( ! MatrixReader< Matrix >::readMtxFile( fileName, *hostMatrix ) )
-         {
-            std::cerr << "I am not able to read the matrix file " << fileName << "." << std::endl;
-            /*logFile << std::endl;
-            logFile << inputFileName << std::endl;
-            logFile << "Benchmark failed: Unable to read the matrix." << std::endl;*/
-            return false;
-         }
-      }
-      catch( std::bad_alloc )
-      {
-         std::cerr << "Not enough memory to read the matrix." << std::endl;
-         /*logFile << std::endl;
-         logFile << inputFileName << std::endl;
-         logFile << "Benchmark failed: Not enough memory." << std::endl;*/
-         return false;
-      }
-   }
-   return true;
 }
 
-template< typename Matrix >
-bool resolveLinearSolver( const Config::ParameterContainer& parameters )
+template< typename MatrixType >
+struct LinearSolversBenchmark
 {
-   const String& solver = parameters.getParameter< String >( "solver" );
-   typedef Pointers::SharedPointer<  Matrix > MatrixPointer;
+   using RealType = typename MatrixType::RealType;
+   using DeviceType = typename MatrixType::DeviceType;
+   using IndexType = typename MatrixType::IndexType;
+   using VectorType = Containers::Vector< RealType, DeviceType, IndexType >;
+
+   using Partitioner = DistributedContainers::Partitioner< IndexType, CommunicatorType >;
+   using DistributedMatrix = DistributedContainers::DistributedMatrix< MatrixType, CommunicatorType >;
+   using DistributedVector = DistributedContainers::DistributedVector< RealType, DeviceType, IndexType, CommunicatorType >;
+   using DistributedRowLengths = typename DistributedMatrix::CompressedRowLengthsVector;
+
+   static bool
+   run( Benchmark& benchmark,
+        Benchmark::MetadataMap metadata,
+// FIXME: ParameterContainer should be copyable, but that leads to double-free
+//        const Config::ParameterContainer& parameters )
+        Config::ParameterContainer& parameters )
+   {
+      SharedPointer< MatrixType > matrixPointer;
+      VectorType x0, b;
+      if( ! matrixPointer->load( parameters.getParameter< String >( "input-matrix" ) ) ||
+          ! x0.load( parameters.getParameter< String >( "input-dof" ) ) ||
+          ! b.load( parameters.getParameter< String >( "input-rhs" ) ) )
+          return false;
+
+      typename MatrixType::CompressedRowLengthsVector rowLengths;
+      matrixPointer->getCompressedRowLengths( rowLengths );
+      const IndexType maxRowLength = rowLengths.max();
+
+      const String name = String( (CommunicatorType::isDistributed()) ? "Distributed linear solvers" : "Linear solvers" )
+                          + " (" + parameters.getParameter< String >( "name" ) + "): ";
+      benchmark.newBenchmark( name, metadata );
+      benchmark.setMetadataColumns( Benchmark::MetadataColumns({
+         // TODO: strip the device
+//         {"matrix type", matrixPointer->getType()},
+         {"rows", matrixPointer->getRows()},
+         {"columns", matrixPointer->getColumns()},
+         // FIXME: getMaxRowLengths() returns 0 for matrices loaded from file
+//         {"max elements per row", matrixPointer->getMaxRowLength()},
+         {"max elements per row", maxRowLength},
+      } ));
+
+      const bool reorder = parameters.getParameter< bool >( "reorder-dofs" );
+      if( reorder ) {
+         using PermutationVector = Containers::Vector< IndexType, DeviceType, IndexType >;
+         PermutationVector perm, iperm;
+         getTrivialOrdering( *matrixPointer, perm, iperm );
+         SharedPointer< MatrixType > matrix_perm;
+         VectorType x0_perm, b_perm;
+         reorderMatrix( *matrixPointer, *matrix_perm, perm, iperm );
+         reorderVector( x0, x0_perm, perm );
+         reorderVector( b, b_perm, perm );
+         if( CommunicatorType::isDistributed() )
+            runDistributed( benchmark, metadata, parameters, matrix_perm, x0_perm, b_perm );
+         else
+            runNonDistributed( benchmark, metadata, parameters, matrix_perm, x0_perm, b_perm );
+      }
+      else {
+         if( CommunicatorType::isDistributed() )
+            runDistributed( benchmark, metadata, parameters, matrixPointer, x0, b );
+         else
+            runNonDistributed( benchmark, metadata, parameters, matrixPointer, x0, b );
+      }
 
-   MatrixPointer matrix;
-   if( ! readMatrix( parameters, *matrix ) )
-      return false;
+      return true;
+   }
 
-   if( solver == "gmres" )
-      return benchmarkSolver< Solvers::Linear::GMRES< Matrix > >( parameters, matrix );
+   static void
+   runDistributed( Benchmark& benchmark,
+                   Benchmark::MetadataMap metadata,
+// FIXME: ParameterContainer should be copyable, but that leads to double-free
+//                   const Config::ParameterContainer& parameters,
+                   Config::ParameterContainer& parameters,
+                   const SharedPointer< MatrixType >& matrixPointer,
+                   const VectorType& x0,
+                   const VectorType& b )
+   {
+      // set up the distributed matrix
+      const auto group = CommunicatorType::AllGroup;
+      const auto localRange = Partitioner::splitRange( matrixPointer->getRows(), group );
+      SharedPointer< DistributedMatrix > distMatrixPointer( localRange, matrixPointer->getRows(), matrixPointer->getColumns(), group );
+      DistributedVector dist_x0( localRange, matrixPointer->getRows(), group );
+      DistributedVector dist_b( localRange, matrixPointer->getRows(), group );
+
+      // copy the row lengths from the global matrix to the distributed matrix
+      DistributedRowLengths distributedRowLengths( localRange, matrixPointer->getRows(), group );
+      for( IndexType i = 0; i < distMatrixPointer->getLocalMatrix().getRows(); i++ ) {
+         const auto gi = distMatrixPointer->getLocalRowRange().getGlobalIndex( i );
+         distributedRowLengths[ gi ] = matrixPointer->getRowLength( gi );
+      }
+      distMatrixPointer->setCompressedRowLengths( distributedRowLengths );
+
+      // copy data from the global matrix/vector into the distributed matrix/vector
+      for( IndexType i = 0; i < distMatrixPointer->getLocalMatrix().getRows(); i++ ) {
+         const auto gi = distMatrixPointer->getLocalRowRange().getGlobalIndex( i );
+         dist_x0[ gi ] = x0[ gi ];
+         dist_b[ gi ] = b[ gi ];
+
+         const IndexType rowLength = matrixPointer->getRowLength( i );
+         IndexType columns[ rowLength ];
+         RealType values[ rowLength ];
+         matrixPointer->getRowFast( gi, columns, values );
+         distMatrixPointer->setRowFast( gi, columns, values, rowLength );
+      }
 
-   if( solver == "cg" )
-      return benchmarkSolver< Solvers::Linear::CG< Matrix > >( parameters, matrix );
+      std::cout << "Iterative solvers:" << std::endl;
+      benchmarkIterativeSolvers( benchmark, parameters, distMatrixPointer, dist_x0, dist_b );
+   }
 
-   if( solver == "bicgstab" )
-      return benchmarkSolver< Solvers::Linear::BICGStab< Matrix > >( parameters, matrix );
+   static void
+   runNonDistributed( Benchmark& benchmark,
+                      Benchmark::MetadataMap metadata,
+// FIXME: ParameterContainer should be copyable, but that leads to double-free
+//                      const Config::ParameterContainer& parameters,
+                      Config::ParameterContainer& parameters,
+                      const SharedPointer< MatrixType >& matrixPointer,
+                      const VectorType& x0,
+                      const VectorType& b )
+   {
+      // direct solvers
+      if( parameters.getParameter< bool >( "with-direct" ) ) {
+         using CSR = Matrices::CSR< RealType, DeviceType, IndexType >;
+         SharedPointer< CSR > matrixCopy;
+         Matrices::copySparseMatrix( *matrixCopy, *matrixPointer );
+
+#ifdef HAVE_UMFPACK
+         std::cout << "UMFPACK wrapper:" << std::endl;
+         using UmfpackSolver = Solvers::Linear::UmfpackWrapper< CSR >;
+         using Preconditioner = Solvers::Linear::Preconditioners::Preconditioner< CSR >;
+         benchmarkSolver< UmfpackSolver, Preconditioner >( parameters, matrixCopy, x0, b );
+#endif
+
+#ifdef HAVE_ARMADILLO
+         std::cout << "Armadillo wrapper (which wraps SuperLU):" << std::endl;
+         benchmarkArmadillo( parameters, matrixCopy, x0, b );
+#endif
+      }
 
-   if( solver == "tfqmr" )
-      return benchmarkSolver< Solvers::Linear::TFQMR< Matrix > >( parameters, matrix );
+      std::cout << "Iterative solvers:" << std::endl;
+      benchmarkIterativeSolvers( benchmark, parameters, matrixPointer, x0, b );
 
-   std::cerr << "Unknown solver " << solver << "." << std::endl;
-   return false;
-}
+#ifdef HAVE_CUSOLVER
+      std::cout << "CuSOLVER:" << std::endl;
+      {
+         using CSR = Matrices::CSR< RealType, DeviceType, IndexType >;
+         SharedPointer< CSR > matrixCopy;
+         Matrices::copySparseMatrix( *matrixCopy, *matrixPointer );
+
+         SharedPointer< typename CSR::CudaType > cuda_matrixCopy;
+         *cuda_matrixCopy = *matrixCopy;
+         typename VectorType::CudaType cuda_x0, cuda_b;
+         cuda_x0.setLike( x0 );
+         cuda_b.setLike( b );
+         cuda_x0 = x0;
+         cuda_b = b;
+
+         using namespace Solvers::Linear;
+         using namespace Solvers::Linear::Preconditioners;
+         benchmarkSolver< CuSolverWrapper, Preconditioner >( benchmark, parameters, cuda_matrixCopy, cuda_x0, cuda_b );
+      }
+#endif
+   }
+};
 
-template< typename Real,
-          typename Device >
-bool resolveMatrixFormat( const Config::ParameterContainer& parameters )
+void
+configSetup( Config::ConfigDescription& config )
 {
-   const String& matrixFormat = parameters.getParameter< String >( "matrix-format" );
-
-   if( matrixFormat == "dense" )
-      return resolveLinearSolver< Dense< Real, Device, int > >( parameters );
-
-   if( matrixFormat == "tridiagonal" )
-      return resolveLinearSolver< Tridiagonal< Real, Device, int > >( parameters );
-
-   if( matrixFormat == "multidiagonal" )
-      return resolveLinearSolver< Multidiagonal< Real, Device, int > >( parameters );
-
-   if( matrixFormat == "ellpack" )
-      return resolveLinearSolver< Ellpack< Real, Device, int > >( parameters );
-
-   if( matrixFormat == "sliced-ellpack" )
-      return resolveLinearSolver< SlicedEllpack< Real, Device, int > >( parameters );
-
-   if( matrixFormat == "chunked-ellpack" )
-      return resolveLinearSolver< ChunkedEllpack< Real, Device, int > >( parameters );
-
-   if( matrixFormat == "csr" )
-      return resolveLinearSolver< CSR< Real, Device, int > >( parameters );
-
-   std::cerr << "Unknown matrix format " << matrixFormat << "." << std::endl;
-   return false;
+   config.addDelimiter( "Benchmark settings:" );
+   config.addEntry< String >( "log-file", "Log file name.", "tnl-benchmark-linear-solvers.log");
+   config.addEntry< String >( "output-mode", "Mode for opening the log file.", "overwrite" );
+   config.addEntryEnum( "append" );
+   config.addEntryEnum( "overwrite" );
+   config.addEntry< int >( "loops", "Number of repetitions of the benchmark.", 10 );
+   config.addRequiredEntry< String >( "input-matrix", "File name of the input matrix (in binary TNL format)." );
+   config.addRequiredEntry< String >( "input-dof", "File name of the input DOF vector (in binary TNL format)." );
+   config.addRequiredEntry< String >( "input-rhs", "File name of the input right-hand-side vector (in binary TNL format)." );
+   config.addEntry< String >( "name", "Name of the matrix in the benchmark.", "" );
+   config.addEntry< int >( "verbose", "Verbose mode.", 1 );
+   config.addEntry< bool >( "reorder-dofs", "Reorder matrix entries corresponding to the same DOF together.", false );
+   config.addEntry< bool >( "with-direct", "Includes the 3rd party direct solvers in the benchmark.", false );
+
+   config.addDelimiter( "Device settings:" );
+   Devices::Host::configSetup( config );
+   Devices::Cuda::configSetup( config );
+   CommunicatorType::configSetup( config );
+
+   config.addDelimiter( "Linear solver settings:" );
+   Solvers::IterativeSolver< double, int >::configSetup( config );
+   using Matrix = Matrices::SlicedEllpack< double, Devices::Host, int >;
+   using GMRES = Solvers::Linear::GMRES< Matrix >;
+   GMRES::configSetup( config );
+   using BiCGstabL = Solvers::Linear::BICGStabL< Matrix >;
+   BiCGstabL::configSetup( config );
+   using ILUT = Solvers::Linear::Preconditioners::ILUT< Matrix >;
+   ILUT::configSetup( config );
 }
 
-template< typename Real >
-bool resolveDevice( const Config::ParameterContainer& parameters )
+int
+main( int argc, char* argv[] )
 {
-   const String& device = parameters.getParameter< String >( "device" );
-
-   if( device == "host" )
-      return resolveMatrixFormat< Real, Devices::Host >( parameters );
+#ifndef NDEBUG
+   Debugging::trackFloatingPointExceptions();
+#endif
 
-   if( device == "cuda" )
-      return resolveMatrixFormat< Real, Devices::Cuda >( parameters );
-
-   std::cerr << "Uknown device " << device << "." << std::endl;
-   return false;
-}
-
-int main( int argc, char* argv[] )
-{
    Config::ParameterContainer parameters;
    Config::ConfigDescription conf_desc;
 
    configSetup( conf_desc );
- 
-   if( ! parseCommandLine( argc, argv, conf_desc, parameters ) )
-   {
+
+   Communicators::ScopedInitializer< CommunicatorType > scopedInit(argc, argv);
+   const int rank = CommunicatorType::GetRank( CommunicatorType::AllGroup );
+
+   if( ! parseCommandLine( argc, argv, conf_desc, parameters ) ) {
       conf_desc.printUsage( argv[ 0 ] );
-      return 1;
+      return EXIT_FAILURE;
    }
-   const String& precision = parameters.getParameter< String >( "precision" );
-   if( precision == "float" )
-      if( ! resolveDevice< float >( parameters ) )
+   if( ! Devices::Host::setup( parameters ) ||
+       ! Devices::Cuda::setup( parameters ) ||
+       ! CommunicatorType::setup( parameters ) )
+      return EXIT_FAILURE;
+
+   const String & logFileName = parameters.getParameter< String >( "log-file" );
+   const String & outputMode = parameters.getParameter< String >( "output-mode" );
+   const unsigned loops = parameters.getParameter< unsigned >( "loops" );
+   const unsigned verbose = (rank == 0) ? parameters.getParameter< unsigned >( "verbose" ) : 0;
+
+   // open log file
+   auto mode = std::ios::out;
+   if( outputMode == "append" )
+       mode |= std::ios::app;
+   std::ofstream logFile;
+   if( rank == 0 )
+      logFile.open( logFileName.getString(), mode );
+
+   // init benchmark and common metadata
+   Benchmark benchmark( loops, verbose );
+
+   // prepare global metadata
+   Benchmark::MetadataMap metadata = getHardwareMetadata();
+
+   // TODO: implement resolveMatrixType
+//   return ! Matrices::resolveMatrixType< MainConfig,
+//                                         Devices::Host,
+//                                         LinearSolversBenchmark >( benchmark, metadata, parameters );
+   using MatrixType = Matrices::SlicedEllpack< double, Devices::Host, int >;
+   const bool status = LinearSolversBenchmark< MatrixType >::run( benchmark, metadata, parameters );
+
+   if( rank == 0 )
+      if( ! benchmark.save( logFile ) ) {
+         std::cerr << "Failed to write the benchmark results to file '" << parameters.getParameter< String >( "log-file" ) << "'." << std::endl;
          return EXIT_FAILURE;
-   if( precision == "double" )
-      if( ! resolveDevice< double >( parameters ) )
-         return EXIT_FAILURE;
-   return EXIT_SUCCESS;
-}
-
+      }
 
-#endif /* TNL_BENCHMARK_LINEAR_SOLVERS_H_ */
+   return ! status;
+}