diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0ef868cc7622d04150a5358e5b6506e98b03b11e..68252ba6a74c45adc73dbd4e18b0afb5c81e5a67 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -25,11 +25,11 @@ option(WITH_GMP "Build with GMP support" OFF)
 option(WITH_TESTS "Build tests" ON)
 option(WITH_PROFILING "Enable code profiling compiler flags" OFF )
 option(WITH_COVERAGE "Enable code coverage reports from unit tests" OFF)
-option(WITH_EXAMPLES "Compile the 'examples' directory" ON)
+option(WITH_EXAMPLES "Compile the 'src/Examples' directory" ON)
 option(WITH_TOOLS "Compile the 'src/Tools' directory" ON)
 option(WITH_BENCHMARKS "Compile the 'src/Benchmarks' directory" ON)
 option(WITH_PYTHON "Compile the Python bindings" ON)
-option(WITH_DOC "Generate documentation" ON)
+option(WITH_DOC "Build examples included in the documentation" ON)
 
 # install paths relative to the cmake's prefix
 set( TNL_TARGET_INCLUDE_DIRECTORY "include/TNL" )
@@ -81,10 +81,10 @@ set( CMAKE_CXX_STANDARD 14 )
 set( CMAKE_CXX_STANDARD_REQUIRED ON )
 set( CMAKE_CXX_EXTENSIONS OFF )
 
-# set Debug/Release options
+# set default build options
 set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread -Wall -Wno-unused-local-typedefs -Wno-unused-variable -Wno-unknown-pragmas" )
 set( CMAKE_CXX_FLAGS_DEBUG "-g" )
-set( CMAKE_CXX_FLAGS_RELEASE "-O3 -march=native -mtune=native -DNDEBUG" )
+set( CMAKE_CXX_FLAGS_RELEASE "-O3 -DNDEBUG" )
 # pass -rdynamic only in Debug mode
 set( CMAKE_SHARED_LIBRARY_LINK_C_FLAGS "" )
 set( CMAKE_SHARED_LIBRARY_LINK_C_FLAGS_DEBUG "-rdynamic" )
@@ -96,6 +96,13 @@ set( CMAKE_SHARED_LINKER_FLAGS "" )
 set( CMAKE_SHARED_LINKER_FLAGS_DEBUG "-rdynamic" )
 set( CMAKE_SHARED_LINKER_FLAGS_RELEASE "" )
 
+# set additional Debug/Release options using generator expressions
+# (that way we can exclude some options for specific targets, see https://stackoverflow.com/a/59734798 for details)
+add_compile_options(
+   # GOTCHA: CMake uses semicolons as list item separator, spaces would lead to a single argument inside double-quotes on the command line
+   "$<$<CONFIG:RELEASE>:-march=native;-mtune=native>"
+)
+
 # disable GCC's infamous "maybe-uninitialized" warning (it produces mostly false positives)
 if( CMAKE_CXX_COMPILER_ID STREQUAL "GNU" )
    set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-maybe-uninitialized" )
@@ -300,7 +307,7 @@ add_subdirectory( src )
 add_subdirectory( share )
 
 # Add subdirectories for examples included in the documentation
-if( ${WITH_DOC} OR ${WITH_EXAMPLES} )
+if( ${WITH_DOC} )
    set( TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH "${CMAKE_SOURCE_DIR}/Documentation/output_snippets" )
    file(MAKE_DIRECTORY ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH})
    add_subdirectory( Documentation/Examples )
diff --git a/build b/build
index 68966f0f423d13fd2cfb69a380fe02ea9bc25e8f..76169a7c4b20cdc8c6750c23abd7aa6fb675c5e6 100755
--- a/build
+++ b/build
@@ -87,10 +87,11 @@ if [[ ${HELP} == "yes" ]]; then
     echo "   --with-profiling=yes/no               Enables code profiling compiler falgs. 'no' by default."
     echo "   --with-coverage=yes/no                Enables code coverage reports for unit tests. 'no' by default (lcov is required)."
     echo "   --with-doc=yes/no                     Build documentation. 'yes' by default."
-    echo "   --with-examples=yes/no                Compile the 'examples' directory. 'yes' by default."
+    echo "   --with-examples=yes/no                Compile the 'src/Examples' directory. 'yes' by default."
     echo "   --with-tools=yes/no                   Compile the 'src/Tools' directory. 'yes' by default."
     echo "   --with-python=yes/no                  Compile the Python bindings. 'yes' by default."
     echo "   --with-benchmarks=yes/no              Compile the 'src/Benchmarks' directory. 'yes' by default."
+    echo "   --with-doc=yes/no                     Generate the documentation. 'yes' by default."
     echo "   --cmake=CMAKE                         Path to cmake. 'cmake' by default."
     echo "   --verbose                             It enables verbose build."
     echo "   --root-dir=PATH                       Path to the TNL source code root dir."
diff --git a/src/Benchmarks/BLAS/CMakeLists.txt b/src/Benchmarks/BLAS/CMakeLists.txt
index 3f040abc893464efb196d5a40ef1e1f3edbcbbab..81d83753329658c960b47a08d3f1ac58881bd9ff 100644
--- a/src/Benchmarks/BLAS/CMakeLists.txt
+++ b/src/Benchmarks/BLAS/CMakeLists.txt
@@ -5,16 +5,28 @@ else()
     add_executable( tnl-benchmark-blas tnl-benchmark-blas.cpp )
 endif()
 
-find_library( CBLAS_LIBRARY NAMES cblas
-           PATHS /usr/lib
-                 /usr/lib64
-                 /usr/lib/x86_64-linux-gnu
-                 /usr/local/lib
-                 /usr/local/lib64 )
+find_library( CBLAS_LIBRARY NAMES cblas )
+
+# fallback for Centos 7.5 - libcblas.so does not exist, link to libtatlas.so or libsatlas.so
+# https://forums.centos.org/viewtopic.php?t=48543
+find_library( TATLAS_LIBRARY NAMES tatlas
+              PATH_SUFFIXES atlas )
+find_library( SATLAS_LIBRARY NAMES satlas
+              PATH_SUFFIXES atlas )
+
 if( CBLAS_LIBRARY )
    target_compile_definitions( tnl-benchmark-blas PUBLIC "-DHAVE_BLAS" )
    target_link_libraries( tnl-benchmark-blas ${CBLAS_LIBRARY} )
+elseif( TATLAS_LIBRARY )
+   target_compile_definitions( tnl-benchmark-blas PUBLIC "-DHAVE_BLAS" )
+   target_link_libraries( tnl-benchmark-blas ${TATLAS_LIBRARY} )
+elseif( SATLAS_LIBRARY )
+   target_compile_definitions( tnl-benchmark-blas PUBLIC "-DHAVE_BLAS" )
+   target_link_libraries( tnl-benchmark-blas ${SATLAS_LIBRARY} )
 else()
+   # FIXME: We require the CBLAS interface, but CMake's FindBLAS cannot detect that,
+   #        so this fails unless the BLAS implementation includes it in the same
+   #        shared library file as the Fortran implementation (e.g. OpenBLAS does that).
    find_package( BLAS )
    if( BLAS_FOUND )
       target_compile_definitions( tnl-benchmark-blas PUBLIC "-DHAVE_BLAS" )
diff --git a/src/Benchmarks/BLAS/blasWrappers.h b/src/Benchmarks/BLAS/blasWrappers.h
index d1e0edff119c733126db27cdcc913cd189d4468e..ce30060bf399fa80c8119dc185edb84a5e3010ca 100644
--- a/src/Benchmarks/BLAS/blasWrappers.h
+++ b/src/Benchmarks/BLAS/blasWrappers.h
@@ -2,7 +2,13 @@
 
 #ifdef HAVE_BLAS
 
+// HOTFIX: cblas.h from the atlas-devel package (version 3.10.1-12.el7) on CentOS 7
+// does not declare the functions as `extern "C"`, which breaks name mangling.
+// Note that nested `extern "C"` is valid and correct:
+// https://stackoverflow.com/questions/48099828/what-happens-if-you-nest-extern-c
+extern "C" {
 #include <cblas.h>
+}
 
 inline int blasIgamax( int n, const float *x, int incx )
 {
diff --git a/src/Benchmarks/LinearSolvers/benchmarks.h b/src/Benchmarks/LinearSolvers/benchmarks.h
index 577907c78ab13253c6c46c9db4d182b00d11957e..7b22cdfc1ef30aaddc29cc35067962ed4530f257 100644
--- a/src/Benchmarks/LinearSolvers/benchmarks.h
+++ b/src/Benchmarks/LinearSolvers/benchmarks.h
@@ -145,7 +145,7 @@ benchmarkSolver( Benchmark& benchmark,
 
       virtual HeaderElements getTableHeader() const override
       {
-         return HeaderElements({"time", "speedup", "converged", "iterations", "residue_precond", "residue_true"});
+         return HeaderElements({"time", "stddev", "stddev/time", "speedup", "converged", "iterations", "residue_precond", "residue_true"});
       }
 
       virtual RowElements getRowElements() const override
@@ -160,7 +160,7 @@ benchmarkSolver( Benchmark& benchmark,
          r = b - r;
          const double residue_true = lpNorm( r, 2.0 ) / lpNorm( b, 2.0 );
 
-         return RowElements({ time, speedup, (double) converged, (double) iterations,
+         return RowElements({ time, stddev, stddev/time, speedup, (double) converged, (double) iterations,
                               residue_precond, residue_true });
       }
    };
diff --git a/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.h b/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.h
index 0701b647a42416e6439cd02a1ef10157512210f3..4aabf39cd4bae98bc411fcc95feef56672b039ca 100644
--- a/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.h
+++ b/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.h
@@ -15,6 +15,7 @@
 #include <set>
 #include <sstream>
 #include <string>
+#include <random>
 
 #ifndef NDEBUG
 #include <TNL/Debugging/FPE.h>
@@ -30,6 +31,7 @@
 #include <TNL/Containers/Partitioner.h>
 #include <TNL/Containers/DistributedVector.h>
 #include <TNL/Matrices/DistributedMatrix.h>
+#include <TNL/Matrices/MatrixReader.h>
 #include <TNL/Solvers/Linear/Preconditioners/Diagonal.h>
 #include <TNL/Solvers/Linear/Preconditioners/ILU0.h>
 #include <TNL/Solvers/Linear/Preconditioners/ILUT.h>
@@ -68,12 +70,19 @@ using CommunicatorType = Communicators::NoDistrCommunicator;
 
 static const std::set< std::string > valid_solvers = {
    "gmres",
-   "cwygmres",
    "tfqmr",
    "bicgstab",
    "bicgstab-ell",
 };
 
+static const std::set< std::string > valid_gmres_variants = {
+   "CGS",
+   "CGSR",
+   "MGS",
+   "MGSR",
+   "CWY",
+};
+
 static const std::set< std::string > valid_preconditioners = {
    "jacobi",
    "ilu0",
@@ -85,19 +94,19 @@ parse_comma_list( const Config::ParameterContainer& parameters,
                   const char* parameter,
                   const std::set< std::string >& options )
 {
-   const String solvers = parameters.getParameter< String >( parameter );
+   const String param = parameters.getParameter< String >( parameter );
 
-   if( solvers == "all" )
+   if( param == "all" )
       return options;
 
-   std::stringstream ss( solvers.getString() );
+   std::stringstream ss( param.getString() );
    std::string s;
    std::set< std::string > set;
 
    while( std::getline( ss, s, ',' ) ) {
       if( ! options.count( s ) )
          throw std::logic_error( std::string("Invalid value in the comma-separated list for the parameter '")
-                                 + parameter + "': '" + s + "'. The list contains: '" + solvers.getString() + "'." );
+                                 + parameter + "': '" + s + "'. The list contains: '" + param.getString() + "'." );
 
       set.insert( s );
 
@@ -108,6 +117,42 @@ parse_comma_list( const Config::ParameterContainer& parameters,
    return set;
 }
 
+// TODO: implement this in TNL::String
+bool ends_with( const std::string& value, const std::string& ending )
+{
+   if (ending.size() > value.size())
+      return false;
+   return std::equal(ending.rbegin(), ending.rend(), value.rbegin());
+}
+
+// initialize all vector entries with a unioformly distributed random value from the interval [a, b]
+template< typename Vector >
+void set_random_vector( Vector& v, typename Vector::RealType a, typename Vector::RealType b )
+{
+   using RealType = typename Vector::RealType;
+   using IndexType = typename Vector::IndexType;
+   // random device will be used to obtain a seed for the random number engine
+   std::random_device rd;
+   // initialize the standard mersenne_twister_engine with rd() as the seed
+   std::mt19937 gen(rd());
+   // create uniform distribution
+   std::uniform_real_distribution< RealType > dis(a, b);
+
+   // create host vector
+   typename Vector::template Self< RealType, Devices::Host > host_v;
+   host_v.setSize( v.getSize() );
+
+   // initialize the host vector
+   auto kernel = [&] ( IndexType i )
+   {
+      host_v[ i ] = dis(gen);
+   };
+   Algorithms::ParallelFor< Devices::Host >::exec( (IndexType) 0, host_v.getSize(), kernel );
+
+   // copy the data to the device vector
+   v = host_v;
+}
+
 template< typename Matrix, typename Vector >
 void
 benchmarkIterativeSolvers( Benchmark& benchmark,
@@ -138,6 +183,7 @@ benchmarkIterativeSolvers( Benchmark& benchmark,
 
    const int ell_max = 2;
    const std::set< std::string > solvers = parse_comma_list( parameters, "solvers", valid_solvers );
+   const std::set< std::string > gmresVariants = parse_comma_list( parameters, "gmres-variants", valid_gmres_variants );
    const std::set< std::string > preconditioners = parse_comma_list( parameters, "preconditioners", valid_preconditioners );
    const bool with_preconditioner_update = parameters.getParameter< bool >( "with-preconditioner-update" );
 
@@ -151,21 +197,14 @@ benchmarkIterativeSolvers( Benchmark& benchmark,
       }
 
       if( solvers.count( "gmres" ) ) {
-         benchmark.setOperation("GMRES (Jacobi)");
-         parameters.template setParameter< String >( "gmres-variant", "MGSR" );
-         benchmarkSolver< GMRES, Diagonal >( benchmark, parameters, matrixPointer, x0, b );
-         #ifdef HAVE_CUDA
-         benchmarkSolver< GMRES, Diagonal >( benchmark, parameters, cudaMatrixPointer, cuda_x0, cuda_b );
-         #endif
-      }
-
-      if( solvers.count( "cwygmres" ) ) {
-         benchmark.setOperation("CWYGMRES (Jacobi)");
-         parameters.template setParameter< String >( "gmres-variant", "CWY" );
-         benchmarkSolver< GMRES, Diagonal >( benchmark, parameters, matrixPointer, x0, b );
-         #ifdef HAVE_CUDA
-         benchmarkSolver< GMRES, Diagonal >( benchmark, parameters, cudaMatrixPointer, cuda_x0, cuda_b );
-         #endif
+         for( auto variant : gmresVariants ) {
+            benchmark.setOperation(variant + "-GMRES (Jacobi)");
+            parameters.template setParameter< String >( "gmres-variant", variant.c_str() );
+            benchmarkSolver< GMRES, Diagonal >( benchmark, parameters, matrixPointer, x0, b );
+            #ifdef HAVE_CUDA
+            benchmarkSolver< GMRES, Diagonal >( benchmark, parameters, cudaMatrixPointer, cuda_x0, cuda_b );
+            #endif
+         }
       }
 
       if( solvers.count( "tfqmr" ) ) {
@@ -208,21 +247,14 @@ benchmarkIterativeSolvers( Benchmark& benchmark,
       }
 
       if( solvers.count( "gmres" ) ) {
-         benchmark.setOperation("GMRES (ILU0)");
-         parameters.template setParameter< String >( "gmres-variant", "MGSR" );
-         benchmarkSolver< GMRES, ILU0 >( benchmark, parameters, matrixPointer, x0, b );
-         #ifdef HAVE_CUDA
-         benchmarkSolver< GMRES, ILU0 >( benchmark, parameters, cudaMatrixPointer, cuda_x0, cuda_b );
-         #endif
-      }
-
-      if( solvers.count( "cwygmres" ) ) {
-         benchmark.setOperation("CWYGMRES (ILU0)");
-         parameters.template setParameter< String >( "gmres-variant", "CWY" );
-         benchmarkSolver< GMRES, ILU0 >( benchmark, parameters, matrixPointer, x0, b );
-         #ifdef HAVE_CUDA
-         benchmarkSolver< GMRES, ILU0 >( benchmark, parameters, cudaMatrixPointer, cuda_x0, cuda_b );
-         #endif
+         for( auto variant : gmresVariants ) {
+            benchmark.setOperation(variant + "-GMRES (ILU0)");
+            parameters.template setParameter< String >( "gmres-variant", variant.c_str() );
+            benchmarkSolver< GMRES, ILU0 >( benchmark, parameters, matrixPointer, x0, b );
+            #ifdef HAVE_CUDA
+            benchmarkSolver< GMRES, ILU0 >( benchmark, parameters, cudaMatrixPointer, cuda_x0, cuda_b );
+            #endif
+         }
       }
 
       if( solvers.count( "tfqmr" ) ) {
@@ -264,21 +296,14 @@ benchmarkIterativeSolvers( Benchmark& benchmark,
       }
 
       if( solvers.count( "gmres" ) ) {
-         benchmark.setOperation("GMRES (ILUT)");
-         parameters.template setParameter< String >( "gmres-variant", "MGSR" );
-         benchmarkSolver< GMRES, ILUT >( benchmark, parameters, matrixPointer, x0, b );
-         #ifdef HAVE_CUDA
-         benchmarkSolver< GMRES, ILUT >( benchmark, parameters, cudaMatrixPointer, cuda_x0, cuda_b );
-         #endif
-      }
-
-      if( solvers.count( "cwygmres" ) ) {
-         benchmark.setOperation("CWYGMRES (ILUT)");
-         parameters.template setParameter< String >( "gmres-variant", "CWY" );
-         benchmarkSolver< GMRES, ILUT >( benchmark, parameters, matrixPointer, x0, b );
-         #ifdef HAVE_CUDA
-         benchmarkSolver< GMRES, ILUT >( benchmark, parameters, cudaMatrixPointer, cuda_x0, cuda_b );
-         #endif
+         for( auto variant : gmresVariants ) {
+            benchmark.setOperation(variant + "-GMRES (ILUT)");
+            parameters.template setParameter< String >( "gmres-variant", variant.c_str() );
+            benchmarkSolver< GMRES, ILUT >( benchmark, parameters, matrixPointer, x0, b );
+            #ifdef HAVE_CUDA
+            benchmarkSolver< GMRES, ILUT >( benchmark, parameters, cudaMatrixPointer, cuda_x0, cuda_b );
+            #endif
+         }
       }
 
       if( solvers.count( "tfqmr" ) ) {
@@ -330,11 +355,42 @@ struct LinearSolversBenchmark
 //        const Config::ParameterContainer& parameters )
         Config::ParameterContainer& parameters )
    {
+      const String file_matrix = parameters.getParameter< String >( "input-matrix" );
+      const String file_dof = parameters.getParameter< String >( "input-dof" );
+      const String file_rhs = parameters.getParameter< String >( "input-rhs" );
+
       SharedPointer< MatrixType > matrixPointer;
       VectorType x0, b;
-      matrixPointer->load( parameters.getParameter< String >( "input-matrix" ) );
-      File( parameters.getParameter< String >( "input-dof" ), std::ios_base::in ) >> x0;
-      File( parameters.getParameter< String >( "input-rhs" ), std::ios_base::in ) >> b;
+
+      // load the matrix
+      if( ends_with( file_matrix, ".mtx" ) ) {
+         Matrices::MatrixReader< MatrixType > reader;
+         if( ! reader.readMtxFile( file_matrix, *matrixPointer ) )
+            return false;
+      }
+      else {
+         matrixPointer->load( file_matrix );
+      }
+
+      // load the vectors
+      if( file_dof && file_rhs ) {
+         File( file_dof, std::ios_base::in ) >> x0;
+         File( file_rhs, std::ios_base::in ) >> b;
+      }
+      else {
+         // set x0 := 0
+         x0.setSize( matrixPointer->getColumns() );
+         x0 = 0;
+
+         // generate random vector x
+         VectorType x;
+         x.setSize( matrixPointer->getColumns() );
+         set_random_vector( x, 1e2, 1e3 );
+
+         // set b := A*x
+         b.setSize( matrixPointer->getRows() );
+         matrixPointer->vectorProduct( x, b );
+      }
 
       typename MatrixType::CompressedRowLengthsVector rowLengths;
       matrixPointer->getCompressedRowLengths( rowLengths );
@@ -488,14 +544,15 @@ configSetup( Config::ConfigDescription& config )
    config.addEntryEnum( "append" );
    config.addEntryEnum( "overwrite" );
    config.addEntry< int >( "loops", "Number of repetitions of the benchmark.", 10 );
-   config.addRequiredEntry< String >( "input-matrix", "File name of the input matrix (in binary TNL format)." );
-   config.addRequiredEntry< String >( "input-dof", "File name of the input DOF vector (in binary TNL format)." );
-   config.addRequiredEntry< String >( "input-rhs", "File name of the input right-hand-side vector (in binary TNL format)." );
+   config.addRequiredEntry< String >( "input-matrix", "File name of the input matrix (in binary TNL format or textual MTX format)." );
+   config.addEntry< String >( "input-dof", "File name of the input DOF vector (in binary TNL format).", "" );
+   config.addEntry< String >( "input-rhs", "File name of the input right-hand-side vector (in binary TNL format).", "" );
    config.addEntry< String >( "name", "Name of the matrix in the benchmark.", "" );
    config.addEntry< int >( "verbose", "Verbose mode.", 1 );
    config.addEntry< bool >( "reorder-dofs", "Reorder matrix entries corresponding to the same DOF together.", false );
    config.addEntry< bool >( "with-direct", "Includes the 3rd party direct solvers in the benchmark.", false );
-   config.addEntry< String >( "solvers", "Comma-separated list of solvers to run benchmarks for. Options: gmres, cwygmres, tfqmr, bicgstab, bicgstab-ell.", "all" );
+   config.addEntry< String >( "solvers", "Comma-separated list of solvers to run benchmarks for. Options: gmres, tfqmr, bicgstab, bicgstab-ell.", "all" );
+   config.addEntry< String >( "gmres-variants", "Comma-separated list of GMRES variants to run benchmarks for. Options: CGS, CGSR, MGS, MGSR, CWY.", "all" );
    config.addEntry< String >( "preconditioners", "Comma-separated list of preconditioners to run benchmarks for. Options: jacobi, ilu0, ilut.", "all" );
    config.addEntry< bool >( "with-preconditioner-update", "Run benchmark for the preconditioner update.", true );
    config.addEntry< String >( "devices", "Run benchmarks on these devices.", "all" );
diff --git a/src/Python/pytnl/tnl/CMakeLists.txt b/src/Python/pytnl/tnl/CMakeLists.txt
index b6c4182b88d7f8c4d1f2ca9acb8478daec0bd45e..de405e5e50c0549e2cf846b06a82d1acb07d9414 100644
--- a/src/Python/pytnl/tnl/CMakeLists.txt
+++ b/src/Python/pytnl/tnl/CMakeLists.txt
@@ -16,6 +16,16 @@ pybind11_add_module( pytnl ${sources} )
 # rename the shared library to tnl.cpython-XXm-x86_64-linux-gnu.so
 set_target_properties( pytnl PROPERTIES LIBRARY_OUTPUT_NAME tnl )
 
+# Skip -march=native -mtune=native for pytnl - optimizing python bindings for
+# a specific architecture is not very useful and prevents using Python tools on
+# hybrid clusters.
+get_target_property( pytnl_COMPILE_OPTIONS pytnl COMPILE_OPTIONS )
+if( pytnl_COMPILE_OPTIONS )
+   string( REPLACE "-march=native" "" pytnl_COMPILE_OPTIONS "${pytnl_COMPILE_OPTIONS}" )
+   string( REPLACE "-mtune=native" "" pytnl_COMPILE_OPTIONS "${pytnl_COMPILE_OPTIONS}" )
+   set_target_properties( pytnl PROPERTIES COMPILE_OPTIONS "${pytnl_COMPILE_OPTIONS}" )
+endif()
+
 # We have bindings for unsafe objects (e.g. Array::operator[]) where assertion
 # is the only safeguard, so we need to translate the TNL::AssertionError to
 # Python's AssertionError.
diff --git a/src/TNL/Matrices/MatrixReader_impl.h b/src/TNL/Matrices/MatrixReader_impl.h
index dd6ddc07245150c859946e97b2fde3a66021aaa3..0415d5f8d1d60af87f82a6efff5f2a5ea4d18ab7 100644
--- a/src/TNL/Matrices/MatrixReader_impl.h
+++ b/src/TNL/Matrices/MatrixReader_impl.h
@@ -165,7 +165,7 @@ template< typename Matrix >
 bool MatrixReader< Matrix >::checkMtxHeader( const String& header,
                                                 bool& symmetric )
 {
-   std::vector< String > parsedLine = header.split();
+   std::vector< String > parsedLine = header.split( ' ', String::SplitSkip::SkipEmpty );
    if( (int) parsedLine.size() < 5 )
       return false;
    if( parsedLine[ 0 ] != "%%MatrixMarket" )
@@ -231,7 +231,7 @@ bool MatrixReader< Matrix >::readMtxHeader( std::istream& file,
          return false;
       }
 
-      parsedLine = line.split();
+      parsedLine = line.split( ' ', String::SplitSkip::SkipEmpty );
       if( (int) parsedLine.size() != 3 )
       {
          std::cerr << "Wrong number of parameters in the matrix header." << std::endl;
@@ -389,7 +389,7 @@ bool MatrixReader< Matrix >::parseMtxLineWithElement( const String& line,
                                                          IndexType& column,
                                                          RealType& value )
 {
-   std::vector< String > parsedLine = line.split();
+   std::vector< String > parsedLine = line.split( ' ', String::SplitSkip::SkipEmpty );
    if( (int) parsedLine.size() != 3 )
    {
       std::cerr << "Wrong number of parameters in the matrix row at line:" << line << std::endl;
diff --git a/src/TNL/Solvers/IterativeSolver.h b/src/TNL/Solvers/IterativeSolver.h
index 6550e5e497c461d26879ca6eeb3e9e989e04ab07..75487fa0197a51b79897384de3b12ecec12c667b 100644
--- a/src/TNL/Solvers/IterativeSolver.h
+++ b/src/TNL/Solvers/IterativeSolver.h
@@ -2,7 +2,7 @@
                           IterativeSolver.h  -  description
                              -------------------
     begin                : Oct 19, 2012
-    copyright            : (C) 2012 by Tomas Oberhuber
+    copyright            : (C) 2012 by Tomas Oberhuber et al.
     email                : tomas.oberhuber@fjfi.cvut.cz
  ***************************************************************************/
 
@@ -10,6 +10,8 @@
 
 #pragma once
 
+#include <limits>
+
 #include <TNL/Config/ConfigDescription.h>
 #include <TNL/Config/ParameterContainer.h>
 #include <TNL/Solvers/IterativeSolverMonitor.h>
@@ -22,17 +24,16 @@ template< typename Real,
           typename SolverMonitor = IterativeSolverMonitor< Real, Index > >
 class IterativeSolver
 {
-   public:
+public:
+   using SolverMonitorType = SolverMonitor;
 
-      using SolverMonitorType = SolverMonitor;
-      
-   IterativeSolver();
+   IterativeSolver() = default;
 
    static void configSetup( Config::ConfigDescription& config,
                             const String& prefix = "" );
 
    bool setup( const Config::ParameterContainer& parameters,
-              const String& prefix = "" );
+               const String& prefix = "" );
 
    void setMaxIterations( const Index& maxIterations );
 
@@ -70,27 +71,27 @@ class IterativeSolver
 
    void refreshSolverMonitor( bool force = false );
 
+protected:
+   Index maxIterations = 1000000000;
 
-   protected:
+   Index minIterations = 0;
 
-   Index maxIterations;
+   Index currentIteration = 0;
 
-   Index minIterations;
+   Real convergenceResidue = 1e-6;
 
-   Index currentIteration;
+   // If the current residue is greater than divergenceResidue, the solver is stopped.
+   Real divergenceResidue = std::numeric_limits< float >::max();
 
-   Real convergenceResidue;
+   Real currentResidue = 0;
 
-   /****
-    * If the current residue is over divergenceResidue the solver is stopped.
-    */
-   Real divergenceResidue;
+   SolverMonitor* solverMonitor = nullptr;
 
-   Real currentResidue;
+   Index refreshRate = 1;
 
-   SolverMonitor* solverMonitor;
+   String residualHistoryFileName = "";
 
-   Index refreshRate;
+   std::ofstream residualHistoryFile;
 };
 
 } // namespace Solvers
diff --git a/src/TNL/Solvers/IterativeSolver_impl.h b/src/TNL/Solvers/IterativeSolver_impl.h
index 528a08ce6febb0a55a0c8b127d9aa90a925f2c18..8f28e8847f52ede82e68d6117312638c499fac5b 100644
--- a/src/TNL/Solvers/IterativeSolver_impl.h
+++ b/src/TNL/Solvers/IterativeSolver_impl.h
@@ -2,7 +2,7 @@
                           IterativeSolver_impl.h  -  description
                              -------------------
     begin                : Oct 19, 2012
-    copyright            : (C) 2012 by Tomas Oberhuber
+    copyright            : (C) 2012 by Tomas Oberhuber et al.
     email                : tomas.oberhuber@fjfi.cvut.cz
  ***************************************************************************/
 
@@ -11,34 +11,21 @@
 #pragma once
 
 #include <cmath>
-#include <float.h>
-#include <limits>
 
 #include "IterativeSolver.h"
 
 namespace TNL {
-namespace Solvers {   
+namespace Solvers {
 
 template< typename Real, typename Index, typename SolverMonitor >
-IterativeSolver< Real, Index, SolverMonitor >::IterativeSolver()
-: maxIterations( 1000000000 ),
-  minIterations( 0 ),
-  currentIteration( 0 ),
-  convergenceResidue( 1.0e-6 ),
-  divergenceResidue( DBL_MAX ),
-  currentResidue( 0 ),
-  solverMonitor( 0 ),
-  refreshRate( 1 )
-{
-};
-
-template< typename Real, typename Index, typename SolverMonitor >
-void IterativeSolver< Real, Index, SolverMonitor >::configSetup( Config::ConfigDescription& config,
-                                                      const String& prefix )
+void
+IterativeSolver< Real, Index, SolverMonitor >::
+configSetup( Config::ConfigDescription& config,
+             const String& prefix )
 {
    config.addEntry< int >   ( prefix + "max-iterations", "Maximal number of iterations the solver may perform.", 1000000000 );
    config.addEntry< int >   ( prefix + "min-iterations", "Minimal number of iterations the solver must perform.", 0 );
-   
+
    // The default value for the convergence residue MUST be zero since not in all problems we want to stop the solver
    // when we reach a state near a steady state. This can be only temporary if, for example, when the boundary conditions
    // are time dependent (growing velocity at inlet starting from 0).
@@ -46,56 +33,74 @@ void IterativeSolver< Real, Index, SolverMonitor >::configSetup( Config::ConfigD
    config.addEntry< double >( prefix + "divergence-residue", "Divergence occurs when the residue exceeds given limit.", std::numeric_limits< float >::max() );
    // TODO: setting refresh rate should be done in SolverStarter::setup (it's not a parameter of the IterativeSolver)
    config.addEntry< int >   ( prefix + "refresh-rate", "Number of iterations between solver monitor refreshes.", 1 );
+
+   config.addEntry< String >( prefix + "residual-history-file", "Path to the file where the residual history will be saved.", "" );
 }
 
 template< typename Real, typename Index, typename SolverMonitor >
-bool IterativeSolver< Real, Index, SolverMonitor >::setup( const Config::ParameterContainer& parameters,
-                                               const String& prefix )
+bool
+IterativeSolver< Real, Index, SolverMonitor >::
+setup( const Config::ParameterContainer& parameters,
+       const String& prefix )
 {
-   this->setMaxIterations( parameters.getParameter< int >( "max-iterations" ) );
-   this->setMinIterations( parameters.getParameter< int >( "min-iterations" ) );
-   this->setConvergenceResidue( parameters.getParameter< double >( "convergence-residue" ) );
-   this->setDivergenceResidue( parameters.getParameter< double >( "divergence-residue" ) );
+   this->setMaxIterations( parameters.getParameter< int >( prefix + "max-iterations" ) );
+   this->setMinIterations( parameters.getParameter< int >( prefix + "min-iterations" ) );
+   this->setConvergenceResidue( parameters.getParameter< double >( prefix + "convergence-residue" ) );
+   this->setDivergenceResidue( parameters.getParameter< double >( prefix + "divergence-residue" ) );
    // TODO: setting refresh rate should be done in SolverStarter::setup (it's not a parameter of the IterativeSolver)
-   this->setRefreshRate( parameters.getParameter< int >( "refresh-rate" ) );
+   this->setRefreshRate( parameters.getParameter< int >( prefix + "refresh-rate" ) );
+   this->residualHistoryFileName = parameters.getParameter< String >( prefix + "residual-history-file" );
+   if( this->residualHistoryFileName )
+      this->residualHistoryFile.open( this->residualHistoryFileName.getString() );
    return true;
 }
 
 template< typename Real, typename Index, typename SolverMonitor >
-void IterativeSolver< Real, Index, SolverMonitor >::setMaxIterations( const Index& maxIterations )
+void
+IterativeSolver< Real, Index, SolverMonitor >::
+setMaxIterations( const Index& maxIterations )
 {
    this->maxIterations = maxIterations;
 }
 
 template< typename Real, typename Index, typename SolverMonitor >
-const Index& IterativeSolver< Real, Index, SolverMonitor >::getMaxIterations() const
+const Index&
+IterativeSolver< Real, Index, SolverMonitor >::
+getMaxIterations() const
 {
    return this->maxIterations;
 }
 
 template< typename Real, typename Index, typename SolverMonitor >
-void IterativeSolver< Real, Index, SolverMonitor >::setMinIterations( const Index& minIterations )
+void
+IterativeSolver< Real, Index, SolverMonitor >::
+setMinIterations( const Index& minIterations )
 {
    this->minIterations = minIterations;
 }
 
 template< typename Real, typename Index, typename SolverMonitor >
-const Index& IterativeSolver< Real, Index, SolverMonitor >::getMinIterations() const
+const Index&
+IterativeSolver< Real, Index, SolverMonitor >::
+getMinIterations() const
 {
    return this->minIterations;
 }
 
 template< typename Real, typename Index, typename SolverMonitor >
-void IterativeSolver< Real, Index, SolverMonitor >::resetIterations()
+void
+IterativeSolver< Real, Index, SolverMonitor >::
+resetIterations()
 {
    this->currentIteration = 0;
    if( this->solverMonitor )
       this->solverMonitor->setIterations( 0 );
-   
 }
 
 template< typename Real, typename Index, typename SolverMonitor >
-bool IterativeSolver< Real, Index, SolverMonitor >::nextIteration()
+bool
+IterativeSolver< Real, Index, SolverMonitor >::
+nextIteration()
 {
    // this->checkNextIteration() must be called before the iteration counter is incremented
    bool result = this->checkNextIteration();
@@ -108,7 +113,9 @@ bool IterativeSolver< Real, Index, SolverMonitor >::nextIteration()
 }
 
 template< typename Real, typename Index, typename SolverMonitor >
-bool IterativeSolver< Real, Index, SolverMonitor >::checkNextIteration()
+bool
+IterativeSolver< Real, Index, SolverMonitor >::
+checkNextIteration()
 {
    this->refreshSolverMonitor();
 
@@ -116,7 +123,7 @@ bool IterativeSolver< Real, Index, SolverMonitor >::checkNextIteration()
        this->getIterations() > this->getMaxIterations()  ||
        ( this->getResidue() > this->getDivergenceResidue() && this->getIterations() >= this->getMinIterations() ) ||
        ( this->getResidue() < this->getConvergenceResidue() && this->getIterations() >= this->getMinIterations() ) )
-      return false;   
+      return false;
    return true;
 }
 
@@ -158,59 +165,81 @@ getIterations() const
 }
 
 template< typename Real, typename Index, typename SolverMonitor >
-void IterativeSolver< Real, Index, SolverMonitor >::setConvergenceResidue( const Real& convergenceResidue )
+void
+IterativeSolver< Real, Index, SolverMonitor >::
+setConvergenceResidue( const Real& convergenceResidue )
 {
    this->convergenceResidue = convergenceResidue;
 }
 
 template< typename Real, typename Index, typename SolverMonitor >
-const Real& IterativeSolver< Real, Index, SolverMonitor >::getConvergenceResidue() const
+const Real&
+IterativeSolver< Real, Index, SolverMonitor >::
+getConvergenceResidue() const
 {
    return this->convergenceResidue;
 }
 
 template< typename Real, typename Index, typename SolverMonitor >
-void IterativeSolver< Real, Index, SolverMonitor >::setDivergenceResidue( const Real& divergenceResidue )
+void
+IterativeSolver< Real, Index, SolverMonitor >::
+setDivergenceResidue( const Real& divergenceResidue )
 {
    this->divergenceResidue = divergenceResidue;
 }
 
 template< typename Real, typename Index, typename SolverMonitor >
-const Real& IterativeSolver< Real, Index, SolverMonitor >::getDivergenceResidue() const
+const Real&
+IterativeSolver< Real, Index, SolverMonitor >::
+getDivergenceResidue() const
 {
    return this->divergenceResidue;
 }
 
-
 template< typename Real, typename Index, typename SolverMonitor >
-void IterativeSolver< Real, Index, SolverMonitor >::setResidue( const Real& residue )
+void
+IterativeSolver< Real, Index, SolverMonitor >::
+setResidue( const Real& residue )
 {
    this->currentResidue = residue;
    if( this->solverMonitor )
       this->solverMonitor->setResidue( this->getResidue() );
+   if( this->residualHistoryFile ) {
+      if( this->getIterations() == 0 )
+         this->residualHistoryFile << "\n";
+      this->residualHistoryFile << this->getIterations() << "\t" << std::scientific << residue << std::endl;
+   }
 }
 
 template< typename Real, typename Index, typename SolverMonitor >
-const Real& IterativeSolver< Real, Index, SolverMonitor >::getResidue() const
+const Real&
+IterativeSolver< Real, Index, SolverMonitor >::
+getResidue() const
 {
    return this->currentResidue;
 }
 
 // TODO: setting refresh rate should be done in SolverStarter::setup (it's not a parameter of the IterativeSolver)
 template< typename Real, typename Index, typename SolverMonitor >
-void IterativeSolver< Real, Index, SolverMonitor >::setRefreshRate( const Index& refreshRate )
+void
+IterativeSolver< Real, Index, SolverMonitor >::
+setRefreshRate( const Index& refreshRate )
 {
    this->refreshRate = refreshRate;
 }
 
 template< typename Real, typename Index, typename SolverMonitor >
-void IterativeSolver< Real, Index, SolverMonitor >::setSolverMonitor( SolverMonitorType& solverMonitor )
+void
+IterativeSolver< Real, Index, SolverMonitor >::
+setSolverMonitor( SolverMonitorType& solverMonitor )
 {
    this->solverMonitor = &solverMonitor;
 }
 
 template< typename Real, typename Index, typename SolverMonitor >
-void IterativeSolver< Real, Index, SolverMonitor >::refreshSolverMonitor( bool force )
+void
+IterativeSolver< Real, Index, SolverMonitor >::
+refreshSolverMonitor( bool force )
 {
    if( this->solverMonitor )
    {
diff --git a/src/TNL/Solvers/Linear/GMRES.h b/src/TNL/Solvers/Linear/GMRES.h
index f1a4b87328a630411c634bed6744494f76afea02..e1c02f0ab5eadbaea7ba2c2678641b0c1a05ee6b 100644
--- a/src/TNL/Solvers/Linear/GMRES.h
+++ b/src/TNL/Solvers/Linear/GMRES.h
@@ -53,7 +53,16 @@ protected:
    using HostView = typename DeviceView::template Self< RealType, Devices::Host >;
    using HostVector = typename DeviceVector::template Self< RealType, Devices::Host >;;
 
-   enum class Variant { MGS, MGSR, CWY };
+   enum class Variant { CGS, CGSR, MGS, MGSR, CWY };
+
+// nvcc allows __cuda_callable__ lambdas only in public methods
+#ifdef __NVCC__
+public:
+#endif
+   int orthogonalize_CGS( const int m, const RealType normb, const RealType beta );
+#ifdef __NVCC__
+protected:
+#endif
 
    int orthogonalize_MGS( const int m, const RealType normb, const RealType beta );
 
diff --git a/src/TNL/Solvers/Linear/GMRES_impl.h b/src/TNL/Solvers/Linear/GMRES_impl.h
index d6cb8fdd095120c35cbf303a734ed7c5667bb79d..8e653e28b2d783e4a342f899eb794554d11c131a 100644
--- a/src/TNL/Solvers/Linear/GMRES_impl.h
+++ b/src/TNL/Solvers/Linear/GMRES_impl.h
@@ -31,6 +31,8 @@ configSetup( Config::ConfigDescription& config,
              const String& prefix )
 {
    config.addEntry< String >( prefix + "gmres-variant", "Minimal number of iterations after which the GMRES restarts.", "CWY" );
+      config.addEntryEnum( "CGS" );
+      config.addEntryEnum( "CGSR" );
       config.addEntryEnum( "MGS" );
       config.addEntryEnum( "MGSR" );
       config.addEntryEnum( "CWY" );
@@ -47,7 +49,11 @@ setup( const Config::ParameterContainer& parameters,
        const String& prefix )
 {
    const String var = parameters.getParameter< String >( prefix + "gmres-variant" );
-   if( var == "MGS" )
+   if( var == "CGS" )
+      variant = Variant::CGS;
+   else if( var == "CGSR" )
+      variant = Variant::CGSR;
+   else if( var == "MGS" )
       variant = Variant::MGS;
    else if( var == "MGSR" )
       variant = Variant::MGSR;
@@ -133,6 +139,10 @@ solve( ConstVectorViewType b, VectorViewType x )
       // orthogonalization
       int o_steps = 0;
       switch( variant ) {
+         case Variant::CGS:
+         case Variant::CGSR:
+            o_steps = orthogonalize_CGS( m, normb, beta );
+            break;
          case Variant::MGS:
          case Variant::MGSR:
             o_steps = orthogonalize_MGS( m, normb, beta );
@@ -167,6 +177,102 @@ solve( ConstVectorViewType b, VectorViewType x )
    return this->checkConvergence();
 }
 
+template< typename Matrix >
+int
+GMRES< Matrix >::
+orthogonalize_CGS( const int m, const RealType normb, const RealType beta )
+{
+   // initial binding to _M_tmp sets the correct local range, global size and
+   // communication group for distributed views
+   VectorViewType v_i( _M_tmp.getView() );
+   VectorViewType v_k( _M_tmp.getView() );
+
+   /***
+    * v_0 = r / | r | =  1.0 / beta * r
+    */
+   v_i.bind( V.getData(), size );
+   v_i = (1.0 / beta) * r;
+
+   H.setValue( 0.0 );
+   s.setValue( 0.0 );
+   s[ 0 ] = beta;
+
+   /****
+    * Starting m-loop
+    */
+   for( int i = 0; i < m && this->nextIteration(); i++ ) {
+      v_i.bind( &V.getData()[ i * ldSize ], size );
+      /****
+       * Solve w from M w = A v_i
+       */
+      preconditioned_matvec( w, v_i );
+
+      for( int k = 0; k <= i; k++ )
+         H[ k + i * (m + 1) ] = 0.0;
+      const int reorthogonalize = (variant == Variant::CGSR) ? 2 : 1;
+      for( int l = 0; l < reorthogonalize; l++ ) {
+         // auxiliary array for the H coefficients of the current l-loop
+         RealType H_l[i + 1];
+
+         // CGS part 1: compute projection coefficients
+//         for( int k = 0; k <= i; k++ ) {
+//            v_k.bind( &V.getData()[ k * ldSize ], size );
+//            H_l[k] = (w, v_k);
+//            H[ k + i * (m + 1) ] += H_l[k];
+//         }
+         // H_l = V_i^T * w
+         const RealType* _V = V.getData();
+         const RealType* _w = Traits::getConstLocalView( w ).getData();
+         const IndexType ldSize = this->ldSize;
+         auto fetch = [_V, _w, ldSize] __cuda_callable__ ( IndexType idx, int k ) { return _V[ idx + k * ldSize ] * _w[ idx ]; };
+         Algorithms::Multireduction< DeviceType >::reduce
+                  ( (RealType) 0,
+                    fetch,
+                    std::plus<>{},
+                    size,
+                    i + 1,
+                    H_l );
+         for( int k = 0; k <= i; k++ )
+            H[ k + i * (m + 1) ] += H_l[k];
+
+         // CGS part 2: subtract the projections
+//         for( int k = 0; k <= i; k++ ) {
+//            v_k.bind( &V.getData()[ k * ldSize ], size );
+//            w = w - H_l[k] * v_k;
+//         }
+         // w := w - V_i * H_l
+         Matrices::MatrixOperations< DeviceType >::
+            gemv( size, i + 1,
+                  -1.0, V.getData(), ldSize, H_l,
+                  1.0, Traits::getLocalView( w ).getData() );
+      }
+      /***
+       * H_{i+1,i} = |w|
+       */
+      RealType normw = lpNorm( w, 2.0 );
+      H[ i + 1 + i * (m + 1) ] = normw;
+
+      /***
+       * v_{i+1} = w / |w|
+       */
+      v_i.bind( &V.getData()[ ( i + 1 ) * ldSize ], size );
+      v_i = (1.0 / normw) * w;
+
+      /****
+       * Applying the Givens rotations G_0, ..., G_i
+       */
+      apply_givens_rotations( i, m );
+
+      this->setResidue( std::fabs( s[ i + 1 ] ) / normb );
+      if( ! this->checkNextIteration() )
+         return i;
+      else
+         this->refreshSolverMonitor();
+   }
+
+   return m;
+}
+
 template< typename Matrix >
 int
 GMRES< Matrix >::
@@ -198,13 +304,13 @@ orthogonalize_MGS( const int m, const RealType normb, const RealType beta )
       preconditioned_matvec( w, v_i );
 
       for( int k = 0; k <= i; k++ )
-         H[ k + i * ( m + 1 ) ] = 0.0;
+         H[ k + i * (m + 1) ] = 0.0;
       const int reorthogonalize = (variant == Variant::MGSR) ? 2 : 1;
       for( int l = 0; l < reorthogonalize; l++ )
          for( int k = 0; k <= i; k++ ) {
             v_k.bind( &V.getData()[ k * ldSize ], size );
             /***
-             * H_{k,i} = ( w, v_k )
+             * H_{k,i} = (w, v_k)
              */
             RealType H_k_i = (w, v_k);
             H[ k + i * (m + 1) ] += H_k_i;
diff --git a/src/TNL/Solvers/ODE/ExplicitSolver_impl.h b/src/TNL/Solvers/ODE/ExplicitSolver_impl.h
index 46e69381c82610d6d69854342a6e90caf8654ad9..eef1a30accbac59433876da2ddfe03aa06c8470c 100644
--- a/src/TNL/Solvers/ODE/ExplicitSolver_impl.h
+++ b/src/TNL/Solvers/ODE/ExplicitSolver_impl.h
@@ -10,6 +10,8 @@
 
 #pragma once
 
+#include <cfloat>
+
 namespace TNL {
 namespace Solvers {
 namespace ODE {