diff --git a/src/Benchmarks/BLAS/array-operations.h b/src/Benchmarks/BLAS/array-operations.h
index 9ee6ff8a0699bbc96c63e34942f9ee890d88685f..b5cf9ff58da754aaa4c8ce645989afbb25e61f7c 100644
--- a/src/Benchmarks/BLAS/array-operations.h
+++ b/src/Benchmarks/BLAS/array-operations.h
@@ -72,9 +72,9 @@ benchmarkArrayOperations( Benchmark & benchmark,
       resultDevice = (int) deviceArray == deviceArray2;
    };
    benchmark.setOperation( "comparison (operator==)", 2 * datasetSize );
-   benchmark.time( reset1, "CPU", compareHost );
+   benchmark.time< Devices::Host >( reset1, "CPU", compareHost );
 #ifdef HAVE_CUDA
-   benchmark.time( reset1, "GPU", compareCuda );
+   benchmark.time< Devices::Cuda >( reset1, "GPU", compareCuda );
 #endif
 
 
@@ -87,9 +87,9 @@ benchmarkArrayOperations( Benchmark & benchmark,
    benchmark.setOperation( "copy (operator=)", 2 * datasetSize );
    // copyBasetime is used later inside HAVE_CUDA guard, so the compiler will
    // complain when compiling without CUDA
-   const double copyBasetime = benchmark.time( reset1, "CPU", copyAssignHostHost );
+   const double copyBasetime = benchmark.time< Devices::Host >( reset1, "CPU", copyAssignHostHost );
 #ifdef HAVE_CUDA
-   benchmark.time( reset1, "GPU", copyAssignCudaCuda );
+   benchmark.time< Devices::Cuda >( reset1, "GPU", copyAssignCudaCuda );
 #endif
 
 
@@ -101,8 +101,8 @@ benchmarkArrayOperations( Benchmark & benchmark,
    };
 #ifdef HAVE_CUDA
    benchmark.setOperation( "copy (operator=)", datasetSize, copyBasetime );
-   benchmark.time( reset1, "CPU->GPU", copyAssignHostCuda );
-   benchmark.time( reset1, "GPU->CPU", copyAssignCudaHost );
+   benchmark.time< Devices::Cuda >( reset1, "CPU->GPU", copyAssignHostCuda );
+   benchmark.time< Devices::Cuda >( reset1, "GPU->CPU", copyAssignCudaHost );
 #endif
 
 
@@ -113,9 +113,9 @@ benchmarkArrayOperations( Benchmark & benchmark,
       deviceArray.setValue( 3.0 );
    };
    benchmark.setOperation( "setValue", datasetSize );
-   benchmark.time( reset1, "CPU", setValueHost );
+   benchmark.time< Devices::Host >( reset1, "CPU", setValueHost );
 #ifdef HAVE_CUDA
-   benchmark.time( reset1, "GPU", setValueCuda );
+   benchmark.time< Devices::Cuda >( reset1, "GPU", setValueCuda );
 #endif
 
 
@@ -132,9 +132,9 @@ benchmarkArrayOperations( Benchmark & benchmark,
 #endif
    };
    benchmark.setOperation( "allocation (setSize)", datasetSize );
-   benchmark.time( resetSize1, "CPU", setSizeHost );
+   benchmark.time< Devices::Host >( resetSize1, "CPU", setSizeHost );
 #ifdef HAVE_CUDA
-   benchmark.time( resetSize1, "GPU", setSizeCuda );
+   benchmark.time< Devices::Cuda >( resetSize1, "GPU", setSizeCuda );
 #endif
 
 
@@ -151,9 +151,9 @@ benchmarkArrayOperations( Benchmark & benchmark,
 #endif
    };
    benchmark.setOperation( "deallocation (reset)", datasetSize );
-   benchmark.time( setSize1, "CPU", resetSizeHost );
+   benchmark.time< Devices::Host >( setSize1, "CPU", resetSizeHost );
 #ifdef HAVE_CUDA
-   benchmark.time( setSize1, "GPU", resetSizeCuda );
+   benchmark.time< Devices::Cuda >( setSize1, "GPU", resetSizeCuda );
 #endif
 
    return true;
diff --git a/src/Benchmarks/BLAS/spmv.h b/src/Benchmarks/BLAS/spmv.h
index 9df40f4ec6b0db6d8d268dd5d38cef92b9fc61c8..7299f828a51f11ed5cab5ecd178e0e78040fcc90 100644
--- a/src/Benchmarks/BLAS/spmv.h
+++ b/src/Benchmarks/BLAS/spmv.h
@@ -163,9 +163,9 @@ benchmarkSpMV( Benchmark & benchmark,
    };
 
    benchmark.setOperation( datasetSize );
-   benchmark.time( reset, "CPU", spmvHost );
+   benchmark.time< Devices::Host >( reset, "CPU", spmvHost );
 #ifdef HAVE_CUDA
-   benchmark.time( reset, "GPU", spmvCuda );
+   benchmark.time< Devices::Cuda >( reset, "GPU", spmvCuda );
 #endif
 
    return true;
diff --git a/src/Benchmarks/BLAS/vector-operations.h b/src/Benchmarks/BLAS/vector-operations.h
index 8dd63de85b985f72a9a1dc3888f2e587234340be..e191b8fbb9a7949b7aba399591c5b2928909d893 100644
--- a/src/Benchmarks/BLAS/vector-operations.h
+++ b/src/Benchmarks/BLAS/vector-operations.h
@@ -90,9 +90,9 @@ benchmarkVectorOperations( Benchmark & benchmark,
       resultDevice = deviceVector.max();
    };
    benchmark.setOperation( "max", datasetSize );
-   benchmark.time( reset1, "CPU", maxHost );
+   benchmark.time< Devices::Host >( reset1, "CPU", maxHost );
 #ifdef HAVE_CUDA
-   benchmark.time( reset1, "GPU", maxCuda );
+   benchmark.time< Devices::Cuda >( reset1, "GPU", maxCuda );
 #endif
 
 
@@ -103,9 +103,9 @@ benchmarkVectorOperations( Benchmark & benchmark,
       resultDevice = deviceVector.min();
    };
    benchmark.setOperation( "min", datasetSize );
-   benchmark.time( reset1, "CPU", minHost );
+   benchmark.time< Devices::Host >( reset1, "CPU", minHost );
 #ifdef HAVE_CUDA
-   benchmark.time( reset1, "GPU", minCuda );
+   benchmark.time< Devices::Cuda >( reset1, "GPU", minCuda );
 #endif
 
 
@@ -125,10 +125,10 @@ benchmarkVectorOperations( Benchmark & benchmark,
    };
 #endif
    benchmark.setOperation( "absMax", datasetSize );
-   benchmark.time( reset1, "CPU", absMaxHost );
+   benchmark.time< Devices::Host >( reset1, "CPU", absMaxHost );
 #ifdef HAVE_CUDA
-   benchmark.time( reset1, "GPU", absMaxCuda );
-   benchmark.time( reset1, "cuBLAS", absMaxCublas );
+   benchmark.time< Devices::Cuda >( reset1, "GPU", absMaxCuda );
+   benchmark.time< Devices::Cuda >( reset1, "cuBLAS", absMaxCublas );
 #endif
 
 
@@ -148,10 +148,10 @@ benchmarkVectorOperations( Benchmark & benchmark,
    };
 #endif
    benchmark.setOperation( "absMin", datasetSize );
-   benchmark.time( reset1, "CPU", absMinHost );
+   benchmark.time< Devices::Host >( reset1, "CPU", absMinHost );
 #ifdef HAVE_CUDA
-   benchmark.time( reset1, "GPU", absMinCuda );
-   benchmark.time( reset1, "cuBLAS", absMinCublas );
+   benchmark.time< Devices::Cuda >( reset1, "GPU", absMinCuda );
+   benchmark.time< Devices::Cuda >( reset1, "cuBLAS", absMinCublas );
 #endif
 
 
@@ -162,9 +162,9 @@ benchmarkVectorOperations( Benchmark & benchmark,
       resultDevice = deviceVector.sum();
    };
    benchmark.setOperation( "sum", datasetSize );
-   benchmark.time( reset1, "CPU", sumHost );
+   benchmark.time< Devices::Host >( reset1, "CPU", sumHost );
 #ifdef HAVE_CUDA
-   benchmark.time( reset1, "GPU", sumCuda );
+   benchmark.time< Devices::Cuda >( reset1, "GPU", sumCuda );
 #endif
 
 
@@ -182,10 +182,10 @@ benchmarkVectorOperations( Benchmark & benchmark,
    };
 #endif
    benchmark.setOperation( "l1 norm", datasetSize );
-   benchmark.time( reset1, "CPU", l1normHost );
+   benchmark.time< Devices::Host >( reset1, "CPU", l1normHost );
 #ifdef HAVE_CUDA
-   benchmark.time( reset1, "GPU", l1normCuda );
-   benchmark.time( reset1, "cuBLAS", l1normCublas );
+   benchmark.time< Devices::Cuda >( reset1, "GPU", l1normCuda );
+   benchmark.time< Devices::Cuda >( reset1, "cuBLAS", l1normCublas );
 #endif
 
 
@@ -203,10 +203,10 @@ benchmarkVectorOperations( Benchmark & benchmark,
    };
 #endif
    benchmark.setOperation( "l2 norm", datasetSize );
-   benchmark.time( reset1, "CPU", l2normHost );
+   benchmark.time< Devices::Host >( reset1, "CPU", l2normHost );
 #ifdef HAVE_CUDA
-   benchmark.time( reset1, "GPU", l2normCuda );
-   benchmark.time( reset1, "cuBLAS", l2normCublas );
+   benchmark.time< Devices::Cuda >( reset1, "GPU", l2normCuda );
+   benchmark.time< Devices::Cuda >( reset1, "cuBLAS", l2normCublas );
 #endif
 
 
@@ -217,9 +217,9 @@ benchmarkVectorOperations( Benchmark & benchmark,
       resultDevice = deviceVector.lpNorm( 3.0 );
    };
    benchmark.setOperation( "l3 norm", datasetSize );
-   benchmark.time( reset1, "CPU", l3normHost );
+   benchmark.time< Devices::Host >( reset1, "CPU", l3normHost );
 #ifdef HAVE_CUDA
-   benchmark.time( reset1, "GPU", l3normCuda );
+   benchmark.time< Devices::Cuda >( reset1, "GPU", l3normCuda );
 #endif
 
 
@@ -238,10 +238,10 @@ benchmarkVectorOperations( Benchmark & benchmark,
    };
 #endif
    benchmark.setOperation( "scalar product", 2 * datasetSize );
-   benchmark.time( reset1, "CPU", scalarProductHost );
+   benchmark.time< Devices::Host >( reset1, "CPU", scalarProductHost );
 #ifdef HAVE_CUDA
-   benchmark.time( reset1, "GPU", scalarProductCuda );
-   benchmark.time( reset1, "cuBLAS", scalarProductCublas );
+   benchmark.time< Devices::Cuda >( reset1, "GPU", scalarProductCuda );
+   benchmark.time< Devices::Cuda >( reset1, "cuBLAS", scalarProductCublas );
 #endif
 
    /*
@@ -289,10 +289,10 @@ benchmarkVectorOperations( Benchmark & benchmark,
    };
 #endif
    benchmark.setOperation( "scalar multiplication", 2 * datasetSize );
-   benchmark.time( reset1, "CPU", multiplyHost );
+   benchmark.time< Devices::Host >( reset1, "CPU", multiplyHost );
 #ifdef HAVE_CUDA
-   benchmark.time( reset1, "GPU", multiplyCuda );
-   benchmark.time( reset1, "cuBLAS", multiplyCublas );
+   benchmark.time< Devices::Cuda >( reset1, "GPU", multiplyCuda );
+   benchmark.time< Devices::Cuda >( reset1, "cuBLAS", multiplyCublas );
 #endif
 
 
@@ -312,10 +312,10 @@ benchmarkVectorOperations( Benchmark & benchmark,
    };
 #endif
    benchmark.setOperation( "vector addition", 3 * datasetSize );
-   benchmark.time( reset1, "CPU", addVectorHost );
+   benchmark.time< Devices::Host >( reset1, "CPU", addVectorHost );
 #ifdef HAVE_CUDA
-   benchmark.time( reset1, "GPU", addVectorCuda );
-   benchmark.time( reset1, "cuBLAS", addVectorCublas );
+   benchmark.time< Devices::Cuda >( reset1, "GPU", addVectorCuda );
+   benchmark.time< Devices::Cuda >( reset1, "cuBLAS", addVectorCublas );
 #endif
 
 
diff --git a/src/Benchmarks/Benchmarks.h b/src/Benchmarks/Benchmarks.h
index c371e2dfb3ec1bba387bddbdb454abbaaab44f6e..435e7037379be182844dfe39b672bc39a258af68 100644
--- a/src/Benchmarks/Benchmarks.h
+++ b/src/Benchmarks/Benchmarks.h
@@ -34,53 +34,60 @@ namespace Benchmarks {
 
 const double oneGB = 1024.0 * 1024.0 * 1024.0;
 
-template< typename ComputeFunction,
-          typename ResetFunction,
-          typename Monitor = TNL::Solvers::IterativeSolverMonitor< double, int > >
-double
-timeFunction( ComputeFunction compute,
-              ResetFunction reset,
-              int loops,
-              const double& minTime, 
-              Monitor && monitor = Monitor() )
+template< typename Device >
+class FunctionTimer
 {
-   // the timer is constructed zero-initialized and stopped
-   Timer timer;
-
-   // set timer to the monitor
-   monitor.setTimer( timer );
-
-   // warm up
-   reset();
-   compute();
-
-   //timer.start();
-   int i;
-   for( i = 0;
-        i < loops || timer.getRealTime() < minTime;
-        ++i) 
-   {
-      // abuse the monitor's "time" for loops
-      monitor.setTime( i + 1 );
-
-      reset();
-
-      // Explicit synchronization of the CUDA device
-      // TODO: not necessary for host computations
-#ifdef HAVE_CUDA
-      cudaDeviceSynchronize();
+   public:
+      using DeviceType = Device;
+
+      template< typename ComputeFunction,
+                typename ResetFunction,
+                typename Monitor = TNL::Solvers::IterativeSolverMonitor< double, int > >
+      static double
+      timeFunction( ComputeFunction compute,
+                    ResetFunction reset,
+                    int loops,
+                    const double& minTime, 
+                    Monitor && monitor = Monitor() )
+      {
+         // the timer is constructed zero-initialized and stopped
+         Timer timer;
+
+         // set timer to the monitor
+         monitor.setTimer( timer );
+
+         // warm up
+         reset();
+         compute();
+
+         //timer.start();
+         int i;
+         for( i = 0;
+              i < loops || timer.getRealTime() < minTime;
+              ++i) 
+         {
+            // abuse the monitor's "time" for loops
+            monitor.setTime( i + 1 );
+
+            reset();
+
+            // Explicit synchronization of the CUDA device
+#ifdef HAVE_CUDA      
+            if( std::is_same< Device, Devices::Cuda >::value )
+               cudaDeviceSynchronize();
 #endif
-      timer.start();
-      compute();
+            timer.start();
+            compute();
 #ifdef HAVE_CUDA
-      cudaDeviceSynchronize();
+            if( std::is_same< Device, Devices::Cuda >::value )
+               cudaDeviceSynchronize();
 #endif
-      timer.stop();
-   }
-
-   return timer.getRealTime() / ( double ) i;
-}
+            timer.stop();
+         }
 
+         return timer.getRealTime() / ( double ) i;
+      }
+};
 
 class Logging
 {
@@ -443,7 +450,8 @@ public:
    // "speedup" columns.
    // TODO: allow custom columns bound to lambda functions (e.g. for Gflops calculation)
    // Also terminates the recursion of the following variadic template.
-   template< typename ResetFunction,
+   template< typename Device,
+             typename ResetFunction,
              typename ComputeFunction >
    double
    time( ResetFunction reset,
@@ -456,10 +464,10 @@ public:
          if( verbose > 1 ) {
             // run the monitor main loop
             Solvers::SolverMonitorThread monitor_thread( monitor );
-            result.time = timeFunction( compute, reset, loops, minTime, monitor );
+            result.time = FunctionTimer< Device >::timeFunction( compute, reset, loops, minTime, monitor );
          }
          else {
-            result.time = timeFunction( compute, reset, loops, minTime, monitor );
+            result.time = FunctionTimer< Device >::timeFunction( compute, reset, loops, minTime, monitor );
          }
       }
       catch ( const std::exception& e ) {
@@ -477,7 +485,8 @@ public:
       return this->baseTime;
    }
 
-   template< typename ResetFunction,
+   template< typename Device, 
+             typename ResetFunction,
              typename ComputeFunction,
              typename... NextComputations >
    inline double
@@ -486,7 +495,7 @@ public:
          ComputeFunction & compute )
    {
       BenchmarkResult result;
-      return time( reset, performer, compute, result );
+      return time< Device, ResetFunction, ComputeFunction >( reset, performer, compute, result );
    }
 
    // Adds an error message to the log. Should be called in places where the
diff --git a/src/Benchmarks/DistSpMV/tnl-benchmark-distributed-spmv.h b/src/Benchmarks/DistSpMV/tnl-benchmark-distributed-spmv.h
index a3bd76753aaab02367e5f509cead9dacee2f1b62..55c6bc1567aafc3196dfe9da1cc3e92803ec65b7 100644
--- a/src/Benchmarks/DistSpMV/tnl-benchmark-distributed-spmv.h
+++ b/src/Benchmarks/DistSpMV/tnl-benchmark-distributed-spmv.h
@@ -62,7 +62,7 @@ benchmarkSpmv( Benchmark& benchmark,
       matrix.vectorProduct( x, y );
    };
 
-   benchmark.time( reset, performer, compute );
+   benchmark.time< typename Matrix::DeviceType >( reset, performer, compute );
 }
 
 template< typename Matrix, typename Vector >
@@ -114,7 +114,7 @@ benchmarkDistributedSpmv( Benchmark& benchmark,
       Matrix::CommunicatorType::Barrier( matrix.getCommunicationGroup() );
    };
 
-   benchmark.time( reset, performer, compute );
+   benchmark.time< typename Matrix::DeviceType >( reset, performer, compute );
 }
 
 template< typename Matrix, typename Vector >
diff --git a/src/Benchmarks/LinearSolvers/benchmarks.h b/src/Benchmarks/LinearSolvers/benchmarks.h
index a82ec2dc297ba008d45b5291ece3df07d47fefe7..c6278a76b1e10de7c830a6bae27476623796e42b 100644
--- a/src/Benchmarks/LinearSolvers/benchmarks.h
+++ b/src/Benchmarks/LinearSolvers/benchmarks.h
@@ -73,7 +73,7 @@ benchmarkPreconditionerUpdate( Benchmark& benchmark,
       barrier( matrix );
    };
 
-   benchmark.time( reset, performer, compute );
+   benchmark.time< typename Matrix::DeviceType >( reset, performer, compute );
 }
 
 template< template<typename> class Solver, template<typename> class Preconditioner, typename Matrix, typename Vector >
@@ -166,7 +166,7 @@ benchmarkSolver( Benchmark& benchmark,
    };
    MyBenchmarkResult benchmarkResult( solver, matrix, x, b );
 
-   benchmark.time( reset, performer, compute, benchmarkResult );
+   benchmark.time< typename Matrix::DeviceType >( reset, performer, compute, benchmarkResult );
 }
 
 #ifdef HAVE_ARMADILLO
diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
index f1c4efeed635ea1566d8b087b564168603ae1670..9e80b0d062f47424e293cfa81f1646384b5ad9eb 100644
--- a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
+++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h
@@ -41,7 +41,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
    const int maxSize = parameters.getParameter< int >( "max-size" );
 
    // Full grid traversing
-   benchmark.newBenchmark( String("Full grid traversing - write 1" + convertToString( Dimension ) + "D" ), metadata );
+   benchmark.newBenchmark( String("Full grid traversing - write 1 " + convertToString( Dimension ) + "D" ), metadata );
    for( std::size_t size = minSize; size <= maxSize; size *= 2 )
    {
 
@@ -78,9 +78,9 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
       };
 
       benchmark.setOperation( "Pure C", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
-      benchmark.time( hostReset, "CPU", hostWriteOneUsingPureC );
+      benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingPureC );
 #ifdef HAVE_CUDA
-      benchmark.time( cudaReset, "GPU", cudaWriteOneUsingPureC );
+      benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingPureC );
 #endif
 
       /****
@@ -97,9 +97,9 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
       }; 
 
       benchmark.setOperation( "parallel for", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
-      benchmark.time( hostReset, "CPU", hostWriteOneUsingParallelFor );
+      benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelFor );
 #ifdef HAVE_CUDA
-      benchmark.time( cudaReset, "GPU", cudaWriteOneUsingParallelFor );
+      benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingParallelFor );
 #endif
 
       /****
@@ -113,12 +113,12 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
       auto cudaWriteOneUsingTraverser = [&] ()
       {
          cudaTraverserBenchmark.writeOneUsingTraverser();
-      }
+      };
 
       benchmark.setOperation( "traverser", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB );
-      benchmark.time( hostReset, "CPU", hostWriteOneUsingTraverser );
+      benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingTraverser );
 #ifdef HAVE_CUDA
-      benchmark.time( cudaReset, "GPU", cudaWriteOneUsingTraverser );
+      benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingTraverser );
 #endif
    }
    return true;