diff --git a/tests/benchmarks/benchmarks.h b/tests/benchmarks/benchmarks.h
index b61b027400264af2897662e5a040f5bd300291aa..5a8e488e8dd0588c3f4f7038e083a28a1cfb5322 100644
--- a/tests/benchmarks/benchmarks.h
+++ b/tests/benchmarks/benchmarks.h
@@ -9,6 +9,8 @@ namespace tnl
 namespace benchmarks
 {
 
+const double oneGB = 1024.0 * 1024.0 * 1024.0;
+
 // TODO: add data member for error message
 struct BenchmarkError {};
 
diff --git a/tests/benchmarks/tnl-cuda-benchmarks.h b/tests/benchmarks/tnl-cuda-benchmarks.h
index 9699ac029c7e306c1b6d46b010658c8c5467dab1..9809911cedcb50042d19f7c29a2ab8d577048566 100644
--- a/tests/benchmarks/tnl-cuda-benchmarks.h
+++ b/tests/benchmarks/tnl-cuda-benchmarks.h
@@ -19,13 +19,12 @@
 #define TNLCUDBENCHMARKS_H_
 
 #include <tnlConfig.h>
-#include <core/vectors/tnlVector.h>
 #include <core/tnlList.h>
 #include <matrices/tnlSlicedEllpackMatrix.h>
 #include <matrices/tnlEllpackMatrix.h>
 #include <matrices/tnlCSRMatrix.h>
 
-#include "benchmarks.h"
+#include "vector-operations.h"
 
 using namespace tnl::benchmarks;
 
@@ -37,7 +36,6 @@ using namespace tnl::benchmarks;
 template< typename Real, typename Device, typename Index >
 using SlicedEllpackMatrix = tnlSlicedEllpackMatrix< Real, Device, Index >;
 
-const double oneGB = 1024.0 * 1024.0 * 1024.0;
 
 
 // TODO:
@@ -199,9 +197,6 @@ int main( int argc, char* argv[] )
 #ifdef HAVE_CUDA
    
    typedef double Real;
-   typedef tnlVector< Real, tnlHost > HostVector;
-   typedef tnlVector< Real, tnlCuda > CudaVector;
-
    
    /****
     * The first argument of this program is the size od data set to be reduced.
@@ -217,139 +212,7 @@ int main( int argc, char* argv[] )
    if( argc > 3 )
       elementsPerRow = atoi( argv[ 3 ] );
    
-   
-   double datasetSize = ( double ) ( loops * size ) * sizeof( Real ) / oneGB;
-   
-   HostVector hostVector, hostVector2;
-   CudaVector deviceVector, deviceVector2;
-   hostVector.setSize( size );
-   if( ! deviceVector.setSize( size ) )
-      return EXIT_FAILURE;
-   hostVector2.setLike( hostVector );
-   if( ! deviceVector2.setLike( deviceVector ) )
-      return EXIT_FAILURE;
-
-   Real resultHost, resultDevice;
-
-
-   // check functions
-   auto compare1 = [&]() {
-      return hostVector == deviceVector;
-   };
-   auto compare2 = [&]() {
-      return hostVector2 == deviceVector2;
-   };
-   auto compare12 = [&]() {
-      return compare1() && compare2();
-   };
-   auto compareScalars = [&]() {
-      return resultHost == resultDevice;
-   };
-
-   // reset functions
-   auto reset1 = [&]() {
-      hostVector.setValue( 1.0 );
-      deviceVector.setValue( 1.0 );
-   };
-   auto reset2 = [&]() {
-      hostVector2.setValue( 1.0 );
-      deviceVector2.setValue( 1.0 );
-   };
-   auto reset12 = [&]() {
-      reset1();
-      reset2();
-   };
-
-
-   reset12();
-
-   cout << "Benchmarking CPU-GPU memory transfer:" << endl;
-   auto copyAssign = [&]() {
-      deviceVector = hostVector;
-   };
-   cout << "  ";
-   benchmarkSingle( loops, datasetSize, copyAssign, compare1, reset1 );
-    
-
-   cout << "Benchmarking vector addition:" << endl;
-   auto addVectorHost = [&]() {
-      hostVector.addVector( hostVector2 );
-   };
-   auto addVectorCuda = [&]() {
-      deviceVector.addVector( deviceVector2 );
-      // TODO: synchronization should be part of addVector
-      cudaThreadSynchronize();
-   };
-   benchmarkCuda( loops, 3 * datasetSize, addVectorHost, addVectorCuda, compare1, reset1 );
-
-
-   cout << "Benchmarking scalar product:" << endl;
-   auto scalarProductHost = [&]() {
-      resultHost = hostVector.scalarProduct( hostVector2 );
-   };
-   auto scalarProductCuda = [&]() {
-      resultDevice = deviceVector.scalarProduct( deviceVector2 );
-   };
-   benchmarkCuda( loops, 2 * datasetSize, scalarProductHost, scalarProductCuda, compareScalars, voidFunc );
-
-/* TODO
-#ifdef HAVE_CUBLAS
-   cout << "Benchmarking scalar product on GPU with Cublas: " << endl;
-   cublasHandle_t handle;
-   cublasCreate( &handle );
-   timer.reset();
-   timer.start();   
-   for( int i = 0; i < loops; i++ )
-      cublasDdot( handle,
-                  size,
-                  deviceVector.getData(), 1,
-                  deviceVector.getData(), 1,
-                  &resultDevice );
-   cudaThreadSynchronize();
-   timer.stop();
-   bandwidth = 2 * datasetSize / timer.getTime();
-   cout << "bandwidth: " << bandwidth << " GB/sec, time: " << timer.getTime() << " sec." << endl;
-#endif    
-*/
-
-   cout << "Benchmarking L2 norm: " << endl;
-   auto l2normHost = [&]() {
-      resultHost = hostVector.lpNorm( 2.0 );
-   };
-   auto l2normCuda = [&]() {
-      resultDevice = deviceVector.lpNorm( 2.0 );
-   };
-   benchmarkCuda( loops, datasetSize, l2normHost, l2normCuda, compareScalars, voidFunc );
-
-
-   /*
-   cout << "Benchmarking prefix-sum:" << endl;
-   timer.reset();
-   timer.start();
-   hostVector.computePrefixSum();
-   timer.stop();
-   timeHost = timer.getTime();
-   bandwidth = 2 * datasetSize / loops / timer.getTime();
-   cout << "  CPU: bandwidth: " << bandwidth << " GB/sec, time: " << timer.getTime() << " sec." << endl;
-   
-   timer.reset();
-   timer.start();
-   deviceVector.computePrefixSum();
-   timer.stop();
-   timeDevice = timer.getTime();
-   bandwidth = 2 * datasetSize / loops / timer.getTime();
-   cout << "  GPU: bandwidth: " << bandwidth << " GB/sec, time: " << timer.getTime() << " sec." << endl;
-   cout << "  CPU/GPU speedup: " << timeHost / timeDevice << endl;
-
-   HostVector auxHostVector;
-   auxHostVector.setLike( deviceVector );
-   auxHostVector = deviceVector;
-   for( int i = 0; i < size; i++ )
-      if( hostVector.getElement( i ) != auxHostVector.getElement( i ) )
-      {
-         cerr << "Error in prefix sum at position " << i << ":  " << hostVector.getElement( i ) << " != " << auxHostVector.getElement( i ) << endl;
-      }
-*/
+   benchmarkVectorOperations< Real >( loops, size );
 
    benchmarkSpMV< Real, tnlEllpackMatrix >( loops, size, elementsPerRow );
    benchmarkSpMV< Real, SlicedEllpackMatrix >( loops, size, elementsPerRow );
diff --git a/tests/benchmarks/vector-operations.h b/tests/benchmarks/vector-operations.h
new file mode 100644
index 0000000000000000000000000000000000000000..b15bfeb843b66aeb2face40f9bad7a4d43156739
--- /dev/null
+++ b/tests/benchmarks/vector-operations.h
@@ -0,0 +1,159 @@
+#pragma once
+
+#include "benchmarks.h"
+
+#include <core/vectors/tnlVector.h>
+
+namespace tnl
+{
+namespace benchmarks
+{
+
+template< typename Real = double,
+          typename Index = int >
+bool
+benchmarkVectorOperations( const int & loops,
+                           const int & size )
+{
+    typedef tnlVector< Real, tnlHost, Index > HostVector;
+    typedef tnlVector< Real, tnlCuda, Index > CudaVector;
+    using namespace std;
+
+    double datasetSize = ( double ) ( loops * size ) * sizeof( Real ) / oneGB;
+    
+    HostVector hostVector, hostVector2;
+    CudaVector deviceVector, deviceVector2;
+    hostVector.setSize( size );
+    if( ! deviceVector.setSize( size ) )
+        return false;
+    hostVector2.setLike( hostVector );
+    if( ! deviceVector2.setLike( deviceVector ) )
+        return false;
+ 
+    Real resultHost, resultDevice;
+ 
+ 
+    // check functions
+    auto compare1 = [&]() {
+        return hostVector == deviceVector;
+    };
+    auto compare2 = [&]() {
+        return hostVector2 == deviceVector2;
+    };
+    auto compare12 = [&]() {
+        return compare1() && compare2();
+    };
+    auto compareScalars = [&]() {
+        return resultHost == resultDevice;
+    };
+ 
+    // reset functions
+    auto reset1 = [&]() {
+        hostVector.setValue( 1.0 );
+        deviceVector.setValue( 1.0 );
+    };
+    auto reset2 = [&]() {
+        hostVector2.setValue( 1.0 );
+        deviceVector2.setValue( 1.0 );
+    };
+    auto reset12 = [&]() {
+        reset1();
+        reset2();
+    };
+ 
+ 
+    reset12();
+ 
+    cout << "Benchmarking CPU-GPU memory transfer:" << endl;
+    auto copyAssign = [&]() {
+        deviceVector = hostVector;
+    };
+    cout << "  ";
+    benchmarkSingle( loops, datasetSize, copyAssign, compare1, reset1 );
+     
+ 
+    cout << "Benchmarking vector addition:" << endl;
+    auto addVectorHost = [&]() {
+        hostVector.addVector( hostVector2 );
+    };
+    auto addVectorCuda = [&]() {
+        deviceVector.addVector( deviceVector2 );
+        // TODO: synchronization should be part of addVector
+        cudaThreadSynchronize();
+    };
+    benchmarkCuda( loops, 3 * datasetSize, addVectorHost, addVectorCuda, compare1, reset1 );
+ 
+ 
+    cout << "Benchmarking scalar product:" << endl;
+    auto scalarProductHost = [&]() {
+        resultHost = hostVector.scalarProduct( hostVector2 );
+    };
+    auto scalarProductCuda = [&]() {
+        resultDevice = deviceVector.scalarProduct( deviceVector2 );
+    };
+    benchmarkCuda( loops, 2 * datasetSize, scalarProductHost, scalarProductCuda, compareScalars, voidFunc );
+
+/* TODO
+#ifdef HAVE_CUBLAS
+   cout << "Benchmarking scalar product on GPU with Cublas: " << endl;
+   cublasHandle_t handle;
+   cublasCreate( &handle );
+   timer.reset();
+   timer.start();   
+   for( int i = 0; i < loops; i++ )
+      cublasDdot( handle,
+                  size,
+                  deviceVector.getData(), 1,
+                  deviceVector.getData(), 1,
+                  &resultDevice );
+   cudaThreadSynchronize();
+   timer.stop();
+   bandwidth = 2 * datasetSize / timer.getTime();
+   cout << "bandwidth: " << bandwidth << " GB/sec, time: " << timer.getTime() << " sec." << endl;
+#endif    
+*/
+
+    cout << "Benchmarking L2 norm: " << endl;
+    auto l2normHost = [&]() {
+        resultHost = hostVector.lpNorm( 2.0 );
+    };
+    auto l2normCuda = [&]() {
+        resultDevice = deviceVector.lpNorm( 2.0 );
+    };
+    benchmarkCuda( loops, datasetSize, l2normHost, l2normCuda, compareScalars, voidFunc );
+ 
+ 
+    /*
+    cout << "Benchmarking prefix-sum:" << endl;
+    timer.reset();
+    timer.start();
+    hostVector.computePrefixSum();
+    timer.stop();
+    timeHost = timer.getTime();
+    bandwidth = 2 * datasetSize / loops / timer.getTime();
+    cout << "  CPU: bandwidth: " << bandwidth << " GB/sec, time: " << timer.getTime() << " sec." << endl;
+    
+    timer.reset();
+    timer.start();
+    deviceVector.computePrefixSum();
+    timer.stop();
+    timeDevice = timer.getTime();
+    bandwidth = 2 * datasetSize / loops / timer.getTime();
+    cout << "  GPU: bandwidth: " << bandwidth << " GB/sec, time: " << timer.getTime() << " sec." << endl;
+    cout << "  CPU/GPU speedup: " << timeHost / timeDevice << endl;
+ 
+    HostVector auxHostVector;
+    auxHostVector.setLike( deviceVector );
+    auxHostVector = deviceVector;
+    for( int i = 0; i < size; i++ )
+       if( hostVector.getElement( i ) != auxHostVector.getElement( i ) )
+       {
+          cerr << "Error in prefix sum at position " << i << ":  " << hostVector.getElement( i ) << " != " << auxHostVector.getElement( i ) << endl;
+       }
+    */
+
+    return true;
+}
+
+} // namespace benchmarks
+} // namespace tnl