diff --git a/tests/benchmarks/cublasWrappers.h b/tests/benchmarks/cublasWrappers.h
index 6369c0f12f34f9deb6ca2094f4255f9ab9142ed9..a7520a34d5d13ced796d8b2366b0382887f09a49 100644
--- a/tests/benchmarks/cublasWrappers.h
+++ b/tests/benchmarks/cublasWrappers.h
@@ -5,6 +5,70 @@
 
 #include <cublas_v2.h>
 
+inline cublasStatus_t
+cublasIgamax( cublasHandle_t handle, int n,
+              const float           *x, int incx, int *result )
+{
+    return cublasIsamax( handle, n, x, incx, result );
+}
+
+inline cublasStatus_t
+cublasIgamax( cublasHandle_t handle, int n,
+              const double          *x, int incx, int *result )
+{
+    return cublasIdamax( handle, n, x, incx, result );
+}
+
+
+inline cublasStatus_t
+cublasIgamin( cublasHandle_t handle, int n,
+              const float           *x, int incx, int *result )
+{
+    return cublasIsamin( handle, n, x, incx, result );
+}
+
+inline cublasStatus_t
+cublasIgamin( cublasHandle_t handle, int n,
+              const double          *x, int incx, int *result )
+{
+    return cublasIdamin( handle, n, x, incx, result );
+}
+
+
+inline cublasStatus_t
+cublasGasum( cublasHandle_t handle, int n,
+             const float           *x, int incx, float  *result )
+{
+    return cublasSasum( handle, n, x, incx, result );
+}
+
+inline cublasStatus_t
+cublasGasum( cublasHandle_t handle, int n,
+             const double          *x, int incx, double *result )
+{
+    return cublasDasum( handle, n, x, incx, result );
+}
+
+
+inline cublasStatus_t
+cublasGaxpy( cublasHandle_t handle, int n,
+             const float           *alpha,
+             const float           *x, int incx,
+             float                 *y, int incy )
+{
+    return cublasSaxpy( handle, n, alpha, x, incx, y, incy );
+}
+
+inline cublasStatus_t
+cublasGaxpy( cublasHandle_t handle, int n,
+             const double          *alpha,
+             const double          *x, int incx,
+             double                *y, int incy )
+{
+    return cublasDaxpy( handle, n, alpha, x, incx, y, incy );
+}
+
+
 inline cublasStatus_t
 cublasGdot( cublasHandle_t handle, int n,
             const float        *x, int incx,
@@ -23,5 +87,37 @@ cublasGdot( cublasHandle_t handle, int n,
     return cublasDdot( handle, n, x, incx, y, incy, result );
 }
 
+
+inline cublasStatus_t
+cublasGnrm2( cublasHandle_t handle, int n,
+             const float           *x, int incx, float  *result )
+{
+    return cublasSnrm2( handle, n, x, incx, result );
+}
+
+inline cublasStatus_t
+cublasGnrm2( cublasHandle_t handle, int n,
+             const double          *x, int incx, double *result )
+{
+    return cublasDnrm2( handle, n, x, incx, result );
+}
+
+
+inline cublasStatus_t
+cublasGscal( cublasHandle_t handle, int n,
+             const float           *alpha,
+             float           *x, int incx )
+{
+    return cublasSscal( handle, n, alpha, x, incx );
+}
+
+inline cublasStatus_t
+cublasGscal( cublasHandle_t handle, int n,
+             const double          *alpha,
+             double          *x, int incx )
+{
+    return cublasDscal( handle, n, alpha, x, incx );
+}
+
 #endif
 #endif
diff --git a/tests/benchmarks/vector-operations.h b/tests/benchmarks/vector-operations.h
index 3876235c09163b7b79171d0ed8e465006de27e33..5d15af0ff13b2fb4d64f69e58b8fc6135cd7c9c6 100644
--- a/tests/benchmarks/vector-operations.h
+++ b/tests/benchmarks/vector-operations.h
@@ -71,10 +71,22 @@ benchmarkVectorOperations( Benchmark & benchmark,
     auto multiplyCuda = [&]() {
         deviceVector *= 0.5;
     };
+#ifdef HAVE_CUBLAS
+    auto multiplyCublas = [&]() {
+        const Real alpha = 0.5;
+        cublasGscal( cublasHandle, size,
+                     &alpha,
+                     deviceVector.getData(), 1 );
+    };
+#endif
     benchmark.setOperation( "scalar multiplication", 2 * datasetSize );
     benchmark.time( reset1,
                     "CPU", multiplyHost,
-                    "GPU", multiplyCuda );
+                    "GPU", multiplyCuda
+#ifdef HAVE_CUBLAS
+                  , "cuBLAS", multiplyCublas
+#endif
+                  );
 
 
     auto addVectorHost = [&]() {
@@ -83,10 +95,23 @@ benchmarkVectorOperations( Benchmark & benchmark,
     auto addVectorCuda = [&]() {
         deviceVector.addVector( deviceVector2 );
     };
+#ifdef HAVE_CUBLAS
+    auto addVectorCublas = [&]() {
+        const Real alpha = 1.0;
+        cublasGaxpy( cublasHandle, size,
+                     &alpha,
+                     deviceVector2.getData(), 1,
+                     deviceVector.getData(), 1 );
+    };
+#endif
     benchmark.setOperation( "vector addition", 3 * datasetSize );
     benchmark.time( reset1,
                     "CPU", addVectorHost,
-                    "GPU", addVectorCuda );
+                    "GPU", addVectorCuda
+#ifdef HAVE_CUBLAS
+                  , "cuBLAS", addVectorCublas
+#endif
+                  );
 
 
     auto maxHost = [&]() {
@@ -119,10 +144,23 @@ benchmarkVectorOperations( Benchmark & benchmark,
     auto absMaxCuda = [&]() {
         resultDevice = deviceVector.absMax();
     };
+#ifdef HAVE_CUBLAS
+    auto absMaxCublas = [&]() {
+        int index = 0;
+        cublasIgamax( cublasHandle, size,
+                      deviceVector.getData(), 1,
+                      &index );
+        resultDevice = deviceVector.getElement( index );
+    };
+#endif
     benchmark.setOperation( "absMax", datasetSize );
     benchmark.time( reset1,
                     "CPU", absMaxHost,
-                    "GPU", absMaxCuda );
+                    "GPU", absMaxCuda
+#ifdef HAVE_CUBLAS
+                  , "cuBLAS", absMaxCublas
+#endif
+                  );
 
 
     auto absMinHost = [&]() {
@@ -131,10 +169,23 @@ benchmarkVectorOperations( Benchmark & benchmark,
     auto absMinCuda = [&]() {
         resultDevice = deviceVector.absMin();
     };
+#ifdef HAVE_CUBLAS
+    auto absMinCublas = [&]() {
+        int index = 0;
+        cublasIgamin( cublasHandle, size,
+                      deviceVector.getData(), 1,
+                      &index );
+        resultDevice = deviceVector.getElement( index );
+    };
+#endif
     benchmark.setOperation( "absMin", datasetSize );
     benchmark.time( reset1,
                     "CPU", absMinHost,
-                    "GPU", absMinCuda );
+                    "GPU", absMinCuda
+#ifdef HAVE_CUBLAS
+                  , "cuBLAS", absMinCublas
+#endif
+                  );
 
 
     auto sumHost = [&]() {
@@ -155,10 +206,21 @@ benchmarkVectorOperations( Benchmark & benchmark,
     auto l1normCuda = [&]() {
         resultDevice = deviceVector.lpNorm( 1.0 );
     };
+#ifdef HAVE_CUBLAS
+    auto l1normCublas = [&]() {
+        cublasGasum( cublasHandle, size,
+                     deviceVector.getData(), 1,
+                     &resultDevice );
+    };
+#endif
     benchmark.setOperation( "l1 norm", datasetSize );
     benchmark.time( reset1,
                     "CPU", l1normHost,
-                    "GPU", l1normCuda );
+                    "GPU", l1normCuda
+#ifdef HAVE_CUBLAS
+                  , "cuBLAS", l1normCublas
+#endif
+                  );
 
 
     auto l2normHost = [&]() {
@@ -167,10 +229,21 @@ benchmarkVectorOperations( Benchmark & benchmark,
     auto l2normCuda = [&]() {
         resultDevice = deviceVector.lpNorm( 2.0 );
     };
+#ifdef HAVE_CUBLAS
+    auto l2normCublas = [&]() {
+        cublasGnrm2( cublasHandle, size,
+                     deviceVector.getData(), 1,
+                     &resultDevice );
+    };
+#endif
     benchmark.setOperation( "l2 norm", datasetSize );
     benchmark.time( reset1,
                     "CPU", l2normHost,
-                    "GPU", l2normCuda );
+                    "GPU", l2normCuda
+#ifdef HAVE_CUBLAS
+                  , "cuBLAS", l2normCublas
+#endif
+                  );
 
 
     auto l3normHost = [&]() {