diff --git a/src/Benchmarks/BLAS/cublasWrappers.h b/src/Benchmarks/BLAS/cublasWrappers.h
index 1e63e139d6faa513706ed1b18a207e87ea1a079d..f0d8952e633627d503cc310aa24ee51ef48dcc34 100644
--- a/src/Benchmarks/BLAS/cublasWrappers.h
+++ b/src/Benchmarks/BLAS/cublasWrappers.h
@@ -118,4 +118,29 @@ cublasGscal( cublasHandle_t handle, int n,
    return cublasDscal( handle, n, alpha, x, incx );
 }
 
+
+inline cublasStatus_t
+cublasGemv( cublasHandle_t handle, cublasOperation_t trans,
+            int m, int n,
+            const float           *alpha,
+            const float           *A, int lda,
+            const float           *x, int incx,
+            const float           *beta,
+            float           *y, int incy )
+{
+   return cublasSgemv( handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy );
+}
+
+inline cublasStatus_t
+cublasGemv( cublasHandle_t handle, cublasOperation_t trans,
+            int m, int n,
+            const double          *alpha,
+            const double          *A, int lda,
+            const double          *x, int incx,
+            const double          *beta,
+            double          *y, int incy )
+{
+   return cublasDgemv( handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy );
+}
+
 #endif
diff --git a/src/Benchmarks/BLAS/dense-mv.h b/src/Benchmarks/BLAS/dense-mv.h
new file mode 100644
index 0000000000000000000000000000000000000000..99978c55b85cc94dd96a10d30f428b969f5f358a
--- /dev/null
+++ b/src/Benchmarks/BLAS/dense-mv.h
@@ -0,0 +1,144 @@
+/***************************************************************************
+                          dense-mv.h  -  description
+                             -------------------
+    begin                : Jul 8, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Jakub Klinkovsky
+
+#pragma once
+
+#include "../Benchmarks.h"
+#include "cublasWrappers.h"
+
+#include <TNL/Containers/Vector.h>
+#include <TNL/Pointers/DevicePointer.h>
+#include <TNL/Matrices/DenseMatrix.h>
+#include <TNL/Devices/Cuda.h>
+#include <TNL/Devices/Host.h>
+
+namespace TNL {
+namespace Benchmarks {
+
+template< typename Matrix >
+void setMatrix( Matrix& matrix )
+{
+   using RealType = typename Matrix::RealType;
+   using IndexType = typename Matrix::IndexType;
+   matrix.forAllElements( [] __cuda_callable__ ( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx_, RealType& value ) {
+       value = 1.0; } );
+}
+
+template< typename Real >
+void
+benchmarkDenseMVSynthetic( Benchmark<> & benchmark,
+                           const int & size )
+{
+   using HostMatrix = TNL::Matrices::DenseMatrix< Real, TNL::Devices::Host >;
+   using RowMajorCudaMatrix = TNL::Matrices::DenseMatrix< Real, TNL::Devices::Cuda, int, TNL::Algorithms::Segments::RowMajorOrder >;
+   using ColumnMajorCudaMatrix = TNL::Matrices::DenseMatrix< Real, TNL::Devices::Cuda >;
+   using HostVector = Containers::Vector< Real, Devices::Host, int >;
+   using CudaVector = Containers::Vector< Real, Devices::Cuda, int >;
+
+   HostMatrix hostMatrix;
+   RowMajorCudaMatrix rowMajorCudaMatrix;
+   ColumnMajorCudaMatrix columnMajorCudaMatrix;
+   HostVector inHostVector, outHostVector;
+   CudaVector inCudaVector, outCudaVector1, outCudaVector2;
+
+   // create benchmark group
+   const std::vector< String > parsedType = parseObjectType( getType< HostMatrix >() );
+#ifdef HAVE_CUDA
+   benchmark.createHorizontalGroup( parsedType[ 0 ], 2 );
+#else
+   benchmark.createHorizontalGroup( parsedType[ 0 ], 1 );
+#endif
+
+   hostMatrix.setDimensions( size, size );
+   inHostVector.setSize( size );
+   outHostVector.setSize( size );
+
+   setMatrix< HostMatrix >( hostMatrix );
+   const double datasetSize = (double) ( size * size ) * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB;
+
+   // reset function
+   auto reset = [&]() {
+      inHostVector = 1.0;
+      outHostVector = 0.0;
+#ifdef HAVE_CUDA
+      inCudaVector = 1.0;
+      //outCudaVector1 = 0.0;
+      //outCudaVector2 = 0.0;
+#endif
+   };
+
+   // compute functions
+   auto spmvHost = [&]() {
+      hostMatrix.vectorProduct( inHostVector, outHostVector );
+   };
+   benchmark.setOperation( datasetSize );
+   benchmark.time< Devices::Host >( reset, "CPU", spmvHost );
+
+#ifdef HAVE_CUDA
+   columnMajorCudaMatrix.setDimensions( size, size );
+   inCudaVector.setSize( size );
+   outCudaVector1.setSize( size );
+   outCudaVector2.setSize( size );
+   setMatrix< ColumnMajorCudaMatrix >( columnMajorCudaMatrix );
+
+   auto columnMajorMvCuda = [&]() {
+      columnMajorCudaMatrix.vectorProduct( inCudaVector, outCudaVector1 );
+   };
+   benchmark.time< Devices::Cuda >( reset, "GPU col", columnMajorMvCuda );
+
+   columnMajorCudaMatrix.reset();
+
+   rowMajorCudaMatrix.setDimensions( size, size );
+   setMatrix< RowMajorCudaMatrix >( rowMajorCudaMatrix );
+
+   auto rowMajorMvCuda = [&]() {
+      rowMajorCudaMatrix.vectorProduct( inCudaVector, outCudaVector2 );
+   };
+   benchmark.time< Devices::Cuda >( reset, "GPU row", rowMajorMvCuda );
+
+   //std::cerr << "Diff. = " << TNL::max( abs( outCudaVector2 - outCudaVector1 ) ) << std::endl;
+
+   rowMajorCudaMatrix.reset();
+   columnMajorCudaMatrix.setDimensions( size, size );
+   setMatrix< ColumnMajorCudaMatrix >( columnMajorCudaMatrix );
+
+   cublasHandle_t cublasHandle;
+   cublasCreate( &cublasHandle );
+   auto mvCublas = [&] () {
+      Real alpha = 1.0;
+      Real beta = 0.0;
+      cublasGemv( cublasHandle, CUBLAS_OP_N, size, size, &alpha,
+                  columnMajorCudaMatrix.getValues().getData(), size,
+                  inCudaVector.getData(), 1, &beta,
+                  outCudaVector1.getData(), 1 );
+   };
+   benchmark.time< Devices::Cuda >( reset, "GPU cublas", mvCublas );
+
+#endif
+}
+
+/*template< typename Real = double,
+          typename Index = int >
+void
+benchmarkDenseSynthetic( Benchmark<> & benchmark,
+                         const int & size )
+{
+   // TODO: benchmark all formats from tnl-benchmark-spmv (different parameters of the base formats)
+   // NOTE: CSR is disabled because it is very slow on GPU
+   //benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Scalar >( benchmark, size, elementsPerRow );
+   benchmarkSpMV< Real, Benchmarks::SpMV::ReferenceFormats::Legacy::Ellpack >( benchmark, size, elementsPerRow );
+   benchmarkSpMV< Real, SlicedEllpack >( benchmark, size, elementsPerRow );
+   benchmarkSpMV< Real, Benchmarks::SpMV::ReferenceFormats::Legacy::ChunkedEllpack >( benchmark, size, elementsPerRow );
+}*/
+
+} // namespace Benchmarks
+} // namespace TNL
diff --git a/src/Benchmarks/BLAS/tnl-benchmark-blas.h b/src/Benchmarks/BLAS/tnl-benchmark-blas.h
index bc58aeb2c9e0d1210836c139727376a2c3422837..9b061adf65a752116bc75090d86af8509e4e764a 100644
--- a/src/Benchmarks/BLAS/tnl-benchmark-blas.h
+++ b/src/Benchmarks/BLAS/tnl-benchmark-blas.h
@@ -22,6 +22,8 @@
 #include "vector-operations.h"
 #include "triad.h"
 #include "spmv.h"
+#include "dense-mv.h"
+
 
 using namespace TNL;
 using namespace TNL::Benchmarks;
@@ -100,6 +102,18 @@ runBlasBenchmarks( Benchmark<> & benchmark,
       } ));
       benchmarkSpmvSynthetic< Real >( benchmark, size, elementsPerRow );
    }
+
+   // Dense matrix-vector multiplication
+   benchmark.newBenchmark( String("Dense matrix-vector multiplication (") + precision + ")",
+                           metadata );
+   for( std::size_t size = 10; size <= 20000; size *= 2 ) {
+      benchmark.setMetadataColumns( Benchmark<>::MetadataColumns({
+         { "rows", convertToString( size ) },
+         { "columns", convertToString( size ) }
+      } ));
+      benchmarkDenseMVSynthetic< Real >( benchmark, size );
+   }
+
 }
 
 void