diff --git a/src/Benchmarks/SpMV/OldSpMV/tnl-benchmark-old-spmv.cpp b/src/Benchmarks/SpMV/OldSpMV/tnl-benchmark-old-spmv.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c9cd17cda0312c07bed4bcaa92c4ef4273704b35
--- /dev/null
+++ b/src/Benchmarks/SpMV/OldSpMV/tnl-benchmark-old-spmv.cpp
@@ -0,0 +1,14 @@
+                          tnl-benchmark-spmv.cpp  -  description
+                             -------------------
+    begin                : Jun 5, 2014
+    copyright            : (C) 2014 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+/* See Copyright Notice in tnl/Copyright */
+#include "tnl-benchmark-old-spmv.h"
diff --git a/src/Benchmarks/SpMV/OldSpMV/tnl-benchmark-old-spmv.cu b/src/Benchmarks/SpMV/OldSpMV/tnl-benchmark-old-spmv.cu
new file mode 100644
index 0000000000000000000000000000000000000000..433af970b6058e1ae03f480296da566a3cbb79b5
--- /dev/null
+++ b/src/Benchmarks/SpMV/OldSpMV/tnl-benchmark-old-spmv.cu
@@ -0,0 +1,12 @@
+                          tnl-benchmark-spmv.cu  -  description
+                             -------------------
+    begin                : Jun 5, 2014
+    copyright            : (C) 2014 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+/* See Copyright Notice in tnl/Copyright */
+#include "tnl-benchmark-old-spmv.h"
diff --git a/src/Benchmarks/SpMV/OldSpMV/tnl-benchmark-old-spmv.h b/src/Benchmarks/SpMV/OldSpMV/tnl-benchmark-old-spmv.h
new file mode 100644
index 0000000000000000000000000000000000000000..455c7d412f4f8ae4cc4af7bbd15ba0e47dda978a
--- /dev/null
+++ b/src/Benchmarks/SpMV/OldSpMV/tnl-benchmark-old-spmv.h
@@ -0,0 +1,925 @@
+                          tnl-benchmark-spmv.h  -  description
+                             -------------------
+    begin                : Jun 5, 2014
+    copyright            : (C) 2014 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+/* See Copyright Notice in tnl/Copyright */
+#pragma once
+#include <fstream>
+#include <iomanip>
+#include <unistd.h>
+#ifdef HAVE_CUDA
+#include <cusparse.h>
+#include <TNL/Config/ConfigDescription.h>
+#include <TNL/Config/ParameterContainer.h>
+#include <TNL/Matrices/CSR.h>
+#include <TNL/Matrices/AdEllpack.h>
+#include <TNL/Matrices/BiEllpack.h>
+#include <TNL/Matrices/BiEllpackSymmetric.h>
+#include <TNL/Matrices/Ellpack.h>
+#include <TNL/Matrices/EllpackSymmetric.h>
+#include <TNL/Matrices/EllpackSymmetricGraph.h>
+#include <TNL/Matrices/SlicedEllpack.h>
+#include <TNL/Matrices/SlicedEllpackSymmetric.h>
+#include <TNL/Matrices/SlicedEllpackSymmetricGraph.h>
+#include <TNL/Matrices/ChunkedEllpack.h>
+#include <TNL/Matrices/MatrixReader.h>
+#include <TNL/Timer.h>
+#include "tnlCusparseCSRMatrix.h"
+using namespace std;
+using namespace TNL;
+using namespace TNL::Matrices;
+void setupConfig( Config::ConfigDescription& config )
+   config.addDelimiter                            ( "General settings:" );
+   config.addRequiredEntry< String >( "test" , "Test to be performed." );
+      config.addEntryEnum< String >( "mtx" );
+      config.addEntryEnum< String >( "tnl" );
+   config.addRequiredEntry< String >( "input-file" , "Input file name." );
+   config.addEntry< String >( "log-file", "Log file name.", "tnl-benchmark-spmv.log");
+   config.addEntry< String >( "precision", "Precision of the arithmetics.", "double" );
+   config.addEntry< double >( "stop-time", "Seconds to iterate the SpMV operation.", 1.0 );
+   config.addEntry< int >( "verbose", "Verbose mode.", 1 );
+bool initLogFile( std::fstream& logFile, const String& fileName )
+   if( access( fileName.getString(), F_OK ) == -1 )
+   {
+      logFile.open( fileName.getString(), std::ios::out );
+      if( ! logFile )
+         return false;
+      const String fillingColoring = " : COLORING 0 #FFF8DC 20 #FFFF00 40 #FFD700 60 #FF8C0 80 #FF0000 100";
+      const String speedupColoring = " : COLORING #0099FF 1 #FFFFFF 2 #00FF99 4 #33FF99 8 #33FF22 16 #FF9900";
+      const String paddingColoring = " : COLORING #FFFFFF 1 #FFFFCC 10 #FFFF99 100 #FFFF66 1000 #FFFF33 10000 #FFFF00";
+      logFile << "#Matrix file " << std::endl;
+      logFile << "#Rows" << std::endl;
+      logFile << "#Columns" << std::endl;
+      logFile << "#Non-zero elements" << std::endl;
+      logFile << "#Filling (in %)" << fillingColoring << std::endl;
+      logFile << "#CSR Format" << std::endl;
+      logFile << "# CPU" << std::endl;
+      logFile << "#  Gflops" << std::endl;
+      logFile << "#  Throughput" << std::endl;
+      logFile << "#  Speedup" << speedupColoring << std::endl;
+#ifdef HAVE_CUDA
+      logFile << "# Cusparse CSR" << std::endl;
+      logFile << "#  Gflops" << std::endl;
+      logFile << "#  Throughput" << std::endl;
+      logFile << "#  Speedup" << speedupColoring << " SORT - cusparse-csr-speedup.txt" << std::endl;
+      logFile << "# CUDA" << std::endl;
+      logFile << "#  Scalar" << std::endl;
+      logFile << "#   Gflops" << std::endl;
+      logFile << "#   Throughput" << std::endl;
+      logFile << "#   Speedup" << speedupColoring << " SORT - csr-scalar-cuda-speedup.txt" << std::endl;
+      logFile << "#  Vector" << std::endl;
+      logFile << "#   Warp Size 1" << std::endl;
+      logFile << "#    Gflops" << std::endl;
+      logFile << "#    Throughput" << std::endl;
+      logFile << "#    Speedup" << speedupColoring << " SORT - csr-vector-1-cuda-speedup.txt" << std::endl;
+      logFile << "#   Warp Size 2" << std::endl;
+      logFile << "#    Gflops" << std::endl;
+      logFile << "#    Throughput" << std::endl;
+      logFile << "#    Speedup" << speedupColoring << " SORT - csr-vector-2-cuda-speedup.txt" << std::endl;
+      logFile << "#   Warp Size 4" << std::endl;
+      logFile << "#    Gflops" << std::endl;
+      logFile << "#    Throughput" << std::endl;
+      logFile << "#    Speedup" << speedupColoring << " SORT - csr-vector-4-cuda-speedup.txt" << std::endl;
+      logFile << "#   Warp Size 8" << std::endl;
+      logFile << "#    Gflops" << std::endl;
+      logFile << "#    Throughput" << std::endl;
+      logFile << "#    Speedup" << speedupColoring << " SORT - csr-vector-8-cuda-speedup.txt" << std::endl;
+      logFile << "#   Warp Size 16" << std::endl;
+      logFile << "#    Gflops" << std::endl;
+      logFile << "#    Throughput" << std::endl;
+      logFile << "#    Speedup" << speedupColoring << " SORT - csr-vector-16-cuda-speedup.txt" << std::endl;
+      logFile << "#   Warp Size 32" << std::endl;
+      logFile << "#    Gflops" << std::endl;
+      logFile << "#    Throughput" << std::endl;
+      logFile << "#    Speedup" << speedupColoring << " SORT - csr-vector-32-cuda-speedup.txt" << std::endl;
+      logFile << "#  Hybrid" << std::endl;
+      logFile << "#   Split 2" << std::endl;
+      logFile << "#    Gflops" << std::endl;
+      logFile << "#    Throughput" << std::endl;
+      logFile << "#    Speedup" << speedupColoring << " SORT - csr-hybrid-2-cuda-speedup.txt" << std::endl;
+      logFile << "#   Split 4" << std::endl;
+      logFile << "#    Gflops" << std::endl;
+      logFile << "#    Throughput" << std::endl;
+      logFile << "#    Speedup" << speedupColoring << " SORT - csr-hybrid-4-cuda-speedup.txt" << std::endl;
+      logFile << "#   Split 8" << std::endl;
+      logFile << "#    Gflops" << std::endl;
+      logFile << "#    Throughput" << std::endl;
+      logFile << "#    Speedup" << speedupColoring << " SORT - csr-hybrid-8-cuda-speedup.txt" << std::endl;
+      logFile << "#   Split 16" << std::endl;
+      logFile << "#    Gflops" << std::endl;
+      logFile << "#    Throughput" << std::endl;
+      logFile << "#    Speedup" << speedupColoring << " SORT - csr-hybrid-16-cuda-speedup.txt" << std::endl;
+      logFile << "#   Split 32" << std::endl;
+      logFile << "#    Gflops" << std::endl;
+      logFile << "#    Throughput" << std::endl;
+      logFile << "#    Speedup" << speedupColoring << " SORT - csr-hybrid-32-cuda-speedup.txt" << std::endl;
+      logFile << "#   Split 64" << std::endl;
+      logFile << "#    Gflops" << std::endl;
+      logFile << "#    Throughput" << std::endl;
+      logFile << "#    Speedup" << speedupColoring << " SORT - csr-hybrid-64-cuda-speedup.txt" << std::endl;
+      logFile << "#Ellpack Format" << std::endl;
+      logFile << "# Padding (in %)" << paddingColoring << std::endl;
+      logFile << "# CPU" << std::endl;
+      logFile << "#  Gflops" << std::endl;
+      logFile << "#  Throughput" << std::endl;
+      logFile << "#  Speedup" << speedupColoring << " SORT - ellpack-host-speedup.txt" << std::endl;
+#ifdef HAVE_CUDA
+      logFile << "# CUDA" << std::endl;
+      logFile << "#  Gflops" << std::endl;
+      logFile << "#  Throughput" << std::endl;
+      logFile << "#  Speedup" << speedupColoring << " SORT - ellpack-cuda-speedup.txt" << std::endl;
+      logFile << "#SlicedEllpack Format" << std::endl;
+      logFile << "# Padding (in %)" << paddingColoring << std::endl;
+      logFile << "# CPU" << std::endl;
+      logFile << "#  Gflops" << std::endl;
+      logFile << "#  Throughput" << std::endl;
+      logFile << "#  Speedup" << speedupColoring << " SORT - sliced-ellpack-host-speedup.txt" << std::endl;
+#ifdef HAVE_CUDA
+      logFile << "# CUDA" << std::endl;
+      logFile << "#  Gflops" << std::endl;
+      logFile << "#  Throughput" << std::endl;
+      logFile << "#  Speedup" << speedupColoring << " SORT - sliced-ellpack-cuda-speedup.txt" << std::endl;
+      logFile << "#ChunkedEllpack Format" << std::endl;
+      logFile << "# Padding (in %)" << paddingColoring << std::endl;
+      logFile << "# CPU" << std::endl;
+      logFile << "#  Gflops" << std::endl;
+      logFile << "#  Throughput" << std::endl;
+      logFile << "#  Speedup" << speedupColoring << " SORT - chunked-ellpack-host-speedup.txt" << std::endl;
+#ifdef HAVE_CUDA
+      logFile << "# CUDA" << std::endl;
+      logFile << "#  Gflops" << std::endl;
+      logFile << "#  Throughput" << std::endl;
+      logFile << "#  Speedup" << speedupColoring << " SORT - chunked-ellpack-cuda-speedup.txt" << std::endl;
+      return true;
+   }
+   logFile.open( fileName.getString(), std::ios::out | std::ios::app );
+   //logFile << std::setprecision( 2 );
+   if( ! logFile )
+      return false;
+   return true;
+template< typename Matrix >
+void printMatrixInfo( const String& inputFileName,
+                      const Matrix& matrix,
+                      std::ostream& str )
+   str << " Rows: " << std::setw( 8 ) << matrix.getRows();
+   str << " Columns: " << std::setw( 8 ) << matrix.getColumns();
+   str << " Nonzero Elements: " << std::setw( 10 ) << matrix.getNumberOfNonzeroMatrixElements();
+   const double fillingRatio = ( double ) matrix.getNumberOfNonzeroMatrixElements() / ( double ) matrix.getNumberOfMatrixElements();
+   str << " Filling: " << std::setw( 5 ) << 100.0 * fillingRatio << "%" << std::endl;
+   str << std::setw( 25 ) << "Format"
+       << std::setw( 15 ) << "Padding"
+       << std::setw( 15 ) << "Time"
+       << std::setw( 15 ) << "GFLOPS"
+       << std::setw( 15 ) << "Throughput"
+       << std::setw( 15 ) << "Speedup" << std::endl;
+template< typename Matrix >
+bool writeMatrixInfo( const String& inputFileName,
+                      const Matrix& matrix,
+                      std::ostream& logFile )
+   logFile << std::endl;
+   logFile << inputFileName << std::endl;
+   logFile << " " << matrix.getRows() << std::endl;
+   logFile << " " << matrix.getColumns() << std::endl;
+   logFile << " " << matrix.getNumberOfNonzeroMatrixElements() << std::endl;
+   const double fillingRatio = ( double ) matrix.getNumberOfNonzeroMatrixElements() / ( double ) matrix.getNumberOfMatrixElements();
+   logFile << " " << 100.0 * fillingRatio << std::endl;
+   logFile << std::flush;
+   if( ! logFile.good() )
+      return false;
+   return true;
+double computeGflops( const long int nonzeroElements,
+                      const int iterations,
+                      const double& time )
+   return ( double ) ( 2 * iterations * nonzeroElements ) / time * 1.0e-9;
+template< typename Real >
+double computeThroughput( const long int nonzeroElements,
+                          const int iterations,
+                          const int rows,
+                          const double& time )
+   return ( double ) ( ( 2 * nonzeroElements + rows ) * iterations ) * sizeof( Real ) / time * 1.0e-9;
+template< typename Matrix,
+          typename Vector >
+double benchmarkMatrix( const Matrix& matrix,
+                        const Vector& x,
+                        Vector& b,
+                        const long int nonzeroElements,
+                        const char* format,
+                        const double& stopTime,
+                        const double& baseline,
+                        int verbose,
+                        std::fstream& logFile )
+   Timer timer;
+   timer.start();
+   double time( 0.0 );
+   int iterations( 0 );
+   while( time < stopTime )
+   {
+      matrix.vectorProduct( x, b );
+#ifdef HAVE_CUDA
+      if( std::is_same< typename Matrix::DeviceType, Devices::Cuda >::value )
+         cudaDeviceSynchronize();
+      time = timer.getRealTime();
+      iterations++;
+   }
+   const double gflops = computeGflops( nonzeroElements, iterations, time );
+   const double throughput = computeThroughput< typename Matrix::RealType >( nonzeroElements, iterations, matrix.getRows(), time );
+   const long int allocatedElements = matrix.getNumberOfMatrixElements();
+   const double padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100.0;
+   if( verbose )
+   {
+     std::cout << std::setw( 25 ) << format
+           << std::setw( 15 ) << padding
+           << std::setw( 15 ) << time
+           << std::setw( 15 ) << gflops
+           << std::setw( 15 ) << throughput;
+      if( baseline )
+        std::cout << std::setw( 15 ) << gflops / baseline << std::endl;
+      else
+        std::cout << std::setw( 15 ) << "N/A" << std::endl;
+   }
+   logFile << "  " << gflops << std::endl;
+   logFile << "  " << throughput << std::endl;
+   if( baseline )
+      logFile << gflops / baseline << std::endl;
+   else
+      logFile << "N/A" << std::endl;
+   return gflops;
+void writeTestFailed( std::fstream& logFile,
+                      int repeat )
+   for( int i = 0; i < repeat; i++ )
+      logFile << "N/A" << std::endl;
+template< typename Real >
+bool setupBenchmark( const Config::ParameterContainer& parameters )
+   const String& test = parameters.getParameter< String >( "test" );
+   const String& inputFileName = parameters.getParameter< String >( "input-file" );
+   const String& logFileName = parameters.getParameter< String >( "log-file" );
+   const int verbose = parameters.getParameter< int >( "verbose" );
+   const double stopTime = parameters.getParameter< double >( "stop-time" );
+   std::fstream logFile;
+   if( ! initLogFile( logFile, logFileName ) )
+   {
+      std::cerr << "I am not able to open the file " << logFileName << "." << std::endl;
+      return false;
+   }
+   if( test == "mtx" )
+   {
+      typedef Matrices::CSR< Real, Devices::Host, int > CSRType;
+      CSRType csrMatrix;
+      try
+      {
+         if( ! MatrixReader< CSRType >::readMtxFile( inputFileName, csrMatrix ) )
+         {
+            std::cerr << "I am not able to read the matrix file " << inputFileName << "." << std::endl;
+            logFile << std::endl;
+            logFile << inputFileName << std::endl;
+            logFile << "Benchmark failed: Unable to read the matrix." << std::endl;
+            return false;
+         }
+      }
+      catch( std::bad_alloc )
+      {
+         std::cerr << "Not enough memory to read the matrix." << std::endl;
+         logFile << std::endl;
+         logFile << inputFileName << std::endl;
+         logFile << "Benchmark failed: Not enough memory." << std::endl;
+         return false;
+      }
+      if( verbose )
+         printMatrixInfo( inputFileName, csrMatrix,std::cout );
+      if( ! writeMatrixInfo( inputFileName, csrMatrix, logFile ) )
+      {
+         std::cerr << "I am not able to write new matrix to the log file." << std::endl;
+         return false;
+      }
+      const int rows = csrMatrix.getRows();
+      const long int nonzeroElements = csrMatrix.getNumberOfMatrixElements();
+      Containers::Vector< int, Devices::Host, int > rowLengthsHost;
+      rowLengthsHost.setSize( rows );
+      for( int row = 0; row < rows; row++ )
+         rowLengthsHost[ row ] = csrMatrix.getRowLength( row );
+      typedef Containers::Vector< Real, Devices::Host, int > HostVector;
+      HostVector hostX, hostB;
+      hostX.setSize( csrMatrix.getColumns() );
+      hostX.setValue( 1.0 );
+      hostB.setSize( csrMatrix.getRows() );
+#ifdef HAVE_CUDA
+      typedef Containers::Vector< Real, Devices::Cuda, int > CudaVector;
+      CudaVector cudaX, cudaB;
+      Containers::Vector< int, Devices::Cuda, int > rowLengthsCuda;
+      cudaX.setSize( csrMatrix.getColumns() );
+      cudaX.setValue( 1.0 );
+      cudaB.setSize( csrMatrix.getRows() );
+      rowLengthsCuda.setSize( csrMatrix.getRows() );
+      rowLengthsCuda = rowLengthsHost;
+      cusparseHandle_t cusparseHandle;
+      cusparseCreate( &cusparseHandle );
+      const double baseline = benchmarkMatrix( csrMatrix,
+                                               hostX,
+                                               hostB,
+                                               nonzeroElements,
+                                               "CSR Host",
+                                               stopTime,
+                                               0.0,
+                                               verbose,
+                                               logFile );
+#ifdef HAVE_CUDA
+      typedef CSR< Real, Devices::Cuda, int > CSRCudaType;
+      CSRCudaType cudaCSR;
+      //cout << "Copying matrix to GPU... ";
+      cudaCSR = csrMatrix;
+      TNL::CusparseCSR< Real > cusparseCSR;
+      cusparseCSR.init( cudaCSR, &cusparseHandle );
+      benchmarkMatrix( cusparseCSR,
+                       cudaX,
+                       cudaB,
+                       nonzeroElements,
+                       "Cusparse CSR",
+                       stopTime,
+                       baseline,
+                       verbose,
+                       logFile );
+      cusparseDestroy( cusparseHandle );
+      std::cout << " done.   \r";
+      /*cudaCSR.setCudaKernelType( CSRCudaType::scalar );
+      benchmarkMatrix( cudaCSR,
+                       cudaX,
+                       cudaB,
+                       nonzeroElements,
+                       "CSR Cuda Scalar",
+                       stopTime,
+                       baseline,
+                       verbose,
+                       logFile );
+      cudaCSR.setCudaKernelType( CSRCudaType::vector );
+      cudaCSR.setCudaWarpSize( 1 );
+      benchmarkMatrix( cudaCSR,
+                       cudaX,
+                       cudaB,
+                       nonzeroElements,
+                       "CSR Cuda Vector 1",
+                       stopTime,
+                       baseline,
+                       verbose,
+                       logFile );
+      cudaCSR.setCudaWarpSize( 2 );
+      benchmarkMatrix( cudaCSR,
+                       cudaX,
+                       cudaB,
+                       nonzeroElements,
+                       "CSR Cuda Vector 2",
+                       stopTime,
+                       baseline,
+                       verbose,
+                       logFile );
+      cudaCSR.setCudaWarpSize( 4 );
+      benchmarkMatrix( cudaCSR,
+                       cudaX,
+                       cudaB,
+                       nonzeroElements,
+                       "CSR Cuda Vector 4",
+                       stopTime,
+                       baseline,
+                       verbose,
+                       logFile );
+      cudaCSR.setCudaWarpSize( 8 );
+      benchmarkMatrix( cudaCSR,
+                       cudaX,
+                       cudaB,
+                       nonzeroElements,
+                       "CSR Cuda Vector 8",
+                       stopTime,
+                       baseline,
+                       verbose,
+                       logFile );
+      cudaCSR.setCudaWarpSize( 16 );
+      benchmarkMatrix( cudaCSR,
+                       cudaX,
+                       cudaB,
+                       nonzeroElements,
+                       "CSR Cuda Vector 16",
+                       stopTime,
+                       baseline,
+                       verbose,
+                       logFile );
+      cudaCSR.setCudaWarpSize( 32 );
+      benchmarkMatrix( cudaCSR,
+                       cudaX,
+                       cudaB,
+                       nonzeroElements,
+                       "CSR Cuda Vector 32",
+                       stopTime,
+                       baseline,
+                       verbose,
+                       logFile );
+      cudaCSR.setCudaKernelType( CSRCudaType::hybrid );
+      cudaCSR.setHybridModeSplit( 2 );
+      benchmarkMatrix( cudaCSR,
+                       cudaX,
+                       cudaB,
+                       nonzeroElements,
+                       "CSR Cuda Hyrbid 2",
+                       stopTime,
+                       baseline,
+                       verbose,
+                       logFile );
+      cudaCSR.setHybridModeSplit( 4 );
+      benchmarkMatrix( cudaCSR,
+                       cudaX,
+                       cudaB,
+                       nonzeroElements,
+                       "CSR Cuda Hyrbid 4",
+                       stopTime,
+                       baseline,
+                       verbose,
+                       logFile );
+      cudaCSR.setHybridModeSplit( 8 );
+      benchmarkMatrix( cudaCSR,
+                       cudaX,
+                       cudaB,
+                       nonzeroElements,
+                       "CSR Cuda Hyrbid 8",
+                       stopTime,
+                       baseline,
+                       verbose,
+                       logFile );
+      cudaCSR.setHybridModeSplit( 16 );
+      benchmarkMatrix( cudaCSR,
+                       cudaX,
+                       cudaB,
+                       nonzeroElements,
+                       "CSR Cuda Hyrbid 16",
+                       stopTime,
+                       baseline,
+                       verbose,
+                       logFile );
+      cudaCSR.setHybridModeSplit( 32 );
+      benchmarkMatrix( cudaCSR,
+                       cudaX,
+                       cudaB,
+                       nonzeroElements,
+                       "CSR Cuda Hyrbid 32",
+                       stopTime,
+                       baseline,
+                       verbose,
+                       logFile );
+      cudaCSR.setHybridModeSplit( 64 );
+      benchmarkMatrix( cudaCSR,
+                       cudaX,
+                       cudaB,
+                       nonzeroElements,
+                       "CSR Cuda Hyrbid 64",
+                       stopTime,
+                       baseline,
+                       verbose,
+                       logFile );*/
+      cudaCSR.reset();
+      long int allocatedElements;
+      double padding;
+      typedef Ellpack< Real, Devices::Host, int > EllpackType;
+      EllpackType ellpackMatrix;
+      Matrices::copySparseMatrix( ellpackMatrix, csrMatrix );
+      allocatedElements = ellpackMatrix.getNumberOfMatrixElements();
+      padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100.0;
+      logFile << "    " << padding << std::endl;
+      benchmarkMatrix( ellpackMatrix,
+                       hostX,
+                       hostB,
+                       nonzeroElements,
+                       "Ellpack Host",
+                       stopTime,
+                       baseline,
+                       verbose,
+                       logFile );
+#ifdef HAVE_CUDA
+      typedef Ellpack< Real, Devices::Cuda, int > EllpackCudaType;
+      EllpackCudaType cudaEllpack;
+      std::cout << "Copying matrix to GPU... ";
+      cudaEllpack = ellpackMatrix;
+      std::cout << " done.   \r";
+      benchmarkMatrix( cudaEllpack,
+                       cudaX,
+                       cudaB,
+                       nonzeroElements,
+                       "Ellpack Cuda",
+                       stopTime,
+                       baseline,
+                       verbose,
+                       logFile );
+      cudaEllpack.reset();
+      ellpackMatrix.reset();
+      typedef Matrices::EllpackSymmetric< Real, Devices::Host, int > EllpackSymmetricType;
+      EllpackSymmetricType EllpackSymmetric;
+      if( ! MatrixReader< EllpackSymmetricType >::readMtxFile( inputFileName, EllpackSymmetric, verbose, true ) )
+         writeTestFailed( logFile, 7 );
+      else
+      {
+         allocatedElements = EllpackSymmetric.getNumberOfMatrixElements();
+         padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100.0;
+         logFile << "    " << padding <<std::endl;
+         benchmarkMatrix( EllpackSymmetric,
+                          hostX,
+                          hostB,
+                          nonzeroElements,
+                          "EllpackSym Host",
+                          stopTime,
+                          baseline,
+                          verbose,
+                          logFile );
+         EllpackSymmetric.reset();
+#ifdef HAVE_CUDA
+         typedef Matrices::EllpackSymmetric< Real, Devices::Cuda, int > EllpackSymmetricCudaType;
+         EllpackSymmetricCudaType cudaEllpackSymmetric;
+        std::cout << "Copying matrix to GPU... ";
+         for( int i = 0; i < rowLengthsHost.getSize(); i++ )
+             rowLengthsHost[ i ] = EllpackSymmetric.getRowLength( i );
+         rowLengthsCuda = rowLengthsHost;
+         // TODO: fix this
+         //if( ! cudaEllpackSymmetric.copyFrom( EllpackSymmetric, rowLengthsCuda ) )
+         {
+           std::cerr << "I am not able to transfer the matrix on GPU." <<std::endl;
+            writeTestFailed( logFile, 3 );
+         }
+         //else
+         {
+           std::cout << " done.   \r";
+            benchmarkMatrix( cudaEllpackSymmetric,
+                             cudaX,
+                             cudaB,
+                             nonzeroElements,
+                             "EllpackSym Cuda",
+                             stopTime,
+                             baseline,
+                             verbose,
+                             logFile );
+         }
+         cudaEllpackSymmetric.reset();
+      }
+      typedef Matrices::SlicedEllpack< Real, Devices::Host, int > SlicedEllpackMatrixType;
+      SlicedEllpackMatrixType slicedEllpackMatrix;
+      if( ! Matrices::MatrixReader< SlicedEllpackMatrixType >::readMtxFile( inputFileName, slicedEllpackMatrix, verbose ) )
+         writeTestFailed( logFile, 7 );
+      else
+      {
+         allocatedElements = slicedEllpackMatrix.getNumberOfMatrixElements();
+         padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100;
+         logFile << "    " << padding <<std::endl;
+         benchmarkMatrix( slicedEllpackMatrix,
+                          hostX,
+                          hostB,
+                          nonzeroElements,
+                          "SlicedEllpack Host",
+                          stopTime,
+                          baseline,
+                          verbose,
+                          logFile );
+#ifdef HAVE_CUDA
+         typedef Matrices::SlicedEllpack< Real, Devices::Cuda, int > SlicedEllpackMatrixCudaType;
+         SlicedEllpackMatrixCudaType cudaSlicedEllpackMatrix;
+         for( int i = 0; i < rowLengthsHost.getSize(); i++ )
+              rowLengthsHost[ i ] = slicedEllpackMatrix.getRowLength( i );
+         rowLengthsCuda = rowLengthsHost;
+         // TODO: fix
+         //if( ! cudaSlicedEllpackMatrix.copyFrom( slicedEllpackMatrix, rowLengthsCuda ) )
+         {
+            std::cerr << "Nejde zkopirovat" <<std::endl;
+             writeTestFailed( logFile, 3 );
+         }
+         //else
+         {
+           std::cout << " done.    \r";
+            benchmarkMatrix( cudaSlicedEllpackMatrix,
+                             cudaX,
+                             cudaB,
+                             nonzeroElements,
+                             "SlicedEllpack Cuda",
+                             stopTime,
+                             baseline,
+                             verbose,
+                             logFile );
+         }
+         cudaSlicedEllpackMatrix.reset();        
+      }
+      typedef Matrices::ChunkedEllpack< Real, Devices::Host, int > ChunkedEllpackType;
+      ChunkedEllpackType chunkedEllpack;
+      Matrices::copySparseMatrix( chunkedEllpack, csrMatrix );
+      allocatedElements = chunkedEllpack.getNumberOfMatrixElements();
+      padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100.0;
+      logFile << "    " << padding << std::endl;
+      benchmarkMatrix( chunkedEllpack,
+                       hostX,
+                       hostB,
+                       nonzeroElements,
+                       "ChunkedEllpack Host",
+                       stopTime,
+                       baseline,
+                       verbose,
+                       logFile );
+#ifdef HAVE_CUDA
+      typedef Matrices::ChunkedEllpack< Real, Devices::Cuda, int > ChunkedEllpackCudaType;
+      ChunkedEllpackCudaType cudaChunkedEllpack;
+      std::cout << "Copying matrix to GPU... ";
+      cudaChunkedEllpack = chunkedEllpack;
+      std::cout << " done.    \r";
+      benchmarkMatrix( cudaChunkedEllpack,
+                       cudaX,
+                       cudaB,
+                       nonzeroElements,
+                       "ChunkedEllpack Cuda",
+                       stopTime,
+                       baseline,
+                       verbose,
+                       logFile );
+      cudaChunkedEllpack.reset();
+      typedef Matrices::BiEllpack< Real, Devices::Host, int > BiEllpackMatrixType;
+      BiEllpackMatrixType biEllpackMatrix;
+      // TODO: I did not check this during git merging, but I hope its gonna work
+      //   Tomas Oberhuber
+      //    copySparseMatrix( biEllpackMatrix, csrMatrix ); // TODO:Fix the getRow method to be compatible with othr formats
+      /*if( ! biEllpackMatrix.copyFrom( csrMatrix, rowLengthsHost ) )
+         writeTestFailed( logFile, 7 );
+      else*/
+      {
+         allocatedElements = biEllpackMatrix.getNumberOfMatrixElements();
+         padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100.0;
+         logFile << "    " << padding <<std::endl;
+         benchmarkMatrix( biEllpackMatrix,
+                          hostX,
+                          hostB,
+                          nonzeroElements,
+                          "BiEllpack Host",
+                          stopTime,
+                          baseline,
+                          verbose,
+                          logFile );
+         biEllpackMatrix.reset();
+#ifdef HAVE_CUDA
+         typedef Matrices::BiEllpack< Real, Devices::Cuda, int > BiEllpackMatrixCudaType;
+         BiEllpackMatrixCudaType cudaBiEllpackMatrix;
+         // TODO: I did not check this during git merging, but I hope its gonna work
+         //   Tomas Oberhuber
+         //    copySparseMatrix( biEllpackMatrix, csrMatrix ); // TODO:Fix the getRow method to be compatible with othr formats
+        std::cout << "Copying matrix to GPU... ";
+         /*if( ! cudaBiEllpackMatrix.copyFrom( biEllpackMatrix, rowLengthsCuda ) )
+         {
+           std::cerr << "I am not able to transfer the matrix on GPU." <<std::endl;
+            writeTestFailed( logFile, 3 );
+         }
+         else*/
+         {
+           std::cout << " done.    \r";
+            benchmarkMatrix( cudaBiEllpackMatrix,
+                             cudaX,
+                             cudaB,
+                             nonzeroElements,
+                             "BiEllpack Cuda",
+                             stopTime,
+                             baseline,
+                             verbose,
+                             logFile );
+         }
+         cudaBiEllpackMatrix.reset();
+      }
+      typedef Matrices::SlicedEllpackSymmetric< Real, Devices::Host, int > SlicedEllpackSymmetricType;
+      SlicedEllpackSymmetricType slicedEllpackSymmetric;
+      if( ! Matrices::MatrixReader< SlicedEllpackSymmetricType >::readMtxFile( inputFileName, slicedEllpackSymmetric, verbose, true ) )
+         writeTestFailed( logFile, 7 );
+      else
+      {
+         allocatedElements = slicedEllpackSymmetric.getNumberOfMatrixElements();
+         padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100.0;
+         logFile << "    " << padding <<std::endl;
+         benchmarkMatrix( slicedEllpackSymmetric,
+                          hostX,
+                          hostB,
+                          nonzeroElements,
+                          "SlicedEllpackSym Host",
+                          stopTime,
+                          baseline,
+                          verbose,
+                          logFile );
+         slicedEllpackSymmetric.reset();
+#ifdef HAVE_CUDA
+         typedef Matrices::SlicedEllpackSymmetric< Real, Devices::Cuda, int > SlicedEllpackSymmetricCudaType;
+         SlicedEllpackSymmetricCudaType cudaSlicedEllpackSymmetric;
+        std::cout << "Copying matrix to GPU... ";
+         for( int i = 0; i < rowLengthsHost.getSize(); i++ )
+             rowLengthsHost[ i ] = slicedEllpackSymmetric.getRowLength( i );
+         rowLengthsCuda = rowLengthsHost;
+         // TODO: fiox the nest line
+         //if( ! cudaSlicedEllpackSymmetric.copyFrom( slicedEllpackSymmetric, rowLengthsCuda ) )
+         {
+           std::cerr << "I am not able to transfer the matrix on GPU." <<std::endl;
+            writeTestFailed( logFile, 3 );
+         }
+         //else
+         {
+           std::cout << " done.   \r";
+            benchmarkMatrix( cudaSlicedEllpackSymmetric,
+                             cudaX,
+                             cudaB,
+                             nonzeroElements,
+                             "SlicedEllpackSym Cuda",
+                             stopTime,
+                             baseline,
+                             verbose,
+                             logFile );
+         }
+         cudaSlicedEllpackSymmetric.reset();
+      }
+      typedef Matrices::EllpackSymmetricGraph< Real, Devices::Host, int > EllpackSymmetricGraphMatrixType;
+      EllpackSymmetricGraphMatrixType EllpackSymmetricGraphMatrix;
+      if( ! Matrices::MatrixReader< EllpackSymmetricGraphMatrixType >::readMtxFile( inputFileName, EllpackSymmetricGraphMatrix, verbose, true ) ||
+          ! EllpackSymmetricGraphMatrix.help() )
+         writeTestFailed( logFile, 7 );
+      else
+      {
+         allocatedElements = EllpackSymmetricGraphMatrix.getNumberOfMatrixElements();
+         padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100.0;
+         logFile << "    " << padding <<std::endl;
+         benchmarkMatrix( EllpackSymmetricGraphMatrix,
+                          hostX,
+                          hostB,
+                          nonzeroElements,
+                          "Ellpack Graph Host",
+                          stopTime,
+                          baseline,
+                          verbose,
+                          logFile );
+         EllpackSymmetricGraphMatrix.reset();
+#ifdef HAVE_CUDA
+         typedef Matrices::EllpackSymmetricGraph< Real, Devices::Cuda, int > EllpackSymmetricGraphMatrixCudaType;
+         EllpackSymmetricGraphMatrixCudaType cudaEllpackSymmetricGraphMatrix;
+        std::cout << "Copying matrix to GPU... ";
+         for( int i = 0; i < rowLengthsHost.getSize(); i++ )
+             rowLengthsHost[ i ] = EllpackSymmetricGraphMatrix.getRowLength( i );
+         rowLengthsCuda = rowLengthsHost;
+         // TODO: fix it
+         //if( ! cudaEllpackSymmetricGraphMatrix.copyFrom( EllpackSymmetricGraphMatrix, rowLengthsCuda ) ) 
+         {
+            writeTestFailed( logFile, 3 );
+         }
+         //else if( ! cudaEllpackSymmetricGraphMatrix.help() )
+         {
+            writeTestFailed( logFile, 3 );
+         } 
+         //else
+         {
+            std::cout << " done.   \r";
+            benchmarkMatrix( cudaEllpackSymmetricGraphMatrix,
+                             cudaX,
+                             cudaB,
+                             nonzeroElements,
+                             "Ellpack Graph Cuda",
+                             stopTime,
+                             baseline,
+                             verbose,
+                             logFile );
+         }
+         cudaEllpackSymmetricGraphMatrix.reset();
+      }
+        typedef Matrices::AdEllpack< Real, Devices::Host, int > AdEllpackMatrixType;
+        AdEllpackMatrixType adEllpackMatrix;
+         // TODO: I did not check this during git merging, but I hope its gonna work
+         //   Tomas Oberhuber
+        //copySparseMatrix( adEllpackMatrix, csrMatrix ); // TODO:Fix the getRow method to be compatible with othr formats
+        /*if( ! adEllpackMatrix.copyFrom( csrMatrix, rowLengthsHost ) )
+           writeTestFailed( logFile, 7 );
+        else*/
+        {
+           allocatedElements = adEllpackMatrix.getNumberOfMatrixElements();
+           padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100.0;
+           logFile << "    " << padding <<std::endl;
+           benchmarkMatrix( adEllpackMatrix,
+                            hostX,
+                            hostB,
+                            nonzeroElements,
+                            "AdEllpack Host",
+                            stopTime,
+                            baseline,
+                            verbose,
+                            logFile );
+           adEllpackMatrix.reset();
+        }
+#ifdef HAVE_CUDA
+         typedef Matrices::AdEllpack< Real, Devices::Cuda, int > AdEllpackMatrixCudaType;
+         AdEllpackMatrixCudaType cudaAdEllpackMatrix;
+         // TODO: I did not check this during git merging, but I hope its gonna work
+         //   Tomas Oberhuber
+        //copySparseMatrix( adEllpackMatrix, csrMatrix ); // TODO:Fix the getRow method to be compatible with othr formats
+        std::cout << "Copying matrix to GPU... ";
+         /*if( ! cudaAdEllpackMatrix.copyFrom( csrMatrix, rowLengthsCuda ) )
+         {
+           std::cerr << "I am not able to transfer the matrix on GPU." <<std::endl;
+            writeTestFailed( logFile, 3 );
+         }
+         else*/
+         {
+	    allocatedElements = cudaAdEllpackMatrix.getNumberOfMatrixElements();
+	    padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100.0;
+            logFile << "    " << padding <<std::endl;
+           std::cout << " done.    \r";
+            benchmarkMatrix( cudaAdEllpackMatrix,
+                             cudaX,
+                             cudaB,
+                             nonzeroElements,
+                             "AdEllpack Cuda",
+                             stopTime,
+                             baseline,
+                             verbose,
+                             logFile );
+           cudaAdEllpackMatrix.reset();
+	}
+   }
+   return true;
+int main( int argc, char* argv[] )
+   Config::ParameterContainer parameters;
+   Config::ConfigDescription conf_desc;
+   setupConfig( conf_desc );
+   if( ! parseCommandLine( argc, argv, conf_desc, parameters ) )
+   {
+      conf_desc.printUsage( argv[ 0 ] );
+      return 1;
+   }
+   const String& precision = parameters.getParameter< String >( "precision" );
+   if( precision == "float" )
+      if( ! setupBenchmark< float >( parameters ) )
+         return EXIT_FAILURE;
+   if( precision == "double" )
+      if( ! setupBenchmark< double >( parameters ) )
+         return EXIT_FAILURE;
+   return EXIT_SUCCESS;
\ No newline at end of file
diff --git a/src/Benchmarks/SpMV/tnlCusparseCSRMatrix.h b/src/Benchmarks/SpMV/OldSpMV/tnlCusparseCSRMatrix.h
similarity index 99%
rename from src/Benchmarks/SpMV/tnlCusparseCSRMatrix.h
rename to src/Benchmarks/SpMV/OldSpMV/tnlCusparseCSRMatrix.h
index 8f6d376fe27ebed3cd67307bf8f24ea2c5d630d4..fbef4f9a2410669f8c91ef51bf6de404ab1bb7fc 100644
--- a/src/Benchmarks/SpMV/tnlCusparseCSRMatrix.h
+++ b/src/Benchmarks/SpMV/OldSpMV/tnlCusparseCSRMatrix.h
@@ -8,6 +8,8 @@
 /* See Copyright Notice in tnl/Copyright */
 #include <TNL/Assert.h>
 #include <TNL/Devices/Cuda.h>
 #ifdef HAVE_CUDA
@@ -157,3 +159,4 @@ class CusparseCSR< float > : public CusparseCSRBase< float >
 } // namespace TNL
\ No newline at end of file
diff --git a/src/Benchmarks/SpMV/spmv.h b/src/Benchmarks/SpMV/spmv.h
new file mode 100644
index 0000000000000000000000000000000000000000..2c28d57d31f87d24b1070ecba508eec184a9e340
--- /dev/null
+++ b/src/Benchmarks/SpMV/spmv.h
@@ -0,0 +1,189 @@
+                          spmv.h  -  description
+                             -------------------
+    begin                : Dec 30, 2015
+    copyright            : (C) 2015 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+/* See Copyright Notice in tnl/Copyright */
+// Implemented by: Jakub Klinkovsky
+#pragma once
+#include "../Benchmarks.h"
+#include <TNL/Pointers/DevicePointer.h>
+#include <TNL/Matrices/CSR.h>
+#include <TNL/Matrices/Ellpack.h>
+#include <TNL/Matrices/SlicedEllpack.h>
+#include <TNL/Matrices/ChunkedEllpack.h>
+namespace TNL {
+namespace Benchmarks {
+// silly alias to match the number of template parameters with other formats
+template< typename Real, typename Device, typename Index >
+using SlicedEllpack = Matrices::SlicedEllpack< Real, Device, Index >;
+template< typename Matrix >
+int setHostTestMatrix( Matrix& matrix,
+                       const int elementsPerRow )
+   const int size = matrix.getRows();
+   int elements( 0 );
+   for( int row = 0; row < size; row++ ) {
+      int col = row - elementsPerRow / 2;
+      for( int element = 0; element < elementsPerRow; element++ ) {
+         if( col + element >= 0 &&
+            col + element < size )
+         {
+            matrix.setElement( row, col + element, element + 1 );
+            elements++;
+         }
+      }
+   }
+   return elements;
+#ifdef HAVE_CUDA
+template< typename Matrix >
+__global__ void setCudaTestMatrixKernel( Matrix* matrix,
+                                         const int elementsPerRow,
+                                         const int gridIdx )
+   const int rowIdx = ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
+   if( rowIdx >= matrix->getRows() )
+      return;
+   int col = rowIdx - elementsPerRow / 2;
+   for( int element = 0; element < elementsPerRow; element++ ) {
+      if( col + element >= 0 &&
+         col + element < matrix->getColumns() )
+         matrix->setElementFast( rowIdx, col + element, element + 1 );
+   }
+template< typename Matrix >
+void setCudaTestMatrix( Matrix& matrix,
+                        const int elementsPerRow )
+#ifdef HAVE_CUDA
+   typedef typename Matrix::IndexType IndexType;
+   typedef typename Matrix::RealType RealType;
+   Pointers::DevicePointer< Matrix > kernel_matrix( matrix );
+   dim3 cudaBlockSize( 256 ), cudaGridSize( Devices::Cuda::getMaxGridSize() );
+   const IndexType cudaBlocks = roundUpDivision( matrix.getRows(), cudaBlockSize.x );
+   const IndexType cudaGrids = roundUpDivision( cudaBlocks, Devices::Cuda::getMaxGridSize() );
+   for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ ) {
+      if( gridIdx == cudaGrids - 1 )
+         cudaGridSize.x = cudaBlocks % Devices::Cuda::getMaxGridSize();
+      setCudaTestMatrixKernel< Matrix >
+         <<< cudaGridSize, cudaBlockSize >>>
+         ( &kernel_matrix.template modifyData< Devices::Cuda >(), elementsPerRow, gridIdx );
+   }
+// TODO: rename as benchmark_SpMV_synthetic and move to spmv-synthetic.h
+template< typename Real,
+          template< typename, typename, typename > class Matrix,
+          template< typename, typename, typename > class Vector = Containers::Vector >
+benchmarkSpMV( Benchmark & benchmark,
+               const int & size,
+               const int elementsPerRow = 5 )
+   typedef Matrix< Real, Devices::Host, int > HostMatrix;
+   typedef Matrix< Real, Devices::Cuda, int > DeviceMatrix;
+   typedef Containers::Vector< Real, Devices::Host, int > HostVector;
+   typedef Containers::Vector< Real, Devices::Cuda, int > CudaVector;
+   HostMatrix hostMatrix;
+   DeviceMatrix deviceMatrix;
+   Containers::Vector< int, Devices::Host, int > hostRowLengths;
+   Containers::Vector< int, Devices::Cuda, int > deviceRowLengths;
+   HostVector hostVector, hostVector2;
+   CudaVector deviceVector, deviceVector2;
+   // create benchmark group
+   const std::vector< String > parsedType = parseObjectType( HostMatrix::getType() );
+#ifdef HAVE_CUDA
+   benchmark.createHorizontalGroup( parsedType[ 0 ], 2 );
+   benchmark.createHorizontalGroup( parsedType[ 0 ], 1 );
+   hostRowLengths.setSize( size );
+   hostMatrix.setDimensions( size, size );
+   hostVector.setSize( size );
+   hostVector2.setSize( size );
+#ifdef HAVE_CUDA
+   deviceRowLengths.setSize( size );
+   deviceMatrix.setDimensions( size, size );
+   deviceVector.setSize( size );
+   deviceVector2.setSize( size );
+   hostRowLengths.setValue( elementsPerRow );
+#ifdef HAVE_CUDA
+   deviceRowLengths.setValue( elementsPerRow );
+   hostMatrix.setCompressedRowLengths( hostRowLengths );
+#ifdef HAVE_CUDA
+   deviceMatrix.setCompressedRowLengths( deviceRowLengths );
+   const int elements = setHostTestMatrix< HostMatrix >( hostMatrix, elementsPerRow );
+   setCudaTestMatrix< DeviceMatrix >( deviceMatrix, elementsPerRow );
+   const double datasetSize = (double) elements * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB;
+   // reset function
+   auto reset = [&]() {
+      hostVector.setValue( 1.0 );
+      hostVector2.setValue( 0.0 );
+#ifdef HAVE_CUDA
+      deviceVector.setValue( 1.0 );
+      deviceVector2.setValue( 0.0 );
+   };
+   // compute functions
+   auto spmvHost = [&]() {
+      hostMatrix.vectorProduct( hostVector, hostVector2 );
+   };
+   auto spmvCuda = [&]() {
+      deviceMatrix.vectorProduct( deviceVector, deviceVector2 );
+   };
+   benchmark.setOperation( datasetSize );
+   benchmark.time< Devices::Host >( reset, "CPU", spmvHost );
+#ifdef HAVE_CUDA
+   benchmark.time< Devices::Cuda >( reset, "GPU", spmvCuda );
+   return true;
+template< typename Real = double,
+          typename Index = int >
+benchmarkSpmvSynthetic( Benchmark & benchmark,
+                        const int & size,
+                        const int & elementsPerRow )
+   bool result = true;
+   // TODO: benchmark all formats from tnl-benchmark-spmv (different parameters of the base formats)
+   result |= benchmarkSpMV< Real, Matrices::CSR >( benchmark, size, elementsPerRow );
+//   result |= benchmarkSpMV< Real, Matrices::Ellpack >( benchmark, size, elementsPerRow );
+//   result |= benchmarkSpMV< Real, SlicedEllpack >( benchmark, size, elementsPerRow );
+//   result |= benchmarkSpMV< Real, Matrices::ChunkedEllpack >( benchmark, size, elementsPerRow );
+   return result;
+} // namespace Benchmarks
+} // namespace TNL
diff --git a/src/Benchmarks/SpMV/tnl-benchmark-spmv.cpp b/src/Benchmarks/SpMV/tnl-benchmark-spmv.cpp
index fadbcca0ce9d04b01103d4f24b0df03c169fc1c7..466a56914e9097fc5f467332ef91290b481ca293 100644
--- a/src/Benchmarks/SpMV/tnl-benchmark-spmv.cpp
+++ b/src/Benchmarks/SpMV/tnl-benchmark-spmv.cpp
@@ -1,14 +1,11 @@
                           tnl-benchmark-spmv.cpp  -  description
-    begin                : Jun 5, 2014
-    copyright            : (C) 2014 by Tomas Oberhuber
+    begin                : March 3, 2019
+    copyright            : (C) 2019 by Tomas Oberhuber
     email                : tomas.oberhuber@fjfi.cvut.cz
 /* See Copyright Notice in tnl/Copyright */
 #include "tnl-benchmark-spmv.h"
diff --git a/src/Benchmarks/SpMV/tnl-benchmark-spmv.cu b/src/Benchmarks/SpMV/tnl-benchmark-spmv.cu
index fed383d86776e521dd31f299e3ba8baa9b0afdae..5a3a711ad22412b2998674cc820a80f2035d7fdc 100644
--- a/src/Benchmarks/SpMV/tnl-benchmark-spmv.cu
+++ b/src/Benchmarks/SpMV/tnl-benchmark-spmv.cu
@@ -1,12 +1,11 @@
                           tnl-benchmark-spmv.cu  -  description
-    begin                : Jun 5, 2014
-    copyright            : (C) 2014 by Tomas Oberhuber
+    begin                : March 3, 2019
+    copyright            : (C) 2019 by Tomas Oberhuber
     email                : tomas.oberhuber@fjfi.cvut.cz
 /* See Copyright Notice in tnl/Copyright */
 #include "tnl-benchmark-spmv.h"
diff --git a/src/Benchmarks/SpMV/tnl-benchmark-spmv.h b/src/Benchmarks/SpMV/tnl-benchmark-spmv.h
index c707018ad7e8da62e47249315eb80da37280bbbb..97e47f2a0fa5de68a58031cb3b07f182e64321bd 100644
--- a/src/Benchmarks/SpMV/tnl-benchmark-spmv.h
+++ b/src/Benchmarks/SpMV/tnl-benchmark-spmv.h
@@ -1,921 +1,138 @@
                           tnl-benchmark-spmv.h  -  description
-    begin                : Jun 5, 2014
-    copyright            : (C) 2014 by Tomas Oberhuber
+    begin                : March 3, 2019
+    copyright            : (C) 2019 by Tomas Oberhuber et al.
     email                : tomas.oberhuber@fjfi.cvut.cz
 /* See Copyright Notice in tnl/Copyright */
-#pragma once
+// Implemented by: Jakub Klinkovsky
-#include <fstream>
-#include <iomanip>
-#include <unistd.h>
-#ifdef HAVE_CUDA
-#include <cusparse.h>
+#pragma once
+#include <TNL/Devices/Host.h>
+#include <TNL/Devices/Cuda.h>
 #include <TNL/Config/ConfigDescription.h>
 #include <TNL/Config/ParameterContainer.h>
-#include <TNL/Matrices/CSR.h>
-#include <TNL/Matrices/AdEllpack.h>
-#include <TNL/Matrices/BiEllpack.h>
-#include <TNL/Matrices/BiEllpackSymmetric.h>
-#include <TNL/Matrices/Ellpack.h>
-#include <TNL/Matrices/EllpackSymmetric.h>
-#include <TNL/Matrices/EllpackSymmetricGraph.h>
-#include <TNL/Matrices/SlicedEllpack.h>
-#include <TNL/Matrices/SlicedEllpackSymmetric.h>
-#include <TNL/Matrices/SlicedEllpackSymmetricGraph.h>
-#include <TNL/Matrices/ChunkedEllpack.h>
-#include <TNL/Matrices/MatrixReader.h>
-#include <TNL/Timer.h>
-#include "tnlCusparseCSRMatrix.h"
-using namespace std;
+#include <Benchmarks/BLAS/array-operations.h>
+#include <Benchmarks/BLAS/vector-operations.h>
+#include "spmv.h"
 using namespace TNL;
-using namespace TNL::Matrices;
+using namespace TNL::Benchmarks;
-void setupConfig( Config::ConfigDescription& config )
-   config.addDelimiter                            ( "General settings:" );
-   config.addRequiredEntry< String >( "test" , "Test to be performed." );
-      config.addEntryEnum< String >( "mtx" );
-      config.addEntryEnum< String >( "tnl" );
-   config.addRequiredEntry< String >( "input-file" , "Input file name." );
-   config.addEntry< String >( "log-file", "Log file name.", "tnl-benchmark-spmv.log");
-   config.addEntry< String >( "precision", "Precision of the arithmetics.", "double" );
-   config.addEntry< double >( "stop-time", "Seconds to iterate the SpMV operation.", 1.0 );
-   config.addEntry< int >( "verbose", "Verbose mode.", 1 );
-bool initLogFile( std::fstream& logFile, const String& fileName )
+template< typename Real >
+runSpMVBenchmarks( Benchmark & benchmark,
+                   Benchmark::MetadataMap metadata,
+                   const std::size_t & size,
+                   const int & elementsPerRow )
-   if( access( fileName.getString(), F_OK ) == -1 )
-   {
-      logFile.open( fileName.getString(), std::ios::out );
-      if( ! logFile )
-         return false;
-      const String fillingColoring = " : COLORING 0 #FFF8DC 20 #FFFF00 40 #FFD700 60 #FF8C0 80 #FF0000 100";
-      const String speedupColoring = " : COLORING #0099FF 1 #FFFFFF 2 #00FF99 4 #33FF99 8 #33FF22 16 #FF9900";
-      const String paddingColoring = " : COLORING #FFFFFF 1 #FFFFCC 10 #FFFF99 100 #FFFF66 1000 #FFFF33 10000 #FFFF00";
-      logFile << "#Matrix file " << std::endl;
-      logFile << "#Rows" << std::endl;
-      logFile << "#Columns" << std::endl;
-      logFile << "#Non-zero elements" << std::endl;
-      logFile << "#Filling (in %)" << fillingColoring << std::endl;
-      logFile << "#CSR Format" << std::endl;
-      logFile << "# CPU" << std::endl;
-      logFile << "#  Gflops" << std::endl;
-      logFile << "#  Throughput" << std::endl;
-      logFile << "#  Speedup" << speedupColoring << std::endl;
-#ifdef HAVE_CUDA
-      logFile << "# Cusparse CSR" << std::endl;
-      logFile << "#  Gflops" << std::endl;
-      logFile << "#  Throughput" << std::endl;
-      logFile << "#  Speedup" << speedupColoring << " SORT - cusparse-csr-speedup.txt" << std::endl;
-      logFile << "# CUDA" << std::endl;
-      logFile << "#  Scalar" << std::endl;
-      logFile << "#   Gflops" << std::endl;
-      logFile << "#   Throughput" << std::endl;
-      logFile << "#   Speedup" << speedupColoring << " SORT - csr-scalar-cuda-speedup.txt" << std::endl;
-      logFile << "#  Vector" << std::endl;
-      logFile << "#   Warp Size 1" << std::endl;
-      logFile << "#    Gflops" << std::endl;
-      logFile << "#    Throughput" << std::endl;
-      logFile << "#    Speedup" << speedupColoring << " SORT - csr-vector-1-cuda-speedup.txt" << std::endl;
-      logFile << "#   Warp Size 2" << std::endl;
-      logFile << "#    Gflops" << std::endl;
-      logFile << "#    Throughput" << std::endl;
-      logFile << "#    Speedup" << speedupColoring << " SORT - csr-vector-2-cuda-speedup.txt" << std::endl;
-      logFile << "#   Warp Size 4" << std::endl;
-      logFile << "#    Gflops" << std::endl;
-      logFile << "#    Throughput" << std::endl;
-      logFile << "#    Speedup" << speedupColoring << " SORT - csr-vector-4-cuda-speedup.txt" << std::endl;
-      logFile << "#   Warp Size 8" << std::endl;
-      logFile << "#    Gflops" << std::endl;
-      logFile << "#    Throughput" << std::endl;
-      logFile << "#    Speedup" << speedupColoring << " SORT - csr-vector-8-cuda-speedup.txt" << std::endl;
-      logFile << "#   Warp Size 16" << std::endl;
-      logFile << "#    Gflops" << std::endl;
-      logFile << "#    Throughput" << std::endl;
-      logFile << "#    Speedup" << speedupColoring << " SORT - csr-vector-16-cuda-speedup.txt" << std::endl;
-      logFile << "#   Warp Size 32" << std::endl;
-      logFile << "#    Gflops" << std::endl;
-      logFile << "#    Throughput" << std::endl;
-      logFile << "#    Speedup" << speedupColoring << " SORT - csr-vector-32-cuda-speedup.txt" << std::endl;
-      logFile << "#  Hybrid" << std::endl;
-      logFile << "#   Split 2" << std::endl;
-      logFile << "#    Gflops" << std::endl;
-      logFile << "#    Throughput" << std::endl;
-      logFile << "#    Speedup" << speedupColoring << " SORT - csr-hybrid-2-cuda-speedup.txt" << std::endl;
-      logFile << "#   Split 4" << std::endl;
-      logFile << "#    Gflops" << std::endl;
-      logFile << "#    Throughput" << std::endl;
-      logFile << "#    Speedup" << speedupColoring << " SORT - csr-hybrid-4-cuda-speedup.txt" << std::endl;
-      logFile << "#   Split 8" << std::endl;
-      logFile << "#    Gflops" << std::endl;
-      logFile << "#    Throughput" << std::endl;
-      logFile << "#    Speedup" << speedupColoring << " SORT - csr-hybrid-8-cuda-speedup.txt" << std::endl;
-      logFile << "#   Split 16" << std::endl;
-      logFile << "#    Gflops" << std::endl;
-      logFile << "#    Throughput" << std::endl;
-      logFile << "#    Speedup" << speedupColoring << " SORT - csr-hybrid-16-cuda-speedup.txt" << std::endl;
-      logFile << "#   Split 32" << std::endl;
-      logFile << "#    Gflops" << std::endl;
-      logFile << "#    Throughput" << std::endl;
-      logFile << "#    Speedup" << speedupColoring << " SORT - csr-hybrid-32-cuda-speedup.txt" << std::endl;
-      logFile << "#   Split 64" << std::endl;
-      logFile << "#    Gflops" << std::endl;
-      logFile << "#    Throughput" << std::endl;
-      logFile << "#    Speedup" << speedupColoring << " SORT - csr-hybrid-64-cuda-speedup.txt" << std::endl;
-      logFile << "#Ellpack Format" << std::endl;
-      logFile << "# Padding (in %)" << paddingColoring << std::endl;
-      logFile << "# CPU" << std::endl;
-      logFile << "#  Gflops" << std::endl;
-      logFile << "#  Throughput" << std::endl;
-      logFile << "#  Speedup" << speedupColoring << " SORT - ellpack-host-speedup.txt" << std::endl;
-#ifdef HAVE_CUDA
-      logFile << "# CUDA" << std::endl;
-      logFile << "#  Gflops" << std::endl;
-      logFile << "#  Throughput" << std::endl;
-      logFile << "#  Speedup" << speedupColoring << " SORT - ellpack-cuda-speedup.txt" << std::endl;
-      logFile << "#SlicedEllpack Format" << std::endl;
-      logFile << "# Padding (in %)" << paddingColoring << std::endl;
-      logFile << "# CPU" << std::endl;
-      logFile << "#  Gflops" << std::endl;
-      logFile << "#  Throughput" << std::endl;
-      logFile << "#  Speedup" << speedupColoring << " SORT - sliced-ellpack-host-speedup.txt" << std::endl;
-#ifdef HAVE_CUDA
-      logFile << "# CUDA" << std::endl;
-      logFile << "#  Gflops" << std::endl;
-      logFile << "#  Throughput" << std::endl;
-      logFile << "#  Speedup" << speedupColoring << " SORT - sliced-ellpack-cuda-speedup.txt" << std::endl;
-      logFile << "#ChunkedEllpack Format" << std::endl;
-      logFile << "# Padding (in %)" << paddingColoring << std::endl;
-      logFile << "# CPU" << std::endl;
-      logFile << "#  Gflops" << std::endl;
-      logFile << "#  Throughput" << std::endl;
-      logFile << "#  Speedup" << speedupColoring << " SORT - chunked-ellpack-host-speedup.txt" << std::endl;
-#ifdef HAVE_CUDA
-      logFile << "# CUDA" << std::endl;
-      logFile << "#  Gflops" << std::endl;
-      logFile << "#  Throughput" << std::endl;
-      logFile << "#  Speedup" << speedupColoring << " SORT - chunked-ellpack-cuda-speedup.txt" << std::endl;
-      return true;
-   }
-   logFile.open( fileName.getString(), std::ios::out | std::ios::app );
-   //logFile << std::setprecision( 2 );
-   if( ! logFile )
-      return false;
-   return true;
+   const String precision = getType< Real >();
+   metadata["precision"] = precision;
+   // Array operations
+   benchmark.newBenchmark( String("Array operations (") + precision + ")",
+                           metadata );
+   benchmark.setMetadataColumns( Benchmark::MetadataColumns({
+         { "size", convertToString( size ) }, } ));
+   benchmarkArrayOperations< Real >( benchmark, size );
+   // Vector operations
+   benchmark.newBenchmark( String("Vector operations (") + precision + ")",
+                           metadata );
+   benchmark.setMetadataColumns( Benchmark::MetadataColumns({
+         { "size", convertToString( size ) }, } ));
+   benchmarkVectorOperations< Real >( benchmark, size );
+   // Sparse matrix-vector multiplication
+   benchmark.newBenchmark( String("Sparse matrix-vector multiplication (") + precision + ")",
+                           metadata );
+   benchmark.setMetadataColumns( Benchmark::MetadataColumns({
+         { "rows", convertToString( size ) },
+         { "columns", convertToString( size ) },
+         { "elements per row", convertToString( elementsPerRow ) },
+      } ));
+   benchmarkSpmvSynthetic< Real >( benchmark, size, elementsPerRow );
-template< typename Matrix >
-void printMatrixInfo( const String& inputFileName,
-                      const Matrix& matrix,
-                      std::ostream& str )
+setupConfig( Config::ConfigDescription & config )
-   str << " Rows: " << std::setw( 8 ) << matrix.getRows();
-   str << " Columns: " << std::setw( 8 ) << matrix.getColumns();
-   str << " Nonzero Elements: " << std::setw( 10 ) << matrix.getNumberOfNonzeroMatrixElements();
-   const double fillingRatio = ( double ) matrix.getNumberOfNonzeroMatrixElements() / ( double ) matrix.getNumberOfMatrixElements();
-   str << " Filling: " << std::setw( 5 ) << 100.0 * fillingRatio << "%" << std::endl;
-   str << std::setw( 25 ) << "Format"
-       << std::setw( 15 ) << "Padding"
-       << std::setw( 15 ) << "Time"
-       << std::setw( 15 ) << "GFLOPS"
-       << std::setw( 15 ) << "Throughput"
-       << std::setw( 15 ) << "Speedup" << std::endl;
+   config.addDelimiter( "Benchmark settings:" );
+   config.addEntry< String >( "log-file", "Log file name.", "tnl-benchmark-blas.log");
+   config.addEntry< String >( "output-mode", "Mode for opening the log file.", "overwrite" );
+   config.addEntryEnum( "append" );
+   config.addEntryEnum( "overwrite" );
+   config.addEntry< String >( "precision", "Precision of the arithmetics.", "double" );
+   config.addEntryEnum( "float" );
+   config.addEntryEnum( "double" );
+   config.addEntryEnum( "all" );
+   config.addEntry< int >( "size", "Size of arrays/vectors used in the benchmark.", 100000 );
+   config.addEntry< int >( "loops", "Number of iterations for every computation.", 10 );
+   config.addEntry< int >( "elements-per-row", "Number of elements per row of the sparse matrix used in the matrix-vector multiplication benchmark.", 5 );
+   config.addEntry< int >( "verbose", "Verbose mode.", 1 );
-template< typename Matrix >
-bool writeMatrixInfo( const String& inputFileName,
-                      const Matrix& matrix,
-                      std::ostream& logFile )
-   logFile << std::endl;
-   logFile << inputFileName << std::endl;
-   logFile << " " << matrix.getRows() << std::endl;
-   logFile << " " << matrix.getColumns() << std::endl;
-   logFile << " " << matrix.getNumberOfNonzeroMatrixElements() << std::endl;
-   const double fillingRatio = ( double ) matrix.getNumberOfNonzeroMatrixElements() / ( double ) matrix.getNumberOfMatrixElements();
-   logFile << " " << 100.0 * fillingRatio << std::endl;
-   logFile << std::flush;
-   if( ! logFile.good() )
-      return false;
-   return true;
+   config.addDelimiter( "Device settings:" );
+   Devices::Host::configSetup( config );
+   Devices::Cuda::configSetup( config );
-double computeGflops( const long int nonzeroElements,
-                      const int iterations,
-                      const double& time )
+main( int argc, char* argv[] )
-   return ( double ) ( 2 * iterations * nonzeroElements ) / time * 1.0e-9;
+   Config::ParameterContainer parameters;
+   Config::ConfigDescription conf_desc;
-template< typename Real >
-double computeThroughput( const long int nonzeroElements,
-                          const int iterations,
-                          const int rows,
-                          const double& time )
-   return ( double ) ( ( 2 * nonzeroElements + rows ) * iterations ) * sizeof( Real ) / time * 1.0e-9;
+   setupConfig( conf_desc );
-template< typename Matrix,
-          typename Vector >
-double benchmarkMatrix( const Matrix& matrix,
-                        const Vector& x,
-                        Vector& b,
-                        const long int nonzeroElements,
-                        const char* format,
-                        const double& stopTime,
-                        const double& baseline,
-                        int verbose,
-                        std::fstream& logFile )
-   Timer timer;
-   timer.start();
-   double time( 0.0 );
-   int iterations( 0 );
-   while( time < stopTime )
-   {
-      matrix.vectorProduct( x, b );
-#ifdef HAVE_CUDA
-      if( std::is_same< typename Matrix::DeviceType, Devices::Cuda >::value )
-         cudaDeviceSynchronize();
-      time = timer.getRealTime();
-      iterations++;
-   }
-   const double gflops = computeGflops( nonzeroElements, iterations, time );
-   const double throughput = computeThroughput< typename Matrix::RealType >( nonzeroElements, iterations, matrix.getRows(), time );
-   const long int allocatedElements = matrix.getNumberOfMatrixElements();
-   const double padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100.0;
-   if( verbose )
-   {
-     std::cout << std::setw( 25 ) << format
-           << std::setw( 15 ) << padding
-           << std::setw( 15 ) << time
-           << std::setw( 15 ) << gflops
-           << std::setw( 15 ) << throughput;
-      if( baseline )
-        std::cout << std::setw( 15 ) << gflops / baseline << std::endl;
-      else
-        std::cout << std::setw( 15 ) << "N/A" << std::endl;
+   if( ! parseCommandLine( argc, argv, conf_desc, parameters ) ) {
+      conf_desc.printUsage( argv[ 0 ] );
+      return EXIT_FAILURE;
-   logFile << "  " << gflops << std::endl;
-   logFile << "  " << throughput << std::endl;
-   if( baseline )
-      logFile << gflops / baseline << std::endl;
-   else
-      logFile << "N/A" << std::endl;
-   return gflops;
-void writeTestFailed( std::fstream& logFile,
-                      int repeat )
-   for( int i = 0; i < repeat; i++ )
-      logFile << "N/A" << std::endl;
-template< typename Real >
-bool setupBenchmark( const Config::ParameterContainer& parameters )
-   const String& test = parameters.getParameter< String >( "test" );
-   const String& inputFileName = parameters.getParameter< String >( "input-file" );
-   const String& logFileName = parameters.getParameter< String >( "log-file" );
+   if( ! Devices::Host::setup( parameters ) ||
+       ! Devices::Cuda::setup( parameters ) )
+      return EXIT_FAILURE;
+   const String & logFileName = parameters.getParameter< String >( "log-file" );
+   const String & outputMode = parameters.getParameter< String >( "output-mode" );
+   const String & precision = parameters.getParameter< String >( "precision" );
+   // FIXME: getParameter< std::size_t >() does not work with parameters added with addEntry< int >(),
+   // which have a default value. The workaround below works for int values, but it is not possible
+   // to pass 64-bit integer values
+//   const std::size_t minSize = parameters.getParameter< std::size_t >( "min-size" );
+//   const std::size_t maxSize = parameters.getParameter< std::size_t >( "max-size" );
+   const std::size_t size = parameters.getParameter< int >( "size" );
+   const int loops = parameters.getParameter< int >( "loops" );
+   const int elementsPerRow = parameters.getParameter< int >( "elements-per-row" );
    const int verbose = parameters.getParameter< int >( "verbose" );
-   const double stopTime = parameters.getParameter< double >( "stop-time" );
-   std::fstream logFile;
-   if( ! initLogFile( logFile, logFileName ) )
-   {
-      std::cerr << "I am not able to open the file " << logFileName << "." << std::endl;
-      return false;
-   }
-   if( test == "mtx" )
-   {
-      typedef Matrices::CSR< Real, Devices::Host, int > CSRType;
-      CSRType csrMatrix;
-      try
-      {
-         if( ! MatrixReader< CSRType >::readMtxFile( inputFileName, csrMatrix ) )
-         {
-            std::cerr << "I am not able to read the matrix file " << inputFileName << "." << std::endl;
-            logFile << std::endl;
-            logFile << inputFileName << std::endl;
-            logFile << "Benchmark failed: Unable to read the matrix." << std::endl;
-            return false;
-         }
-      }
-      catch( const std::bad_alloc& )
-      {
-         std::cerr << "Not enough memory to read the matrix." << std::endl;
-         logFile << std::endl;
-         logFile << inputFileName << std::endl;
-         logFile << "Benchmark failed: Not enough memory." << std::endl;
-         return false;
-      }
-      if( verbose )
-         printMatrixInfo( inputFileName, csrMatrix,std::cout );
-      if( ! writeMatrixInfo( inputFileName, csrMatrix, logFile ) )
-      {
-         std::cerr << "I am not able to write new matrix to the log file." << std::endl;
-         return false;
-      }
-      const int rows = csrMatrix.getRows();
-      const long int nonzeroElements = csrMatrix.getNumberOfMatrixElements();
-      Containers::Vector< int, Devices::Host, int > rowLengthsHost;
-      rowLengthsHost.setSize( rows );
-      for( int row = 0; row < rows; row++ )
-         rowLengthsHost[ row ] = csrMatrix.getRowLength( row );
-      typedef Containers::Vector< Real, Devices::Host, int > HostVector;
-      HostVector hostX, hostB;
-      hostX.setSize( csrMatrix.getColumns() );
-      hostX.setValue( 1.0 );
-      hostB.setSize( csrMatrix.getRows() );
-#ifdef HAVE_CUDA
-      typedef Containers::Vector< Real, Devices::Cuda, int > CudaVector;
-      CudaVector cudaX, cudaB;
-      Containers::Vector< int, Devices::Cuda, int > rowLengthsCuda;
-      cudaX.setSize( csrMatrix.getColumns() );
-      cudaX.setValue( 1.0 );
-      cudaB.setSize( csrMatrix.getRows() );
-      rowLengthsCuda.setSize( csrMatrix.getRows() );
-      rowLengthsCuda = rowLengthsHost;
-      cusparseHandle_t cusparseHandle;
-      cusparseCreate( &cusparseHandle );
-      const double baseline = benchmarkMatrix( csrMatrix,
-                                               hostX,
-                                               hostB,
-                                               nonzeroElements,
-                                               "CSR Host",
-                                               stopTime,
-                                               0.0,
-                                               verbose,
-                                               logFile );
-#ifdef HAVE_CUDA
-      typedef CSR< Real, Devices::Cuda, int > CSRCudaType;
-      CSRCudaType cudaCSR;
-      //cout << "Copying matrix to GPU... ";
-      cudaCSR = csrMatrix;
-      TNL::CusparseCSR< Real > cusparseCSR;
-      cusparseCSR.init( cudaCSR, &cusparseHandle );
-      benchmarkMatrix( cusparseCSR,
-                       cudaX,
-                       cudaB,
-                       nonzeroElements,
-                       "Cusparse CSR",
-                       stopTime,
-                       baseline,
-                       verbose,
-                       logFile );
-      cusparseDestroy( cusparseHandle );
-      std::cout << " done.   \r";
-      /*cudaCSR.setCudaKernelType( CSRCudaType::scalar );
-      benchmarkMatrix( cudaCSR,
-                       cudaX,
-                       cudaB,
-                       nonzeroElements,
-                       "CSR Cuda Scalar",
-                       stopTime,
-                       baseline,
-                       verbose,
-                       logFile );
-      cudaCSR.setCudaKernelType( CSRCudaType::vector );
-      cudaCSR.setCudaWarpSize( 1 );
-      benchmarkMatrix( cudaCSR,
-                       cudaX,
-                       cudaB,
-                       nonzeroElements,
-                       "CSR Cuda Vector 1",
-                       stopTime,
-                       baseline,
-                       verbose,
-                       logFile );
-      cudaCSR.setCudaWarpSize( 2 );
-      benchmarkMatrix( cudaCSR,
-                       cudaX,
-                       cudaB,
-                       nonzeroElements,
-                       "CSR Cuda Vector 2",
-                       stopTime,
-                       baseline,
-                       verbose,
-                       logFile );
-      cudaCSR.setCudaWarpSize( 4 );
-      benchmarkMatrix( cudaCSR,
-                       cudaX,
-                       cudaB,
-                       nonzeroElements,
-                       "CSR Cuda Vector 4",
-                       stopTime,
-                       baseline,
-                       verbose,
-                       logFile );
-      cudaCSR.setCudaWarpSize( 8 );
-      benchmarkMatrix( cudaCSR,
-                       cudaX,
-                       cudaB,
-                       nonzeroElements,
-                       "CSR Cuda Vector 8",
-                       stopTime,
-                       baseline,
-                       verbose,
-                       logFile );
-      cudaCSR.setCudaWarpSize( 16 );
-      benchmarkMatrix( cudaCSR,
-                       cudaX,
-                       cudaB,
-                       nonzeroElements,
-                       "CSR Cuda Vector 16",
-                       stopTime,
-                       baseline,
-                       verbose,
-                       logFile );
-      cudaCSR.setCudaWarpSize( 32 );
-      benchmarkMatrix( cudaCSR,
-                       cudaX,
-                       cudaB,
-                       nonzeroElements,
-                       "CSR Cuda Vector 32",
-                       stopTime,
-                       baseline,
-                       verbose,
-                       logFile );
-      cudaCSR.setCudaKernelType( CSRCudaType::hybrid );
-      cudaCSR.setHybridModeSplit( 2 );
-      benchmarkMatrix( cudaCSR,
-                       cudaX,
-                       cudaB,
-                       nonzeroElements,
-                       "CSR Cuda Hyrbid 2",
-                       stopTime,
-                       baseline,
-                       verbose,
-                       logFile );
-      cudaCSR.setHybridModeSplit( 4 );
-      benchmarkMatrix( cudaCSR,
-                       cudaX,
-                       cudaB,
-                       nonzeroElements,
-                       "CSR Cuda Hyrbid 4",
-                       stopTime,
-                       baseline,
-                       verbose,
-                       logFile );
-      cudaCSR.setHybridModeSplit( 8 );
-      benchmarkMatrix( cudaCSR,
-                       cudaX,
-                       cudaB,
-                       nonzeroElements,
-                       "CSR Cuda Hyrbid 8",
-                       stopTime,
-                       baseline,
-                       verbose,
-                       logFile );
-      cudaCSR.setHybridModeSplit( 16 );
-      benchmarkMatrix( cudaCSR,
-                       cudaX,
-                       cudaB,
-                       nonzeroElements,
-                       "CSR Cuda Hyrbid 16",
-                       stopTime,
-                       baseline,
-                       verbose,
-                       logFile );
-      cudaCSR.setHybridModeSplit( 32 );
-      benchmarkMatrix( cudaCSR,
-                       cudaX,
-                       cudaB,
-                       nonzeroElements,
-                       "CSR Cuda Hyrbid 32",
-                       stopTime,
-                       baseline,
-                       verbose,
-                       logFile );
-      cudaCSR.setHybridModeSplit( 64 );
-      benchmarkMatrix( cudaCSR,
-                       cudaX,
-                       cudaB,
-                       nonzeroElements,
-                       "CSR Cuda Hyrbid 64",
-                       stopTime,
-                       baseline,
-                       verbose,
-                       logFile );*/
-      cudaCSR.reset();
-      long int allocatedElements;
-      double padding;
-      typedef Ellpack< Real, Devices::Host, int > EllpackType;
-      EllpackType ellpackMatrix;
-      Matrices::copySparseMatrix( ellpackMatrix, csrMatrix );
-      allocatedElements = ellpackMatrix.getNumberOfMatrixElements();
-      padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100.0;
-      logFile << "    " << padding << std::endl;
-      benchmarkMatrix( ellpackMatrix,
-                       hostX,
-                       hostB,
-                       nonzeroElements,
-                       "Ellpack Host",
-                       stopTime,
-                       baseline,
-                       verbose,
-                       logFile );
-#ifdef HAVE_CUDA
-      typedef Ellpack< Real, Devices::Cuda, int > EllpackCudaType;
-      EllpackCudaType cudaEllpack;
-      std::cout << "Copying matrix to GPU... ";
-      cudaEllpack = ellpackMatrix;
-      std::cout << " done.   \r";
-      benchmarkMatrix( cudaEllpack,
-                       cudaX,
-                       cudaB,
-                       nonzeroElements,
-                       "Ellpack Cuda",
-                       stopTime,
-                       baseline,
-                       verbose,
-                       logFile );
-      cudaEllpack.reset();
-      ellpackMatrix.reset();
-      typedef Matrices::EllpackSymmetric< Real, Devices::Host, int > EllpackSymmetricType;
-      EllpackSymmetricType EllpackSymmetric;
-      if( ! MatrixReader< EllpackSymmetricType >::readMtxFile( inputFileName, EllpackSymmetric, verbose, true ) )
-         writeTestFailed( logFile, 7 );
-      else
-      {
-         allocatedElements = EllpackSymmetric.getNumberOfMatrixElements();
-         padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100.0;
-         logFile << "    " << padding <<std::endl;
-         benchmarkMatrix( EllpackSymmetric,
-                          hostX,
-                          hostB,
-                          nonzeroElements,
-                          "EllpackSym Host",
-                          stopTime,
-                          baseline,
-                          verbose,
-                          logFile );
-         EllpackSymmetric.reset();
-#ifdef HAVE_CUDA
-         typedef Matrices::EllpackSymmetric< Real, Devices::Cuda, int > EllpackSymmetricCudaType;
-         EllpackSymmetricCudaType cudaEllpackSymmetric;
-        std::cout << "Copying matrix to GPU... ";
-         for( int i = 0; i < rowLengthsHost.getSize(); i++ )
-             rowLengthsHost[ i ] = EllpackSymmetric.getRowLength( i );
-         rowLengthsCuda = rowLengthsHost;
-         // TODO: fix this
-         //if( ! cudaEllpackSymmetric.copyFrom( EllpackSymmetric, rowLengthsCuda ) )
-         {
-           std::cerr << "I am not able to transfer the matrix on GPU." <<std::endl;
-            writeTestFailed( logFile, 3 );
-         }
-         //else
-         {
-           std::cout << " done.   \r";
-            benchmarkMatrix( cudaEllpackSymmetric,
-                             cudaX,
-                             cudaB,
-                             nonzeroElements,
-                             "EllpackSym Cuda",
-                             stopTime,
-                             baseline,
-                             verbose,
-                             logFile );
-         }
-         cudaEllpackSymmetric.reset();
-      }
+   // open log file
+   auto mode = std::ios::out;
+   if( outputMode == "append" )
+       mode |= std::ios::app;
+   std::ofstream logFile( logFileName.getString(), mode );
-      typedef Matrices::SlicedEllpack< Real, Devices::Host, int > SlicedEllpackMatrixType;
-      SlicedEllpackMatrixType slicedEllpackMatrix;
-      if( ! Matrices::MatrixReader< SlicedEllpackMatrixType >::readMtxFile( inputFileName, slicedEllpackMatrix, verbose ) )
-         writeTestFailed( logFile, 7 );
-      else
-      {
-         allocatedElements = slicedEllpackMatrix.getNumberOfMatrixElements();
-         padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100;
-         logFile << "    " << padding <<std::endl;
-         benchmarkMatrix( slicedEllpackMatrix,
-                          hostX,
-                          hostB,
-                          nonzeroElements,
-                          "SlicedEllpack Host",
-                          stopTime,
-                          baseline,
-                          verbose,
-                          logFile );
-#ifdef HAVE_CUDA
-         typedef Matrices::SlicedEllpack< Real, Devices::Cuda, int > SlicedEllpackMatrixCudaType;
-         SlicedEllpackMatrixCudaType cudaSlicedEllpackMatrix;
-         for( int i = 0; i < rowLengthsHost.getSize(); i++ )
-              rowLengthsHost[ i ] = slicedEllpackMatrix.getRowLength( i );
-         rowLengthsCuda = rowLengthsHost;
-         // TODO: fix
-         //if( ! cudaSlicedEllpackMatrix.copyFrom( slicedEllpackMatrix, rowLengthsCuda ) )
-         {
-            std::cerr << "Nejde zkopirovat" <<std::endl;
-             writeTestFailed( logFile, 3 );
-         }
-         //else
-         {
-           std::cout << " done.    \r";
-            benchmarkMatrix( cudaSlicedEllpackMatrix,
-                             cudaX,
-                             cudaB,
-                             nonzeroElements,
-                             "SlicedEllpack Cuda",
-                             stopTime,
-                             baseline,
-                             verbose,
-                             logFile );
-         }
-         cudaSlicedEllpackMatrix.reset();        
-      }
+   // init benchmark and common metadata
+   Benchmark benchmark( loops, verbose );
-      typedef Matrices::ChunkedEllpack< Real, Devices::Host, int > ChunkedEllpackType;
-      ChunkedEllpackType chunkedEllpack;
-      Matrices::copySparseMatrix( chunkedEllpack, csrMatrix );
-      allocatedElements = chunkedEllpack.getNumberOfMatrixElements();
-      padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100.0;
-      logFile << "    " << padding << std::endl;
-      benchmarkMatrix( chunkedEllpack,
-                       hostX,
-                       hostB,
-                       nonzeroElements,
-                       "ChunkedEllpack Host",
-                       stopTime,
-                       baseline,
-                       verbose,
-                       logFile );
-#ifdef HAVE_CUDA
-      typedef Matrices::ChunkedEllpack< Real, Devices::Cuda, int > ChunkedEllpackCudaType;
-      ChunkedEllpackCudaType cudaChunkedEllpack;
-      std::cout << "Copying matrix to GPU... ";
-      cudaChunkedEllpack = chunkedEllpack;
-      std::cout << " done.    \r";
-      benchmarkMatrix( cudaChunkedEllpack,
-                       cudaX,
-                       cudaB,
-                       nonzeroElements,
-                       "ChunkedEllpack Cuda",
-                       stopTime,
-                       baseline,
-                       verbose,
-                       logFile );
-      cudaChunkedEllpack.reset();
+   // prepare global metadata
+   Benchmark::MetadataMap metadata = getHardwareMetadata();
-      typedef Matrices::BiEllpack< Real, Devices::Host, int > BiEllpackMatrixType;
-      BiEllpackMatrixType biEllpackMatrix;
-      // TODO: I did not check this during git merging, but I hope its gonna work
-      //   Tomas Oberhuber
-      //    copySparseMatrix( biEllpackMatrix, csrMatrix ); // TODO:Fix the getRow method to be compatible with othr formats
-      /*if( ! biEllpackMatrix.copyFrom( csrMatrix, rowLengthsHost ) )
-         writeTestFailed( logFile, 7 );
-      else*/
-      {
-         allocatedElements = biEllpackMatrix.getNumberOfMatrixElements();
-         padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100.0;
-         logFile << "    " << padding <<std::endl;
-         benchmarkMatrix( biEllpackMatrix,
-                          hostX,
-                          hostB,
-                          nonzeroElements,
-                          "BiEllpack Host",
-                          stopTime,
-                          baseline,
-                          verbose,
-                          logFile );
-         biEllpackMatrix.reset();
+   if( precision == "all" || precision == "float" )
+      runSpMVBenchmarks< float >( benchmark, metadata, size, elementsPerRow );
+   if( precision == "all" || precision == "double" )
+      runSpMVBenchmarks< double >( benchmark, metadata, size, elementsPerRow );
-#ifdef HAVE_CUDA
-         typedef Matrices::BiEllpack< Real, Devices::Cuda, int > BiEllpackMatrixCudaType;
-         BiEllpackMatrixCudaType cudaBiEllpackMatrix;
-         // TODO: I did not check this during git merging, but I hope its gonna work
-         //   Tomas Oberhuber
-         //    copySparseMatrix( biEllpackMatrix, csrMatrix ); // TODO:Fix the getRow method to be compatible with othr formats
-        std::cout << "Copying matrix to GPU... ";
-         /*if( ! cudaBiEllpackMatrix.copyFrom( biEllpackMatrix, rowLengthsCuda ) )
-         {
-           std::cerr << "I am not able to transfer the matrix on GPU." <<std::endl;
-            writeTestFailed( logFile, 3 );
-         }
-         else*/
-         {
-           std::cout << " done.    \r";
-            benchmarkMatrix( cudaBiEllpackMatrix,
-                             cudaX,
-                             cudaB,
-                             nonzeroElements,
-                             "BiEllpack Cuda",
-                             stopTime,
-                             baseline,
-                             verbose,
-                             logFile );
-         }
-         cudaBiEllpackMatrix.reset();
-      }
-      typedef Matrices::SlicedEllpackSymmetric< Real, Devices::Host, int > SlicedEllpackSymmetricType;
-      SlicedEllpackSymmetricType slicedEllpackSymmetric;
-      if( ! Matrices::MatrixReader< SlicedEllpackSymmetricType >::readMtxFile( inputFileName, slicedEllpackSymmetric, verbose, true ) )
-         writeTestFailed( logFile, 7 );
-      else
-      {
-         allocatedElements = slicedEllpackSymmetric.getNumberOfMatrixElements();
-         padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100.0;
-         logFile << "    " << padding <<std::endl;
-         benchmarkMatrix( slicedEllpackSymmetric,
-                          hostX,
-                          hostB,
-                          nonzeroElements,
-                          "SlicedEllpackSym Host",
-                          stopTime,
-                          baseline,
-                          verbose,
-                          logFile );
-         slicedEllpackSymmetric.reset();
-#ifdef HAVE_CUDA
-         typedef Matrices::SlicedEllpackSymmetric< Real, Devices::Cuda, int > SlicedEllpackSymmetricCudaType;
-         SlicedEllpackSymmetricCudaType cudaSlicedEllpackSymmetric;
-        std::cout << "Copying matrix to GPU... ";
-         for( int i = 0; i < rowLengthsHost.getSize(); i++ )
-             rowLengthsHost[ i ] = slicedEllpackSymmetric.getRowLength( i );
-         rowLengthsCuda = rowLengthsHost;
-         // TODO: fiox the nest line
-         //if( ! cudaSlicedEllpackSymmetric.copyFrom( slicedEllpackSymmetric, rowLengthsCuda ) )
-         {
-           std::cerr << "I am not able to transfer the matrix on GPU." <<std::endl;
-            writeTestFailed( logFile, 3 );
-         }
-         //else
-         {
-           std::cout << " done.   \r";
-            benchmarkMatrix( cudaSlicedEllpackSymmetric,
-                             cudaX,
-                             cudaB,
-                             nonzeroElements,
-                             "SlicedEllpackSym Cuda",
-                             stopTime,
-                             baseline,
-                             verbose,
-                             logFile );
-         }
-         cudaSlicedEllpackSymmetric.reset();
-      }
-      typedef Matrices::EllpackSymmetricGraph< Real, Devices::Host, int > EllpackSymmetricGraphMatrixType;
-      EllpackSymmetricGraphMatrixType EllpackSymmetricGraphMatrix;
-      if( ! Matrices::MatrixReader< EllpackSymmetricGraphMatrixType >::readMtxFile( inputFileName, EllpackSymmetricGraphMatrix, verbose, true ) ||
-          ! EllpackSymmetricGraphMatrix.help() )
-         writeTestFailed( logFile, 7 );
-      else
-      {
-         allocatedElements = EllpackSymmetricGraphMatrix.getNumberOfMatrixElements();
-         padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100.0;
-         logFile << "    " << padding <<std::endl;
-         benchmarkMatrix( EllpackSymmetricGraphMatrix,
-                          hostX,
-                          hostB,
-                          nonzeroElements,
-                          "Ellpack Graph Host",
-                          stopTime,
-                          baseline,
-                          verbose,
-                          logFile );
-         EllpackSymmetricGraphMatrix.reset();
-#ifdef HAVE_CUDA
-         typedef Matrices::EllpackSymmetricGraph< Real, Devices::Cuda, int > EllpackSymmetricGraphMatrixCudaType;
-         EllpackSymmetricGraphMatrixCudaType cudaEllpackSymmetricGraphMatrix;
-        std::cout << "Copying matrix to GPU... ";
-         for( int i = 0; i < rowLengthsHost.getSize(); i++ )
-             rowLengthsHost[ i ] = EllpackSymmetricGraphMatrix.getRowLength( i );
-         rowLengthsCuda = rowLengthsHost;
-         // TODO: fix it
-         //if( ! cudaEllpackSymmetricGraphMatrix.copyFrom( EllpackSymmetricGraphMatrix, rowLengthsCuda ) ) 
-         {
-            writeTestFailed( logFile, 3 );
-         }
-         //else if( ! cudaEllpackSymmetricGraphMatrix.help() )
-         {
-            writeTestFailed( logFile, 3 );
-         } 
-         //else
-         {
-            std::cout << " done.   \r";
-            benchmarkMatrix( cudaEllpackSymmetricGraphMatrix,
-                             cudaX,
-                             cudaB,
-                             nonzeroElements,
-                             "Ellpack Graph Cuda",
-                             stopTime,
-                             baseline,
-                             verbose,
-                             logFile );
-         }
-         cudaEllpackSymmetricGraphMatrix.reset();
-      }
-        typedef Matrices::AdEllpack< Real, Devices::Host, int > AdEllpackMatrixType;
-        AdEllpackMatrixType adEllpackMatrix;
-         // TODO: I did not check this during git merging, but I hope its gonna work
-         //   Tomas Oberhuber
-        //copySparseMatrix( adEllpackMatrix, csrMatrix ); // TODO:Fix the getRow method to be compatible with othr formats
-        /*if( ! adEllpackMatrix.copyFrom( csrMatrix, rowLengthsHost ) )
-           writeTestFailed( logFile, 7 );
-        else*/
-        {
-           allocatedElements = adEllpackMatrix.getNumberOfMatrixElements();
-           padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100.0;
-           logFile << "    " << padding <<std::endl;
-           benchmarkMatrix( adEllpackMatrix,
-                            hostX,
-                            hostB,
-                            nonzeroElements,
-                            "AdEllpack Host",
-                            stopTime,
-                            baseline,
-                            verbose,
-                            logFile );
-           adEllpackMatrix.reset();
-        }
-#ifdef HAVE_CUDA
-         typedef Matrices::AdEllpack< Real, Devices::Cuda, int > AdEllpackMatrixCudaType;
-         AdEllpackMatrixCudaType cudaAdEllpackMatrix;
-         // TODO: I did not check this during git merging, but I hope its gonna work
-         //   Tomas Oberhuber
-        //copySparseMatrix( adEllpackMatrix, csrMatrix ); // TODO:Fix the getRow method to be compatible with othr formats
-        std::cout << "Copying matrix to GPU... ";
-         /*if( ! cudaAdEllpackMatrix.copyFrom( csrMatrix, rowLengthsCuda ) )
-         {
-           std::cerr << "I am not able to transfer the matrix on GPU." <<std::endl;
-            writeTestFailed( logFile, 3 );
-         }
-         else*/
-         {
-	    allocatedElements = cudaAdEllpackMatrix.getNumberOfMatrixElements();
-	    padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100.0;
-            logFile << "    " << padding <<std::endl;
-           std::cout << " done.    \r";
-            benchmarkMatrix( cudaAdEllpackMatrix,
-                             cudaX,
-                             cudaB,
-                             nonzeroElements,
-                             "AdEllpack Cuda",
-                             stopTime,
-                             baseline,
-                             verbose,
-                             logFile );
-           cudaAdEllpackMatrix.reset();
-	}
+   if( ! benchmark.save( logFile ) ) {
+      std::cerr << "Failed to write the benchmark results to file '" << parameters.getParameter< String >( "log-file" ) << "'." << std::endl;
+      return EXIT_FAILURE;
-   return true;
-int main( int argc, char* argv[] )
-   Config::ParameterContainer parameters;
-   Config::ConfigDescription conf_desc;
-   setupConfig( conf_desc );
-   if( ! parseCommandLine( argc, argv, conf_desc, parameters ) )
-   {
-      conf_desc.printUsage( argv[ 0 ] );
-      return 1;
-   }
-   const String& precision = parameters.getParameter< String >( "precision" );
-   if( precision == "float" )
-      if( ! setupBenchmark< float >( parameters ) )
-         return EXIT_FAILURE;
-   if( precision == "double" )
-      if( ! setupBenchmark< double >( parameters ) )
-         return EXIT_FAILURE;
    return EXIT_SUCCESS;