From 7611149633cbaca282c824937ce6cc332a7815a9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkjak@fjfi.cvut.cz>
Date: Wed, 12 Sep 2018 10:24:09 +0200
Subject: [PATCH] Added tnl-benchmark-distributed-spmv

---
 src/Benchmarks/CMakeLists.txt                 |   1 +
 src/Benchmarks/DistSpMV/CMakeLists.txt        |  11 +
 src/Benchmarks/DistSpMV/ordering.h            | 133 ++++++
 .../tnl-benchmark-distributed-spmv.cpp        |  11 +
 .../tnl-benchmark-distributed-spmv.cu         |  11 +
 .../DistSpMV/tnl-benchmark-distributed-spmv.h | 395 ++++++++++++++++++
 6 files changed, 562 insertions(+)
 create mode 100644 src/Benchmarks/DistSpMV/CMakeLists.txt
 create mode 100644 src/Benchmarks/DistSpMV/ordering.h
 create mode 100644 src/Benchmarks/DistSpMV/tnl-benchmark-distributed-spmv.cpp
 create mode 100644 src/Benchmarks/DistSpMV/tnl-benchmark-distributed-spmv.cu
 create mode 100644 src/Benchmarks/DistSpMV/tnl-benchmark-distributed-spmv.h

diff --git a/src/Benchmarks/CMakeLists.txt b/src/Benchmarks/CMakeLists.txt
index e34ade5be3..e0637205f7 100644
--- a/src/Benchmarks/CMakeLists.txt
+++ b/src/Benchmarks/CMakeLists.txt
@@ -1,6 +1,7 @@
 add_subdirectory( HeatEquation )
 add_subdirectory( BLAS )
 add_subdirectory( SpMV )
+add_subdirectory( DistSpMV )
 add_subdirectory( LinearSolvers )
 
 set( headers
diff --git a/src/Benchmarks/DistSpMV/CMakeLists.txt b/src/Benchmarks/DistSpMV/CMakeLists.txt
new file mode 100644
index 0000000000..57ccdd7a9f
--- /dev/null
+++ b/src/Benchmarks/DistSpMV/CMakeLists.txt
@@ -0,0 +1,11 @@
+if( BUILD_CUDA )
+   cuda_add_executable( tnl-benchmark-distributed-spmv-cuda tnl-benchmark-distributed-spmv.cu )
+   target_link_libraries( tnl-benchmark-distributed-spmv-cuda tnl )
+
+   install( TARGETS tnl-benchmark-distributed-spmv-cuda RUNTIME DESTINATION bin )
+endif()
+
+add_executable( tnl-benchmark-distributed-spmv tnl-benchmark-distributed-spmv.cpp )
+target_link_libraries( tnl-benchmark-distributed-spmv tnl )
+
+install( TARGETS tnl-benchmark-distributed-spmv RUNTIME DESTINATION bin )
diff --git a/src/Benchmarks/DistSpMV/ordering.h b/src/Benchmarks/DistSpMV/ordering.h
new file mode 100644
index 0000000000..5bd68a95bf
--- /dev/null
+++ b/src/Benchmarks/DistSpMV/ordering.h
@@ -0,0 +1,133 @@
+#pragma once
+
+#include <algorithm>
+
+#include <TNL/Devices/Host.h>
+#include <TNL/ParallelFor.h>
+
+using namespace TNL;
+
+template< typename Matrix, typename PermutationVector >
+void
+getTrivialOrdering( const Matrix& matrix, PermutationVector& perm, PermutationVector& iperm )
+{
+   using IndexType = typename Matrix::IndexType;
+
+   // allocate permutation vectors
+   perm.setSize( matrix.getRows() );
+   iperm.setSize( matrix.getRows() );
+
+   const IndexType N = matrix.getRows() / 2;
+   for( IndexType i = 0; i < N; i++ ) {
+      perm[ 2 * i ] = i;
+      perm[ 2 * i + 1 ] = i + N;
+      iperm[ i ] = 2 * i;
+      iperm[ i + N ] = 2 * i + 1;
+   }
+}
+
+template< typename Vector, typename PermutationVector >
+void
+reorderVector( const Vector& src, Vector& dest, const PermutationVector& perm )
+{
+   TNL_ASSERT_EQ( src.getSize(), perm.getSize(),
+                  "Source vector and permutation must have the same size." );
+   using RealType = typename Vector::RealType;
+   using DeviceType = typename Vector::DeviceType;
+   using IndexType = typename Vector::IndexType;
+
+   auto kernel = [] __cuda_callable__
+      ( IndexType i,
+        const RealType* src,
+        RealType* dest,
+        const typename PermutationVector::RealType* perm )
+   {
+      dest[ i ] = src[ perm[ i ] ];
+   };
+
+   dest.setLike( src );
+
+   ParallelFor< DeviceType >::exec( (IndexType) 0, src.getSize(),
+                                    kernel,
+                                    src.getData(),
+                                    dest.getData(),
+                                    perm.getData() );
+}
+
+template< typename Matrix, typename PermutationVector >
+void
+reorderMatrix( const Matrix& matrix1, Matrix& matrix2, const PermutationVector& _perm, const PermutationVector& _iperm )
+{
+   // TODO: implement on GPU
+   static_assert( std::is_same< typename Matrix::DeviceType, Devices::Host >::value, "matrix reordering is implemented only for host" );
+   static_assert( std::is_same< typename PermutationVector::DeviceType, Devices::Host >::value, "matrix reordering is implemented only for host" );
+
+   using namespace TNL;
+   using IndexType = typename Matrix::IndexType;
+
+   matrix2.setLike( matrix1 );
+
+   // general multidimensional accessors for permutation indices
+   // TODO: this depends on the specific layout of dofs, general reordering of NDArray is needed
+   auto perm = [&]( IndexType dof ) {
+      TNL_ASSERT_LT( dof, matrix1.getRows(), "invalid dof index" );
+      const IndexType i = dof / _perm.getSize();
+      return i * _perm.getSize() + _perm[ dof % _perm.getSize() ];
+   };
+   auto iperm = [&]( IndexType dof ) {
+      TNL_ASSERT_LT( dof, matrix1.getRows(), "invalid dof index" );
+      const IndexType i = dof / _iperm.getSize();
+      return i * _iperm.getSize() + _iperm[ dof % _iperm.getSize() ];
+   };
+
+   // set row lengths
+   typename Matrix::CompressedRowLengthsVector rowLengths;
+   rowLengths.setSize( matrix1.getRows() );
+   for( IndexType i = 0; i < matrix1.getRows(); i++ ) {
+      const IndexType maxLength = matrix1.getRowLength( perm( i ) );
+      const auto row = matrix1.getRow( perm( i ) );
+      IndexType length = 0;
+      for( IndexType j = 0; j < maxLength; j++ )
+         if( row.getElementColumn( j ) < matrix1.getColumns() )
+            length++;
+      rowLengths[ i ] = length;
+   }
+   matrix2.setCompressedRowLengths( rowLengths );
+
+   // set row elements
+   for( IndexType i = 0; i < matrix2.getRows(); i++ ) {
+      const IndexType rowLength = rowLengths[ i ];
+
+      // extract sparse row
+      const auto row1 = matrix1.getRow( perm( i ) );
+
+      // permute
+      typename Matrix::IndexType columns[ rowLength ];
+      typename Matrix::RealType values[ rowLength ];
+      for( IndexType j = 0; j < rowLength; j++ ) {
+         columns[ j ] = iperm( row1.getElementColumn( j ) );
+         values[ j ] = row1.getElementValue( j );
+      }
+
+      // sort
+      IndexType indices[ rowLength ];
+      for( IndexType j = 0; j < rowLength; j++ )
+         indices[ j ] = j;
+      // nvcc does not allow lambdas to capture VLAs, even in host code (WTF!?)
+      //    error: a variable captured by a lambda cannot have a type involving a variable-length array
+      IndexType* _columns = columns;
+      auto comparator = [=]( IndexType a, IndexType b ) {
+         return _columns[ a ] < _columns[ b ];
+      };
+      std::sort( indices, indices + rowLength, comparator );
+
+      typename Matrix::IndexType sortedColumns[ rowLength ];
+      typename Matrix::RealType sortedValues[ rowLength ];
+      for( IndexType j = 0; j < rowLength; j++ ) {
+         sortedColumns[ j ] = columns[ indices[ j ] ];
+         sortedValues[ j ] = values[ indices[ j ] ];
+      }
+
+      matrix2.setRow( i, sortedColumns, sortedValues, rowLength );
+   }
+}
diff --git a/src/Benchmarks/DistSpMV/tnl-benchmark-distributed-spmv.cpp b/src/Benchmarks/DistSpMV/tnl-benchmark-distributed-spmv.cpp
new file mode 100644
index 0000000000..63c02eab46
--- /dev/null
+++ b/src/Benchmarks/DistSpMV/tnl-benchmark-distributed-spmv.cpp
@@ -0,0 +1,11 @@
+/***************************************************************************
+                          tnl-benchmark-distributed-spmv.cpp  -  description
+                             -------------------
+    begin                : Sep 11, 2018
+    copyright            : (C) 2018 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#include "tnl-benchmark-distributed-spmv.h"
diff --git a/src/Benchmarks/DistSpMV/tnl-benchmark-distributed-spmv.cu b/src/Benchmarks/DistSpMV/tnl-benchmark-distributed-spmv.cu
new file mode 100644
index 0000000000..7e9094fbbc
--- /dev/null
+++ b/src/Benchmarks/DistSpMV/tnl-benchmark-distributed-spmv.cu
@@ -0,0 +1,11 @@
+/***************************************************************************
+                          tnl-benchmark-distributed-spmv.cu  -  description
+                             -------------------
+    begin                : Sep 11, 2018
+    copyright            : (C) 2018 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#include "tnl-benchmark-distributed-spmv.h"
diff --git a/src/Benchmarks/DistSpMV/tnl-benchmark-distributed-spmv.h b/src/Benchmarks/DistSpMV/tnl-benchmark-distributed-spmv.h
new file mode 100644
index 0000000000..1aed36ec8f
--- /dev/null
+++ b/src/Benchmarks/DistSpMV/tnl-benchmark-distributed-spmv.h
@@ -0,0 +1,395 @@
+/***************************************************************************
+                          tnl-benchmark-distributed-spmv.h  -  description
+                             -------------------
+    begin                : Sep 11, 2018
+    copyright            : (C) 2018 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Jakub Klinkovsky
+
+#pragma once
+
+#ifndef NDEBUG
+#include <TNL/Debugging/FPE.h>
+#endif
+
+#include <TNL/Devices/Host.h>
+#include <TNL/Devices/Cuda.h>
+#include <TNL/Devices/SystemInfo.h>
+#include <TNL/Devices/CudaDeviceInfo.h>
+#include <TNL/Config/ConfigDescription.h>
+#include <TNL/Config/ParameterContainer.h>
+#include <TNL/Communicators/MpiCommunicator.h>
+#include <TNL/Communicators/NoDistrCommunicator.h>
+#include <TNL/Communicators/ScopedInitializer.h>
+#include <TNL/DistributedContainers/Partitioner.h>
+#include <TNL/DistributedContainers/DistributedVector.h>
+#include <TNL/DistributedContainers/DistributedMatrix.h>
+
+#include "../Benchmarks.h"
+#include "ordering.h"
+
+#include <TNL/Matrices/SlicedEllpack.h>
+
+using namespace TNL;
+using namespace TNL::Benchmarks;
+
+#ifdef HAVE_MPI
+using CommunicatorType = Communicators::MpiCommunicator;
+#else
+using CommunicatorType = Communicators::NoDistrCommunicator;
+#endif
+
+
+template< typename Matrix, typename Vector >
+void
+benchmarkSpmv( Benchmark& benchmark,
+               const Matrix& matrix,
+               const Vector& x,
+               const char* performer = "CPU" )
+{
+   Vector y;
+   y.setLike( x );
+
+   // reset function
+   auto reset = [&]() {
+      y = x;
+   };
+
+   // benchmark function
+   auto compute = [&]() {
+      matrix.vectorProduct( x, y );
+   };
+
+   benchmark.time( reset, performer, compute );
+}
+
+template< typename Matrix, typename Vector >
+void
+benchmarkSpmvCuda( Benchmark& benchmark,
+                   const Matrix& matrix,
+                   const Vector& x )
+{
+   using RealType = typename Matrix::RealType;
+   using IndexType = typename Matrix::IndexType;
+   using CudaMatrix = typename Matrix::CudaType;
+   using CudaVector = typename Vector::CudaType;
+
+   CudaVector cuda_x;
+   cuda_x = x;
+
+   Timer t;
+   t.start();
+
+   CudaMatrix cuda_matrix;
+   cuda_matrix = matrix;
+
+   t.stop();
+   std::cout << "--> Copying the matrix to the GPU took " << t.getRealTime() << " seconds." << std::endl;
+
+   benchmarkSpmv( benchmark, cuda_matrix, cuda_x, "GPU" );
+}
+
+template< typename Matrix, typename Vector >
+void
+benchmarkDistributedSpmv( Benchmark& benchmark,
+                          // TODO: cannot be const due to internal buffering
+//                          const Matrix& matrix,
+                          Matrix& matrix,
+                          const Vector& x,
+                          const char* performer = "CPU" )
+{
+   Vector y;
+   y.setLike( x );
+
+   // reset function
+   auto reset = [&]() {
+      y = x;
+   };
+
+   // benchmark function
+   auto compute = [&]() {
+      matrix.vectorProduct( x, y );
+      Matrix::CommunicatorType::Barrier( matrix.getCommunicationGroup() );
+   };
+
+   benchmark.time( reset, performer, compute );
+}
+
+template< typename Matrix, typename Vector >
+void
+benchmarkDistributedSpmvCuda( Benchmark& benchmark,
+                              const Matrix& matrix,
+                              const Vector& x )
+{
+   using RealType = typename Matrix::RealType;
+   using IndexType = typename Matrix::IndexType;
+   using CudaMatrix = typename Matrix::CudaType;
+   using CudaVector = typename Vector::CudaType;
+
+   CudaVector cuda_x;
+   cuda_x = x;
+
+   Timer t;
+   t.start();
+
+   CudaMatrix cuda_matrix;
+   cuda_matrix = matrix;
+
+   t.stop();
+   std::cout << "--> Copying the matrix to the GPU took " << t.getRealTime() << " seconds." << std::endl;
+
+   benchmarkDistributedSpmv( benchmark, cuda_matrix, cuda_x, "GPU" );
+}
+
+template< typename MatrixType >
+struct SpmvBenchmark
+{
+   using RealType = typename MatrixType::RealType;
+   using DeviceType = typename MatrixType::DeviceType;
+   using IndexType = typename MatrixType::IndexType;
+   using VectorType = Containers::Vector< RealType, DeviceType, IndexType >;
+
+   using Partitioner = DistributedContainers::Partitioner< IndexType, CommunicatorType >;
+   using DistributedMatrix = DistributedContainers::DistributedMatrix< MatrixType, CommunicatorType >;
+   using DistributedVector = DistributedContainers::DistributedVector< RealType, DeviceType, IndexType, CommunicatorType >;
+   using DistributedRowLengths = typename DistributedMatrix::CompressedRowLengthsVector;
+
+   static bool
+   run( Benchmark& benchmark,
+        Benchmark::MetadataMap metadata,
+        const Config::ParameterContainer& parameters )
+   {
+      MatrixType matrix;
+      VectorType vector;
+      if( ! matrix.load( parameters.getParameter< String >( "input-matrix" ) ) ||
+          ! vector.load( parameters.getParameter< String >( "input-vector" ) ) )
+          return false;
+
+      typename MatrixType::CompressedRowLengthsVector rowLengths;
+      matrix.getCompressedRowLengths( rowLengths );
+      const IndexType maxRowLength = rowLengths.max();
+
+      const String name = String( (CommunicatorType::isDistributed()) ? "DistSpMV" : "SpMV" )
+                          + " (" + parameters.getParameter< String >( "name" ) + "): ";
+      benchmark.newBenchmark( name, metadata );
+      benchmark.setMetadataColumns( Benchmark::MetadataColumns({
+         // TODO: strip the device
+//         {"matrix type", matrix.getType()},
+         {"rows", matrix.getRows()},
+         {"columns", matrix.getColumns()},
+         // FIXME: getMaxRowLengths() returns 0 for matrices loaded from file
+//         {"max elements per row", matrix.getMaxRowLength()},
+         {"max elements per row", maxRowLength},
+      } ));
+
+      const bool reorder = parameters.getParameter< bool >( "reorder-dofs" );
+      if( reorder ) {
+         using PermutationVector = Containers::Vector< IndexType, DeviceType, IndexType >;
+         PermutationVector perm, iperm;
+         getTrivialOrdering( matrix, perm, iperm );
+         MatrixType matrix_perm;
+         reorderMatrix( matrix, matrix_perm, perm, iperm );
+         if( CommunicatorType::isDistributed() )
+            runDistributed( benchmark, metadata, parameters, matrix_perm, vector );
+         else
+            runNonDistributed( benchmark, metadata, parameters, matrix_perm, vector );
+      }
+      else {
+         if( CommunicatorType::isDistributed() )
+            runDistributed( benchmark, metadata, parameters, matrix, vector );
+         else
+            runNonDistributed( benchmark, metadata, parameters, matrix, vector );
+      }
+
+      return true;
+   }
+
+   static void
+   runNonDistributed( Benchmark& benchmark,
+                      Benchmark::MetadataMap metadata,
+                      const Config::ParameterContainer& parameters,
+                      MatrixType& matrix,
+                      VectorType& vector )
+   {
+      benchmarkSpmv( benchmark, matrix, vector );
+#ifdef HAVE_CUDA
+      benchmarkSpmvCuda( benchmark, matrix, vector );
+#endif
+   }
+
+   static void
+   runDistributed( Benchmark& benchmark,
+                   Benchmark::MetadataMap metadata,
+                   const Config::ParameterContainer& parameters,
+                   MatrixType& matrix,
+                   VectorType& vector )
+   {
+      // set up the distributed matrix
+      const auto group = CommunicatorType::AllGroup;
+      const auto localRange = Partitioner::splitRange( matrix.getRows(), group );
+      DistributedMatrix distributedMatrix( localRange, matrix.getRows(), matrix.getColumns(), group );
+      DistributedVector distributedVector( localRange, matrix.getRows(), group );
+
+      // copy the row lengths from the global matrix to the distributed matrix
+      DistributedRowLengths distributedRowLengths( localRange, matrix.getRows(), group );
+      for( IndexType i = 0; i < distributedMatrix.getLocalMatrix().getRows(); i++ ) {
+         const auto gi = distributedMatrix.getLocalRowRange().getGlobalIndex( i );
+         distributedRowLengths[ gi ] = matrix.getRowLength( gi );
+      }
+      distributedMatrix.setCompressedRowLengths( distributedRowLengths );
+
+      // copy data from the global matrix/vector into the distributed matrix/vector
+      for( IndexType i = 0; i < distributedMatrix.getLocalMatrix().getRows(); i++ ) {
+         const auto gi = distributedMatrix.getLocalRowRange().getGlobalIndex( i );
+         distributedVector[ gi ] = vector[ gi ];
+
+         const IndexType rowLength = matrix.getRowLength( i );
+         IndexType columns[ rowLength ];
+         RealType values[ rowLength ];
+         matrix.getRowFast( gi, columns, values );
+         distributedMatrix.setRowFast( gi, columns, values, rowLength );
+      }
+
+      benchmarkDistributedSpmv( benchmark, distributedMatrix, distributedVector );
+#ifdef HAVE_CUDA
+      benchmarkDistributedSpmvCuda( benchmark, distributedMatrix, distributedVector );
+#endif
+
+#ifndef NDEBUG
+      // compare results of normal and distributed spmv
+      VectorType y;
+      y.setLike( vector );
+      matrix.vectorProduct( vector, y );
+      DistributedVector distributedY;
+      distributedY.setLike( distributedVector );
+      distributedMatrix.vectorProduct( distributedVector, distributedY );
+      const int rank = CommunicatorType::GetRank( distributedMatrix.getCommunicationGroup() );
+      const int nproc = CommunicatorType::GetSize( distributedMatrix.getCommunicationGroup() );
+      VectorType subY( y,
+                       Partitioner::getOffset( matrix.getRows(), rank, nproc ),
+                       Partitioner::getSizeForRank( matrix.getRows(), rank, nproc ) );
+      TNL_ASSERT_EQ( distributedY.getLocalVectorView(), subY, "WRONG RESULT !!!" );
+#endif
+   }
+};
+
+void
+configSetup( Config::ConfigDescription & config )
+{
+   config.addDelimiter( "Benchmark settings:" );
+   config.addEntry< String >( "log-file", "Log file name.", "tnl-benchmark-distributed-spmv.log");
+   config.addEntry< String >( "output-mode", "Mode for opening the log file.", "overwrite" );
+   config.addEntryEnum( "append" );
+   config.addEntryEnum( "overwrite" );
+   config.addEntry< int >( "loops", "Number of repetitions of the benchmark.", 10 );
+   config.addRequiredEntry< String >( "input-matrix", "File name of the input matrix (in binary TNL format)." );
+   config.addRequiredEntry< String >( "input-vector", "File name of the input vector (in binary TNL format)." );
+   config.addEntry< String >( "name", "Name of the matrix in the benchmark.", "" );
+   config.addEntry< int >( "verbose", "Verbose mode.", 1 );
+   config.addEntry< bool >( "reorder-dofs", "Reorder matrix entries corresponding to the same DOF together.", false );
+
+   config.addDelimiter( "Device settings:" );
+   Devices::Host::configSetup( config );
+   Devices::Cuda::configSetup( config );
+   CommunicatorType::configSetup( config );
+}
+
+int
+main( int argc, char* argv[] )
+{
+#ifndef NDEBUG
+   Debugging::trackFloatingPointExceptions();
+#endif
+
+   Config::ParameterContainer parameters;
+   Config::ConfigDescription conf_desc;
+
+   configSetup( conf_desc );
+
+   Communicators::ScopedInitializer< CommunicatorType > scopedInit(argc, argv);
+   const int rank = CommunicatorType::GetRank( CommunicatorType::AllGroup );
+
+   if( ! parseCommandLine( argc, argv, conf_desc, parameters ) ) {
+      conf_desc.printUsage( argv[ 0 ] );
+      return EXIT_FAILURE;
+   }
+
+   if( ! Devices::Host::setup( parameters ) ||
+       ! Devices::Cuda::setup( parameters ) ||
+       ! CommunicatorType::setup( parameters ) )
+      return EXIT_FAILURE;
+
+   const String & logFileName = parameters.getParameter< String >( "log-file" );
+   const String & outputMode = parameters.getParameter< String >( "output-mode" );
+   const unsigned loops = parameters.getParameter< unsigned >( "loops" );
+   const unsigned verbose = (rank == 0) ? parameters.getParameter< unsigned >( "verbose" ) : 0;
+
+   // open log file
+   auto mode = std::ios::out;
+   if( outputMode == "append" )
+       mode |= std::ios::app;
+   std::ofstream logFile;
+   if( rank == 0 )
+      logFile.open( logFileName.getString(), mode );
+
+   // init benchmark and common metadata
+   Benchmark benchmark( loops, verbose );
+
+   // prepare global metadata
+   const int cpu_id = 0;
+   Devices::CacheSizes cacheSizes = Devices::SystemInfo::getCPUCacheSizes( cpu_id );
+   String cacheInfo = String( cacheSizes.L1data ) + ", "
+                       + String( cacheSizes.L1instruction ) + ", "
+                       + String( cacheSizes.L2 ) + ", "
+                       + String( cacheSizes.L3 );
+#ifdef HAVE_CUDA
+   const int activeGPU = Devices::CudaDeviceInfo::getActiveDevice();
+   const String deviceArch = String( Devices::CudaDeviceInfo::getArchitectureMajor( activeGPU ) ) + "." +
+                             String( Devices::CudaDeviceInfo::getArchitectureMinor( activeGPU ) );
+#endif
+   Benchmark::MetadataMap metadata {
+       { "host name", Devices::SystemInfo::getHostname() },
+       { "architecture", Devices::SystemInfo::getArchitecture() },
+       { "system", Devices::SystemInfo::getSystemName() },
+       { "system release", Devices::SystemInfo::getSystemRelease() },
+       { "start time", Devices::SystemInfo::getCurrentTime() },
+#ifdef HAVE_MPI
+       { "number of MPI processes", CommunicatorType::GetSize( CommunicatorType::AllGroup ) },
+#endif
+       { "OpenMP enabled", Devices::Host::isOMPEnabled() },
+       { "OpenMP threads", Devices::Host::getMaxThreadsCount() },
+       { "CPU model name", Devices::SystemInfo::getCPUModelName( cpu_id ) },
+       { "CPU cores", Devices::SystemInfo::getNumberOfCores( cpu_id ) },
+       { "CPU threads per core", Devices::SystemInfo::getNumberOfThreads( cpu_id ) / Devices::SystemInfo::getNumberOfCores( cpu_id ) },
+       { "CPU max frequency (MHz)", Devices::SystemInfo::getCPUMaxFrequency( cpu_id ) / 1e3 },
+       { "CPU cache sizes (L1d, L1i, L2, L3) (kiB)", cacheInfo },
+#ifdef HAVE_CUDA
+       { "GPU name", Devices::CudaDeviceInfo::getDeviceName( activeGPU ) },
+       { "GPU architecture", deviceArch },
+       { "GPU CUDA cores", Devices::CudaDeviceInfo::getCudaCores( activeGPU ) },
+       { "GPU clock rate (MHz)", (double) Devices::CudaDeviceInfo::getClockRate( activeGPU ) / 1e3 },
+       { "GPU global memory (GB)", (double) Devices::CudaDeviceInfo::getGlobalMemory( activeGPU ) / 1e9 },
+       { "GPU memory clock rate (MHz)", (double) Devices::CudaDeviceInfo::getMemoryClockRate( activeGPU ) / 1e3 },
+       { "GPU memory ECC enabled", Devices::CudaDeviceInfo::getECCEnabled( activeGPU ) },
+#endif
+   };
+
+   // TODO: implement resolveMatrixType
+//   return ! Matrices::resolveMatrixType< MainConfig,
+//                                         Devices::Host,
+//                                         SpmvBenchmark >( benchmark, metadata, parameters );
+   using MatrixType = Matrices::SlicedEllpack< double, Devices::Host, int >;
+   const bool status = SpmvBenchmark< MatrixType >::run( benchmark, metadata, parameters );
+
+   if( rank == 0 )
+      if( ! benchmark.save( logFile ) ) {
+         std::cerr << "Failed to write the benchmark results to file '" << parameters.getParameter< String >( "log-file" ) << "'." << std::endl;
+         return EXIT_FAILURE;
+      }
+
+   return ! status;
+}
-- 
GitLab