From a147f47a056ff726a12b2383607f10eb6d55841a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Thu, 5 Mar 2020 22:11:57 +0100
Subject: [PATCH] Improving SpMV benchmark with benchmark results.

---
 src/Benchmarks/SpMV/SpmvBenchmarkResult.h |  52 +++++
 src/Benchmarks/SpMV/spmv.h                | 234 ++++++++++------------
 2 files changed, 154 insertions(+), 132 deletions(-)
 create mode 100644 src/Benchmarks/SpMV/SpmvBenchmarkResult.h

diff --git a/src/Benchmarks/SpMV/SpmvBenchmarkResult.h b/src/Benchmarks/SpMV/SpmvBenchmarkResult.h
new file mode 100644
index 0000000000..699be6efd1
--- /dev/null
+++ b/src/Benchmarks/SpMV/SpmvBenchmarkResult.h
@@ -0,0 +1,52 @@
+/***************************************************************************
+                          SpmvBenchmarkResult.h  -  description
+                             -------------------
+    begin                : Mar 5, 2020
+    copyright            : (C) 2020 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#include "../Benchmarks.h"
+
+namespace TNL {
+namespace Benchmarks {
+
+template< typename Real = double,
+          typename Index = int >
+struct SpmvBenchmarkResult
+: public BenchmarkResult
+{
+   using RealType = Real;
+   using IndexType = Index;
+   using HostVector = Containers::Vector< Real, Devices::Host, Index >;
+   using CudaVector = Containers::Vector< Real, Devices::Cuda, Index >;
+
+   SpmvBenchmarkResult( CudaVector& cudaResult, HostVector& hostResult, CudaVector& cusparseResult )
+   : hostResult( hostResult ), cudaResult( cudaResult), cusparseResult( cusparseResult ){};
+
+   virtual HeaderElements getTableHeader() const override
+   {
+      return HeaderElements({"time", "stddev", "stddev/time", "speedup", "Host.Diff.Max", "Host.Diff.L2", "Cusparse.Diff.Max", "Cusparse.Diff.L2"});
+   }
+
+   virtual RowElements getRowElements() const override
+   {
+      HostVector cudaCopy, cusparseCopy, a, b;
+      cudaCopy = cudaResult;
+      cusparseCopy = cusparseResult;
+      a = cudaCopy - hostResult;
+      b = cudaCopy - cusparseCopy;
+      return RowElements({ time, stddev, stddev/time, speedup, max( abs( a ) ), lpNorm( a, 2.0 ), max( abs( b ) ), lpNorm( b, 2.0 ) });
+   }
+
+   HostVector &hostResult;
+
+   CudaVector &cudaResult, &cusparseResult;
+};
+   
+} //namespace Benchmarks
+} //namespace TNL
diff --git a/src/Benchmarks/SpMV/spmv.h b/src/Benchmarks/SpMV/spmv.h
index 3cd5c19d2a..8851e41143 100644
--- a/src/Benchmarks/SpMV/spmv.h
+++ b/src/Benchmarks/SpMV/spmv.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include "../Benchmarks.h"
+#include "SpmvBenchmarkResult.h"
 
 #include <TNL/Pointers/DevicePointer.h>
 #include <TNL/Matrices/Legacy/CSR.h>
@@ -113,168 +114,137 @@ benchmarkSpMV( Benchmark& benchmark,
                const String& inputFileName,
                bool verboseMR )
 {
-    // Setup CSR for cuSPARSE. It will compared to the format given as a template parameter to this function
-    typedef Matrices::Legacy::CSR< Real, Devices::Host, int > CSR_HostMatrix;
-    typedef Matrices::Legacy::CSR< Real, Devices::Cuda, int > CSR_DeviceMatrix;
+   // Setup CSR for cuSPARSE. It will compared to the format given as a template parameter to this function
+   using CSR_HostMatrix = Matrices::Legacy::CSR< Real, Devices::Host, int >;
+   using CSR_DeviceMatrix = Matrices::Legacy::CSR< Real, Devices::Cuda, int >;
 
-    CSR_HostMatrix CSRhostMatrix;
-    CSR_DeviceMatrix CSRdeviceMatrix;
+   CSR_HostMatrix CSRhostMatrix;
+   CSR_DeviceMatrix CSRdeviceMatrix;
 
-    // Read the matrix for CSR, to set up cuSPARSE
-    MatrixReader< CSR_HostMatrix >::readMtxFile( inputFileName, CSRhostMatrix, verboseMR );
+   // Read the matrix for CSR, to set up cuSPARSE
+   MatrixReader< CSR_HostMatrix >::readMtxFile( inputFileName, CSRhostMatrix, verboseMR );
 
 #ifdef HAVE_CUDA
-    // cuSPARSE handle setup
-    cusparseHandle_t cusparseHandle;
-    cusparseCreate( &cusparseHandle );
+   // cuSPARSE handle setup
+   cusparseHandle_t cusparseHandle;
+   cusparseCreate( &cusparseHandle );
 
-    // cuSPARSE (in TNL's CSR) only works for device, copy the matrix from host to device
-    CSRdeviceMatrix = CSRhostMatrix;
+   // cuSPARSE (in TNL's CSR) only works for device, copy the matrix from host to device
+   CSRdeviceMatrix = CSRhostMatrix;
 
-    // Delete the CSRhostMatrix, so it doesn't take up unnecessary space
-    CSRhostMatrix.reset();
+   // Delete the CSRhostMatrix, so it doesn't take up unnecessary space
+   CSRhostMatrix.reset();
 
-    // Initialize the cusparseCSR matrix.
-    TNL::CusparseCSR< Real > cusparseCSR;
-    cusparseCSR.init( CSRdeviceMatrix, &cusparseHandle );
+   // Initialize the cusparseCSR matrix.
+   TNL::CusparseCSR< Real > cusparseCSR;
+   cusparseCSR.init( CSRdeviceMatrix, &cusparseHandle );
 #endif
 
-    // Setup the format which is given as a template parameter to this function
-    typedef Matrix< Real, Devices::Host, int > HostMatrix;
-    typedef Matrix< Real, Devices::Cuda, int > DeviceMatrix;
-    typedef Containers::Vector< Real, Devices::Host, int > HostVector;
-    typedef Containers::Vector< Real, Devices::Cuda, int > CudaVector;
+   // Setup the format which is given as a template parameter to this function
+   typedef Matrix< Real, Devices::Host, int > HostMatrix;
+   typedef Matrix< Real, Devices::Cuda, int > DeviceMatrix;
+   typedef Containers::Vector< Real, Devices::Host, int > HostVector;
+   typedef Containers::Vector< Real, Devices::Cuda, int > CudaVector;
 
-    HostMatrix hostMatrix;
-    DeviceMatrix deviceMatrix;
-    HostVector hostVector, hostVector2;
-    CudaVector deviceVector, deviceVector2;
+   HostMatrix hostMatrix;
+   DeviceMatrix deviceMatrix;
+   HostVector hostVector, hostVector2;
+   CudaVector deviceVector, deviceVector2, cusparseVector;
 
-    // Load the format
-    MatrixReader< HostMatrix >::readMtxFile( inputFileName, hostMatrix, verboseMR );
+   // Load the format
+   MatrixReader< HostMatrix >::readMtxFile( inputFileName, hostMatrix, verboseMR );
 
 
-    // Setup MetaData here (not in tnl-benchmark-spmv.h, as done in Benchmarks/BLAS),
-    //  because we need the matrix loaded first to get the rows and columns
-    benchmark.setMetadataColumns( Benchmark::MetadataColumns({
-          { "matrix name", convertToString( getMatrixFileName( inputFileName ) ) },
-          { "non-zeros", convertToString( hostMatrix.getNumberOfNonzeroMatrixElements() ) },
-          { "rows", convertToString( hostMatrix.getRows() ) },
-          { "columns", convertToString( hostMatrix.getColumns() ) },
-          { "matrix format", MatrixInfo< HostMatrix >::getFormat() } //convertToString( getType( hostMatrix ) ) }
-       } ));
+   // Setup MetaData here (not in tnl-benchmark-spmv.h, as done in Benchmarks/BLAS),
+   //  because we need the matrix loaded first to get the rows and columns
+   benchmark.setMetadataColumns( Benchmark::MetadataColumns({
+         { "matrix name", convertToString( getMatrixFileName( inputFileName ) ) },
+         { "non-zeros", convertToString( hostMatrix.getNumberOfNonzeroMatrixElements() ) },
+         { "rows", convertToString( hostMatrix.getRows() ) },
+         { "columns", convertToString( hostMatrix.getColumns() ) },
+         { "matrix format", MatrixInfo< HostMatrix >::getFormat() } //convertToString( getType( hostMatrix ) ) }
+      } ));
 
-    hostVector.setSize( hostMatrix.getColumns() );
-    hostVector2.setSize( hostMatrix.getRows() );
+   hostVector.setSize( hostMatrix.getColumns() );
+   hostVector2.setSize( hostMatrix.getRows() );
 
 #ifdef HAVE_CUDA
-    deviceMatrix = hostMatrix;
-    deviceVector.setSize( hostMatrix.getColumns() );
-    deviceVector2.setSize( hostMatrix.getRows() );
+   deviceMatrix = hostMatrix;
+   deviceVector.setSize( hostMatrix.getColumns() );
+   deviceVector2.setSize( hostMatrix.getRows() );
+   cusparseVector.setSize( hostMatrix.getRows() );
 #endif
 
-    // reset function
-    auto reset = [&]() {
-       hostVector.setValue( 1.0 );
-       hostVector2.setValue( 0.0 );
- #ifdef HAVE_CUDA
-       deviceVector.setValue( 1.0 );
-       deviceVector2.setValue( 0.0 );
+   // reset function
+   auto resetHostVectors = [&]() {
+      hostVector = 1.0;
+      hostVector2 = 0.0;
+   };
+#ifdef HAVE_CUDA
+   auto resetCudaVectors = [&]() {
+      deviceVector = 1.0;
+      deviceVector2 = 0.0;
+   };
+   auto resetCusparseVectors = [&]() {
+      deviceVector = 1.0;
+      cusparseVector == 0.0;
+   };
  #endif
-    };
-
-    const int elements = hostMatrix.getNumberOfNonzeroMatrixElements();
 
-    const double datasetSize = (double) elements * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB;
+   const int elements = hostMatrix.getNumberOfNonzeroMatrixElements();
+   const double datasetSize = (double) elements * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB;
 
     // compute functions
-    auto spmvHost = [&]() {
-       hostMatrix.vectorProduct( hostVector, hostVector2 );
-    };
+   auto spmvHost = [&]() {
+      hostMatrix.vectorProduct( hostVector, hostVector2 );
+   };
 #ifdef HAVE_CUDA
-    auto spmvCuda = [&]() {
-       deviceMatrix.vectorProduct( deviceVector, deviceVector2 );
-    };
+   auto spmvCuda = [&]() {
+      deviceMatrix.vectorProduct( deviceVector, deviceVector2 );
+   };
 
-    auto spmvCusparse = [&]() {
-        cusparseCSR.vectorProduct( deviceVector, deviceVector2 );
-    };
+   auto spmvCusparse = [&]() {
+       cusparseCSR.vectorProduct( deviceVector, cusparseVector );
+   };
 #endif
 
-    benchmark.setOperation( datasetSize );
-    benchmark.time< Devices::Host >( reset, "CPU", spmvHost );
+   benchmark.setOperation( datasetSize );
+   benchmark.time< Devices::Host >( resetHostVectors, "CPU", spmvHost );
 
-    // Initialize the host vector to be compared.
-    //  (The values in hostVector2 will be reset when spmvCuda starts)
-    HostVector resultHostVector2;
-    resultHostVector2.setSize( hostVector2.getSize() );
-    resultHostVector2.setValue( 0.0 );
+   // Initialize the host vector to be compared.
+   //  (The values in hostVector2 will be reset when spmvCuda starts)
+   HostVector resultHostVector2;
+   resultHostVector2.setSize( hostVector2.getSize() );
+   resultHostVector2.setValue( 0.0 );
 
-    // Copy the values
-    resultHostVector2 = hostVector2;
+   // Copy the values
+   resultHostVector2 = hostVector2;
 
 #ifdef HAVE_CUDA
-    benchmark.time< Devices::Cuda >( reset, "GPU", spmvCuda );
-
-    // Initialize the device vector to be compared.
-    //  (The values in deviceVector2 will be reset when spmvCusparse starts)
-    HostVector resultDeviceVector2;
-    resultDeviceVector2.setSize( deviceVector2.getSize() );
-    resultDeviceVector2.setValue( 0.0 );
-
-    resultDeviceVector2 = deviceVector2;
-    
-    // Setup cuSPARSE MetaData, since it has the same header as CSR,
-    //  and therefore will not get its own headers (rows, cols, speedup etc.) in log.
-    //      * Not setting this up causes (among other undiscovered errors) the speedup from CPU to GPU on the input format to be overwritten.
-    benchmark.setMetadataColumns( Benchmark::MetadataColumns({
-          { "matrix name", convertToString( getMatrixFileName( inputFileName ) ) },
-          { "non-zeros", convertToString( hostMatrix.getNumberOfNonzeroMatrixElements() ) },
-          { "rows", convertToString( hostMatrix.getRows() ) },
-          { "columns", convertToString( hostMatrix.getColumns() ) },
-          { "matrix format", convertToString( "CSR-cuSPARSE-" + getFormatShort( hostMatrix ) ) }
-       } ));
-
-    benchmark.time< Devices::Cuda >( reset, "GPU", spmvCusparse );
-
-    HostVector resultcuSPARSEDeviceVector2;
-    resultcuSPARSEDeviceVector2.setSize( deviceVector2.getSize() );
-    resultcuSPARSEDeviceVector2.setValue( 0.0 );
-
-    resultcuSPARSEDeviceVector2 = deviceVector2;
-
-    // Difference between GPU (current format) and GPU-cuSPARSE results
-    //Real cuSparseDifferenceAbsMax = resultDeviceVector2.differenceAbsMax( resultcuSPARSEDeviceVector2 );
-    Real cuSparseDifferenceAbsMax = max( abs( resultDeviceVector2 - resultcuSPARSEDeviceVector2 ) );
-    //Real cuSparseDifferenceLpNorm = resultDeviceVector2.differenceLpNorm( resultcuSPARSEDeviceVector2, 1 );
-    Real cuSparseDifferenceLpNorm = lpNorm( resultDeviceVector2 - resultcuSPARSEDeviceVector2, 1 );
-
-    std::string GPUxGPUcuSparse_resultDifferenceAbsMax = "GPUxGPUcuSPARSE differenceAbsMax = " + std::to_string( cuSparseDifferenceAbsMax );
-    std::string GPUxGPUcuSparse_resultDifferenceLpNorm = "GPUxGPUcuSPARSE differenceLpNorm = " + std::to_string( cuSparseDifferenceLpNorm );
-
-    char *GPUcuSparse_absMax = &GPUxGPUcuSparse_resultDifferenceAbsMax[ 0u ];
-    char *GPUcuSparse_lpNorm = &GPUxGPUcuSparse_resultDifferenceLpNorm[ 0u ];
-
-
-    // Difference between CPU and GPU results for the current format
-    //Real differenceAbsMax = resultHostVector2.differenceAbsMax( resultDeviceVector2 );
-    Real differenceAbsMax = max( abs( resultHostVector2 - resultDeviceVector2 ) );
-    //Real differenceLpNorm = resultHostVector2.differenceLpNorm( resultDeviceVector2, 1 );
-    Real differenceLpNorm = lpNorm( resultHostVector2 - resultDeviceVector2, 1 );
-
-    std::string CPUxGPU_resultDifferenceAbsMax = "CPUxGPU differenceAbsMax = " + std::to_string( differenceAbsMax );
-    std::string CPUxGPU_resultDifferenceLpNorm = "CPUxGPU differenceLpNorm = " + std::to_string( differenceLpNorm );
-
-    char *CPUxGPU_absMax = &CPUxGPU_resultDifferenceAbsMax[ 0u ];
-    char *CPUxGPU_lpNorm = &CPUxGPU_resultDifferenceLpNorm[ 0u ];
-
-    // Print result differences of CPU and GPU of current format
-    std::cout << CPUxGPU_absMax << std::endl;
-    std::cout << CPUxGPU_lpNorm << std::endl;
-
-    // Print result differences of GPU of current format and GPU with cuSPARSE.
-    std::cout << GPUcuSparse_absMax << std::endl;
-    std::cout << GPUcuSparse_lpNorm << std::endl;
+   benchmark.time< Devices::Cuda >( resetCudaVectors, "GPU", spmvCuda );
+
+   // Initialize the device vector to be compared.
+   //  (The values in deviceVector2 will be reset when spmvCusparse starts)
+   HostVector resultDeviceVector2;
+   resultDeviceVector2.setSize( deviceVector2.getSize() );
+   resultDeviceVector2.setValue( 0.0 );
+
+   resultDeviceVector2 = deviceVector2;
+   
+   // Setup cuSPARSE MetaData, since it has the same header as CSR,
+   //  and therefore will not get its own headers (rows, cols, speedup etc.) in log.
+   //      * Not setting this up causes (among other undiscovered errors) the speedup from CPU to GPU on the input format to be overwritten.
+   benchmark.setMetadataColumns( Benchmark::MetadataColumns({
+         { "matrix name", convertToString( getMatrixFileName( inputFileName ) ) },
+         { "non-zeros", convertToString( hostMatrix.getNumberOfNonzeroMatrixElements() ) },
+         { "rows", convertToString( hostMatrix.getRows() ) },
+         { "columns", convertToString( hostMatrix.getColumns() ) },
+         { "matrix format", convertToString( "CSR-cuSPARSE-" + getFormatShort( hostMatrix ) ) }
+      } ));
+
+   SpmvBenchmarkResult< Real, int > benchmarkResult( deviceVector2, hostVector2, cusparseVector );
+   benchmark.time< Devices::Cuda >( resetCusparseVectors, "GPU", spmvCusparse, benchmarkResult );
+
  #endif
 
     std::cout << std::endl;
-- 
GitLab