Commit a7aa9460 authored by Jakub Klinkovský's avatar Jakub Klinkovský
Browse files

Merge branch 'JK/benchmarks' into 'develop'

Benchmarks refactoring

Closes #96 and #62

See merge request !107
parents bc4c3e2f 521dd4af
Loading
Loading
Loading
Loading

scripts/eti.py

0 → 100755
+74 −0
Original line number Diff line number Diff line
#! /usr/bin/env python3

import os.path
import pathlib
import re
import sys

if len(sys.argv) != 2:
    print(f"usage: {sys.argv[0]} FILE\n\nwhere FILE is a C++ source code or header file.", file=sys.stderr)
    sys.exit(1)
if not os.path.isfile(sys.argv[1]):
    print(f"error: {sys.argv[1]} is not a valid file.", file=sys.stderr)
    sys.exit(1)

src = sys.argv[1]
basename = os.path.splitext(os.path.basename(src))[0]
dirname = f"{basename}.templates"

if not os.path.isdir(dirname):
    os.mkdir(dirname)

def get_source_code(namespaces, extern_template_instantiation):
    eti = extern_template_instantiation.strip().replace("extern ", "", 1)
    # use absolute path for the include when src is an absolute path
    # (e.g. when called by CMake, because relative include does not work with
    # its separate build dir structure)
    if src == os.path.abspath(src):
        source_code = f"#include \"{src}\"\n"
    # use relative path for the include when src is relative
    else:
        relpath = os.path.relpath(src, dirname)
        source_code = f"#include \"{relpath}\"\n"
    for ns in namespaces:
        source_code += f"namespace {ns} {{\n"
    source_code += eti + "\n"
    for ns in namespaces:
        source_code += f"}} // namespace {ns}\n"
    return source_code

def check_write(content, fname):
    write = False
    if os.path.isfile(fname):
        write = open(fname, "r").read().strip() != content.strip()
    else:
        write = True

    if write is True:
        with open(fname, "w") as out:
            out.write(content)

i = 0
namespaces = []
file_names = set()
for line in open(src).readlines():
    # heuristics for namespaces
    ns_begin = re.search(r"^\s*namespace\s+(\w+)\s*\{$", line)
    if ns_begin:
        namespaces.append(ns_begin.group(1))
    ns_end = re.search(r"^\s*\}\s*\/\/\s*namespace\s+(\w+)$", line)
    if ns_end:
        namespaces.pop(-1)

    if line.strip().startswith("extern template"):
        source_code = get_source_code(namespaces, line)
        for ext in ["cpp", "cu"]:
            fname = f"{dirname}/{basename}.t{i}.{ext}"
            check_write(source_code, fname)
            file_names.add(fname)
        i += 1

# remove extraneous files from the target directory
for path in pathlib.Path(dirname).iterdir():
    if str(path) not in file_names:
        path.unlink()
+3 −7
Original line number Diff line number Diff line
@@ -14,8 +14,7 @@

#include <cstring>

#include "../Benchmarks.h"

#include <TNL/Benchmarks/Benchmarks.h>
#include <TNL/Containers/Array.h>

namespace TNL {
@@ -116,10 +115,7 @@ benchmarkArrayOperations( Benchmark<> & benchmark,
      hostArray = hostArray2;
   };
   benchmark.setOperation( "copy (operator=)", 2 * datasetSize );
   // copyBasetime is used later inside HAVE_CUDA guard, so the compiler will
   // complain when compiling without CUDA
   const double copyBasetime = benchmark.time< Devices::Host >( reset1, "CPU", copyAssignHostHost );
   (void)copyBasetime;  // ignore unused variable
   benchmark.time< Devices::Host >( reset1, "CPU", copyAssignHostHost );
#ifdef HAVE_CUDA
   auto copyAssignCudaCuda = [&]() {
      deviceArray = deviceArray2;
@@ -135,7 +131,7 @@ benchmarkArrayOperations( Benchmark<> & benchmark,
   auto copyAssignCudaHost = [&]() {
      hostArray = deviceArray;
   };
   benchmark.setOperation( "copy (operator=)", datasetSize, copyBasetime );
   benchmark.setOperation( "copy (operator=)", datasetSize, benchmark.getBaseTime() );
   benchmark.time< Devices::Cuda >( reset1, "CPU->GPU", copyAssignHostCuda );
   benchmark.time< Devices::Cuda >( reset1, "GPU->CPU", copyAssignCudaHost );
#endif
+18 −45
Original line number Diff line number Diff line
/***************************************************************************
                          dense-mv.h  -  description
                          gemv.h  -  description
                             -------------------
    begin                : Jul 8, 2021
    copyright            : (C) 2021 by Tomas Oberhuber et al.
@@ -8,15 +8,14 @@

/* See Copyright Notice in tnl/Copyright */

// Implemented by: Jakub Klinkovsky
// Implemented by: Jakub Klinkovsky, Tomas Oberhuber

#pragma once

#include "../Benchmarks.h"
#include <TNL/Benchmarks/Benchmarks.h>
#include "cublasWrappers.h"

#include <TNL/Containers/Vector.h>
#include <TNL/Pointers/DevicePointer.h>
#include <TNL/Matrices/DenseMatrix.h>
#include <TNL/Devices/Cuda.h>
#include <TNL/Devices/Host.h>
@@ -27,16 +26,12 @@ namespace Benchmarks {
template< typename Matrix >
void setMatrix( Matrix& matrix )
{
   using RealType = typename Matrix::RealType;
   using IndexType = typename Matrix::IndexType;
   matrix.forAllElements( [] __cuda_callable__ ( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx_, RealType& value ) {
       value = 1.0; } );
   matrix.setValue( 1.0 );
}

template< typename Real >
void
benchmarkDenseMVSynthetic( Benchmark<> & benchmark,
                           const int & size )
benchmarkGemv( Benchmark<> & benchmark, int rows, int columns )
{
   using HostMatrix = TNL::Matrices::DenseMatrix< Real, TNL::Devices::Host >;
   using RowMajorCudaMatrix = TNL::Matrices::DenseMatrix< Real, TNL::Devices::Cuda, int, TNL::Algorithms::Segments::RowMajorOrder >;
@@ -50,20 +45,13 @@ benchmarkDenseMVSynthetic( Benchmark<> & benchmark,
   HostVector inHostVector, outHostVector;
   CudaVector inCudaVector, outCudaVector1, outCudaVector2;

   // create benchmark group
   const std::vector< String > parsedType = parseObjectType( getType< HostMatrix >() );
#ifdef HAVE_CUDA
   benchmark.createHorizontalGroup( parsedType[ 0 ], 2 );
#else
   benchmark.createHorizontalGroup( parsedType[ 0 ], 1 );
#endif

   hostMatrix.setDimensions( size, size );
   inHostVector.setSize( size );
   outHostVector.setSize( size );
   hostMatrix.setDimensions( rows, columns );
   inHostVector.setSize( columns );
   outHostVector.setSize( rows );

   setMatrix< HostMatrix >( hostMatrix );
   const double datasetSize = (double) ( size * size ) * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB;
   const double datasetSize = (double) ( rows * columns + rows + columns ) * sizeof(Real) / oneGB;
   benchmark.setOperation( "gemv", datasetSize );

   // reset function
   auto reset = [&]() {
@@ -80,14 +68,13 @@ benchmarkDenseMVSynthetic( Benchmark<> & benchmark,
   auto spmvHost = [&]() {
      hostMatrix.vectorProduct( inHostVector, outHostVector );
   };
   benchmark.setOperation( datasetSize );
   benchmark.time< Devices::Host >( reset, "CPU", spmvHost );

#ifdef HAVE_CUDA
   columnMajorCudaMatrix.setDimensions( size, size );
   inCudaVector.setSize( size );
   outCudaVector1.setSize( size );
   outCudaVector2.setSize( size );
   columnMajorCudaMatrix.setDimensions( rows, columns );
   inCudaVector.setSize( columns );
   outCudaVector1.setSize( rows );
   outCudaVector2.setSize( rows );
   setMatrix< ColumnMajorCudaMatrix >( columnMajorCudaMatrix );

   auto columnMajorMvCuda = [&]() {
@@ -97,7 +84,7 @@ benchmarkDenseMVSynthetic( Benchmark<> & benchmark,

   columnMajorCudaMatrix.reset();

   rowMajorCudaMatrix.setDimensions( size, size );
   rowMajorCudaMatrix.setDimensions( rows, columns );
   setMatrix< RowMajorCudaMatrix >( rowMajorCudaMatrix );

   auto rowMajorMvCuda = [&]() {
@@ -109,7 +96,7 @@ benchmarkDenseMVSynthetic( Benchmark<> & benchmark,
   //std::cerr << outCudaVector1 << std::endl << outCudaVector2 << std::endl;

   rowMajorCudaMatrix.reset();
   columnMajorCudaMatrix.setDimensions( size, size );
   columnMajorCudaMatrix.setDimensions( rows, columns );
   setMatrix< ColumnMajorCudaMatrix >( columnMajorCudaMatrix );

   cublasHandle_t cublasHandle;
@@ -117,8 +104,8 @@ benchmarkDenseMVSynthetic( Benchmark<> & benchmark,
   auto mvCublas = [&] () {
      Real alpha = 1.0;
      Real beta = 0.0;
      cublasGemv( cublasHandle, CUBLAS_OP_N, size, size, &alpha,
                  columnMajorCudaMatrix.getValues().getData(), size,
      cublasGemv( cublasHandle, CUBLAS_OP_N, rows, columns, &alpha,
                  columnMajorCudaMatrix.getValues().getData(), rows,
                  inCudaVector.getData(), 1, &beta,
                  outCudaVector1.getData(), 1 );
   };
@@ -128,19 +115,5 @@ benchmarkDenseMVSynthetic( Benchmark<> & benchmark,
#endif
}

/*template< typename Real = double,
          typename Index = int >
void
benchmarkDenseSynthetic( Benchmark<> & benchmark,
                         const int & size )
{
   // TODO: benchmark all formats from tnl-benchmark-spmv (different parameters of the base formats)
   // NOTE: CSR is disabled because it is very slow on GPU
   //benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Scalar >( benchmark, size, elementsPerRow );
   benchmarkSpMV< Real, Benchmarks::SpMV::ReferenceFormats::Legacy::Ellpack >( benchmark, size, elementsPerRow );
   benchmarkSpMV< Real, SlicedEllpack >( benchmark, size, elementsPerRow );
   benchmarkSpMV< Real, Benchmarks::SpMV::ReferenceFormats::Legacy::ChunkedEllpack >( benchmark, size, elementsPerRow );
}*/

} // namespace Benchmarks
} // namespace TNL

src/Benchmarks/BLAS/spmv.h

deleted100644 → 0
+0 −189
Original line number Diff line number Diff line
/***************************************************************************
                          spmv.h  -  description
                             -------------------
    begin                : Dec 30, 2015
    copyright            : (C) 2015 by Tomas Oberhuber et al.
    email                : tomas.oberhuber@fjfi.cvut.cz
 ***************************************************************************/

/* See Copyright Notice in tnl/Copyright */

// Implemented by: Jakub Klinkovsky

#pragma once

#include "../Benchmarks.h"

#include <TNL/Pointers/DevicePointer.h>
#include <Benchmarks/SpMV/ReferenceFormats/Legacy/CSR.h>
#include <Benchmarks/SpMV/ReferenceFormats/Legacy/Ellpack.h>
#include <Benchmarks/SpMV/ReferenceFormats/Legacy/SlicedEllpack.h>
#include <Benchmarks/SpMV/ReferenceFormats/Legacy/ChunkedEllpack.h>

namespace TNL {
namespace Benchmarks {

// silly alias to match the number of template parameters with other formats
template< typename Real, typename Device, typename Index >
using SlicedEllpack = SpMV::ReferenceFormats::Legacy::SlicedEllpack< Real, Device, Index >;

// Legacy formats
template< typename Real, typename Device, typename Index >
using SparseMatrixLegacy_CSR_Scalar = SpMV::ReferenceFormats::Legacy::CSR< Real, Device, Index, SpMV::ReferenceFormats::Legacy::CSRScalar >;


template< typename Matrix >
int setHostTestMatrix( Matrix& matrix,
                       const int elementsPerRow )
{
   const int size = matrix.getRows();
   int elements( 0 );
   for( int row = 0; row < size; row++ ) {
      int col = row - elementsPerRow / 2;
      for( int element = 0; element < elementsPerRow; element++ ) {
         if( col + element >= 0 &&
            col + element < size )
         {
            matrix.setElement( row, col + element, element + 1 );
            elements++;
         }
      }
   }
   return elements;
}

#ifdef HAVE_CUDA
template< typename Matrix >
__global__ void setCudaTestMatrixKernel( Matrix* matrix,
                                         const int elementsPerRow,
                                         const int gridIdx )
{
   const int rowIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
   if( rowIdx >= matrix->getRows() )
      return;
   int col = rowIdx - elementsPerRow / 2;
   for( int element = 0; element < elementsPerRow; element++ ) {
      if( col + element >= 0 &&
         col + element < matrix->getColumns() )
         matrix->setElementFast( rowIdx, col + element, element + 1 );
   }
}
#endif

template< typename Matrix >
void setCudaTestMatrix( Matrix& matrix,
                        const int elementsPerRow )
{
#ifdef HAVE_CUDA
   typedef typename Matrix::IndexType IndexType;
   typedef typename Matrix::RealType RealType;
   Pointers::DevicePointer< Matrix > kernel_matrix( matrix );
   dim3 cudaBlockSize( 256 ), cudaGridSize( Cuda::getMaxGridSize() );
   const IndexType cudaBlocks = roundUpDivision( matrix.getRows(), cudaBlockSize.x );
   const IndexType cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() );
   for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ ) {
      if( gridIdx == cudaGrids - 1 )
         cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize();
      setCudaTestMatrixKernel< Matrix >
         <<< cudaGridSize, cudaBlockSize >>>
         ( &kernel_matrix.template modifyData< Devices::Cuda >(), elementsPerRow, gridIdx );
        TNL_CHECK_CUDA_DEVICE;
   }
#endif
}


// TODO: rename as benchmark_SpMV_synthetic and move to spmv-synthetic.h
template< typename Real,
          template< typename, typename, typename > class Matrix >
void
benchmarkSpMV( Benchmark<> & benchmark,
               const int & size,
               const int elementsPerRow = 5 )
{
   typedef Matrix< Real, Devices::Host, int > HostMatrix;
   typedef Matrix< Real, Devices::Cuda, int > DeviceMatrix;
   typedef Containers::Vector< Real, Devices::Host, int > HostVector;
   typedef Containers::Vector< Real, Devices::Cuda, int > CudaVector;

   HostMatrix hostMatrix;
   DeviceMatrix deviceMatrix;
   Containers::Vector< int, Devices::Host, int > hostRowLengths;
   Containers::Vector< int, Devices::Cuda, int > deviceRowLengths;
   HostVector hostVector, hostVector2;
   CudaVector deviceVector, deviceVector2;

   // create benchmark group
   const std::vector< String > parsedType = parseObjectType( getType< HostMatrix >() );
#ifdef HAVE_CUDA
   benchmark.createHorizontalGroup( parsedType[ 0 ], 2 );
#else
   benchmark.createHorizontalGroup( parsedType[ 0 ], 1 );
#endif

   hostRowLengths.setSize( size );
   hostMatrix.setDimensions( size, size );
   hostVector.setSize( size );
   hostVector2.setSize( size );
#ifdef HAVE_CUDA
   deviceRowLengths.setSize( size );
   deviceMatrix.setDimensions( size, size );
   deviceVector.setSize( size );
   deviceVector2.setSize( size );
#endif

   hostRowLengths.setValue( elementsPerRow );
#ifdef HAVE_CUDA
   deviceRowLengths.setValue( elementsPerRow );
#endif

   hostMatrix.setCompressedRowLengths( hostRowLengths );
#ifdef HAVE_CUDA
   deviceMatrix.setCompressedRowLengths( deviceRowLengths );
#endif

   const int elements = setHostTestMatrix< HostMatrix >( hostMatrix, elementsPerRow );
   setCudaTestMatrix< DeviceMatrix >( deviceMatrix, elementsPerRow );
   const double datasetSize = (double) elements * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB;

   // reset function
   auto reset = [&]() {
      hostVector.setValue( 1.0 );
      hostVector2.setValue( 0.0 );
#ifdef HAVE_CUDA
      deviceVector.setValue( 1.0 );
      deviceVector2.setValue( 0.0 );
#endif
   };

   // compute functions
   auto spmvHost = [&]() {
      hostMatrix.vectorProduct( hostVector, hostVector2 );
   };
   benchmark.setOperation( datasetSize );
   benchmark.time< Devices::Host >( reset, "CPU", spmvHost );
#ifdef HAVE_CUDA
   auto spmvCuda = [&]() {
      deviceMatrix.vectorProduct( deviceVector, deviceVector2 );
   };
   benchmark.time< Devices::Cuda >( reset, "GPU", spmvCuda );
#endif
}

template< typename Real = double,
          typename Index = int >
void
benchmarkSpmvSynthetic( Benchmark<> & benchmark,
                        const int & size,
                        const int & elementsPerRow )
{
   // TODO: benchmark all formats from tnl-benchmark-spmv (different parameters of the base formats)
   // NOTE: CSR is disabled because it is very slow on GPU
   //benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Scalar >( benchmark, size, elementsPerRow );
   benchmarkSpMV< Real, Benchmarks::SpMV::ReferenceFormats::Legacy::Ellpack >( benchmark, size, elementsPerRow );
   benchmarkSpMV< Real, SlicedEllpack >( benchmark, size, elementsPerRow );
   benchmarkSpMV< Real, Benchmarks::SpMV::ReferenceFormats::Legacy::ChunkedEllpack >( benchmark, size, elementsPerRow );
}

} // namespace Benchmarks
} // namespace TNL
+38 −52
Original line number Diff line number Diff line
@@ -21,8 +21,7 @@
#include "array-operations.h"
#include "vector-operations.h"
#include "triad.h"
#include "spmv.h"
#include "dense-mv.h"
#include "gemv.h"


using namespace TNL;
@@ -32,37 +31,39 @@ using namespace TNL::Benchmarks;
template< typename Real >
void
runBlasBenchmarks( Benchmark<> & benchmark,
                   Benchmark<>::MetadataMap metadata,
                   const std::size_t & minSize,
                   const std::size_t & maxSize,
                   const double & sizeStepFactor,
                   const int & elementsPerRow )
                   const double & sizeStepFactor )
{
   const String precision = getType< Real >();
   metadata["precision"] = precision;
   benchmark.setMetadataWidths({
      { "operation", 30 },
      { "performer", 21 },
      { "precision", 10 },
   });

   // Array operations
   benchmark.newBenchmark( String("Array operations (") + precision + ", host allocator = Host)",
                           metadata );
   std::cout << "\n== Array operations ==\n" << std::endl;
   for( std::size_t size = minSize; size <= maxSize; size *= 2 ) {
      benchmark.setMetadataColumns( Benchmark<>::MetadataColumns({
         { "precision", getType< Real >() },
         { "host allocator", "Host" },
         { "size", convertToString( size ) },
      } ));
      benchmarkArrayOperations< Real >( benchmark, size );
   }
#ifdef HAVE_CUDA
   benchmark.newBenchmark( String("Array operations (") + precision + ", host allocator = CudaHost)",
                           metadata );
   for( std::size_t size = minSize; size <= maxSize; size *= 2 ) {
      benchmark.setMetadataColumns( Benchmark<>::MetadataColumns({
         { "precision", getType< Real >() },
         { "host allocator", "CudaHost" },
         { "size", convertToString( size ) },
      } ));
      benchmarkArrayOperations< Real, int, Allocators::CudaHost >( benchmark, size );
   }
   benchmark.newBenchmark( String("Array operations (") + precision + ", host allocator = CudaManaged)",
                           metadata );
   for( std::size_t size = minSize; size <= maxSize; size *= 2 ) {
      benchmark.setMetadataColumns( Benchmark<>::MetadataColumns({
         { "precision", getType< Real >() },
         { "host allocator", "CudaManaged" },
         { "size", convertToString( size ) },
      } ));
      benchmarkArrayOperations< Real, int, Allocators::CudaManaged >( benchmark, size );
@@ -70,10 +71,10 @@ runBlasBenchmarks( Benchmark<> & benchmark,
#endif

   // Vector operations
   benchmark.newBenchmark( String("Vector operations (") + precision + ")",
                           metadata );
   std::cout << "\n== Vector operations ==\n" << std::endl;
   for( std::size_t size = minSize; size <= maxSize; size *= sizeStepFactor ) {
      benchmark.setMetadataColumns( Benchmark<>::MetadataColumns({
         { "precision", getType< Real >() },
         { "size", convertToString( size ) },
      } ));
      benchmarkVectorOperations< Real >( benchmark, size );
@@ -81,39 +82,30 @@ runBlasBenchmarks( Benchmark<> & benchmark,

   // Triad benchmark: copy from host, compute, copy to host
#ifdef HAVE_CUDA
   benchmark.newBenchmark( String("Triad benchmark (") + precision + ")",
                           metadata );
   std::cout << "\n== Triad ==\n" << std::endl;
   for( std::size_t size = minSize; size <= maxSize; size *= 2 ) {
      benchmark.setMetadataColumns( Benchmark<>::MetadataColumns({
         { "precision", getType< Real >() },
         { "size", convertToString( size ) },
      } ));
      benchmarkTriad< Real >( benchmark, size );
   }
#endif

   // Sparse matrix-vector multiplication
   benchmark.newBenchmark( String("Sparse matrix-vector multiplication (") + precision + ")",
                           metadata );
   for( std::size_t size = minSize; size <= maxSize; size *= 2 ) {
      benchmark.setMetadataColumns( Benchmark<>::MetadataColumns({
         { "rows", convertToString( size ) },
         { "columns", convertToString( size ) },
         { "elements per row", convertToString( elementsPerRow ) },
      } ));
      benchmarkSpmvSynthetic< Real >( benchmark, size, elementsPerRow );
   }

   // Dense matrix-vector multiplication
   benchmark.newBenchmark( String("Dense matrix-vector multiplication (") + precision + ")",
                           metadata );
   for( std::size_t size = 10; size <= 20000; size *= 2 ) {
   std::cout << "\n== Dense matrix-vector multiplication ==\n" << std::endl;
   for( std::size_t rows = 10; rows <= 20000 * 20000; rows *= 2 ) {
      for( std::size_t columns = 10; columns <= 20000 * 20000; columns *= 2 ) {
         if( rows * columns > 20000 * 20000 )
            break;
         benchmark.setMetadataColumns( Benchmark<>::MetadataColumns({
         { "rows", convertToString( size ) },
         { "columns", convertToString( size ) }
            { "precision", getType< Real >() },
            { "rows", convertToString( rows ) },
            { "columns", convertToString( columns ) }
         } ));
      benchmarkDenseMVSynthetic< Real >( benchmark, size );
         benchmarkGemv< Real >( benchmark, rows, columns );
      }
   }

}

void
@@ -132,7 +124,6 @@ setupConfig( Config::ConfigDescription & config )
   config.addEntry< int >( "max-size", "Minimum size of arrays/vectors used in the benchmark.", 10000000 );
   config.addEntry< int >( "size-step-factor", "Factor determining the size of arrays/vectors used in the benchmark. First size is min-size and each following size is stepFactor*previousSize, up to max-size.", 2 );
   config.addEntry< int >( "loops", "Number of iterations for every computation.", 10 );
   config.addEntry< int >( "elements-per-row", "Number of elements per row of the sparse matrix used in the matrix-vector multiplication benchmark.", 5 );
   config.addEntry< int >( "verbose", "Verbose mode.", 1 );

   config.addDelimiter( "Device settings:" );
@@ -167,7 +158,6 @@ main( int argc, char* argv[] )
   const std::size_t maxSize = parameters.getParameter< int >( "max-size" );
   const int sizeStepFactor = parameters.getParameter< int >( "size-step-factor" );
   const int loops = parameters.getParameter< int >( "loops" );
   const int elementsPerRow = parameters.getParameter< int >( "elements-per-row" );
   const int verbose = parameters.getParameter< int >( "verbose" );

   if( sizeStepFactor <= 1 ) {
@@ -179,23 +169,19 @@ main( int argc, char* argv[] )
   auto mode = std::ios::out;
   if( outputMode == "append" )
       mode |= std::ios::app;
   std::ofstream logFile( logFileName.getString(), mode );
   std::ofstream logFile( logFileName, mode );

   // init benchmark and common metadata
   Benchmark<> benchmark( loops, verbose );
   // init benchmark and set parameters
   Benchmark<> benchmark( logFile, loops, verbose );

   // prepare global metadata
   Benchmark<>::MetadataMap metadata = getHardwareMetadata< Logging >();
   // write global metadata into a separate file
   std::map< std::string, std::string > metadata = getHardwareMetadata();
   writeMapAsJson( metadata, logFileName, ".metadata.json" );

   if( precision == "all" || precision == "float" )
      runBlasBenchmarks< float >( benchmark, metadata, minSize, maxSize, sizeStepFactor, elementsPerRow );
      runBlasBenchmarks< float >( benchmark, minSize, maxSize, sizeStepFactor );
   if( precision == "all" || precision == "double" )
      runBlasBenchmarks< double >( benchmark, metadata, minSize, maxSize, sizeStepFactor, elementsPerRow );

   if( ! benchmark.save( logFile ) ) {
      std::cerr << "Failed to write the benchmark results to file '" << logFileName << "'." << std::endl;
      return EXIT_FAILURE;
   }
      runBlasBenchmarks< double >( benchmark, minSize, maxSize, sizeStepFactor );

   return EXIT_SUCCESS;
}
Loading