Merge branch 'JK/benchmarks' into 'develop' (a7aa9460) · Commits · TNL / tnl-dev

scripts/eti.py

0 → 100755

+74 −0

Original line number	Diff line number	Diff line
		#! /usr/bin/env python3

		import os.path
		import pathlib
		import re
		import sys

		if len(sys.argv) != 2:
		print(f"usage: {sys.argv[0]} FILE\n\nwhere FILE is a C++ source code or header file.", file=sys.stderr)
		sys.exit(1)
		if not os.path.isfile(sys.argv[1]):
		print(f"error: {sys.argv[1]} is not a valid file.", file=sys.stderr)
		sys.exit(1)

		src = sys.argv[1]
		basename = os.path.splitext(os.path.basename(src))[0]
		dirname = f"{basename}.templates"

		if not os.path.isdir(dirname):
		os.mkdir(dirname)

		def get_source_code(namespaces, extern_template_instantiation):
		eti = extern_template_instantiation.strip().replace("extern ", "", 1)
		# use absolute path for the include when src is an absolute path
		# (e.g. when called by CMake, because relative include does not work with
		# its separate build dir structure)
		if src == os.path.abspath(src):
		source_code = f"#include \"{src}\"\n"
		# use relative path for the include when src is relative
		else:
		relpath = os.path.relpath(src, dirname)
		source_code = f"#include \"{relpath}\"\n"
		for ns in namespaces:
		source_code += f"namespace {ns} {{\n"
		source_code += eti + "\n"
		for ns in namespaces:
		source_code += f"}} // namespace {ns}\n"
		return source_code

		def check_write(content, fname):
		write = False
		if os.path.isfile(fname):
		write = open(fname, "r").read().strip() != content.strip()
		else:
		write = True

		if write is True:
		with open(fname, "w") as out:
		out.write(content)

		i = 0
		namespaces = []
		file_names = set()
		for line in open(src).readlines():
		# heuristics for namespaces
		ns_begin = re.search(r"^\snamespace\s+(\w+)\s\{$", line)
		if ns_begin:
		namespaces.append(ns_begin.group(1))
		ns_end = re.search(r"^\s\}\s\/\/\s*namespace\s+(\w+)$", line)
		if ns_end:
		namespaces.pop(-1)

		if line.strip().startswith("extern template"):
		source_code = get_source_code(namespaces, line)
		for ext in ["cpp", "cu"]:
		fname = f"{dirname}/{basename}.t{i}.{ext}"
		check_write(source_code, fname)
		file_names.add(fname)
		i += 1

		# remove extraneous files from the target directory
		for path in pathlib.Path(dirname).iterdir():
		if str(path) not in file_names:
		path.unlink()

src/Benchmarks/BLAS/array-operations.h

+3 −7

Original line number	Diff line number	Diff line
		@@ -14,8 +14,7 @@

		#include <cstring>

		#include "../Benchmarks.h"

		#include <TNL/Benchmarks/Benchmarks.h>
		#include <TNL/Containers/Array.h>

		namespace TNL {
		@@ -116,10 +115,7 @@ benchmarkArrayOperations( Benchmark<> & benchmark,
		hostArray = hostArray2;
		};
		benchmark.setOperation( "copy (operator=)", 2 * datasetSize );
		// copyBasetime is used later inside HAVE_CUDA guard, so the compiler will
		// complain when compiling without CUDA
		const double copyBasetime = benchmark.time< Devices::Host >( reset1, "CPU", copyAssignHostHost );
		(void)copyBasetime; // ignore unused variable
		benchmark.time< Devices::Host >( reset1, "CPU", copyAssignHostHost );
		#ifdef HAVE_CUDA
		auto copyAssignCudaCuda = [&]() {
		deviceArray = deviceArray2;
		@@ -135,7 +131,7 @@ benchmarkArrayOperations( Benchmark<> & benchmark,
		auto copyAssignCudaHost = [&]() {
		hostArray = deviceArray;
		};
		benchmark.setOperation( "copy (operator=)", datasetSize, copyBasetime );
		benchmark.setOperation( "copy (operator=)", datasetSize, benchmark.getBaseTime() );
		benchmark.time< Devices::Cuda >( reset1, "CPU->GPU", copyAssignHostCuda );
		benchmark.time< Devices::Cuda >( reset1, "GPU->CPU", copyAssignCudaHost );
		#endif

src/Benchmarks/BLAS/dense-mv.h→src/Benchmarks/BLAS/gemv.h

+18 −45

Original line number	Diff line number	Diff line
		/***************************************************************************
		dense-mv.h - description
		gemv.h - description
		-------------------
		begin : Jul 8, 2021
		copyright : (C) 2021 by Tomas Oberhuber et al.
		@@ -8,15 +8,14 @@

		/* See Copyright Notice in tnl/Copyright */

		// Implemented by: Jakub Klinkovsky
		// Implemented by: Jakub Klinkovsky, Tomas Oberhuber

		#pragma once

		#include "../Benchmarks.h"
		#include <TNL/Benchmarks/Benchmarks.h>
		#include "cublasWrappers.h"

		#include <TNL/Containers/Vector.h>
		#include <TNL/Pointers/DevicePointer.h>
		#include <TNL/Matrices/DenseMatrix.h>
		#include <TNL/Devices/Cuda.h>
		#include <TNL/Devices/Host.h>
		@@ -27,16 +26,12 @@ namespace Benchmarks {
		template< typename Matrix >
		void setMatrix( Matrix& matrix )
		{
		using RealType = typename Matrix::RealType;
		using IndexType = typename Matrix::IndexType;
		matrix.forAllElements( [] __cuda_callable__ ( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx_, RealType& value ) {
		value = 1.0; } );
		matrix.setValue( 1.0 );
		}

		template< typename Real >
		void
		benchmarkDenseMVSynthetic( Benchmark<> & benchmark,
		const int & size )
		benchmarkGemv( Benchmark<> & benchmark, int rows, int columns )
		{
		using HostMatrix = TNL::Matrices::DenseMatrix< Real, TNL::Devices::Host >;
		using RowMajorCudaMatrix = TNL::Matrices::DenseMatrix< Real, TNL::Devices::Cuda, int, TNL::Algorithms::Segments::RowMajorOrder >;
		@@ -50,20 +45,13 @@ benchmarkDenseMVSynthetic( Benchmark<> & benchmark,
		HostVector inHostVector, outHostVector;
		CudaVector inCudaVector, outCudaVector1, outCudaVector2;

		// create benchmark group
		const std::vector< String > parsedType = parseObjectType( getType< HostMatrix >() );
		#ifdef HAVE_CUDA
		benchmark.createHorizontalGroup( parsedType[ 0 ], 2 );
		#else
		benchmark.createHorizontalGroup( parsedType[ 0 ], 1 );
		#endif

		hostMatrix.setDimensions( size, size );
		inHostVector.setSize( size );
		outHostVector.setSize( size );
		hostMatrix.setDimensions( rows, columns );
		inHostVector.setSize( columns );
		outHostVector.setSize( rows );

		setMatrix< HostMatrix >( hostMatrix );
		const double datasetSize = (double) ( size * size ) * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB;
		const double datasetSize = (double) ( rows * columns + rows + columns ) * sizeof(Real) / oneGB;
		benchmark.setOperation( "gemv", datasetSize );

		// reset function
		auto reset = [&]() {
		@@ -80,14 +68,13 @@ benchmarkDenseMVSynthetic( Benchmark<> & benchmark,
		auto spmvHost = [&]() {
		hostMatrix.vectorProduct( inHostVector, outHostVector );
		};
		benchmark.setOperation( datasetSize );
		benchmark.time< Devices::Host >( reset, "CPU", spmvHost );

		#ifdef HAVE_CUDA
		columnMajorCudaMatrix.setDimensions( size, size );
		inCudaVector.setSize( size );
		outCudaVector1.setSize( size );
		outCudaVector2.setSize( size );
		columnMajorCudaMatrix.setDimensions( rows, columns );
		inCudaVector.setSize( columns );
		outCudaVector1.setSize( rows );
		outCudaVector2.setSize( rows );
		setMatrix< ColumnMajorCudaMatrix >( columnMajorCudaMatrix );

		auto columnMajorMvCuda = [&]() {
		@@ -97,7 +84,7 @@ benchmarkDenseMVSynthetic( Benchmark<> & benchmark,

		columnMajorCudaMatrix.reset();

		rowMajorCudaMatrix.setDimensions( size, size );
		rowMajorCudaMatrix.setDimensions( rows, columns );
		setMatrix< RowMajorCudaMatrix >( rowMajorCudaMatrix );

		auto rowMajorMvCuda = [&]() {
		@@ -109,7 +96,7 @@ benchmarkDenseMVSynthetic( Benchmark<> & benchmark,
		//std::cerr << outCudaVector1 << std::endl << outCudaVector2 << std::endl;

		rowMajorCudaMatrix.reset();
		columnMajorCudaMatrix.setDimensions( size, size );
		columnMajorCudaMatrix.setDimensions( rows, columns );
		setMatrix< ColumnMajorCudaMatrix >( columnMajorCudaMatrix );

		cublasHandle_t cublasHandle;
		@@ -117,8 +104,8 @@ benchmarkDenseMVSynthetic( Benchmark<> & benchmark,
		auto mvCublas = [&] () {
		Real alpha = 1.0;
		Real beta = 0.0;
		cublasGemv( cublasHandle, CUBLAS_OP_N, size, size, &alpha,
		columnMajorCudaMatrix.getValues().getData(), size,
		cublasGemv( cublasHandle, CUBLAS_OP_N, rows, columns, &alpha,
		columnMajorCudaMatrix.getValues().getData(), rows,
		inCudaVector.getData(), 1, &beta,
		outCudaVector1.getData(), 1 );
		};
		@@ -128,19 +115,5 @@ benchmarkDenseMVSynthetic( Benchmark<> & benchmark,
		#endif
		}

		/*template< typename Real = double,
		typename Index = int >
		void
		benchmarkDenseSynthetic( Benchmark<> & benchmark,
		const int & size )
		{
		// TODO: benchmark all formats from tnl-benchmark-spmv (different parameters of the base formats)
		// NOTE: CSR is disabled because it is very slow on GPU
		//benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Scalar >( benchmark, size, elementsPerRow );
		benchmarkSpMV< Real, Benchmarks::SpMV::ReferenceFormats::Legacy::Ellpack >( benchmark, size, elementsPerRow );
		benchmarkSpMV< Real, SlicedEllpack >( benchmark, size, elementsPerRow );
		benchmarkSpMV< Real, Benchmarks::SpMV::ReferenceFormats::Legacy::ChunkedEllpack >( benchmark, size, elementsPerRow );
		}*/

		} // namespace Benchmarks
		} // namespace TNL

src/Benchmarks/BLAS/spmv.h

deleted100644 → 0

+0 −189

Original line number	Diff line number	Diff line
		/***************************************************************************
		spmv.h - description
		-------------------
		begin : Dec 30, 2015
		copyright : (C) 2015 by Tomas Oberhuber et al.
		email : tomas.oberhuber@fjfi.cvut.cz
		***************************************************************************/

		/* See Copyright Notice in tnl/Copyright */

		// Implemented by: Jakub Klinkovsky

		#pragma once

		#include "../Benchmarks.h"

		#include <TNL/Pointers/DevicePointer.h>
		#include <Benchmarks/SpMV/ReferenceFormats/Legacy/CSR.h>
		#include <Benchmarks/SpMV/ReferenceFormats/Legacy/Ellpack.h>
		#include <Benchmarks/SpMV/ReferenceFormats/Legacy/SlicedEllpack.h>
		#include <Benchmarks/SpMV/ReferenceFormats/Legacy/ChunkedEllpack.h>

		namespace TNL {
		namespace Benchmarks {

		// silly alias to match the number of template parameters with other formats
		template< typename Real, typename Device, typename Index >
		using SlicedEllpack = SpMV::ReferenceFormats::Legacy::SlicedEllpack< Real, Device, Index >;

		// Legacy formats
		template< typename Real, typename Device, typename Index >
		using SparseMatrixLegacy_CSR_Scalar = SpMV::ReferenceFormats::Legacy::CSR< Real, Device, Index, SpMV::ReferenceFormats::Legacy::CSRScalar >;


		template< typename Matrix >
		int setHostTestMatrix( Matrix& matrix,
		const int elementsPerRow )
		{
		const int size = matrix.getRows();
		int elements( 0 );
		for( int row = 0; row < size; row++ ) {
		int col = row - elementsPerRow / 2;
		for( int element = 0; element < elementsPerRow; element++ ) {
		if( col + element >= 0 &&
		col + element < size )
		{
		matrix.setElement( row, col + element, element + 1 );
		elements++;
		}
		}
		}
		return elements;
		}

		#ifdef HAVE_CUDA
		template< typename Matrix >
		__global__ void setCudaTestMatrixKernel( Matrix* matrix,
		const int elementsPerRow,
		const int gridIdx )
		{
		const int rowIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
		if( rowIdx >= matrix->getRows() )
		return;
		int col = rowIdx - elementsPerRow / 2;
		for( int element = 0; element < elementsPerRow; element++ ) {
		if( col + element >= 0 &&
		col + element < matrix->getColumns() )
		matrix->setElementFast( rowIdx, col + element, element + 1 );
		}
		}
		#endif

		template< typename Matrix >
		void setCudaTestMatrix( Matrix& matrix,
		const int elementsPerRow )
		{
		#ifdef HAVE_CUDA
		typedef typename Matrix::IndexType IndexType;
		typedef typename Matrix::RealType RealType;
		Pointers::DevicePointer< Matrix > kernel_matrix( matrix );
		dim3 cudaBlockSize( 256 ), cudaGridSize( Cuda::getMaxGridSize() );
		const IndexType cudaBlocks = roundUpDivision( matrix.getRows(), cudaBlockSize.x );
		const IndexType cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() );
		for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ ) {
		if( gridIdx == cudaGrids - 1 )
		cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize();
		setCudaTestMatrixKernel< Matrix >
		<<< cudaGridSize, cudaBlockSize >>>
		( &kernel_matrix.template modifyData< Devices::Cuda >(), elementsPerRow, gridIdx );
		TNL_CHECK_CUDA_DEVICE;
		}
		#endif
		}


		// TODO: rename as benchmark_SpMV_synthetic and move to spmv-synthetic.h
		template< typename Real,
		template< typename, typename, typename > class Matrix >
		void
		benchmarkSpMV( Benchmark<> & benchmark,
		const int & size,
		const int elementsPerRow = 5 )
		{
		typedef Matrix< Real, Devices::Host, int > HostMatrix;
		typedef Matrix< Real, Devices::Cuda, int > DeviceMatrix;
		typedef Containers::Vector< Real, Devices::Host, int > HostVector;
		typedef Containers::Vector< Real, Devices::Cuda, int > CudaVector;

		HostMatrix hostMatrix;
		DeviceMatrix deviceMatrix;
		Containers::Vector< int, Devices::Host, int > hostRowLengths;
		Containers::Vector< int, Devices::Cuda, int > deviceRowLengths;
		HostVector hostVector, hostVector2;
		CudaVector deviceVector, deviceVector2;

		// create benchmark group
		const std::vector< String > parsedType = parseObjectType( getType< HostMatrix >() );
		#ifdef HAVE_CUDA
		benchmark.createHorizontalGroup( parsedType[ 0 ], 2 );
		#else
		benchmark.createHorizontalGroup( parsedType[ 0 ], 1 );
		#endif

		hostRowLengths.setSize( size );
		hostMatrix.setDimensions( size, size );
		hostVector.setSize( size );
		hostVector2.setSize( size );
		#ifdef HAVE_CUDA
		deviceRowLengths.setSize( size );
		deviceMatrix.setDimensions( size, size );
		deviceVector.setSize( size );
		deviceVector2.setSize( size );
		#endif

		hostRowLengths.setValue( elementsPerRow );
		#ifdef HAVE_CUDA
		deviceRowLengths.setValue( elementsPerRow );
		#endif

		hostMatrix.setCompressedRowLengths( hostRowLengths );
		#ifdef HAVE_CUDA
		deviceMatrix.setCompressedRowLengths( deviceRowLengths );
		#endif

		const int elements = setHostTestMatrix< HostMatrix >( hostMatrix, elementsPerRow );
		setCudaTestMatrix< DeviceMatrix >( deviceMatrix, elementsPerRow );
		const double datasetSize = (double) elements * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB;

		// reset function
		auto reset = [&]() {
		hostVector.setValue( 1.0 );
		hostVector2.setValue( 0.0 );
		#ifdef HAVE_CUDA
		deviceVector.setValue( 1.0 );
		deviceVector2.setValue( 0.0 );
		#endif
		};

		// compute functions
		auto spmvHost = [&]() {
		hostMatrix.vectorProduct( hostVector, hostVector2 );
		};
		benchmark.setOperation( datasetSize );
		benchmark.time< Devices::Host >( reset, "CPU", spmvHost );
		#ifdef HAVE_CUDA
		auto spmvCuda = [&]() {
		deviceMatrix.vectorProduct( deviceVector, deviceVector2 );
		};
		benchmark.time< Devices::Cuda >( reset, "GPU", spmvCuda );
		#endif
		}

		template< typename Real = double,
		typename Index = int >
		void
		benchmarkSpmvSynthetic( Benchmark<> & benchmark,
		const int & size,
		const int & elementsPerRow )
		{
		// TODO: benchmark all formats from tnl-benchmark-spmv (different parameters of the base formats)
		// NOTE: CSR is disabled because it is very slow on GPU
		//benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Scalar >( benchmark, size, elementsPerRow );
		benchmarkSpMV< Real, Benchmarks::SpMV::ReferenceFormats::Legacy::Ellpack >( benchmark, size, elementsPerRow );
		benchmarkSpMV< Real, SlicedEllpack >( benchmark, size, elementsPerRow );
		benchmarkSpMV< Real, Benchmarks::SpMV::ReferenceFormats::Legacy::ChunkedEllpack >( benchmark, size, elementsPerRow );
		}

		} // namespace Benchmarks
		} // namespace TNL

src/Benchmarks/BLAS/tnl-benchmark-blas.h

+38 −52

Original line number	Diff line number	Diff line
		@@ -21,8 +21,7 @@
		#include "array-operations.h"
		#include "vector-operations.h"
		#include "triad.h"
		#include "spmv.h"
		#include "dense-mv.h"
		#include "gemv.h"


		using namespace TNL;
		@@ -32,37 +31,39 @@ using namespace TNL::Benchmarks;
		template< typename Real >
		void
		runBlasBenchmarks( Benchmark<> & benchmark,
		Benchmark<>::MetadataMap metadata,
		const std::size_t & minSize,
		const std::size_t & maxSize,
		const double & sizeStepFactor,
		const int & elementsPerRow )
		const double & sizeStepFactor )
		{
		const String precision = getType< Real >();
		metadata["precision"] = precision;
		benchmark.setMetadataWidths({
		{ "operation", 30 },
		{ "performer", 21 },
		{ "precision", 10 },
		});

		// Array operations
		benchmark.newBenchmark( String("Array operations (") + precision + ", host allocator = Host)",
		metadata );
		std::cout << "\n== Array operations ==\n" << std::endl;
		for( std::size_t size = minSize; size <= maxSize; size *= 2 ) {
		benchmark.setMetadataColumns( Benchmark<>::MetadataColumns({
		{ "precision", getType< Real >() },
		{ "host allocator", "Host" },
		{ "size", convertToString( size ) },
		} ));
		benchmarkArrayOperations< Real >( benchmark, size );
		}
		#ifdef HAVE_CUDA
		benchmark.newBenchmark( String("Array operations (") + precision + ", host allocator = CudaHost)",
		metadata );
		for( std::size_t size = minSize; size <= maxSize; size *= 2 ) {
		benchmark.setMetadataColumns( Benchmark<>::MetadataColumns({
		{ "precision", getType< Real >() },
		{ "host allocator", "CudaHost" },
		{ "size", convertToString( size ) },
		} ));
		benchmarkArrayOperations< Real, int, Allocators::CudaHost >( benchmark, size );
		}
		benchmark.newBenchmark( String("Array operations (") + precision + ", host allocator = CudaManaged)",
		metadata );
		for( std::size_t size = minSize; size <= maxSize; size *= 2 ) {
		benchmark.setMetadataColumns( Benchmark<>::MetadataColumns({
		{ "precision", getType< Real >() },
		{ "host allocator", "CudaManaged" },
		{ "size", convertToString( size ) },
		} ));
		benchmarkArrayOperations< Real, int, Allocators::CudaManaged >( benchmark, size );
		@@ -70,10 +71,10 @@ runBlasBenchmarks( Benchmark<> & benchmark,
		#endif

		// Vector operations
		benchmark.newBenchmark( String("Vector operations (") + precision + ")",
		metadata );
		std::cout << "\n== Vector operations ==\n" << std::endl;
		for( std::size_t size = minSize; size <= maxSize; size *= sizeStepFactor ) {
		benchmark.setMetadataColumns( Benchmark<>::MetadataColumns({
		{ "precision", getType< Real >() },
		{ "size", convertToString( size ) },
		} ));
		benchmarkVectorOperations< Real >( benchmark, size );
		@@ -81,39 +82,30 @@ runBlasBenchmarks( Benchmark<> & benchmark,

		// Triad benchmark: copy from host, compute, copy to host
		#ifdef HAVE_CUDA
		benchmark.newBenchmark( String("Triad benchmark (") + precision + ")",
		metadata );
		std::cout << "\n== Triad ==\n" << std::endl;
		for( std::size_t size = minSize; size <= maxSize; size *= 2 ) {
		benchmark.setMetadataColumns( Benchmark<>::MetadataColumns({
		{ "precision", getType< Real >() },
		{ "size", convertToString( size ) },
		} ));
		benchmarkTriad< Real >( benchmark, size );
		}
		#endif

		// Sparse matrix-vector multiplication
		benchmark.newBenchmark( String("Sparse matrix-vector multiplication (") + precision + ")",
		metadata );
		for( std::size_t size = minSize; size <= maxSize; size *= 2 ) {
		benchmark.setMetadataColumns( Benchmark<>::MetadataColumns({
		{ "rows", convertToString( size ) },
		{ "columns", convertToString( size ) },
		{ "elements per row", convertToString( elementsPerRow ) },
		} ));
		benchmarkSpmvSynthetic< Real >( benchmark, size, elementsPerRow );
		}

		// Dense matrix-vector multiplication
		benchmark.newBenchmark( String("Dense matrix-vector multiplication (") + precision + ")",
		metadata );
		for( std::size_t size = 10; size <= 20000; size *= 2 ) {
		std::cout << "\n== Dense matrix-vector multiplication ==\n" << std::endl;
		for( std::size_t rows = 10; rows <= 20000 * 20000; rows *= 2 ) {
		for( std::size_t columns = 10; columns <= 20000 * 20000; columns *= 2 ) {
		if( rows * columns > 20000 * 20000 )
		break;
		benchmark.setMetadataColumns( Benchmark<>::MetadataColumns({
		{ "rows", convertToString( size ) },
		{ "columns", convertToString( size ) }
		{ "precision", getType< Real >() },
		{ "rows", convertToString( rows ) },
		{ "columns", convertToString( columns ) }
		} ));
		benchmarkDenseMVSynthetic< Real >( benchmark, size );
		benchmarkGemv< Real >( benchmark, rows, columns );
		}
		}

		}

		void
		@@ -132,7 +124,6 @@ setupConfig( Config::ConfigDescription & config )
		config.addEntry< int >( "max-size", "Minimum size of arrays/vectors used in the benchmark.", 10000000 );
		config.addEntry< int >( "size-step-factor", "Factor determining the size of arrays/vectors used in the benchmark. First size is min-size and each following size is stepFactor*previousSize, up to max-size.", 2 );
		config.addEntry< int >( "loops", "Number of iterations for every computation.", 10 );
		config.addEntry< int >( "elements-per-row", "Number of elements per row of the sparse matrix used in the matrix-vector multiplication benchmark.", 5 );
		config.addEntry< int >( "verbose", "Verbose mode.", 1 );

		config.addDelimiter( "Device settings:" );
		@@ -167,7 +158,6 @@ main( int argc, char* argv[] )
		const std::size_t maxSize = parameters.getParameter< int >( "max-size" );
		const int sizeStepFactor = parameters.getParameter< int >( "size-step-factor" );
		const int loops = parameters.getParameter< int >( "loops" );
		const int elementsPerRow = parameters.getParameter< int >( "elements-per-row" );
		const int verbose = parameters.getParameter< int >( "verbose" );

		if( sizeStepFactor <= 1 ) {
		@@ -179,23 +169,19 @@ main( int argc, char* argv[] )
		auto mode = std::ios::out;
		if( outputMode == "append" )
		mode \|= std::ios::app;
		std::ofstream logFile( logFileName.getString(), mode );
		std::ofstream logFile( logFileName, mode );

		// init benchmark and common metadata
		Benchmark<> benchmark( loops, verbose );
		// init benchmark and set parameters
		Benchmark<> benchmark( logFile, loops, verbose );

		// prepare global metadata
		Benchmark<>::MetadataMap metadata = getHardwareMetadata< Logging >();
		// write global metadata into a separate file
		std::map< std::string, std::string > metadata = getHardwareMetadata();
		writeMapAsJson( metadata, logFileName, ".metadata.json" );

		if( precision == "all" \|\| precision == "float" )
		runBlasBenchmarks< float >( benchmark, metadata, minSize, maxSize, sizeStepFactor, elementsPerRow );
		runBlasBenchmarks< float >( benchmark, minSize, maxSize, sizeStepFactor );
		if( precision == "all" \|\| precision == "double" )
		runBlasBenchmarks< double >( benchmark, metadata, minSize, maxSize, sizeStepFactor, elementsPerRow );

		if( ! benchmark.save( logFile ) ) {
		std::cerr << "Failed to write the benchmark results to file '" << logFileName << "'." << std::endl;
		return EXIT_FAILURE;
		}
		runBlasBenchmarks< double >( benchmark, minSize, maxSize, sizeStepFactor );

		return EXIT_SUCCESS;
		}