Benchmarks: rewritten objectively, with support for output into a log file (77e7a205) · Commits · TNL / tnl-dev

tests/benchmarks/array-operations.h

0 → 100644

+92 −0

Original line number	Diff line number	Diff line
		#pragma once

		#include "benchmarks.h"

		#include <core/arrays/tnlArray.h>

		namespace tnl
		{
		namespace benchmarks
		{

		template< typename Real = double,
		typename Index = int >
		bool
		benchmarkArrayOperations( Benchmark & benchmark,
		const int & loops,
		const int & size )
		{
		typedef tnlArray< Real, tnlHost, Index > HostArray;
		typedef tnlArray< Real, tnlCuda, Index > CudaArray;
		using namespace std;

		double datasetSize = ( double ) ( loops * size ) * sizeof( Real ) / oneGB;

		HostArray hostArray, hostArray2;
		CudaArray deviceArray, deviceArray2;
		hostArray.setSize( size );
		if( ! deviceArray.setSize( size ) )
		return false;
		hostArray2.setLike( hostArray );
		if( ! deviceArray2.setLike( deviceArray ) )
		return false;

		Real resultHost, resultDevice;


		// reset functions
		auto reset1 = [&]() {
		hostArray.setValue( 1.0 );
		deviceArray.setValue( 1.0 );
		};
		auto reset2 = [&]() {
		hostArray2.setValue( 1.0 );
		deviceArray2.setValue( 1.0 );
		};
		auto reset12 = [&]() {
		reset1();
		reset2();
		};


		reset12();


		auto compareHost = [&]() {
		resultHost = (int) hostArray == hostArray2;
		};
		auto compareCuda = [&]() {
		resultDevice = (int) deviceArray == deviceArray2;
		};
		benchmark.setOperation( "comparison (operator==)", 2 * datasetSize );
		benchmark.time( reset1,
		"CPU", compareHost,
		"GPU", compareCuda );


		auto copyAssignHostHost = [&]() {
		hostArray = hostArray2;
		};
		auto copyAssignCudaCuda = [&]() {
		deviceArray = deviceArray2;
		};
		benchmark.setOperation( "copy (operator=)", 2 * datasetSize );
		double basetime = benchmark.time( reset1,
		"CPU", copyAssignHostHost,
		"GPU", copyAssignCudaCuda );


		auto copyAssignHostCuda = [&]() {
		deviceArray = hostArray;
		};
		auto copyAssignCudaHost = [&]() {
		hostArray = deviceArray;
		};
		benchmark.setOperation( "copy (operator=)", datasetSize, basetime );
		benchmark.time( reset1,
		"CPU->GPU", copyAssignHostCuda,
		"GPU->CPU", copyAssignCudaHost );
		}

		} // namespace benchmarks
		} // namespace tnl

tests/benchmarks/benchmarks.h

+325 −62

Original line number	Diff line number	Diff line
		@@ -2,8 +2,11 @@

		#include <iostream>
		#include <iomanip>
		#include <map>
		#include <vector>

		#include <core/tnlTimerRT.h>
		#include <core/tnlString.h>

		namespace tnl
		{
		@@ -17,18 +20,15 @@ template< typename ComputeFunction,
		double
		timeFunction( ComputeFunction compute,
		ResetFunction reset,
		const int & loops,
		const double & datasetSize, // in GB
		const double & baseTime, // in seconds (baseline for speedup calculation)
		const char* performer )
		const int & loops )
		{
		// the timer is constructed zero-initialized and stopped
		tnlTimerRT timer;

		reset();
		for(int i = 0; i < loops; ++i) {
		// TODO: not necessary for host computations
		// Explicit synchronization of the CUDA device
		// TODO: not necessary for host computations
		#ifdef HAVE_CUDA
		cudaDeviceSynchronize();
		#endif
		@@ -42,70 +42,333 @@ timeFunction( ComputeFunction compute,
		reset();
		}

		const double time = timer.getTime();
		const double bandwidth = datasetSize / time;
		return timer.getTime();
		}


		struct InternalError {};


		class Logging
		{
		public:
		using MetadataElement = std::pair< const char*, tnlString >;
		using MetadataMap = std::map< const char*, tnlString >;
		using MetadataColumns = std::vector<MetadataElement>;

		using HeaderElements = std::initializer_list< tnlString >;
		using RowElements = std::initializer_list< double >;

		Logging( bool verbose = true )
		: verbose(verbose)
		{ }

		// TODO: fix spacing (blank lines)
		void
		writeTitle( const tnlString & title )
		{
		if( verbose )
		std::cout << std::endl << "== " << title << " ==" << std::endl << std::endl;
		log << ": title = " << title << std::endl;
		}

		void
		writeMetadata( const MetadataMap & metadata )
		{
		if( verbose )
		std::cout << "properties:" << std::endl;

		for( auto & it : metadata ) {
		if( verbose )
		std::cout << " " << it.first << " = " << it.second << std::endl;
		log << ": " << it.first << " = " << it.second << std::endl;
		}
		if( verbose )
		std::cout << std::endl;
		}

		void
		writeTableHeader( const tnlString & spanningElement,
		const HeaderElements & subElements )
		{
		using namespace std;
		cout << " " << performer << ": bandwidth: "
		<< setw( 8 ) << bandwidth << " GB/sec, time: "
		<< setw( 8 ) << time << " sec, speedup: ";
		if( baseTime )
		cout << baseTime / time << endl;
		else
		cout << "N/A" << endl;

		return time;
		if( verbose && header_changed ) {
		for( auto & it : metadataColumns ) {
		cout << setw( 20 ) << it.first;
		}

		// This specialization terminates the recursion
		template< typename ResetFunction,
		typename ComputeFunction >
		inline void
		benchmarkNextOperation( const double & datasetSize,
		const int & loops,
		ResetFunction reset,
		const double & baseTime,
		const char* performer,
		ComputeFunction compute )
		// spanning element is printed as usual column to stdout,
		// but is excluded from header
		cout << setw( 15 ) << "";

		for( auto & it : subElements ) {
		cout << setw( 15 ) << it;
		}
		cout << endl;

		header_changed = false;
		}

		// initial indent string
		header_indent = "!";
		log << endl;
		for( auto & it : metadataColumns ) {
		log << header_indent << " " << it.first << endl;
		}

		// dump stacked spanning columns
		if( horizontalGroups.size() > 0 )
		while( horizontalGroups.back().second <= 0 ) {
		horizontalGroups.pop_back();
		header_indent.pop_back();
		}
		for( int i = 0; i < horizontalGroups.size(); i++ ) {
		if( horizontalGroups[ i ].second > 0 ) {
		log << header_indent << " " << horizontalGroups[ i ].first << endl;
		header_indent += "!";
		}
		}

		log << header_indent << " " << spanningElement << endl;
		for( auto & it : subElements ) {
		log << header_indent << "! " << it << endl;
		}

		if( horizontalGroups.size() > 0 ) {
		horizontalGroups.back().second--;
		header_indent.pop_back();
		}
		}

		void
		writeTableRow( const tnlString & spanningElement,
		const RowElements & subElements )
		{
		using namespace std;

		if( verbose ) {
		for( auto & it : metadataColumns ) {
		cout << setw( 20 ) << it.second;
		}
		// spanning element is printed as usual column to stdout
		cout << setw( 15 ) << spanningElement;
		for( auto & it : subElements ) {
		cout << setw( 15 );
		if( it != 0.0 ) cout << it;
		else cout << "N/A";
		}
		cout << endl;
		}

		// only when changed (the header has been already adjusted)
		// print each element on separate line
		for( auto & it : metadataColumns ) {
		log << it.second << endl;
		}

		// benchmark data are indented
		const tnlString indent = " ";
		for( auto & it : subElements ) {
		if( it != 0.0 ) log << indent << it << endl;
		else log << indent << "N/A" << endl;
		}
		}

		void
		closeTable()
		{
		header_indent = body_indent = "";
		header_changed = true;
		}

		bool save( std::ostream & logFile )
		{
		timeFunction( compute, reset, loops, datasetSize, baseTime, performer );
		closeTable();
		logFile << log.str();
		if( logFile.good() ) {
		log.str() ="";
		return true;
		}
		return false;
		}

		protected:

		// manual double -> tnlString conversion with fixed precision
		static tnlString
		_to_string( const double & num, const int & precision = 0, bool fixed = false )
		{
		std::stringstream str;
		if( fixed )
		str << std::fixed;
		if( precision )
		str << std::setprecision( precision );
		str << num;
		return tnlString( str.str().data() );
		}

		std::stringstream log;
		std::string header_indent;
		std::string body_indent;

		bool verbose;
		MetadataColumns metadataColumns;
		bool header_changed = true;
		std::vector< std::pair< tnlString, int > > horizontalGroups;
		};


		class Benchmark
		: protected Logging
		{
		public:
		using Logging::MetadataElement;
		using Logging::MetadataMap;
		using Logging::MetadataColumns;

		Benchmark( const int & loops = 10,
		bool verbose = true )
		: Logging(verbose), loops(loops)
		{ }

		// TODO: ensure that this is not called in the middle of the benchmark
		// (or just remove it completely?)
		void
		setLoops( const int & loops )
		{
		this->loops = loops;
		}

		// Marks the start of a new benchmark
		void
		newBenchmark( const tnlString & title )
		{
		closeTable();
		writeTitle( title );
		}

		// Marks the start of a new benchmark (with custom metadata)
		void
		newBenchmark( const tnlString & title,
		MetadataMap metadata )
		{
		closeTable();
		writeTitle( title );
		// add loops to metadata
		metadata["loops"] = tnlString(loops);
		writeMetadata( metadata );
		}

		// Sets metadata columns -- values used for all subsequent rows until
		// the next call to this function.
		void
		setMetadataColumns( const MetadataColumns & metadata )
		{
		if( metadataColumns != metadata )
		header_changed = true;
		metadataColumns = metadata;
		}

		// TODO: maybe should be renamed to createVerticalGroup and ensured that vertical and horizontal groups are not used within the same "Benchmark"
		// Sets current operation -- operations expand the table vertically
		// - baseTime should be reset to 0.0 for most operations, but sometimes
		// it is useful to override it
		// - Order of operations inside a "Benchmark" does not matter, rows can be
		// easily sorted while converting to HTML.)
		void
		setOperation( const tnlString & operation,
		const double & datasetSize = 0.0, // in GB
		const double & baseTime = 0.0 )
		{
		if( metadataColumns.size() > 0 && tnlString(metadataColumns[ 0 ].first) == "operation" ) {
		metadataColumns[ 0 ].second = operation;
		}
		else {
		metadataColumns.insert( metadataColumns.begin(), {"operation", operation} );
		}
		setOperation( datasetSize, baseTime );
		header_changed = true;
		}

		void
		setOperation( const double & datasetSize = 0.0,
		const double & baseTime = 0.0 )
		{
		this->datasetSize = datasetSize;
		this->baseTime = baseTime;
		}

		// Creates new horizontal groups inside a benchmark -- increases the number
		// of columns in the "Benchmark", implies column spanning.
		// (Useful e.g. for SpMV formats, different configurations etc.)
		void
		createHorizontalGroup( const tnlString & name,
		const int & subcolumns )
		{
		if( horizontalGroups.size() == 0 ) {
		horizontalGroups.push_back( {name, subcolumns} );
		}
		else {
		auto & last = horizontalGroups.back();
		if( last.first != name && last.second > 0 ) {
		horizontalGroups.push_back( {name, subcolumns} );
		}
		else {
		last.first = name;
		last.second = subcolumns;
		}
		}
		}

		// Recursive template function to deal with benchmarks involving multiple computations
		// Times a single ComputeFunction. Subsequent calls implicitly split
		// the current "horizontal group" into sub-columns identified by
		// "performer", which are further split into "bandwidth", "time" and
		// "speedup" columns.
		// TODO: allow custom columns bound to lambda functions (e.g. for Gflops calculation)
		// Also terminates the recursion of the following variadic template.
		template< typename ResetFunction,
		typename ComputeFunction,
		typename... NextComputations >
		inline void
		benchmarkNextOperation( const double & datasetSize,
		const int & loops,
		ResetFunction reset,
		const double & baseTime,
		const char* performer,
		ComputeFunction compute,
		NextComputations & ... nextComputations )
		typename ComputeFunction >
		double
		time( ResetFunction reset,
		const tnlString & performer,
		ComputeFunction & compute )
		{
		benchmarkNextOperation( datasetSize, loops, reset, baseTime, performer, compute );
		benchmarkNextOperation( datasetSize, loops, reset, baseTime, nextComputations... );
		const double time = timeFunction( compute, reset, loops );
		const double bandwidth = datasetSize / time;
		const double speedup = this->baseTime / time;
		if( this->baseTime == 0.0 )
		this->baseTime = time;

		writeTableHeader( performer, HeaderElements({"bandwidth", "time", "speedup"}) );
		writeTableRow( performer, RowElements({ bandwidth, time, speedup }) );

		return this->baseTime;
		}

		// Main function for benchmarking
		// Recursive template function to deal with multiple computations with the
		// same reset function.
		template< typename ResetFunction,
		typename ComputeFunction,
		typename... NextComputations >
		void
		benchmarkOperation( const char* operation,
		const double & datasetSize,
		const int & loops,
		ResetFunction reset,
		const char* performer,
		ComputeFunction computeBase,
		NextComputations... nextComputations )
		inline double
		time( ResetFunction reset,
		const tnlString & performer,
		ComputeFunction & compute,
		NextComputations & ... nextComputations )
		{
		cout << "Benchmarking " << operation << ":" << endl;
		double baseTime = timeFunction( computeBase, reset, loops, datasetSize, 0.0, performer );
		benchmarkNextOperation( datasetSize, loops, reset, baseTime, nextComputations... );
		std::cout << std::endl;
		time( reset, performer, compute );
		time( reset, nextComputations... );
		return this->baseTime;
		}

		using Logging::save;

		protected:
		int loops;
		double datasetSize = 0.0;
		double baseTime = 0.0;
		};

		} // namespace benchmarks
		} // namespace tnl

tests/benchmarks/tnl-cuda-benchmarks.h

+55 −14

Original line number	Diff line number	Diff line
		@@ -18,12 +18,12 @@
		#ifndef TNLCUDABENCHMARKS_H_
		#define TNLCUDBENCHMARKS_H_

		#include <tnlConfig.h>
		#include <core/tnlList.h>
		#include <matrices/tnlSlicedEllpackMatrix.h>
		#include <matrices/tnlEllpackMatrix.h>
		#include <matrices/tnlCSRMatrix.h>

		#include "array-operations.h"
		#include "vector-operations.h"

		using namespace tnl::benchmarks;
		@@ -31,6 +31,7 @@ using namespace tnl::benchmarks;

		// TODO: should benchmarks check the result of the computation?


		// silly alias to match the number of template parameters with other formats
		template< typename Real, typename Device, typename Index >
		using SlicedEllpackMatrix = tnlSlicedEllpackMatrix< Real, Device, Index >;
		@@ -43,8 +44,6 @@ int setHostTestMatrix( Matrix& matrix,
		int elements( 0 );
		for( int row = 0; row < size; row++ )
		{
		if( row % 100 == 0 )
		cout << "Filling row " << row << "/" << size << " \r" << flush;
		int col = row - elementsPerRow / 2;
		for( int element = 0; element < elementsPerRow; element++ )
		{
		@@ -56,7 +55,6 @@ int setHostTestMatrix( Matrix& matrix,
		}
		}
		}
		cout << endl;
		return elements;
		}

		@@ -104,7 +102,8 @@ template< typename Real,
		template< typename, typename, typename > class Matrix,
		template< typename, typename, typename > class Vector = tnlVector >
		bool
		benchmarkSpMV( const int & loops,
		benchmarkSpMV( Benchmark & benchmark,
		const int & loops,
		const int & size,
		const int elementsPerRow = 5 )
		{
		@@ -149,8 +148,7 @@ benchmarkSpMV( const int & loops,

		tnlList< tnlString > parsedType;
		parseObjectType( HostMatrix::getType(), parsedType );
		tnlString operationDescription = tnlString("SpMV (matrix type: ") + parsedType[ 0 ]
		+ ", rows: " + tnlString(size) + ", elements per row: " + tnlString(elementsPerRow) + ")";
		benchmark.createHorizontalGroup( parsedType[ 0 ], 2 );

		const int elements = setHostTestMatrix< HostMatrix >( hostMatrix, elementsPerRow );
		setCudaTestMatrix< DeviceMatrix >( deviceMatrix, elementsPerRow );
		@@ -172,7 +170,8 @@ benchmarkSpMV( const int & loops,
		deviceMatrix.vectorProduct( deviceVector, deviceVector2 );
		};

		benchmarkOperation( operationDescription.getString(), datasetSize, loops, reset,
		benchmark.setOperation( datasetSize );
		benchmark.time( reset,
		"CPU", spmvHost,
		"GPU", spmvCuda );

		@@ -184,6 +183,7 @@ int main( int argc, char* argv[] )
		#ifdef HAVE_CUDA

		typedef double Real;
		tnlString precision = getType< Real >();

		/****
		* The first argument of this program is the size od data set to be reduced.
		@@ -199,11 +199,52 @@ int main( int argc, char* argv[] )
		if( argc > 3 )
		elementsPerRow = atoi( argv[ 3 ] );

		benchmarkVectorOperations< Real >( loops, size );
		ofstream logFile( "tnl-cuda-benchmarks.log" );
		Benchmark benchmark( loops, true );
		// ostream & logFile = cout;
		// Benchmark benchmark( loops, false );

		// TODO: add hostname, CPU info, GPU info, date, ...
		Benchmark::MetadataMap metadata {
		{"precision", precision},
		};
		// TODO: loop over sizes


		// Array operations
		benchmark.newBenchmark( tnlString("Array operations (") + precision + ")",
		metadata );
		benchmark.setMetadataColumns( Benchmark::MetadataColumns({
		{"size", size},
		} ));
		benchmarkArrayOperations< Real >( benchmark, loops, size );


		benchmarkSpMV< Real, tnlEllpackMatrix >( loops, size, elementsPerRow );
		benchmarkSpMV< Real, SlicedEllpackMatrix >( loops, size, elementsPerRow );
		benchmarkSpMV< Real, tnlCSRMatrix >( loops, size, elementsPerRow );
		// Vector operations
		benchmark.newBenchmark( tnlString("Vector operations (") + precision + ")",
		metadata );
		benchmark.setMetadataColumns( Benchmark::MetadataColumns({
		{"size", size},
		} ));
		benchmarkVectorOperations< Real >( benchmark, loops, size );


		// SpMV
		benchmark.newBenchmark( tnlString("SpMV (") + precision + ")",
		metadata );
		benchmark.setMetadataColumns( Benchmark::MetadataColumns({
		{"rows", size},
		{"columns", size},
		{"elements per row", elementsPerRow},
		} ));

		benchmarkSpMV< Real, tnlEllpackMatrix >( benchmark, loops, size, elementsPerRow );
		benchmarkSpMV< Real, SlicedEllpackMatrix >( benchmark, loops, size, elementsPerRow );
		benchmarkSpMV< Real, tnlCSRMatrix >( benchmark, loops, size, elementsPerRow );


		if( ! benchmark.save( logFile ) )
		return EXIT_FAILURE;

		return EXIT_SUCCESS;
		#else

tests/benchmarks/vector-operations.h

+48 −69

Original line number	Diff line number	Diff line
		@@ -16,7 +16,8 @@ namespace benchmarks
		template< typename Real = double,
		typename Index = int >
		bool
		benchmarkVectorOperations( const int & loops,
		benchmarkVectorOperations( Benchmark & benchmark,
		const int & loops,
		const int & size )
		{
		typedef tnlVector< Real, tnlHost, Index > HostVector;
		@@ -63,46 +64,14 @@ benchmarkVectorOperations( const int & loops,
		reset12();


		auto copyAssignHostCuda = [&]() {
		deviceVector = hostVector;
		};
		auto copyAssignCudaHost = [&]() {
		hostVector = deviceVector;
		};
		benchmarkOperation( "copy assigment (cross-device)", datasetSize, loops, reset1,
		"CPU->GPU", copyAssignHostCuda,
		"GPU->CPU", copyAssignCudaHost );


		auto copyAssignHostHost = [&]() {
		hostVector = hostVector2;
		};
		auto copyAssignCudaCuda = [&]() {
		deviceVector = deviceVector2;
		};
		benchmarkOperation( "copy assigment", 2 * datasetSize, loops, reset1,
		"CPU->CPU", copyAssignHostHost,
		"GPU->GPU", copyAssignCudaCuda );


		auto compareHost = [&]() {
		resultHost = (int) hostVector == hostVector2;
		};
		auto compareCuda = [&]() {
		resultDevice = (int) deviceVector == deviceVector2;
		};
		benchmarkOperation( "comparison (operator==)", 2 * datasetSize, loops, reset1,
		"CPU", compareHost,
		"GPU", compareCuda );


		auto multiplyHost = [&]() {
		hostVector *= 0.5;
		};
		auto multiplyCuda = [&]() {
		deviceVector *= 0.5;
		};
		benchmarkOperation( "scalar multiplication", 2 * datasetSize, loops, reset1,
		benchmark.setOperation( "scalar multiplication", 2 * datasetSize );
		benchmark.time( reset1,
		"CPU", multiplyHost,
		"GPU", multiplyCuda );

		@@ -113,7 +82,8 @@ benchmarkVectorOperations( const int & loops,
		auto addVectorCuda = [&]() {
		deviceVector.addVector( deviceVector2 );
		};
		benchmarkOperation( "vector addition", 3 * datasetSize, loops, reset1,
		benchmark.setOperation( "vector addition", 3 * datasetSize );
		benchmark.time( reset1,
		"CPU", addVectorHost,
		"GPU", addVectorCuda );

		@@ -124,7 +94,8 @@ benchmarkVectorOperations( const int & loops,
		auto maxCuda = [&]() {
		resultDevice = deviceVector.max();
		};
		benchmarkOperation( "max", datasetSize, loops, reset1,
		benchmark.setOperation( "max", datasetSize );
		benchmark.time( reset1,
		"CPU", maxHost,
		"GPU", maxCuda );

		@@ -135,7 +106,8 @@ benchmarkVectorOperations( const int & loops,
		auto minCuda = [&]() {
		resultDevice = deviceVector.min();
		};
		benchmarkOperation( "min", datasetSize, loops, reset1,
		benchmark.setOperation( "min", datasetSize );
		benchmark.time( reset1,
		"CPU", minHost,
		"GPU", minCuda );

		@@ -146,7 +118,8 @@ benchmarkVectorOperations( const int & loops,
		auto absMaxCuda = [&]() {
		resultDevice = deviceVector.absMax();
		};
		benchmarkOperation( "absMax", datasetSize, loops, reset1,
		benchmark.setOperation( "absMax", datasetSize );
		benchmark.time( reset1,
		"CPU", absMaxHost,
		"GPU", absMaxCuda );

		@@ -157,7 +130,8 @@ benchmarkVectorOperations( const int & loops,
		auto absMinCuda = [&]() {
		resultDevice = deviceVector.absMin();
		};
		benchmarkOperation( "absMin", datasetSize, loops, reset1,
		benchmark.setOperation( "absMin", datasetSize );
		benchmark.time( reset1,
		"CPU", absMinHost,
		"GPU", absMinCuda );

		@@ -168,7 +142,8 @@ benchmarkVectorOperations( const int & loops,
		auto sumCuda = [&]() {
		resultDevice = deviceVector.sum();
		};
		benchmarkOperation( "sum", datasetSize, loops, reset1,
		benchmark.setOperation( "sum", datasetSize );
		benchmark.time( reset1,
		"CPU", sumHost,
		"GPU", sumCuda );

		@@ -179,7 +154,8 @@ benchmarkVectorOperations( const int & loops,
		auto l1normCuda = [&]() {
		resultDevice = deviceVector.lpNorm( 1.0 );
		};
		benchmarkOperation( "l1 norm", datasetSize, loops, reset1,
		benchmark.setOperation( "l1 norm", datasetSize );
		benchmark.time( reset1,
		"CPU", l1normHost,
		"GPU", l1normCuda );

		@@ -190,7 +166,8 @@ benchmarkVectorOperations( const int & loops,
		auto l2normCuda = [&]() {
		resultDevice = deviceVector.lpNorm( 2.0 );
		};
		benchmarkOperation( "l2 norm", datasetSize, loops, reset1,
		benchmark.setOperation( "l2 norm", datasetSize );
		benchmark.time( reset1,
		"CPU", l2normHost,
		"GPU", l2normCuda );

		@@ -201,7 +178,8 @@ benchmarkVectorOperations( const int & loops,
		auto l3normCuda = [&]() {
		resultDevice = deviceVector.lpNorm( 3.0 );
		};
		benchmarkOperation( "l3 norm", datasetSize, loops, reset1,
		benchmark.setOperation( "l3 norm", datasetSize );
		benchmark.time( reset1,
		"CPU", l3normHost,
		"GPU", l3normCuda );

		@@ -220,7 +198,8 @@ benchmarkVectorOperations( const int & loops,
		&resultDevice );
		};
		#endif
		benchmarkOperation( "scalar product", 2 * datasetSize, loops, reset1,
		benchmark.setOperation( "scalar product", 2 * datasetSize );
		benchmark.time( reset1,
		"CPU", scalarProductHost,
		"GPU", scalarProductCuda
		#ifdef HAVE_CUBLAS