Merge branch 'JK/multireduction' into 'develop' (cbd05a45) · Commits · TNL / tnl-dev

src/Benchmarks/BLAS/CommonVectorOperations.h

+0 −2

Original line number	Diff line number	Diff line
		@@ -10,8 +10,6 @@

		#pragma once

		#include <TNL/Containers/Algorithms/PrefixSumType.h>

		namespace TNL {
		namespace Benchmarks {

src/Benchmarks/BLAS/CommonVectorOperations.hpp

+36 −63

Original line number	Diff line number	Diff line
		@@ -11,7 +11,6 @@
		#pragma once

		#include <TNL/Containers/Algorithms/Reduction.h>
		#include <TNL/Containers/Algorithms/PrefixSum.h>
		#include "CommonVectorOperations.h"

		namespace TNL {
		@@ -30,9 +29,8 @@ getVectorMax( const Vector& v )

		const auto* data = v.getData();
		auto fetch = [=] __cuda_callable__ ( IndexType i ) -> ResultType { return data[ i ]; };
		auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::max( a, b ); };
		auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::max( a, b ); };
		return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::lowest() );
		auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return TNL::max( a, b ); };
		return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, fetch, std::numeric_limits< ResultType >::lowest() );
		}

		template< typename Device >
		@@ -48,9 +46,8 @@ getVectorMin( const Vector& v )

		const auto* data = v.getData();
		auto fetch = [=] __cuda_callable__ ( IndexType i ) -> RealType { return data[ i ]; };
		auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::min( a, b ); };
		auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::min( a, b ); };
		return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::max() );
		auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return TNL::min( a, b ); };
		return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, fetch, std::numeric_limits< ResultType >::max() );
		}

		template< typename Device >
		@@ -66,9 +63,8 @@ getVectorAbsMax( const Vector& v )

		const auto* data = v.getData();
		auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data[ i ] ); };
		auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::max( a, b ); };
		auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::max( a, b ); };
		return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::lowest() );
		auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return TNL::max( a, b ); };
		return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, fetch, std::numeric_limits< ResultType >::lowest() );
		}

		template< typename Device >
		@@ -84,9 +80,8 @@ getVectorAbsMin( const Vector& v )

		const auto* data = v.getData();
		auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data[ i ] ); };
		auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::min( a, b ); };
		auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::min( a, b ); };
		return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::max() );
		auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return TNL::min( a, b ); };
		return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, fetch, std::numeric_limits< ResultType >::max() );
		}

		template< typename Device >
		@@ -102,9 +97,7 @@ getVectorL1Norm( const Vector& v )

		const auto* data = v.getData();
		auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data[ i ] ); };
		auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; };
		auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; };
		return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 );
		return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), std::plus<>{}, fetch, ( ResultType ) 0 );
		}

		template< typename Device >
		@@ -120,9 +113,7 @@ getVectorL2Norm( const Vector& v )

		const auto* data = v.getData();
		auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data[ i ] * data[ i ]; };
		auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; };
		auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; };
		return std::sqrt( Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 ) );
		return std::sqrt( Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), std::plus<>{}, fetch, ( ResultType ) 0 ) );
		}

		template< typename Device >
		@@ -145,9 +136,7 @@ getVectorLpNorm( const Vector& v,

		const auto* data = v.getData();
		auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::pow( TNL::abs( data[ i ] ), p ); };
		auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; };
		auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; };
		return std::pow( Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 ), 1.0 / p );
		return std::pow( Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), std::plus<>{}, fetch, ( ResultType ) 0 ), 1.0 / p );
		}

		template< typename Device >
		@@ -166,9 +155,7 @@ getVectorSum( const Vector& v )

		const auto* data = v.getData();
		auto fetch = [=] __cuda_callable__ ( IndexType i ) -> ResultType { return data[ i ]; };
		auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; };
		auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; };
		return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 );
		return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), std::plus<>{}, fetch, ( ResultType ) 0 );
		}

		template< typename Device >
		@@ -187,9 +174,8 @@ getVectorDifferenceMax( const Vector1& v1,
		const auto* data1 = v1.getData();
		const auto* data2 = v2.getData();
		auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data1[ i ] - data2[ i ]; };
		auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::max( a, b ); };
		auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::max( a, b ); };
		return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::lowest() );
		auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return TNL::max( a, b ); };
		return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, std::numeric_limits< ResultType >::lowest() );
		}

		template< typename Device >
		@@ -208,9 +194,8 @@ getVectorDifferenceMin( const Vector1& v1,
		const auto* data1 = v1.getData();
		const auto* data2 = v2.getData();
		auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data1[ i ] - data2[ i ]; };
		auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::min( a, b ); };
		auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::min( a, b ); };
		return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::max() );
		auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return TNL::min( a, b ); };
		return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, std::numeric_limits< ResultType >::max() );
		}

		template< typename Device >
		@@ -229,9 +214,8 @@ getVectorDifferenceAbsMax( const Vector1& v1,
		const auto* data1 = v1.getData();
		const auto* data2 = v2.getData();
		auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data1[ i ] - data2[ i ] ); };
		auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::max( a, b ); };
		auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::max( a, b ); };
		return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::lowest() );
		auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return TNL::max( a, b ); };
		return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, std::numeric_limits< ResultType >::lowest() );
		}

		template< typename Device >
		@@ -250,9 +234,8 @@ getVectorDifferenceAbsMin( const Vector1& v1,
		const auto* data1 = v1.getData();
		const auto* data2 = v2.getData();
		auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data1[ i ] - data2[ i ] ); };
		auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::min( a, b ); };
		auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::min( a, b ); };
		return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::max() );
		auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return TNL::min( a, b ); };
		return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, std::numeric_limits< ResultType >::max() );
		}

		template< typename Device >
		@@ -271,9 +254,7 @@ getVectorDifferenceL1Norm( const Vector1& v1,
		const auto* data1 = v1.getData();
		const auto* data2 = v2.getData();
		auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data1[ i ] - data2[ i ] ); };
		auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; };
		auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; };
		return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 );
		return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), std::plus<>{}, fetch, ( ResultType ) 0 );
		}

		template< typename Device >
		@@ -295,9 +276,7 @@ getVectorDifferenceL2Norm( const Vector1& v1,
		auto diff = data1[ i ] - data2[ i ];
		return diff * diff;
		};
		auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; };
		auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; };
		return std::sqrt( Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 ) );
		return std::sqrt( Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), std::plus<>{}, fetch, ( ResultType ) 0 ) );
		}

		template< typename Device >
		@@ -323,9 +302,7 @@ getVectorDifferenceLpNorm( const Vector1& v1,
		const auto* data1 = v1.getData();
		const auto* data2 = v2.getData();
		auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::pow( TNL::abs( data1[ i ] - data2[ i ] ), p ); };
		auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; };
		auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; };
		return std::pow( Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 ), 1.0 / p );
		return std::pow( Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), std::plus<>{}, fetch, ( ResultType ) 0 ), 1.0 / p );
		}

		template< typename Device >
		@@ -344,9 +321,7 @@ getVectorDifferenceSum( const Vector1& v1,
		const auto* data1 = v1.getData();
		const auto* data2 = v2.getData();
		auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data1[ i ] - data2[ i ]; };
		auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; };
		auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; };
		return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 );
		return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), std::plus<>{}, fetch, ( ResultType ) 0 );
		}

		template< typename Device >
		@@ -365,9 +340,7 @@ getScalarProduct( const Vector1& v1,
		const auto* data1 = v1.getData();
		const auto* data2 = v2.getData();
		auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data1[ i ] * data2[ i ]; };
		auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; };
		auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; };
		return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 );
		return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), std::plus<>{}, fetch, ( ResultType ) 0 );
		}

		} // namespace Benchmarks

src/Benchmarks/BLAS/vector-operations.h

+36 −34

Original line number	Diff line number	Diff line
		@@ -346,7 +346,6 @@ benchmarkVectorOperations( Benchmark & benchmark,
		auto l3normCudaET = [&]() {
		resultDevice = lpNorm( deviceView, 3.0 );
		};

		benchmark.setOperation( "l3 norm", datasetSize );
		benchmark.time< Devices::Host >( reset1, "CPU legacy", l3normHost );
		benchmark.time< Devices::Host >( reset1, "CPU ET", l3normHostET );
		@@ -369,7 +368,6 @@ benchmarkVectorOperations( Benchmark & benchmark,
		auto scalarProductCudaET = [&]() {
		resultDevice = ( deviceView, deviceView2 );
		};

		#ifdef HAVE_BLAS
		auto scalarProductBlas = [&]() {
		resultHost = blasGdot( size, hostVector.getData(), 1, hostVector2.getData(), 1 );
		@@ -395,38 +393,6 @@ benchmarkVectorOperations( Benchmark & benchmark,
		benchmark.time< Devices::Cuda >( reset1, "cuBLAS", scalarProductCublas );
		#endif

		////
		// Prefix sum
		/*
		std::cout << "Benchmarking prefix-sum:" << std::endl;
		timer.reset();
		timer.start();
		hostVector.computePrefixSum();
		timer.stop();
		timeHost = timer.getTime();
		bandwidth = 2 * datasetSize / timer.getTime();
		std::cout << " CPU: bandwidth: " << bandwidth << " GB/sec, time: " << timer.getTime() << " sec." << std::endl;

		timer.reset();
		timer.start();
		deviceVector.computePrefixSum();
		timer.stop();
		timeDevice = timer.getTime();
		bandwidth = 2 * datasetSize / timer.getTime();
		std::cout << " GPU: bandwidth: " << bandwidth << " GB/sec, time: " << timer.getTime() << " sec." << std::endl;
		std::cout << " CPU/GPU speedup: " << timeHost / timeDevice << std::endl;

		HostVector auxHostVector;
		auxHostVector.setLike( deviceVector );
		auxHostVector = deviceVector;
		for( int i = 0; i < size; i++ )
		if( hostVector.getElement( i ) != auxHostVector.getElement( i ) )
		{
		std::cerr << "Error in prefix sum at position " << i << ": " << hostVector.getElement( i ) << " != " << auxHostVector.getElement( i ) << std::endl;
		}
		*/


		////
		// Scalar multiplication
		auto multiplyHost = [&]() {
		@@ -435,6 +401,11 @@ benchmarkVectorOperations( Benchmark & benchmark,
		auto multiplyCuda = [&]() {
		deviceVector *= 0.5;
		};
		#ifdef HAVE_BLAS
		auto multiplyBlas = [&]() {
		blasGscal( hostVector.getSize(), (Real) 0.5, hostVector.getData(), 1 );
		};
		#endif
		#ifdef HAVE_CUDA
		auto multiplyCublas = [&]() {
		const Real alpha = 0.5;
		@@ -445,6 +416,9 @@ benchmarkVectorOperations( Benchmark & benchmark,
		#endif
		benchmark.setOperation( "scalar multiplication", 2 * datasetSize );
		benchmark.time< Devices::Host >( reset1, "CPU ET", multiplyHost );
		#ifdef HAVE_BLAS
		benchmark.time< Devices::Host >( reset1, "CPU BLAS", multiplyBlas );
		#endif
		#ifdef HAVE_CUDA
		benchmark.time< Devices::Cuda >( reset1, "GPU ET", multiplyCuda );
		benchmark.time< Devices::Cuda >( reset1, "cuBLAS", multiplyCublas );
		@@ -606,6 +580,34 @@ benchmarkVectorOperations( Benchmark & benchmark,
		benchmark.time< Devices::Cuda >( resetAll, "cuBLAS", addThreeVectorsCublas );
		#endif

		////
		// Inclusive prefix sum
		auto inclusivePrefixSumHost = [&]() {
		hostVector.prefixSum();
		};
		auto inclusivePrefixSumCuda = [&]() {
		deviceVector.prefixSum();
		};
		benchmark.setOperation( "inclusive prefix sum", 2 * datasetSize );
		benchmark.time< Devices::Host >( reset1, "CPU ET", inclusivePrefixSumHost );
		#ifdef HAVE_CUDA
		benchmark.time< Devices::Cuda >( reset1, "GPU ET", inclusivePrefixSumCuda );
		#endif

		////
		// Exclusive prefix sum
		auto exclusivePrefixSumHost = [&]() {
		hostVector.template prefixSum< Containers::Algorithms::PrefixSumType::Exclusive >();
		};
		auto exclusivePrefixSumCuda = [&]() {
		deviceVector.template prefixSum< Containers::Algorithms::PrefixSumType::Exclusive >();
		};
		benchmark.setOperation( "exclusive prefix sum", 2 * datasetSize );
		benchmark.time< Devices::Host >( reset1, "CPU ET", exclusivePrefixSumHost );
		#ifdef HAVE_CUDA
		benchmark.time< Devices::Cuda >( reset1, "GPU ET", exclusivePrefixSumCuda );
		#endif

		#ifdef HAVE_CUDA
		cublasDestroy( cublasHandle );
		#endif

src/Benchmarks/Benchmarks.h

+19 −40

Original line number	Diff line number	Diff line
		@@ -17,7 +17,6 @@
		#include "Logging.h"

		#include <iostream>
		#include <iomanip>
		#include <exception>
		#include <limits>

		@@ -35,24 +34,24 @@ namespace Benchmarks {
		const double oneGB = 1024.0 * 1024.0 * 1024.0;



		struct BenchmarkResult
		{
		using HeaderElements = Logging::HeaderElements;
		using RowElements = Logging::RowElements;

		double bandwidth = std::numeric_limits<double>::quiet_NaN();
		double time = std::numeric_limits<double>::quiet_NaN();
		double stddev = std::numeric_limits<double>::quiet_NaN();
		double bandwidth = std::numeric_limits<double>::quiet_NaN();
		double speedup = std::numeric_limits<double>::quiet_NaN();

		virtual HeaderElements getTableHeader() const
		{
		return HeaderElements({"bandwidth", "time", "speedup"});
		return HeaderElements({ "time", "stddev", "stddev/time", "bandwidth", "speedup" });
		}

		virtual RowElements getRowElements() const
		{
		return RowElements({ bandwidth, time, speedup });
		return RowElements({ time, stddev, stddev / time, bandwidth, speedup });
		}
		};

		@@ -76,7 +75,6 @@ public:
		config.addEntry< int >( "loops", "Number of iterations for every computation.", 10 );
		config.addEntry< bool >( "reset", "Call reset function between loops.", true );
		config.addEntry< double >( "min-time", "Minimal real time in seconds for every computation.", 0.0 );
		config.addEntry< bool >( "timing", "Turns off (or on) the timing (for the purpose of profiling).", true );
		config.addEntry< int >( "verbose", "Verbose mode, the higher number the more verbosity.", 1 );
		}

		@@ -85,7 +83,6 @@ public:
		this->loops = parameters.getParameter< int >( "loops" );
		this->reset = parameters.getParameter< bool >( "reset" );
		this->minTime = parameters.getParameter< double >( "min-time" );
		this->timing = parameters.getParameter< bool >( "timing" );
		const int verbose = parameters.getParameter< int >( "verbose" );
		Logging::setVerbose( verbose );
		}
		@@ -121,7 +118,6 @@ public:
		metadata["loops"] = convertToString(loops);
		metadata["reset"] = convertToString( reset );
		metadata["minimal test time"] = convertToString( minTime );
		metadata["timing"] = convertToString( timing );
		writeMetadata( metadata );
		}

		@@ -203,33 +199,22 @@ public:
		BenchmarkResult & result )
		{
		result.time = std::numeric_limits<double>::quiet_NaN();
		result.stddev = std::numeric_limits<double>::quiet_NaN();
		FunctionTimer< Device > functionTimer;
		try {
		if( verbose > 1 ) {
		// run the monitor main loop
		Solvers::SolverMonitorThread monitor_thread( monitor );
		if( this->timing )
		if( this->reset )
		result.time = functionTimer. template timeFunction< true >( compute, reset, loops, minTime, verbose, monitor );
		else
		result.time = functionTimer. template timeFunction< true >( compute, loops, minTime, verbose, monitor );
		else
		if( this->reset )
		result.time = functionTimer. template timeFunction< false >( compute, reset, loops, minTime, verbose, monitor );
		std::tie( result.time, result.stddev ) = functionTimer.timeFunction( compute, reset, loops, minTime, verbose, monitor );
		else
		result.time = functionTimer. template timeFunction< false >( compute, loops, minTime, verbose, monitor );
		std::tie( result.time, result.stddev ) = functionTimer.timeFunction( compute, loops, minTime, verbose, monitor );
		}
		else {
		if( this->timing )
		if( this->reset )
		result.time = functionTimer. template timeFunction< true >( compute, reset, loops, minTime, verbose, monitor );
		std::tie( result.time, result.stddev ) = functionTimer.timeFunction( compute, reset, loops, minTime, verbose, monitor );
		else
		result.time = functionTimer. template timeFunction< true >( compute, loops, minTime, verbose, monitor );
		else
		if( this->reset )
		result.time = functionTimer. template timeFunction< false >( compute, reset, loops, minTime, verbose, monitor );
		else
		result.time = functionTimer. template timeFunction< false >( compute, loops, minTime, verbose, monitor );
		std::tie( result.time, result.stddev ) = functionTimer.timeFunction( compute, loops, minTime, verbose, monitor );
		}
		this->performedLoops = functionTimer.getPerformedLoops();
		}
		@@ -272,21 +257,16 @@ public:
		BenchmarkResult & result )
		{
		result.time = std::numeric_limits<double>::quiet_NaN();
		result.stddev = std::numeric_limits<double>::quiet_NaN();
		FunctionTimer< Device > functionTimer;
		try {
		if( verbose > 1 ) {
		// run the monitor main loop
		Solvers::SolverMonitorThread monitor_thread( monitor );
		if( this->timing )
		result.time = functionTimer. template timeFunction< true >( compute, loops, minTime, verbose, monitor );
		else
		result.time = functionTimer. template timeFunction< false >( compute, loops, minTime, verbose, monitor );
		std::tie( result.time, result.stddev ) = functionTimer.timeFunction( compute, loops, minTime, verbose, monitor );
		}
		else {
		if( this->timing )
		result.time = functionTimer. template timeFunction< true >( compute, loops, minTime, verbose, monitor );
		else
		result.time = functionTimer. template timeFunction< false >( compute, loops, minTime, verbose, monitor );
		std::tie( result.time, result.stddev ) = functionTimer.timeFunction( compute, loops, minTime, verbose, monitor );
		}
		}
		catch ( const std::exception& e ) {
		@@ -345,7 +325,6 @@ protected:
		double minTime = 0.0;
		double datasetSize = 0.0;
		double baseTime = 0.0;
		bool timing = true;
		bool reset = true;
		SolverMonitorType monitor;
		};

src/Benchmarks/FunctionTimer.h

+78 −93

File changed.

Preview size limit exceeded, changes collapsed.