Commit cbd05a45 authored by Jakub Klinkovský's avatar Jakub Klinkovský
Browse files

Merge branch 'JK/multireduction' into 'develop'

Reduction and multireduction refactoring

Brief summary:

- rewritten multireduction using lambda functions
- avoided `volatile` using `__syncwarp()`
- using reduction functions with `return a + b` instead of `a += b`
- using `std::plus`, `std::multiplies`, `std::logical_and`, `std::logical_or`, etc. instead of custom lambda functions
- optimized OpenMP thread counts for reduction and multireduction
- added computation of sample standard deviation to benchmarks
- implemented parallel prefix-sum with OpenMP
- implemented distributed prefix-sum

See merge request !37
parents 95b2d990 d13a2d18
Loading
Loading
Loading
Loading
+0 −2
Original line number Diff line number Diff line
@@ -10,8 +10,6 @@

#pragma once

#include <TNL/Containers/Algorithms/PrefixSumType.h>

namespace TNL {
namespace Benchmarks {

+36 −63
Original line number Diff line number Diff line
@@ -11,7 +11,6 @@
#pragma once

#include <TNL/Containers/Algorithms/Reduction.h>
#include <TNL/Containers/Algorithms/PrefixSum.h>
#include "CommonVectorOperations.h"

namespace TNL {
@@ -30,9 +29,8 @@ getVectorMax( const Vector& v )

   const auto* data = v.getData();
   auto fetch = [=] __cuda_callable__ ( IndexType i ) -> ResultType { return data[ i ]; };
   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::max( a, b ); };
   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::max( a, b ); };
   return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::lowest() );
   auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return TNL::max( a, b ); };
   return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, fetch, std::numeric_limits< ResultType >::lowest() );
}

template< typename Device >
@@ -48,9 +46,8 @@ getVectorMin( const Vector& v )

   const auto* data = v.getData();
   auto fetch = [=] __cuda_callable__ ( IndexType i ) -> RealType { return data[ i ]; };
   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a =  TNL::min( a, b ); };
   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a =  TNL::min( a, b ); };
   return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::max() );
   auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return TNL::min( a, b ); };
   return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, fetch, std::numeric_limits< ResultType >::max() );
}

template< typename Device >
@@ -66,9 +63,8 @@ getVectorAbsMax( const Vector& v )

   const auto* data = v.getData();
   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data[ i ] ); };
   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::max( a, b ); };
   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::max( a, b ); };
   return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::lowest() );
   auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return TNL::max( a, b ); };
   return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, fetch, std::numeric_limits< ResultType >::lowest() );
}

template< typename Device >
@@ -84,9 +80,8 @@ getVectorAbsMin( const Vector& v )

   const auto* data = v.getData();
   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data[ i ] ); };
   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::min( a, b ); };
   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::min( a, b ); };
   return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::max() );
   auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return TNL::min( a, b ); };
   return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, fetch, std::numeric_limits< ResultType >::max() );
}

template< typename Device >
@@ -102,9 +97,7 @@ getVectorL1Norm( const Vector& v )

   const auto* data = v.getData();
   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data[ i ] ); };
   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; };
   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; };
   return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 );
   return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), std::plus<>{}, fetch, ( ResultType ) 0 );
}

template< typename Device >
@@ -120,9 +113,7 @@ getVectorL2Norm( const Vector& v )

   const auto* data = v.getData();
   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data[ i ] * data[ i ]; };
   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; };
   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; };
   return std::sqrt( Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 ) );
   return std::sqrt( Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), std::plus<>{}, fetch, ( ResultType ) 0 ) );
}

template< typename Device >
@@ -145,9 +136,7 @@ getVectorLpNorm( const Vector& v,

   const auto* data = v.getData();
   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::pow( TNL::abs( data[ i ] ), p ); };
   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; };
   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; };
   return std::pow( Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 ), 1.0 / p );
   return std::pow( Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), std::plus<>{}, fetch, ( ResultType ) 0 ), 1.0 / p );
}

template< typename Device >
@@ -166,9 +155,7 @@ getVectorSum( const Vector& v )

   const auto* data = v.getData();
   auto fetch = [=] __cuda_callable__ ( IndexType i )  -> ResultType { return data[ i ]; };
   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; };
   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; };
   return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 );
   return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), std::plus<>{}, fetch, ( ResultType ) 0 );
}

template< typename Device >
@@ -187,9 +174,8 @@ getVectorDifferenceMax( const Vector1& v1,
   const auto* data1 = v1.getData();
   const auto* data2 = v2.getData();
   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data1[ i ] - data2[ i ]; };
   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::max( a, b ); };
   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::max( a, b ); };
   return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::lowest() );
   auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return TNL::max( a, b ); };
   return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, std::numeric_limits< ResultType >::lowest() );
}

template< typename Device >
@@ -208,9 +194,8 @@ getVectorDifferenceMin( const Vector1& v1,
   const auto* data1 = v1.getData();
   const auto* data2 = v2.getData();
   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data1[ i ] - data2[ i ]; };
   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::min( a, b ); };
   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::min( a, b ); };
   return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::max() );
   auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return TNL::min( a, b ); };
   return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, std::numeric_limits< ResultType >::max() );
}

template< typename Device >
@@ -229,9 +214,8 @@ getVectorDifferenceAbsMax( const Vector1& v1,
   const auto* data1 = v1.getData();
   const auto* data2 = v2.getData();
   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data1[ i ] - data2[ i ] ); };
   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::max( a, b ); };
   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::max( a, b ); };
   return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::lowest() );
   auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return TNL::max( a, b ); };
   return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, std::numeric_limits< ResultType >::lowest() );
}

template< typename Device >
@@ -250,9 +234,8 @@ getVectorDifferenceAbsMin( const Vector1& v1,
   const auto* data1 = v1.getData();
   const auto* data2 = v2.getData();
   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data1[ i ] - data2[ i ] ); };
   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::min( a, b ); };
   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::min( a, b ); };
   return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::max() );
   auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return TNL::min( a, b ); };
   return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, std::numeric_limits< ResultType >::max() );
}

template< typename Device >
@@ -271,9 +254,7 @@ getVectorDifferenceL1Norm( const Vector1& v1,
   const auto* data1 = v1.getData();
   const auto* data2 = v2.getData();
   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data1[ i ] - data2[ i ] ); };
   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; };
   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; };
   return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 );
   return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), std::plus<>{}, fetch, ( ResultType ) 0 );
}

template< typename Device >
@@ -295,9 +276,7 @@ getVectorDifferenceL2Norm( const Vector1& v1,
      auto diff = data1[ i ] - data2[ i ];
      return diff * diff;
   };
   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; };
   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; };
   return std::sqrt( Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 ) );
   return std::sqrt( Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), std::plus<>{}, fetch, ( ResultType ) 0 ) );
}

template< typename Device >
@@ -323,9 +302,7 @@ getVectorDifferenceLpNorm( const Vector1& v1,
   const auto* data1 = v1.getData();
   const auto* data2 = v2.getData();
   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::pow( TNL::abs( data1[ i ] - data2[ i ] ), p ); };
   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; };
   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; };
   return std::pow( Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 ), 1.0 / p );
   return std::pow( Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), std::plus<>{}, fetch, ( ResultType ) 0 ), 1.0 / p );
}

template< typename Device >
@@ -344,9 +321,7 @@ getVectorDifferenceSum( const Vector1& v1,
   const auto* data1 = v1.getData();
   const auto* data2 = v2.getData();
   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data1[ i ] - data2[ i ]; };
   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; };
   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; };
   return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 );
   return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), std::plus<>{}, fetch, ( ResultType ) 0 );
}

template< typename Device >
@@ -365,9 +340,7 @@ getScalarProduct( const Vector1& v1,
   const auto* data1 = v1.getData();
   const auto* data2 = v2.getData();
   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data1[ i ] * data2[ i ]; };
   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; };
   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; };
   return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 );
   return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), std::plus<>{}, fetch, ( ResultType ) 0 );
}

} // namespace Benchmarks
+36 −34
Original line number Diff line number Diff line
@@ -346,7 +346,6 @@ benchmarkVectorOperations( Benchmark & benchmark,
   auto l3normCudaET = [&]() {
      resultDevice = lpNorm( deviceView, 3.0 );
   };

   benchmark.setOperation( "l3 norm", datasetSize );
   benchmark.time< Devices::Host >( reset1, "CPU legacy", l3normHost );
   benchmark.time< Devices::Host >( reset1, "CPU ET", l3normHostET );
@@ -369,7 +368,6 @@ benchmarkVectorOperations( Benchmark & benchmark,
   auto scalarProductCudaET = [&]() {
      resultDevice = ( deviceView, deviceView2 );
   };

#ifdef HAVE_BLAS
   auto scalarProductBlas = [&]() {
      resultHost = blasGdot( size, hostVector.getData(), 1, hostVector2.getData(), 1 );
@@ -395,38 +393,6 @@ benchmarkVectorOperations( Benchmark & benchmark,
   benchmark.time< Devices::Cuda >( reset1, "cuBLAS", scalarProductCublas );
#endif

   ////
   // Prefix sum
   /*
   std::cout << "Benchmarking prefix-sum:" << std::endl;
   timer.reset();
   timer.start();
   hostVector.computePrefixSum();
   timer.stop();
   timeHost = timer.getTime();
   bandwidth = 2 * datasetSize / timer.getTime();
   std::cout << "  CPU: bandwidth: " << bandwidth << " GB/sec, time: " << timer.getTime() << " sec." << std::endl;

   timer.reset();
   timer.start();
   deviceVector.computePrefixSum();
   timer.stop();
   timeDevice = timer.getTime();
   bandwidth = 2 * datasetSize / timer.getTime();
   std::cout << "  GPU: bandwidth: " << bandwidth << " GB/sec, time: " << timer.getTime() << " sec." << std::endl;
   std::cout << "  CPU/GPU speedup: " << timeHost / timeDevice << std::endl;

   HostVector auxHostVector;
   auxHostVector.setLike( deviceVector );
   auxHostVector = deviceVector;
   for( int i = 0; i < size; i++ )
      if( hostVector.getElement( i ) != auxHostVector.getElement( i ) )
      {
         std::cerr << "Error in prefix sum at position " << i << ":  " << hostVector.getElement( i ) << " != " << auxHostVector.getElement( i ) << std::endl;
      }
   */


   ////
   // Scalar multiplication
   auto multiplyHost = [&]() {
@@ -435,6 +401,11 @@ benchmarkVectorOperations( Benchmark & benchmark,
   auto multiplyCuda = [&]() {
      deviceVector *= 0.5;
   };
#ifdef HAVE_BLAS
   auto multiplyBlas = [&]() {
      blasGscal( hostVector.getSize(), (Real) 0.5, hostVector.getData(), 1 );
   };
#endif
#ifdef HAVE_CUDA
   auto multiplyCublas = [&]() {
      const Real alpha = 0.5;
@@ -445,6 +416,9 @@ benchmarkVectorOperations( Benchmark & benchmark,
#endif
   benchmark.setOperation( "scalar multiplication", 2 * datasetSize );
   benchmark.time< Devices::Host >( reset1, "CPU ET", multiplyHost );
#ifdef HAVE_BLAS
   benchmark.time< Devices::Host >( reset1, "CPU BLAS", multiplyBlas );
#endif
#ifdef HAVE_CUDA
   benchmark.time< Devices::Cuda >( reset1, "GPU ET", multiplyCuda );
   benchmark.time< Devices::Cuda >( reset1, "cuBLAS", multiplyCublas );
@@ -606,6 +580,34 @@ benchmarkVectorOperations( Benchmark & benchmark,
   benchmark.time< Devices::Cuda >( resetAll, "cuBLAS", addThreeVectorsCublas );
#endif

   ////
   // Inclusive prefix sum
   auto inclusivePrefixSumHost = [&]() {
      hostVector.prefixSum();
   };
   auto inclusivePrefixSumCuda = [&]() {
      deviceVector.prefixSum();
   };
   benchmark.setOperation( "inclusive prefix sum", 2 * datasetSize );
   benchmark.time< Devices::Host >( reset1, "CPU ET", inclusivePrefixSumHost );
#ifdef HAVE_CUDA
   benchmark.time< Devices::Cuda >( reset1, "GPU ET", inclusivePrefixSumCuda );
#endif

   ////
   // Exclusive prefix sum
   auto exclusivePrefixSumHost = [&]() {
      hostVector.template prefixSum< Containers::Algorithms::PrefixSumType::Exclusive >();
   };
   auto exclusivePrefixSumCuda = [&]() {
      deviceVector.template prefixSum< Containers::Algorithms::PrefixSumType::Exclusive >();
   };
   benchmark.setOperation( "exclusive prefix sum", 2 * datasetSize );
   benchmark.time< Devices::Host >( reset1, "CPU ET", exclusivePrefixSumHost );
#ifdef HAVE_CUDA
   benchmark.time< Devices::Cuda >( reset1, "GPU ET", exclusivePrefixSumCuda );
#endif

#ifdef HAVE_CUDA
   cublasDestroy( cublasHandle );
#endif
+19 −40
Original line number Diff line number Diff line
@@ -17,7 +17,6 @@
#include "Logging.h"

#include <iostream>
#include <iomanip>
#include <exception>
#include <limits>

@@ -35,24 +34,24 @@ namespace Benchmarks {
const double oneGB = 1024.0 * 1024.0 * 1024.0;



struct BenchmarkResult
{
   using HeaderElements = Logging::HeaderElements;
   using RowElements = Logging::RowElements;

   double bandwidth = std::numeric_limits<double>::quiet_NaN();
   double time = std::numeric_limits<double>::quiet_NaN();
   double stddev = std::numeric_limits<double>::quiet_NaN();
   double bandwidth = std::numeric_limits<double>::quiet_NaN();
   double speedup = std::numeric_limits<double>::quiet_NaN();

   virtual HeaderElements getTableHeader() const
   {
      return HeaderElements({"bandwidth", "time", "speedup"});
      return HeaderElements({ "time", "stddev", "stddev/time", "bandwidth", "speedup" });
   }

   virtual RowElements getRowElements() const
   {
      return RowElements({ bandwidth, time, speedup });
      return RowElements({ time, stddev, stddev / time, bandwidth, speedup });
   }
};

@@ -76,7 +75,6 @@ public:
      config.addEntry< int >( "loops", "Number of iterations for every computation.", 10 );
      config.addEntry< bool >( "reset", "Call reset function between loops.", true );
      config.addEntry< double >( "min-time", "Minimal real time in seconds for every computation.", 0.0 );
      config.addEntry< bool >( "timing", "Turns off (or on) the timing (for the purpose of profiling).", true );
      config.addEntry< int >( "verbose", "Verbose mode, the higher number the more verbosity.", 1 );
   }

@@ -85,7 +83,6 @@ public:
      this->loops = parameters.getParameter< int >( "loops" );
      this->reset = parameters.getParameter< bool >( "reset" );
      this->minTime = parameters.getParameter< double >( "min-time" );
      this->timing = parameters.getParameter< bool >( "timing" );
      const int verbose = parameters.getParameter< int >( "verbose" );
      Logging::setVerbose( verbose );
   }
@@ -121,7 +118,6 @@ public:
      metadata["loops"] = convertToString(loops);
      metadata["reset"] = convertToString( reset );
      metadata["minimal test time"] = convertToString( minTime );
      metadata["timing"] = convertToString( timing );
      writeMetadata( metadata );
   }

@@ -203,33 +199,22 @@ public:
         BenchmarkResult & result )
   {
      result.time = std::numeric_limits<double>::quiet_NaN();
      result.stddev = std::numeric_limits<double>::quiet_NaN();
      FunctionTimer< Device > functionTimer;
      try {
         if( verbose > 1 ) {
            // run the monitor main loop
            Solvers::SolverMonitorThread monitor_thread( monitor );
            if( this->timing )
               if( this->reset )
                  result.time = functionTimer. template timeFunction< true >( compute, reset, loops, minTime, verbose, monitor );
               else
                  result.time = functionTimer. template timeFunction< true >( compute, loops, minTime, verbose, monitor );
            else
            if( this->reset )
                  result.time = functionTimer. template timeFunction< false >( compute, reset, loops, minTime, verbose, monitor );
               std::tie( result.time, result.stddev ) = functionTimer.timeFunction( compute, reset, loops, minTime, verbose, monitor );
            else
                  result.time = functionTimer. template timeFunction< false >( compute, loops, minTime, verbose, monitor );
               std::tie( result.time, result.stddev ) = functionTimer.timeFunction( compute, loops, minTime, verbose, monitor );
         }
         else {
            if( this->timing )
            if( this->reset )
                  result.time = functionTimer. template timeFunction< true >( compute, reset, loops, minTime, verbose, monitor );
               std::tie( result.time, result.stddev ) = functionTimer.timeFunction( compute, reset, loops, minTime, verbose, monitor );
            else
                  result.time = functionTimer. template timeFunction< true >( compute, loops, minTime, verbose, monitor );
            else
               if( this->reset )
                  result.time = functionTimer. template timeFunction< false >( compute, reset, loops, minTime, verbose, monitor );
               else
                  result.time = functionTimer. template timeFunction< false >( compute, loops, minTime, verbose, monitor );
               std::tie( result.time, result.stddev ) = functionTimer.timeFunction( compute, loops, minTime, verbose, monitor );
         }
         this->performedLoops = functionTimer.getPerformedLoops();
      }
@@ -272,21 +257,16 @@ public:
         BenchmarkResult & result )
   {
      result.time = std::numeric_limits<double>::quiet_NaN();
      result.stddev = std::numeric_limits<double>::quiet_NaN();
      FunctionTimer< Device > functionTimer;
      try {
         if( verbose > 1 ) {
            // run the monitor main loop
            Solvers::SolverMonitorThread monitor_thread( monitor );
            if( this->timing )
               result.time = functionTimer. template timeFunction< true >( compute, loops, minTime, verbose, monitor );
            else
               result.time = functionTimer. template timeFunction< false >( compute, loops, minTime, verbose, monitor );
            std::tie( result.time, result.stddev ) = functionTimer.timeFunction( compute, loops, minTime, verbose, monitor );
         }
         else {
            if( this->timing )
               result.time = functionTimer. template timeFunction< true >( compute, loops, minTime, verbose, monitor );
            else
               result.time = functionTimer. template timeFunction< false >( compute, loops, minTime, verbose, monitor );
            std::tie( result.time, result.stddev ) = functionTimer.timeFunction( compute, loops, minTime, verbose, monitor );
         }
      }
      catch ( const std::exception& e ) {
@@ -345,7 +325,6 @@ protected:
   double minTime = 0.0;
   double datasetSize = 0.0;
   double baseTime = 0.0;
   bool timing = true;
   bool reset = true;
   SolverMonitorType monitor;
};
+78 −93

File changed.

Preview size limit exceeded, changes collapsed.

Loading