Commit 77e7a205 authored by Jakub Klinkovský's avatar Jakub Klinkovský
Browse files

Benchmarks: rewritten objectively, with support for output into a log file

parent a241fcba
Loading
Loading
Loading
Loading
+92 −0
Original line number Diff line number Diff line
#pragma once

#include "benchmarks.h"

#include <core/arrays/tnlArray.h>

namespace tnl
{
namespace benchmarks
{

template< typename Real = double,
          typename Index = int >
bool
benchmarkArrayOperations( Benchmark & benchmark,
                          const int & loops,
                          const int & size )
{
    typedef tnlArray< Real, tnlHost, Index > HostArray;
    typedef tnlArray< Real, tnlCuda, Index > CudaArray;
    using namespace std;

    double datasetSize = ( double ) ( loops * size ) * sizeof( Real ) / oneGB;

    HostArray hostArray, hostArray2;
    CudaArray deviceArray, deviceArray2;
    hostArray.setSize( size );
    if( ! deviceArray.setSize( size ) )
        return false;
    hostArray2.setLike( hostArray );
    if( ! deviceArray2.setLike( deviceArray ) )
        return false;

    Real resultHost, resultDevice;


    // reset functions
    auto reset1 = [&]() {
        hostArray.setValue( 1.0 );
        deviceArray.setValue( 1.0 );
    };
    auto reset2 = [&]() {
        hostArray2.setValue( 1.0 );
        deviceArray2.setValue( 1.0 );
    };
    auto reset12 = [&]() {
        reset1();
        reset2();
    };


    reset12();


    auto compareHost = [&]() {
        resultHost = (int) hostArray == hostArray2;
    };
    auto compareCuda = [&]() {
        resultDevice = (int) deviceArray == deviceArray2;
    };
    benchmark.setOperation( "comparison (operator==)", 2 * datasetSize );
    benchmark.time( reset1,
                    "CPU", compareHost,
                    "GPU", compareCuda );


    auto copyAssignHostHost = [&]() {
        hostArray = hostArray2;
    };
    auto copyAssignCudaCuda = [&]() {
        deviceArray = deviceArray2;
    };
    benchmark.setOperation( "copy (operator=)", 2 * datasetSize );
    double basetime = benchmark.time( reset1,
                    "CPU", copyAssignHostHost,
                    "GPU", copyAssignCudaCuda );


    auto copyAssignHostCuda = [&]() {
        deviceArray = hostArray;
    };
    auto copyAssignCudaHost = [&]() {
        hostArray = deviceArray;
    };
    benchmark.setOperation( "copy (operator=)", datasetSize, basetime );
    benchmark.time( reset1,
                    "CPU->GPU", copyAssignHostCuda,
                    "GPU->CPU", copyAssignCudaHost );
}

} // namespace benchmarks
} // namespace tnl
+325 −62
Original line number Diff line number Diff line
@@ -2,8 +2,11 @@

#include <iostream>
#include <iomanip>
#include <map>
#include <vector>

#include <core/tnlTimerRT.h>
#include <core/tnlString.h>

namespace tnl
{
@@ -17,18 +20,15 @@ template< typename ComputeFunction,
double
timeFunction( ComputeFunction compute,
              ResetFunction reset,
              const int & loops,
              const double & datasetSize, // in GB
              const double & baseTime, // in seconds (baseline for speedup calculation)
              const char* performer )
              const int & loops )
{
    // the timer is constructed zero-initialized and stopped
    tnlTimerRT timer;

    reset();
    for(int i = 0; i < loops; ++i) {
        // TODO: not necessary for host computations
        // Explicit synchronization of the CUDA device
        // TODO: not necessary for host computations
#ifdef HAVE_CUDA
        cudaDeviceSynchronize();
#endif
@@ -42,70 +42,333 @@ timeFunction( ComputeFunction compute,
        reset();
    }

    const double time = timer.getTime();
    const double bandwidth = datasetSize / time;
    return timer.getTime();
}


struct InternalError {};


class Logging
{
public:
    using MetadataElement = std::pair< const char*, tnlString >;
    using MetadataMap = std::map< const char*, tnlString >;
    using MetadataColumns = std::vector<MetadataElement>;

    using HeaderElements = std::initializer_list< tnlString >;
    using RowElements = std::initializer_list< double >;

    Logging( bool verbose = true )
        : verbose(verbose)
    { }

    // TODO: fix spacing (blank lines)
    void
    writeTitle( const tnlString & title )
    {
        if( verbose )
            std::cout << std::endl << "== " << title << " ==" << std::endl << std::endl;
        log << ": title = " << title << std::endl;
    }

    void
    writeMetadata( const MetadataMap & metadata )
    {
        if( verbose )
            std::cout << "properties:" << std::endl;

        for( auto & it : metadata ) {
            if( verbose )
                std::cout << "   " << it.first << " = " << it.second << std::endl;
            log << ": " << it.first << " = " << it.second << std::endl;
        }
        if( verbose )
            std::cout << std::endl;
    }

    void
    writeTableHeader( const tnlString & spanningElement,
                      const HeaderElements & subElements )
    {
        using namespace std;
    cout << "  " << performer << ": bandwidth: "
         << setw( 8 ) << bandwidth << " GB/sec, time: "
         << setw( 8 ) << time << " sec, speedup: ";
    if( baseTime )
        cout << baseTime / time << endl;
    else
        cout << "N/A" << endl;

    return time;
        if( verbose && header_changed ) {
            for( auto & it : metadataColumns ) {
                cout << setw( 20 ) << it.first;
            }

// This specialization terminates the recursion
template< typename ResetFunction,
          typename ComputeFunction >
inline void
benchmarkNextOperation( const double & datasetSize,
                        const int & loops,
                        ResetFunction reset,
                        const double & baseTime,
                        const char* performer,
                        ComputeFunction compute )
            // spanning element is printed as usual column to stdout,
            // but is excluded from header
            cout << setw( 15 ) << "";

            for( auto & it : subElements ) {
                cout << setw( 15 ) << it;
            }
            cout << endl;

            header_changed = false;
        }

        // initial indent string
        header_indent = "!";
        log << endl;
        for( auto & it : metadataColumns ) {
            log << header_indent << " " << it.first << endl;
        }

        // dump stacked spanning columns
        if( horizontalGroups.size() > 0 )
            while( horizontalGroups.back().second <= 0 ) {
                horizontalGroups.pop_back();
                header_indent.pop_back();
            }
        for( int i = 0; i < horizontalGroups.size(); i++ ) {
            if( horizontalGroups[ i ].second > 0 ) {
                log << header_indent << " " << horizontalGroups[ i ].first << endl;
                header_indent += "!";
            }
        }

        log << header_indent << " " << spanningElement << endl;
        for( auto & it : subElements ) {
            log << header_indent << "! " << it << endl;
        }

        if( horizontalGroups.size() > 0 ) {
            horizontalGroups.back().second--;
            header_indent.pop_back();
        }
    }

    void
    writeTableRow( const tnlString & spanningElement,
                   const RowElements & subElements )
    {
        using namespace std;

        if( verbose ) {
            for( auto & it : metadataColumns ) {
                cout << setw( 20 ) << it.second;
            }
            // spanning element is printed as usual column to stdout
            cout << setw( 15 ) << spanningElement;
            for( auto & it : subElements ) {
                cout << setw( 15 );
                if( it != 0.0 ) cout << it;
                else cout << "N/A";
            }
            cout << endl;
        }

        // only when changed (the header has been already adjusted)
        // print each element on separate line
        for( auto & it : metadataColumns ) {
            log << it.second << endl;
        }

        // benchmark data are indented
        const tnlString indent = "    ";
        for( auto & it : subElements ) {
            if( it != 0.0 ) log << indent << it << endl;
            else log << indent << "N/A" << endl;
        }
    }

    void
    closeTable()
    {
        header_indent = body_indent = "";
        header_changed = true;
    }

    bool save( std::ostream & logFile )
    {
    timeFunction( compute, reset, loops, datasetSize, baseTime, performer );
        closeTable();
        logFile << log.str();
        if( logFile.good() ) {
            log.str() ="";
            return true;
        }
        return false;
    }

protected:

    // manual double -> tnlString conversion with fixed precision
    static tnlString
    _to_string( const double & num, const int & precision = 0, bool fixed = false )
    {
        std::stringstream str;
        if( fixed )
            str << std::fixed;
        if( precision )
            str << std::setprecision( precision );
        str << num;
        return tnlString( str.str().data() );
    }

    std::stringstream log;
    std::string header_indent;
    std::string body_indent;

    bool verbose;
    MetadataColumns metadataColumns;
    bool header_changed = true;
    std::vector< std::pair< tnlString, int > > horizontalGroups;
};


class Benchmark
    : protected Logging
{
public:
    using Logging::MetadataElement;
    using Logging::MetadataMap;
    using Logging::MetadataColumns;

    Benchmark( const int & loops = 10,
               bool verbose = true )
        : Logging(verbose), loops(loops)
    { }

    // TODO: ensure that this is not called in the middle of the benchmark
    // (or just remove it completely?)
    void
    setLoops( const int & loops )
    {
        this->loops = loops;
    }

    // Marks the start of a new benchmark
    void
    newBenchmark( const tnlString & title )
    {
        closeTable();
        writeTitle( title );
    }

    // Marks the start of a new benchmark (with custom metadata)
    void
    newBenchmark( const tnlString & title,
                  MetadataMap metadata )
    {
        closeTable();
        writeTitle( title );
        // add loops to metadata
        metadata["loops"] = tnlString(loops);
        writeMetadata( metadata );
    }

    // Sets metadata columns -- values used for all subsequent rows until
    // the next call to this function.
    void
    setMetadataColumns( const MetadataColumns & metadata )
    {
        if( metadataColumns != metadata )
            header_changed = true;
        metadataColumns = metadata;
    }

    // TODO: maybe should be renamed to createVerticalGroup and ensured that vertical and horizontal groups are not used within the same "Benchmark"
    // Sets current operation -- operations expand the table vertically
    //  - baseTime should be reset to 0.0 for most operations, but sometimes
    //    it is useful to override it
    //  - Order of operations inside a "Benchmark" does not matter, rows can be
    //    easily sorted while converting to HTML.)
    void
    setOperation( const tnlString & operation,
                  const double & datasetSize = 0.0, // in GB
                  const double & baseTime = 0.0 )
    {
        if( metadataColumns.size() > 0 && tnlString(metadataColumns[ 0 ].first) == "operation" ) {
            metadataColumns[ 0 ].second = operation;
        }
        else {
            metadataColumns.insert( metadataColumns.begin(), {"operation", operation} );
        }
        setOperation( datasetSize, baseTime );
        header_changed = true;
    }

    void
    setOperation( const double & datasetSize = 0.0,
                  const double & baseTime = 0.0 )
    {
        this->datasetSize = datasetSize;
        this->baseTime = baseTime;
    }

    // Creates new horizontal groups inside a benchmark -- increases the number
    // of columns in the "Benchmark", implies column spanning.
    // (Useful e.g. for SpMV formats, different configurations etc.)
    void
    createHorizontalGroup( const tnlString & name,
                           const int & subcolumns )
    {
        if( horizontalGroups.size() == 0 ) {
            horizontalGroups.push_back( {name, subcolumns} );
        }
        else {
            auto & last = horizontalGroups.back();
            if( last.first != name && last.second > 0 ) {
                horizontalGroups.push_back( {name, subcolumns} );
            }
            else {
                last.first = name;
                last.second = subcolumns;
            }
        }
    }

// Recursive template function to deal with benchmarks involving multiple computations
    // Times a single ComputeFunction. Subsequent calls implicitly split
    // the current "horizontal group" into sub-columns identified by
    // "performer", which are further split into "bandwidth", "time" and
    // "speedup" columns.
    // TODO: allow custom columns bound to lambda functions (e.g. for Gflops calculation)
    // Also terminates the recursion of the following variadic template.
    template< typename ResetFunction,
          typename ComputeFunction,
          typename... NextComputations >
inline void
benchmarkNextOperation( const double & datasetSize,
                        const int & loops,
                        ResetFunction reset,
                        const double & baseTime,
                        const char* performer,
                        ComputeFunction compute,
                        NextComputations & ... nextComputations )
              typename ComputeFunction >
    double
    time( ResetFunction reset,
          const tnlString & performer,
          ComputeFunction & compute )
    {
    benchmarkNextOperation( datasetSize, loops, reset, baseTime, performer, compute );
    benchmarkNextOperation( datasetSize, loops, reset, baseTime, nextComputations... );
        const double time = timeFunction( compute, reset, loops );
        const double bandwidth = datasetSize / time;
        const double speedup = this->baseTime / time;
        if( this->baseTime == 0.0 )
            this->baseTime = time;

        writeTableHeader( performer, HeaderElements({"bandwidth", "time", "speedup"}) );
        writeTableRow( performer, RowElements({ bandwidth, time, speedup }) );

        return this->baseTime;
    }

// Main function for benchmarking
    // Recursive template function to deal with multiple computations with the
    // same reset function.
    template< typename ResetFunction,
              typename ComputeFunction,
              typename... NextComputations >
void
benchmarkOperation( const char* operation,
                    const double & datasetSize,
                    const int & loops,
                    ResetFunction reset,
                    const char* performer,
                    ComputeFunction computeBase,
                    NextComputations... nextComputations )
    inline double
    time( ResetFunction reset,
          const tnlString & performer,
          ComputeFunction & compute,
          NextComputations & ... nextComputations )
    {
    cout << "Benchmarking " << operation << ":" << endl;
    double baseTime = timeFunction( computeBase, reset, loops, datasetSize, 0.0, performer );
    benchmarkNextOperation( datasetSize, loops, reset, baseTime, nextComputations... );
    std::cout << std::endl;
        time( reset, performer, compute );
        time( reset, nextComputations... );
        return this->baseTime;
    }

    using Logging::save;

protected:
    int loops;
    double datasetSize = 0.0;
    double baseTime = 0.0;
};

} // namespace benchmarks
} // namespace tnl
+55 −14
Original line number Diff line number Diff line
@@ -18,12 +18,12 @@
#ifndef TNLCUDABENCHMARKS_H_
#define TNLCUDBENCHMARKS_H_

#include <tnlConfig.h>
#include <core/tnlList.h>
#include <matrices/tnlSlicedEllpackMatrix.h>
#include <matrices/tnlEllpackMatrix.h>
#include <matrices/tnlCSRMatrix.h>

#include "array-operations.h"
#include "vector-operations.h"

using namespace tnl::benchmarks;
@@ -31,6 +31,7 @@ using namespace tnl::benchmarks;

// TODO: should benchmarks check the result of the computation?


// silly alias to match the number of template parameters with other formats
template< typename Real, typename Device, typename Index >
using SlicedEllpackMatrix = tnlSlicedEllpackMatrix< Real, Device, Index >;
@@ -43,8 +44,6 @@ int setHostTestMatrix( Matrix& matrix,
   int elements( 0 );
   for( int row = 0; row < size; row++ )
   {
      if( row % 100 == 0 )
         cout << "Filling row " << row << "/" << size << "     \r" << flush;
      int col = row - elementsPerRow / 2;
      for( int element = 0; element < elementsPerRow; element++ )
      {
@@ -56,7 +55,6 @@ int setHostTestMatrix( Matrix& matrix,
         }
      }      
   }
   cout << endl;
   return elements;
}

@@ -104,7 +102,8 @@ template< typename Real,
          template< typename, typename, typename > class Matrix,
          template< typename, typename, typename > class Vector = tnlVector >
bool
benchmarkSpMV( const int & loops,
benchmarkSpMV( Benchmark & benchmark,
               const int & loops,
               const int & size,
               const int elementsPerRow = 5 )
{
@@ -149,8 +148,7 @@ benchmarkSpMV( const int & loops,

   tnlList< tnlString > parsedType;
   parseObjectType( HostMatrix::getType(), parsedType );
   tnlString operationDescription = tnlString("SpMV (matrix type: ") + parsedType[ 0 ]
        + ", rows: " + tnlString(size) + ", elements per row: " + tnlString(elementsPerRow) + ")";
   benchmark.createHorizontalGroup( parsedType[ 0 ], 2 );

   const int elements = setHostTestMatrix< HostMatrix >( hostMatrix, elementsPerRow );
   setCudaTestMatrix< DeviceMatrix >( deviceMatrix, elementsPerRow );
@@ -172,7 +170,8 @@ benchmarkSpMV( const int & loops,
      deviceMatrix.vectorProduct( deviceVector, deviceVector2 );
   };

   benchmarkOperation( operationDescription.getString(), datasetSize, loops, reset,
   benchmark.setOperation( datasetSize );
   benchmark.time( reset,
                   "CPU", spmvHost,
                   "GPU", spmvCuda );

@@ -184,6 +183,7 @@ int main( int argc, char* argv[] )
#ifdef HAVE_CUDA
   
   typedef double Real;
   tnlString precision = getType< Real >();
   
   /****
    * The first argument of this program is the size od data set to be reduced.
@@ -199,11 +199,52 @@ int main( int argc, char* argv[] )
   if( argc > 3 )
      elementsPerRow = atoi( argv[ 3 ] );

   benchmarkVectorOperations< Real >( loops, size );
   ofstream logFile( "tnl-cuda-benchmarks.log" );
   Benchmark benchmark( loops, true );
//   ostream & logFile = cout;
//   Benchmark benchmark( loops, false );
   
   // TODO: add hostname, CPU info, GPU info, date, ...
   Benchmark::MetadataMap metadata {
      {"precision", precision},
   };
   // TODO: loop over sizes
   

   // Array operations
   benchmark.newBenchmark( tnlString("Array operations (") + precision + ")",
                           metadata );
   benchmark.setMetadataColumns( Benchmark::MetadataColumns({
      {"size", size},
   } ));
   benchmarkArrayOperations< Real >( benchmark, loops, size );


   benchmarkSpMV< Real, tnlEllpackMatrix >( loops, size, elementsPerRow );
   benchmarkSpMV< Real, SlicedEllpackMatrix >( loops, size, elementsPerRow );
   benchmarkSpMV< Real, tnlCSRMatrix >( loops, size, elementsPerRow );
   // Vector operations
   benchmark.newBenchmark( tnlString("Vector operations (") + precision + ")",
                           metadata );
   benchmark.setMetadataColumns( Benchmark::MetadataColumns({
      {"size", size},
   } ));
   benchmarkVectorOperations< Real >( benchmark, loops, size );


   // SpMV
   benchmark.newBenchmark( tnlString("SpMV (") + precision + ")",
                           metadata );
   benchmark.setMetadataColumns( Benchmark::MetadataColumns({
      {"rows", size},
      {"columns", size},
      {"elements per row", elementsPerRow},
   } ));

   benchmarkSpMV< Real, tnlEllpackMatrix >( benchmark, loops, size, elementsPerRow );
   benchmarkSpMV< Real, SlicedEllpackMatrix >( benchmark, loops, size, elementsPerRow );
   benchmarkSpMV< Real, tnlCSRMatrix >( benchmark, loops, size, elementsPerRow );


   if( ! benchmark.save( logFile ) )
       return EXIT_FAILURE;
   
   return EXIT_SUCCESS;
#else
+48 −69
Original line number Diff line number Diff line
@@ -16,7 +16,8 @@ namespace benchmarks
template< typename Real = double,
          typename Index = int >
bool
benchmarkVectorOperations( const int & loops,
benchmarkVectorOperations( Benchmark & benchmark,
                           const int & loops,
                           const int & size )
{
    typedef tnlVector< Real, tnlHost, Index > HostVector;
@@ -63,46 +64,14 @@ benchmarkVectorOperations( const int & loops,
    reset12();


    auto copyAssignHostCuda = [&]() {
        deviceVector = hostVector;
    };
    auto copyAssignCudaHost = [&]() {
        hostVector = deviceVector;
    };
    benchmarkOperation( "copy assigment (cross-device)", datasetSize, loops, reset1,
                        "CPU->GPU", copyAssignHostCuda,
                        "GPU->CPU", copyAssignCudaHost );


    auto copyAssignHostHost = [&]() {
        hostVector = hostVector2;
    };
    auto copyAssignCudaCuda = [&]() {
        deviceVector = deviceVector2;
    };
    benchmarkOperation( "copy assigment", 2 * datasetSize, loops, reset1,
                        "CPU->CPU", copyAssignHostHost,
                        "GPU->GPU", copyAssignCudaCuda );


    auto compareHost = [&]() {
        resultHost = (int) hostVector == hostVector2;
    };
    auto compareCuda = [&]() {
        resultDevice = (int) deviceVector == deviceVector2;
    };
    benchmarkOperation( "comparison (operator==)", 2 * datasetSize, loops, reset1,
                        "CPU", compareHost,
                        "GPU", compareCuda );


    auto multiplyHost = [&]() {
        hostVector *= 0.5;
    };
    auto multiplyCuda = [&]() {
        deviceVector *= 0.5;
    };
    benchmarkOperation( "scalar multiplication", 2 * datasetSize, loops, reset1,
    benchmark.setOperation( "scalar multiplication", 2 * datasetSize );
    benchmark.time( reset1,
                    "CPU", multiplyHost,
                    "GPU", multiplyCuda );

@@ -113,7 +82,8 @@ benchmarkVectorOperations( const int & loops,
    auto addVectorCuda = [&]() {
        deviceVector.addVector( deviceVector2 );
    };
    benchmarkOperation( "vector addition", 3 * datasetSize, loops, reset1,
    benchmark.setOperation( "vector addition", 3 * datasetSize );
    benchmark.time( reset1,
                    "CPU", addVectorHost,
                    "GPU", addVectorCuda );

@@ -124,7 +94,8 @@ benchmarkVectorOperations( const int & loops,
    auto maxCuda = [&]() {
        resultDevice = deviceVector.max();
    };
    benchmarkOperation( "max", datasetSize, loops, reset1,
    benchmark.setOperation( "max", datasetSize );
    benchmark.time( reset1,
                    "CPU", maxHost,
                    "GPU", maxCuda );

@@ -135,7 +106,8 @@ benchmarkVectorOperations( const int & loops,
    auto minCuda = [&]() {
        resultDevice = deviceVector.min();
    };
    benchmarkOperation( "min", datasetSize, loops, reset1,
    benchmark.setOperation( "min", datasetSize );
    benchmark.time( reset1,
                    "CPU", minHost,
                    "GPU", minCuda );

@@ -146,7 +118,8 @@ benchmarkVectorOperations( const int & loops,
    auto absMaxCuda = [&]() {
        resultDevice = deviceVector.absMax();
    };
    benchmarkOperation( "absMax", datasetSize, loops, reset1,
    benchmark.setOperation( "absMax", datasetSize );
    benchmark.time( reset1,
                    "CPU", absMaxHost,
                    "GPU", absMaxCuda );

@@ -157,7 +130,8 @@ benchmarkVectorOperations( const int & loops,
    auto absMinCuda = [&]() {
        resultDevice = deviceVector.absMin();
    };
    benchmarkOperation( "absMin", datasetSize, loops, reset1,
    benchmark.setOperation( "absMin", datasetSize );
    benchmark.time( reset1,
                    "CPU", absMinHost,
                    "GPU", absMinCuda );

@@ -168,7 +142,8 @@ benchmarkVectorOperations( const int & loops,
    auto sumCuda = [&]() {
        resultDevice = deviceVector.sum();
    };
    benchmarkOperation( "sum", datasetSize, loops, reset1,
    benchmark.setOperation( "sum", datasetSize );
    benchmark.time( reset1,
                    "CPU", sumHost,
                    "GPU", sumCuda );

@@ -179,7 +154,8 @@ benchmarkVectorOperations( const int & loops,
    auto l1normCuda = [&]() {
        resultDevice = deviceVector.lpNorm( 1.0 );
    };
    benchmarkOperation( "l1 norm", datasetSize, loops, reset1,
    benchmark.setOperation( "l1 norm", datasetSize );
    benchmark.time( reset1,
                    "CPU", l1normHost,
                    "GPU", l1normCuda );

@@ -190,7 +166,8 @@ benchmarkVectorOperations( const int & loops,
    auto l2normCuda = [&]() {
        resultDevice = deviceVector.lpNorm( 2.0 );
    };
    benchmarkOperation( "l2 norm", datasetSize, loops, reset1,
    benchmark.setOperation( "l2 norm", datasetSize );
    benchmark.time( reset1,
                    "CPU", l2normHost,
                    "GPU", l2normCuda );

@@ -201,7 +178,8 @@ benchmarkVectorOperations( const int & loops,
    auto l3normCuda = [&]() {
        resultDevice = deviceVector.lpNorm( 3.0 );
    };
    benchmarkOperation( "l3 norm", datasetSize, loops, reset1,
    benchmark.setOperation( "l3 norm", datasetSize );
    benchmark.time( reset1,
                    "CPU", l3normHost,
                    "GPU", l3normCuda );

@@ -220,7 +198,8 @@ benchmarkVectorOperations( const int & loops,
                    &resultDevice );
    };
#endif
    benchmarkOperation( "scalar product", 2 * datasetSize, loops, reset1,
    benchmark.setOperation( "scalar product", 2 * datasetSize );
    benchmark.time( reset1,
                    "CPU", scalarProductHost,
                    "GPU", scalarProductCuda
#ifdef HAVE_CUBLAS