Loading tests/benchmarks/array-operations.h 0 → 100644 +92 −0 Original line number Diff line number Diff line #pragma once #include "benchmarks.h" #include <core/arrays/tnlArray.h> namespace tnl { namespace benchmarks { template< typename Real = double, typename Index = int > bool benchmarkArrayOperations( Benchmark & benchmark, const int & loops, const int & size ) { typedef tnlArray< Real, tnlHost, Index > HostArray; typedef tnlArray< Real, tnlCuda, Index > CudaArray; using namespace std; double datasetSize = ( double ) ( loops * size ) * sizeof( Real ) / oneGB; HostArray hostArray, hostArray2; CudaArray deviceArray, deviceArray2; hostArray.setSize( size ); if( ! deviceArray.setSize( size ) ) return false; hostArray2.setLike( hostArray ); if( ! deviceArray2.setLike( deviceArray ) ) return false; Real resultHost, resultDevice; // reset functions auto reset1 = [&]() { hostArray.setValue( 1.0 ); deviceArray.setValue( 1.0 ); }; auto reset2 = [&]() { hostArray2.setValue( 1.0 ); deviceArray2.setValue( 1.0 ); }; auto reset12 = [&]() { reset1(); reset2(); }; reset12(); auto compareHost = [&]() { resultHost = (int) hostArray == hostArray2; }; auto compareCuda = [&]() { resultDevice = (int) deviceArray == deviceArray2; }; benchmark.setOperation( "comparison (operator==)", 2 * datasetSize ); benchmark.time( reset1, "CPU", compareHost, "GPU", compareCuda ); auto copyAssignHostHost = [&]() { hostArray = hostArray2; }; auto copyAssignCudaCuda = [&]() { deviceArray = deviceArray2; }; benchmark.setOperation( "copy (operator=)", 2 * datasetSize ); double basetime = benchmark.time( reset1, "CPU", copyAssignHostHost, "GPU", copyAssignCudaCuda ); auto copyAssignHostCuda = [&]() { deviceArray = hostArray; }; auto copyAssignCudaHost = [&]() { hostArray = deviceArray; }; benchmark.setOperation( "copy (operator=)", datasetSize, basetime ); benchmark.time( reset1, "CPU->GPU", copyAssignHostCuda, "GPU->CPU", copyAssignCudaHost ); } } // namespace benchmarks } // namespace tnl tests/benchmarks/benchmarks.h +325 −62 Original line number Diff line number Diff line Loading @@ -2,8 +2,11 @@ #include <iostream> #include <iomanip> #include <map> #include <vector> #include <core/tnlTimerRT.h> #include <core/tnlString.h> namespace tnl { Loading @@ -17,18 +20,15 @@ template< typename ComputeFunction, double timeFunction( ComputeFunction compute, ResetFunction reset, const int & loops, const double & datasetSize, // in GB const double & baseTime, // in seconds (baseline for speedup calculation) const char* performer ) const int & loops ) { // the timer is constructed zero-initialized and stopped tnlTimerRT timer; reset(); for(int i = 0; i < loops; ++i) { // TODO: not necessary for host computations // Explicit synchronization of the CUDA device // TODO: not necessary for host computations #ifdef HAVE_CUDA cudaDeviceSynchronize(); #endif Loading @@ -42,70 +42,333 @@ timeFunction( ComputeFunction compute, reset(); } const double time = timer.getTime(); const double bandwidth = datasetSize / time; return timer.getTime(); } struct InternalError {}; class Logging { public: using MetadataElement = std::pair< const char*, tnlString >; using MetadataMap = std::map< const char*, tnlString >; using MetadataColumns = std::vector<MetadataElement>; using HeaderElements = std::initializer_list< tnlString >; using RowElements = std::initializer_list< double >; Logging( bool verbose = true ) : verbose(verbose) { } // TODO: fix spacing (blank lines) void writeTitle( const tnlString & title ) { if( verbose ) std::cout << std::endl << "== " << title << " ==" << std::endl << std::endl; log << ": title = " << title << std::endl; } void writeMetadata( const MetadataMap & metadata ) { if( verbose ) std::cout << "properties:" << std::endl; for( auto & it : metadata ) { if( verbose ) std::cout << " " << it.first << " = " << it.second << std::endl; log << ": " << it.first << " = " << it.second << std::endl; } if( verbose ) std::cout << std::endl; } void writeTableHeader( const tnlString & spanningElement, const HeaderElements & subElements ) { using namespace std; cout << " " << performer << ": bandwidth: " << setw( 8 ) << bandwidth << " GB/sec, time: " << setw( 8 ) << time << " sec, speedup: "; if( baseTime ) cout << baseTime / time << endl; else cout << "N/A" << endl; return time; if( verbose && header_changed ) { for( auto & it : metadataColumns ) { cout << setw( 20 ) << it.first; } // This specialization terminates the recursion template< typename ResetFunction, typename ComputeFunction > inline void benchmarkNextOperation( const double & datasetSize, const int & loops, ResetFunction reset, const double & baseTime, const char* performer, ComputeFunction compute ) // spanning element is printed as usual column to stdout, // but is excluded from header cout << setw( 15 ) << ""; for( auto & it : subElements ) { cout << setw( 15 ) << it; } cout << endl; header_changed = false; } // initial indent string header_indent = "!"; log << endl; for( auto & it : metadataColumns ) { log << header_indent << " " << it.first << endl; } // dump stacked spanning columns if( horizontalGroups.size() > 0 ) while( horizontalGroups.back().second <= 0 ) { horizontalGroups.pop_back(); header_indent.pop_back(); } for( int i = 0; i < horizontalGroups.size(); i++ ) { if( horizontalGroups[ i ].second > 0 ) { log << header_indent << " " << horizontalGroups[ i ].first << endl; header_indent += "!"; } } log << header_indent << " " << spanningElement << endl; for( auto & it : subElements ) { log << header_indent << "! " << it << endl; } if( horizontalGroups.size() > 0 ) { horizontalGroups.back().second--; header_indent.pop_back(); } } void writeTableRow( const tnlString & spanningElement, const RowElements & subElements ) { using namespace std; if( verbose ) { for( auto & it : metadataColumns ) { cout << setw( 20 ) << it.second; } // spanning element is printed as usual column to stdout cout << setw( 15 ) << spanningElement; for( auto & it : subElements ) { cout << setw( 15 ); if( it != 0.0 ) cout << it; else cout << "N/A"; } cout << endl; } // only when changed (the header has been already adjusted) // print each element on separate line for( auto & it : metadataColumns ) { log << it.second << endl; } // benchmark data are indented const tnlString indent = " "; for( auto & it : subElements ) { if( it != 0.0 ) log << indent << it << endl; else log << indent << "N/A" << endl; } } void closeTable() { header_indent = body_indent = ""; header_changed = true; } bool save( std::ostream & logFile ) { timeFunction( compute, reset, loops, datasetSize, baseTime, performer ); closeTable(); logFile << log.str(); if( logFile.good() ) { log.str() =""; return true; } return false; } protected: // manual double -> tnlString conversion with fixed precision static tnlString _to_string( const double & num, const int & precision = 0, bool fixed = false ) { std::stringstream str; if( fixed ) str << std::fixed; if( precision ) str << std::setprecision( precision ); str << num; return tnlString( str.str().data() ); } std::stringstream log; std::string header_indent; std::string body_indent; bool verbose; MetadataColumns metadataColumns; bool header_changed = true; std::vector< std::pair< tnlString, int > > horizontalGroups; }; class Benchmark : protected Logging { public: using Logging::MetadataElement; using Logging::MetadataMap; using Logging::MetadataColumns; Benchmark( const int & loops = 10, bool verbose = true ) : Logging(verbose), loops(loops) { } // TODO: ensure that this is not called in the middle of the benchmark // (or just remove it completely?) void setLoops( const int & loops ) { this->loops = loops; } // Marks the start of a new benchmark void newBenchmark( const tnlString & title ) { closeTable(); writeTitle( title ); } // Marks the start of a new benchmark (with custom metadata) void newBenchmark( const tnlString & title, MetadataMap metadata ) { closeTable(); writeTitle( title ); // add loops to metadata metadata["loops"] = tnlString(loops); writeMetadata( metadata ); } // Sets metadata columns -- values used for all subsequent rows until // the next call to this function. void setMetadataColumns( const MetadataColumns & metadata ) { if( metadataColumns != metadata ) header_changed = true; metadataColumns = metadata; } // TODO: maybe should be renamed to createVerticalGroup and ensured that vertical and horizontal groups are not used within the same "Benchmark" // Sets current operation -- operations expand the table vertically // - baseTime should be reset to 0.0 for most operations, but sometimes // it is useful to override it // - Order of operations inside a "Benchmark" does not matter, rows can be // easily sorted while converting to HTML.) void setOperation( const tnlString & operation, const double & datasetSize = 0.0, // in GB const double & baseTime = 0.0 ) { if( metadataColumns.size() > 0 && tnlString(metadataColumns[ 0 ].first) == "operation" ) { metadataColumns[ 0 ].second = operation; } else { metadataColumns.insert( metadataColumns.begin(), {"operation", operation} ); } setOperation( datasetSize, baseTime ); header_changed = true; } void setOperation( const double & datasetSize = 0.0, const double & baseTime = 0.0 ) { this->datasetSize = datasetSize; this->baseTime = baseTime; } // Creates new horizontal groups inside a benchmark -- increases the number // of columns in the "Benchmark", implies column spanning. // (Useful e.g. for SpMV formats, different configurations etc.) void createHorizontalGroup( const tnlString & name, const int & subcolumns ) { if( horizontalGroups.size() == 0 ) { horizontalGroups.push_back( {name, subcolumns} ); } else { auto & last = horizontalGroups.back(); if( last.first != name && last.second > 0 ) { horizontalGroups.push_back( {name, subcolumns} ); } else { last.first = name; last.second = subcolumns; } } } // Recursive template function to deal with benchmarks involving multiple computations // Times a single ComputeFunction. Subsequent calls implicitly split // the current "horizontal group" into sub-columns identified by // "performer", which are further split into "bandwidth", "time" and // "speedup" columns. // TODO: allow custom columns bound to lambda functions (e.g. for Gflops calculation) // Also terminates the recursion of the following variadic template. template< typename ResetFunction, typename ComputeFunction, typename... NextComputations > inline void benchmarkNextOperation( const double & datasetSize, const int & loops, ResetFunction reset, const double & baseTime, const char* performer, ComputeFunction compute, NextComputations & ... nextComputations ) typename ComputeFunction > double time( ResetFunction reset, const tnlString & performer, ComputeFunction & compute ) { benchmarkNextOperation( datasetSize, loops, reset, baseTime, performer, compute ); benchmarkNextOperation( datasetSize, loops, reset, baseTime, nextComputations... ); const double time = timeFunction( compute, reset, loops ); const double bandwidth = datasetSize / time; const double speedup = this->baseTime / time; if( this->baseTime == 0.0 ) this->baseTime = time; writeTableHeader( performer, HeaderElements({"bandwidth", "time", "speedup"}) ); writeTableRow( performer, RowElements({ bandwidth, time, speedup }) ); return this->baseTime; } // Main function for benchmarking // Recursive template function to deal with multiple computations with the // same reset function. template< typename ResetFunction, typename ComputeFunction, typename... NextComputations > void benchmarkOperation( const char* operation, const double & datasetSize, const int & loops, ResetFunction reset, const char* performer, ComputeFunction computeBase, NextComputations... nextComputations ) inline double time( ResetFunction reset, const tnlString & performer, ComputeFunction & compute, NextComputations & ... nextComputations ) { cout << "Benchmarking " << operation << ":" << endl; double baseTime = timeFunction( computeBase, reset, loops, datasetSize, 0.0, performer ); benchmarkNextOperation( datasetSize, loops, reset, baseTime, nextComputations... ); std::cout << std::endl; time( reset, performer, compute ); time( reset, nextComputations... ); return this->baseTime; } using Logging::save; protected: int loops; double datasetSize = 0.0; double baseTime = 0.0; }; } // namespace benchmarks } // namespace tnl tests/benchmarks/tnl-cuda-benchmarks.h +55 −14 Original line number Diff line number Diff line Loading @@ -18,12 +18,12 @@ #ifndef TNLCUDABENCHMARKS_H_ #define TNLCUDBENCHMARKS_H_ #include <tnlConfig.h> #include <core/tnlList.h> #include <matrices/tnlSlicedEllpackMatrix.h> #include <matrices/tnlEllpackMatrix.h> #include <matrices/tnlCSRMatrix.h> #include "array-operations.h" #include "vector-operations.h" using namespace tnl::benchmarks; Loading @@ -31,6 +31,7 @@ using namespace tnl::benchmarks; // TODO: should benchmarks check the result of the computation? // silly alias to match the number of template parameters with other formats template< typename Real, typename Device, typename Index > using SlicedEllpackMatrix = tnlSlicedEllpackMatrix< Real, Device, Index >; Loading @@ -43,8 +44,6 @@ int setHostTestMatrix( Matrix& matrix, int elements( 0 ); for( int row = 0; row < size; row++ ) { if( row % 100 == 0 ) cout << "Filling row " << row << "/" << size << " \r" << flush; int col = row - elementsPerRow / 2; for( int element = 0; element < elementsPerRow; element++ ) { Loading @@ -56,7 +55,6 @@ int setHostTestMatrix( Matrix& matrix, } } } cout << endl; return elements; } Loading Loading @@ -104,7 +102,8 @@ template< typename Real, template< typename, typename, typename > class Matrix, template< typename, typename, typename > class Vector = tnlVector > bool benchmarkSpMV( const int & loops, benchmarkSpMV( Benchmark & benchmark, const int & loops, const int & size, const int elementsPerRow = 5 ) { Loading Loading @@ -149,8 +148,7 @@ benchmarkSpMV( const int & loops, tnlList< tnlString > parsedType; parseObjectType( HostMatrix::getType(), parsedType ); tnlString operationDescription = tnlString("SpMV (matrix type: ") + parsedType[ 0 ] + ", rows: " + tnlString(size) + ", elements per row: " + tnlString(elementsPerRow) + ")"; benchmark.createHorizontalGroup( parsedType[ 0 ], 2 ); const int elements = setHostTestMatrix< HostMatrix >( hostMatrix, elementsPerRow ); setCudaTestMatrix< DeviceMatrix >( deviceMatrix, elementsPerRow ); Loading @@ -172,7 +170,8 @@ benchmarkSpMV( const int & loops, deviceMatrix.vectorProduct( deviceVector, deviceVector2 ); }; benchmarkOperation( operationDescription.getString(), datasetSize, loops, reset, benchmark.setOperation( datasetSize ); benchmark.time( reset, "CPU", spmvHost, "GPU", spmvCuda ); Loading @@ -184,6 +183,7 @@ int main( int argc, char* argv[] ) #ifdef HAVE_CUDA typedef double Real; tnlString precision = getType< Real >(); /**** * The first argument of this program is the size od data set to be reduced. Loading @@ -199,11 +199,52 @@ int main( int argc, char* argv[] ) if( argc > 3 ) elementsPerRow = atoi( argv[ 3 ] ); benchmarkVectorOperations< Real >( loops, size ); ofstream logFile( "tnl-cuda-benchmarks.log" ); Benchmark benchmark( loops, true ); // ostream & logFile = cout; // Benchmark benchmark( loops, false ); // TODO: add hostname, CPU info, GPU info, date, ... Benchmark::MetadataMap metadata { {"precision", precision}, }; // TODO: loop over sizes // Array operations benchmark.newBenchmark( tnlString("Array operations (") + precision + ")", metadata ); benchmark.setMetadataColumns( Benchmark::MetadataColumns({ {"size", size}, } )); benchmarkArrayOperations< Real >( benchmark, loops, size ); benchmarkSpMV< Real, tnlEllpackMatrix >( loops, size, elementsPerRow ); benchmarkSpMV< Real, SlicedEllpackMatrix >( loops, size, elementsPerRow ); benchmarkSpMV< Real, tnlCSRMatrix >( loops, size, elementsPerRow ); // Vector operations benchmark.newBenchmark( tnlString("Vector operations (") + precision + ")", metadata ); benchmark.setMetadataColumns( Benchmark::MetadataColumns({ {"size", size}, } )); benchmarkVectorOperations< Real >( benchmark, loops, size ); // SpMV benchmark.newBenchmark( tnlString("SpMV (") + precision + ")", metadata ); benchmark.setMetadataColumns( Benchmark::MetadataColumns({ {"rows", size}, {"columns", size}, {"elements per row", elementsPerRow}, } )); benchmarkSpMV< Real, tnlEllpackMatrix >( benchmark, loops, size, elementsPerRow ); benchmarkSpMV< Real, SlicedEllpackMatrix >( benchmark, loops, size, elementsPerRow ); benchmarkSpMV< Real, tnlCSRMatrix >( benchmark, loops, size, elementsPerRow ); if( ! benchmark.save( logFile ) ) return EXIT_FAILURE; return EXIT_SUCCESS; #else Loading tests/benchmarks/vector-operations.h +48 −69 Original line number Diff line number Diff line Loading @@ -16,7 +16,8 @@ namespace benchmarks template< typename Real = double, typename Index = int > bool benchmarkVectorOperations( const int & loops, benchmarkVectorOperations( Benchmark & benchmark, const int & loops, const int & size ) { typedef tnlVector< Real, tnlHost, Index > HostVector; Loading Loading @@ -63,46 +64,14 @@ benchmarkVectorOperations( const int & loops, reset12(); auto copyAssignHostCuda = [&]() { deviceVector = hostVector; }; auto copyAssignCudaHost = [&]() { hostVector = deviceVector; }; benchmarkOperation( "copy assigment (cross-device)", datasetSize, loops, reset1, "CPU->GPU", copyAssignHostCuda, "GPU->CPU", copyAssignCudaHost ); auto copyAssignHostHost = [&]() { hostVector = hostVector2; }; auto copyAssignCudaCuda = [&]() { deviceVector = deviceVector2; }; benchmarkOperation( "copy assigment", 2 * datasetSize, loops, reset1, "CPU->CPU", copyAssignHostHost, "GPU->GPU", copyAssignCudaCuda ); auto compareHost = [&]() { resultHost = (int) hostVector == hostVector2; }; auto compareCuda = [&]() { resultDevice = (int) deviceVector == deviceVector2; }; benchmarkOperation( "comparison (operator==)", 2 * datasetSize, loops, reset1, "CPU", compareHost, "GPU", compareCuda ); auto multiplyHost = [&]() { hostVector *= 0.5; }; auto multiplyCuda = [&]() { deviceVector *= 0.5; }; benchmarkOperation( "scalar multiplication", 2 * datasetSize, loops, reset1, benchmark.setOperation( "scalar multiplication", 2 * datasetSize ); benchmark.time( reset1, "CPU", multiplyHost, "GPU", multiplyCuda ); Loading @@ -113,7 +82,8 @@ benchmarkVectorOperations( const int & loops, auto addVectorCuda = [&]() { deviceVector.addVector( deviceVector2 ); }; benchmarkOperation( "vector addition", 3 * datasetSize, loops, reset1, benchmark.setOperation( "vector addition", 3 * datasetSize ); benchmark.time( reset1, "CPU", addVectorHost, "GPU", addVectorCuda ); Loading @@ -124,7 +94,8 @@ benchmarkVectorOperations( const int & loops, auto maxCuda = [&]() { resultDevice = deviceVector.max(); }; benchmarkOperation( "max", datasetSize, loops, reset1, benchmark.setOperation( "max", datasetSize ); benchmark.time( reset1, "CPU", maxHost, "GPU", maxCuda ); Loading @@ -135,7 +106,8 @@ benchmarkVectorOperations( const int & loops, auto minCuda = [&]() { resultDevice = deviceVector.min(); }; benchmarkOperation( "min", datasetSize, loops, reset1, benchmark.setOperation( "min", datasetSize ); benchmark.time( reset1, "CPU", minHost, "GPU", minCuda ); Loading @@ -146,7 +118,8 @@ benchmarkVectorOperations( const int & loops, auto absMaxCuda = [&]() { resultDevice = deviceVector.absMax(); }; benchmarkOperation( "absMax", datasetSize, loops, reset1, benchmark.setOperation( "absMax", datasetSize ); benchmark.time( reset1, "CPU", absMaxHost, "GPU", absMaxCuda ); Loading @@ -157,7 +130,8 @@ benchmarkVectorOperations( const int & loops, auto absMinCuda = [&]() { resultDevice = deviceVector.absMin(); }; benchmarkOperation( "absMin", datasetSize, loops, reset1, benchmark.setOperation( "absMin", datasetSize ); benchmark.time( reset1, "CPU", absMinHost, "GPU", absMinCuda ); Loading @@ -168,7 +142,8 @@ benchmarkVectorOperations( const int & loops, auto sumCuda = [&]() { resultDevice = deviceVector.sum(); }; benchmarkOperation( "sum", datasetSize, loops, reset1, benchmark.setOperation( "sum", datasetSize ); benchmark.time( reset1, "CPU", sumHost, "GPU", sumCuda ); Loading @@ -179,7 +154,8 @@ benchmarkVectorOperations( const int & loops, auto l1normCuda = [&]() { resultDevice = deviceVector.lpNorm( 1.0 ); }; benchmarkOperation( "l1 norm", datasetSize, loops, reset1, benchmark.setOperation( "l1 norm", datasetSize ); benchmark.time( reset1, "CPU", l1normHost, "GPU", l1normCuda ); Loading @@ -190,7 +166,8 @@ benchmarkVectorOperations( const int & loops, auto l2normCuda = [&]() { resultDevice = deviceVector.lpNorm( 2.0 ); }; benchmarkOperation( "l2 norm", datasetSize, loops, reset1, benchmark.setOperation( "l2 norm", datasetSize ); benchmark.time( reset1, "CPU", l2normHost, "GPU", l2normCuda ); Loading @@ -201,7 +178,8 @@ benchmarkVectorOperations( const int & loops, auto l3normCuda = [&]() { resultDevice = deviceVector.lpNorm( 3.0 ); }; benchmarkOperation( "l3 norm", datasetSize, loops, reset1, benchmark.setOperation( "l3 norm", datasetSize ); benchmark.time( reset1, "CPU", l3normHost, "GPU", l3normCuda ); Loading @@ -220,7 +198,8 @@ benchmarkVectorOperations( const int & loops, &resultDevice ); }; #endif benchmarkOperation( "scalar product", 2 * datasetSize, loops, reset1, benchmark.setOperation( "scalar product", 2 * datasetSize ); benchmark.time( reset1, "CPU", scalarProductHost, "GPU", scalarProductCuda #ifdef HAVE_CUBLAS Loading Loading
tests/benchmarks/array-operations.h 0 → 100644 +92 −0 Original line number Diff line number Diff line #pragma once #include "benchmarks.h" #include <core/arrays/tnlArray.h> namespace tnl { namespace benchmarks { template< typename Real = double, typename Index = int > bool benchmarkArrayOperations( Benchmark & benchmark, const int & loops, const int & size ) { typedef tnlArray< Real, tnlHost, Index > HostArray; typedef tnlArray< Real, tnlCuda, Index > CudaArray; using namespace std; double datasetSize = ( double ) ( loops * size ) * sizeof( Real ) / oneGB; HostArray hostArray, hostArray2; CudaArray deviceArray, deviceArray2; hostArray.setSize( size ); if( ! deviceArray.setSize( size ) ) return false; hostArray2.setLike( hostArray ); if( ! deviceArray2.setLike( deviceArray ) ) return false; Real resultHost, resultDevice; // reset functions auto reset1 = [&]() { hostArray.setValue( 1.0 ); deviceArray.setValue( 1.0 ); }; auto reset2 = [&]() { hostArray2.setValue( 1.0 ); deviceArray2.setValue( 1.0 ); }; auto reset12 = [&]() { reset1(); reset2(); }; reset12(); auto compareHost = [&]() { resultHost = (int) hostArray == hostArray2; }; auto compareCuda = [&]() { resultDevice = (int) deviceArray == deviceArray2; }; benchmark.setOperation( "comparison (operator==)", 2 * datasetSize ); benchmark.time( reset1, "CPU", compareHost, "GPU", compareCuda ); auto copyAssignHostHost = [&]() { hostArray = hostArray2; }; auto copyAssignCudaCuda = [&]() { deviceArray = deviceArray2; }; benchmark.setOperation( "copy (operator=)", 2 * datasetSize ); double basetime = benchmark.time( reset1, "CPU", copyAssignHostHost, "GPU", copyAssignCudaCuda ); auto copyAssignHostCuda = [&]() { deviceArray = hostArray; }; auto copyAssignCudaHost = [&]() { hostArray = deviceArray; }; benchmark.setOperation( "copy (operator=)", datasetSize, basetime ); benchmark.time( reset1, "CPU->GPU", copyAssignHostCuda, "GPU->CPU", copyAssignCudaHost ); } } // namespace benchmarks } // namespace tnl
tests/benchmarks/benchmarks.h +325 −62 Original line number Diff line number Diff line Loading @@ -2,8 +2,11 @@ #include <iostream> #include <iomanip> #include <map> #include <vector> #include <core/tnlTimerRT.h> #include <core/tnlString.h> namespace tnl { Loading @@ -17,18 +20,15 @@ template< typename ComputeFunction, double timeFunction( ComputeFunction compute, ResetFunction reset, const int & loops, const double & datasetSize, // in GB const double & baseTime, // in seconds (baseline for speedup calculation) const char* performer ) const int & loops ) { // the timer is constructed zero-initialized and stopped tnlTimerRT timer; reset(); for(int i = 0; i < loops; ++i) { // TODO: not necessary for host computations // Explicit synchronization of the CUDA device // TODO: not necessary for host computations #ifdef HAVE_CUDA cudaDeviceSynchronize(); #endif Loading @@ -42,70 +42,333 @@ timeFunction( ComputeFunction compute, reset(); } const double time = timer.getTime(); const double bandwidth = datasetSize / time; return timer.getTime(); } struct InternalError {}; class Logging { public: using MetadataElement = std::pair< const char*, tnlString >; using MetadataMap = std::map< const char*, tnlString >; using MetadataColumns = std::vector<MetadataElement>; using HeaderElements = std::initializer_list< tnlString >; using RowElements = std::initializer_list< double >; Logging( bool verbose = true ) : verbose(verbose) { } // TODO: fix spacing (blank lines) void writeTitle( const tnlString & title ) { if( verbose ) std::cout << std::endl << "== " << title << " ==" << std::endl << std::endl; log << ": title = " << title << std::endl; } void writeMetadata( const MetadataMap & metadata ) { if( verbose ) std::cout << "properties:" << std::endl; for( auto & it : metadata ) { if( verbose ) std::cout << " " << it.first << " = " << it.second << std::endl; log << ": " << it.first << " = " << it.second << std::endl; } if( verbose ) std::cout << std::endl; } void writeTableHeader( const tnlString & spanningElement, const HeaderElements & subElements ) { using namespace std; cout << " " << performer << ": bandwidth: " << setw( 8 ) << bandwidth << " GB/sec, time: " << setw( 8 ) << time << " sec, speedup: "; if( baseTime ) cout << baseTime / time << endl; else cout << "N/A" << endl; return time; if( verbose && header_changed ) { for( auto & it : metadataColumns ) { cout << setw( 20 ) << it.first; } // This specialization terminates the recursion template< typename ResetFunction, typename ComputeFunction > inline void benchmarkNextOperation( const double & datasetSize, const int & loops, ResetFunction reset, const double & baseTime, const char* performer, ComputeFunction compute ) // spanning element is printed as usual column to stdout, // but is excluded from header cout << setw( 15 ) << ""; for( auto & it : subElements ) { cout << setw( 15 ) << it; } cout << endl; header_changed = false; } // initial indent string header_indent = "!"; log << endl; for( auto & it : metadataColumns ) { log << header_indent << " " << it.first << endl; } // dump stacked spanning columns if( horizontalGroups.size() > 0 ) while( horizontalGroups.back().second <= 0 ) { horizontalGroups.pop_back(); header_indent.pop_back(); } for( int i = 0; i < horizontalGroups.size(); i++ ) { if( horizontalGroups[ i ].second > 0 ) { log << header_indent << " " << horizontalGroups[ i ].first << endl; header_indent += "!"; } } log << header_indent << " " << spanningElement << endl; for( auto & it : subElements ) { log << header_indent << "! " << it << endl; } if( horizontalGroups.size() > 0 ) { horizontalGroups.back().second--; header_indent.pop_back(); } } void writeTableRow( const tnlString & spanningElement, const RowElements & subElements ) { using namespace std; if( verbose ) { for( auto & it : metadataColumns ) { cout << setw( 20 ) << it.second; } // spanning element is printed as usual column to stdout cout << setw( 15 ) << spanningElement; for( auto & it : subElements ) { cout << setw( 15 ); if( it != 0.0 ) cout << it; else cout << "N/A"; } cout << endl; } // only when changed (the header has been already adjusted) // print each element on separate line for( auto & it : metadataColumns ) { log << it.second << endl; } // benchmark data are indented const tnlString indent = " "; for( auto & it : subElements ) { if( it != 0.0 ) log << indent << it << endl; else log << indent << "N/A" << endl; } } void closeTable() { header_indent = body_indent = ""; header_changed = true; } bool save( std::ostream & logFile ) { timeFunction( compute, reset, loops, datasetSize, baseTime, performer ); closeTable(); logFile << log.str(); if( logFile.good() ) { log.str() =""; return true; } return false; } protected: // manual double -> tnlString conversion with fixed precision static tnlString _to_string( const double & num, const int & precision = 0, bool fixed = false ) { std::stringstream str; if( fixed ) str << std::fixed; if( precision ) str << std::setprecision( precision ); str << num; return tnlString( str.str().data() ); } std::stringstream log; std::string header_indent; std::string body_indent; bool verbose; MetadataColumns metadataColumns; bool header_changed = true; std::vector< std::pair< tnlString, int > > horizontalGroups; }; class Benchmark : protected Logging { public: using Logging::MetadataElement; using Logging::MetadataMap; using Logging::MetadataColumns; Benchmark( const int & loops = 10, bool verbose = true ) : Logging(verbose), loops(loops) { } // TODO: ensure that this is not called in the middle of the benchmark // (or just remove it completely?) void setLoops( const int & loops ) { this->loops = loops; } // Marks the start of a new benchmark void newBenchmark( const tnlString & title ) { closeTable(); writeTitle( title ); } // Marks the start of a new benchmark (with custom metadata) void newBenchmark( const tnlString & title, MetadataMap metadata ) { closeTable(); writeTitle( title ); // add loops to metadata metadata["loops"] = tnlString(loops); writeMetadata( metadata ); } // Sets metadata columns -- values used for all subsequent rows until // the next call to this function. void setMetadataColumns( const MetadataColumns & metadata ) { if( metadataColumns != metadata ) header_changed = true; metadataColumns = metadata; } // TODO: maybe should be renamed to createVerticalGroup and ensured that vertical and horizontal groups are not used within the same "Benchmark" // Sets current operation -- operations expand the table vertically // - baseTime should be reset to 0.0 for most operations, but sometimes // it is useful to override it // - Order of operations inside a "Benchmark" does not matter, rows can be // easily sorted while converting to HTML.) void setOperation( const tnlString & operation, const double & datasetSize = 0.0, // in GB const double & baseTime = 0.0 ) { if( metadataColumns.size() > 0 && tnlString(metadataColumns[ 0 ].first) == "operation" ) { metadataColumns[ 0 ].second = operation; } else { metadataColumns.insert( metadataColumns.begin(), {"operation", operation} ); } setOperation( datasetSize, baseTime ); header_changed = true; } void setOperation( const double & datasetSize = 0.0, const double & baseTime = 0.0 ) { this->datasetSize = datasetSize; this->baseTime = baseTime; } // Creates new horizontal groups inside a benchmark -- increases the number // of columns in the "Benchmark", implies column spanning. // (Useful e.g. for SpMV formats, different configurations etc.) void createHorizontalGroup( const tnlString & name, const int & subcolumns ) { if( horizontalGroups.size() == 0 ) { horizontalGroups.push_back( {name, subcolumns} ); } else { auto & last = horizontalGroups.back(); if( last.first != name && last.second > 0 ) { horizontalGroups.push_back( {name, subcolumns} ); } else { last.first = name; last.second = subcolumns; } } } // Recursive template function to deal with benchmarks involving multiple computations // Times a single ComputeFunction. Subsequent calls implicitly split // the current "horizontal group" into sub-columns identified by // "performer", which are further split into "bandwidth", "time" and // "speedup" columns. // TODO: allow custom columns bound to lambda functions (e.g. for Gflops calculation) // Also terminates the recursion of the following variadic template. template< typename ResetFunction, typename ComputeFunction, typename... NextComputations > inline void benchmarkNextOperation( const double & datasetSize, const int & loops, ResetFunction reset, const double & baseTime, const char* performer, ComputeFunction compute, NextComputations & ... nextComputations ) typename ComputeFunction > double time( ResetFunction reset, const tnlString & performer, ComputeFunction & compute ) { benchmarkNextOperation( datasetSize, loops, reset, baseTime, performer, compute ); benchmarkNextOperation( datasetSize, loops, reset, baseTime, nextComputations... ); const double time = timeFunction( compute, reset, loops ); const double bandwidth = datasetSize / time; const double speedup = this->baseTime / time; if( this->baseTime == 0.0 ) this->baseTime = time; writeTableHeader( performer, HeaderElements({"bandwidth", "time", "speedup"}) ); writeTableRow( performer, RowElements({ bandwidth, time, speedup }) ); return this->baseTime; } // Main function for benchmarking // Recursive template function to deal with multiple computations with the // same reset function. template< typename ResetFunction, typename ComputeFunction, typename... NextComputations > void benchmarkOperation( const char* operation, const double & datasetSize, const int & loops, ResetFunction reset, const char* performer, ComputeFunction computeBase, NextComputations... nextComputations ) inline double time( ResetFunction reset, const tnlString & performer, ComputeFunction & compute, NextComputations & ... nextComputations ) { cout << "Benchmarking " << operation << ":" << endl; double baseTime = timeFunction( computeBase, reset, loops, datasetSize, 0.0, performer ); benchmarkNextOperation( datasetSize, loops, reset, baseTime, nextComputations... ); std::cout << std::endl; time( reset, performer, compute ); time( reset, nextComputations... ); return this->baseTime; } using Logging::save; protected: int loops; double datasetSize = 0.0; double baseTime = 0.0; }; } // namespace benchmarks } // namespace tnl
tests/benchmarks/tnl-cuda-benchmarks.h +55 −14 Original line number Diff line number Diff line Loading @@ -18,12 +18,12 @@ #ifndef TNLCUDABENCHMARKS_H_ #define TNLCUDBENCHMARKS_H_ #include <tnlConfig.h> #include <core/tnlList.h> #include <matrices/tnlSlicedEllpackMatrix.h> #include <matrices/tnlEllpackMatrix.h> #include <matrices/tnlCSRMatrix.h> #include "array-operations.h" #include "vector-operations.h" using namespace tnl::benchmarks; Loading @@ -31,6 +31,7 @@ using namespace tnl::benchmarks; // TODO: should benchmarks check the result of the computation? // silly alias to match the number of template parameters with other formats template< typename Real, typename Device, typename Index > using SlicedEllpackMatrix = tnlSlicedEllpackMatrix< Real, Device, Index >; Loading @@ -43,8 +44,6 @@ int setHostTestMatrix( Matrix& matrix, int elements( 0 ); for( int row = 0; row < size; row++ ) { if( row % 100 == 0 ) cout << "Filling row " << row << "/" << size << " \r" << flush; int col = row - elementsPerRow / 2; for( int element = 0; element < elementsPerRow; element++ ) { Loading @@ -56,7 +55,6 @@ int setHostTestMatrix( Matrix& matrix, } } } cout << endl; return elements; } Loading Loading @@ -104,7 +102,8 @@ template< typename Real, template< typename, typename, typename > class Matrix, template< typename, typename, typename > class Vector = tnlVector > bool benchmarkSpMV( const int & loops, benchmarkSpMV( Benchmark & benchmark, const int & loops, const int & size, const int elementsPerRow = 5 ) { Loading Loading @@ -149,8 +148,7 @@ benchmarkSpMV( const int & loops, tnlList< tnlString > parsedType; parseObjectType( HostMatrix::getType(), parsedType ); tnlString operationDescription = tnlString("SpMV (matrix type: ") + parsedType[ 0 ] + ", rows: " + tnlString(size) + ", elements per row: " + tnlString(elementsPerRow) + ")"; benchmark.createHorizontalGroup( parsedType[ 0 ], 2 ); const int elements = setHostTestMatrix< HostMatrix >( hostMatrix, elementsPerRow ); setCudaTestMatrix< DeviceMatrix >( deviceMatrix, elementsPerRow ); Loading @@ -172,7 +170,8 @@ benchmarkSpMV( const int & loops, deviceMatrix.vectorProduct( deviceVector, deviceVector2 ); }; benchmarkOperation( operationDescription.getString(), datasetSize, loops, reset, benchmark.setOperation( datasetSize ); benchmark.time( reset, "CPU", spmvHost, "GPU", spmvCuda ); Loading @@ -184,6 +183,7 @@ int main( int argc, char* argv[] ) #ifdef HAVE_CUDA typedef double Real; tnlString precision = getType< Real >(); /**** * The first argument of this program is the size od data set to be reduced. Loading @@ -199,11 +199,52 @@ int main( int argc, char* argv[] ) if( argc > 3 ) elementsPerRow = atoi( argv[ 3 ] ); benchmarkVectorOperations< Real >( loops, size ); ofstream logFile( "tnl-cuda-benchmarks.log" ); Benchmark benchmark( loops, true ); // ostream & logFile = cout; // Benchmark benchmark( loops, false ); // TODO: add hostname, CPU info, GPU info, date, ... Benchmark::MetadataMap metadata { {"precision", precision}, }; // TODO: loop over sizes // Array operations benchmark.newBenchmark( tnlString("Array operations (") + precision + ")", metadata ); benchmark.setMetadataColumns( Benchmark::MetadataColumns({ {"size", size}, } )); benchmarkArrayOperations< Real >( benchmark, loops, size ); benchmarkSpMV< Real, tnlEllpackMatrix >( loops, size, elementsPerRow ); benchmarkSpMV< Real, SlicedEllpackMatrix >( loops, size, elementsPerRow ); benchmarkSpMV< Real, tnlCSRMatrix >( loops, size, elementsPerRow ); // Vector operations benchmark.newBenchmark( tnlString("Vector operations (") + precision + ")", metadata ); benchmark.setMetadataColumns( Benchmark::MetadataColumns({ {"size", size}, } )); benchmarkVectorOperations< Real >( benchmark, loops, size ); // SpMV benchmark.newBenchmark( tnlString("SpMV (") + precision + ")", metadata ); benchmark.setMetadataColumns( Benchmark::MetadataColumns({ {"rows", size}, {"columns", size}, {"elements per row", elementsPerRow}, } )); benchmarkSpMV< Real, tnlEllpackMatrix >( benchmark, loops, size, elementsPerRow ); benchmarkSpMV< Real, SlicedEllpackMatrix >( benchmark, loops, size, elementsPerRow ); benchmarkSpMV< Real, tnlCSRMatrix >( benchmark, loops, size, elementsPerRow ); if( ! benchmark.save( logFile ) ) return EXIT_FAILURE; return EXIT_SUCCESS; #else Loading
tests/benchmarks/vector-operations.h +48 −69 Original line number Diff line number Diff line Loading @@ -16,7 +16,8 @@ namespace benchmarks template< typename Real = double, typename Index = int > bool benchmarkVectorOperations( const int & loops, benchmarkVectorOperations( Benchmark & benchmark, const int & loops, const int & size ) { typedef tnlVector< Real, tnlHost, Index > HostVector; Loading Loading @@ -63,46 +64,14 @@ benchmarkVectorOperations( const int & loops, reset12(); auto copyAssignHostCuda = [&]() { deviceVector = hostVector; }; auto copyAssignCudaHost = [&]() { hostVector = deviceVector; }; benchmarkOperation( "copy assigment (cross-device)", datasetSize, loops, reset1, "CPU->GPU", copyAssignHostCuda, "GPU->CPU", copyAssignCudaHost ); auto copyAssignHostHost = [&]() { hostVector = hostVector2; }; auto copyAssignCudaCuda = [&]() { deviceVector = deviceVector2; }; benchmarkOperation( "copy assigment", 2 * datasetSize, loops, reset1, "CPU->CPU", copyAssignHostHost, "GPU->GPU", copyAssignCudaCuda ); auto compareHost = [&]() { resultHost = (int) hostVector == hostVector2; }; auto compareCuda = [&]() { resultDevice = (int) deviceVector == deviceVector2; }; benchmarkOperation( "comparison (operator==)", 2 * datasetSize, loops, reset1, "CPU", compareHost, "GPU", compareCuda ); auto multiplyHost = [&]() { hostVector *= 0.5; }; auto multiplyCuda = [&]() { deviceVector *= 0.5; }; benchmarkOperation( "scalar multiplication", 2 * datasetSize, loops, reset1, benchmark.setOperation( "scalar multiplication", 2 * datasetSize ); benchmark.time( reset1, "CPU", multiplyHost, "GPU", multiplyCuda ); Loading @@ -113,7 +82,8 @@ benchmarkVectorOperations( const int & loops, auto addVectorCuda = [&]() { deviceVector.addVector( deviceVector2 ); }; benchmarkOperation( "vector addition", 3 * datasetSize, loops, reset1, benchmark.setOperation( "vector addition", 3 * datasetSize ); benchmark.time( reset1, "CPU", addVectorHost, "GPU", addVectorCuda ); Loading @@ -124,7 +94,8 @@ benchmarkVectorOperations( const int & loops, auto maxCuda = [&]() { resultDevice = deviceVector.max(); }; benchmarkOperation( "max", datasetSize, loops, reset1, benchmark.setOperation( "max", datasetSize ); benchmark.time( reset1, "CPU", maxHost, "GPU", maxCuda ); Loading @@ -135,7 +106,8 @@ benchmarkVectorOperations( const int & loops, auto minCuda = [&]() { resultDevice = deviceVector.min(); }; benchmarkOperation( "min", datasetSize, loops, reset1, benchmark.setOperation( "min", datasetSize ); benchmark.time( reset1, "CPU", minHost, "GPU", minCuda ); Loading @@ -146,7 +118,8 @@ benchmarkVectorOperations( const int & loops, auto absMaxCuda = [&]() { resultDevice = deviceVector.absMax(); }; benchmarkOperation( "absMax", datasetSize, loops, reset1, benchmark.setOperation( "absMax", datasetSize ); benchmark.time( reset1, "CPU", absMaxHost, "GPU", absMaxCuda ); Loading @@ -157,7 +130,8 @@ benchmarkVectorOperations( const int & loops, auto absMinCuda = [&]() { resultDevice = deviceVector.absMin(); }; benchmarkOperation( "absMin", datasetSize, loops, reset1, benchmark.setOperation( "absMin", datasetSize ); benchmark.time( reset1, "CPU", absMinHost, "GPU", absMinCuda ); Loading @@ -168,7 +142,8 @@ benchmarkVectorOperations( const int & loops, auto sumCuda = [&]() { resultDevice = deviceVector.sum(); }; benchmarkOperation( "sum", datasetSize, loops, reset1, benchmark.setOperation( "sum", datasetSize ); benchmark.time( reset1, "CPU", sumHost, "GPU", sumCuda ); Loading @@ -179,7 +154,8 @@ benchmarkVectorOperations( const int & loops, auto l1normCuda = [&]() { resultDevice = deviceVector.lpNorm( 1.0 ); }; benchmarkOperation( "l1 norm", datasetSize, loops, reset1, benchmark.setOperation( "l1 norm", datasetSize ); benchmark.time( reset1, "CPU", l1normHost, "GPU", l1normCuda ); Loading @@ -190,7 +166,8 @@ benchmarkVectorOperations( const int & loops, auto l2normCuda = [&]() { resultDevice = deviceVector.lpNorm( 2.0 ); }; benchmarkOperation( "l2 norm", datasetSize, loops, reset1, benchmark.setOperation( "l2 norm", datasetSize ); benchmark.time( reset1, "CPU", l2normHost, "GPU", l2normCuda ); Loading @@ -201,7 +178,8 @@ benchmarkVectorOperations( const int & loops, auto l3normCuda = [&]() { resultDevice = deviceVector.lpNorm( 3.0 ); }; benchmarkOperation( "l3 norm", datasetSize, loops, reset1, benchmark.setOperation( "l3 norm", datasetSize ); benchmark.time( reset1, "CPU", l3normHost, "GPU", l3normCuda ); Loading @@ -220,7 +198,8 @@ benchmarkVectorOperations( const int & loops, &resultDevice ); }; #endif benchmarkOperation( "scalar product", 2 * datasetSize, loops, reset1, benchmark.setOperation( "scalar product", 2 * datasetSize ); benchmark.time( reset1, "CPU", scalarProductHost, "GPU", scalarProductCuda #ifdef HAVE_CUBLAS Loading