diff --git a/scripts/eti.py b/scripts/eti.py new file mode 100755 index 0000000000000000000000000000000000000000..89d78876dccdc6149e38fe74391bfccfcb468d2f --- /dev/null +++ b/scripts/eti.py @@ -0,0 +1,74 @@ +#! /usr/bin/env python3 + +import os.path +import pathlib +import re +import sys + +if len(sys.argv) != 2: + print(f"usage: {sys.argv[0]} FILE\n\nwhere FILE is a C++ source code or header file.", file=sys.stderr) + sys.exit(1) +if not os.path.isfile(sys.argv[1]): + print(f"error: {sys.argv[1]} is not a valid file.", file=sys.stderr) + sys.exit(1) + +src = sys.argv[1] +basename = os.path.splitext(os.path.basename(src))[0] +dirname = f"{basename}.templates" + +if not os.path.isdir(dirname): + os.mkdir(dirname) + +def get_source_code(namespaces, extern_template_instantiation): + eti = extern_template_instantiation.strip().replace("extern ", "", 1) + # use absolute path for the include when src is an absolute path + # (e.g. when called by CMake, because relative include does not work with + # its separate build dir structure) + if src == os.path.abspath(src): + source_code = f"#include \"{src}\"\n" + # use relative path for the include when src is relative + else: + relpath = os.path.relpath(src, dirname) + source_code = f"#include \"{relpath}\"\n" + for ns in namespaces: + source_code += f"namespace {ns} {{\n" + source_code += eti + "\n" + for ns in namespaces: + source_code += f"}} // namespace {ns}\n" + return source_code + +def check_write(content, fname): + write = False + if os.path.isfile(fname): + write = open(fname, "r").read().strip() != content.strip() + else: + write = True + + if write is True: + with open(fname, "w") as out: + out.write(content) + +i = 0 +namespaces = [] +file_names = set() +for line in open(src).readlines(): + # heuristics for namespaces + ns_begin = re.search(r"^\s*namespace\s+(\w+)\s*\{$", line) + if ns_begin: + namespaces.append(ns_begin.group(1)) + ns_end = re.search(r"^\s*\}\s*\/\/\s*namespace\s+(\w+)$", line) + if ns_end: + namespaces.pop(-1) + + if line.strip().startswith("extern template"): + source_code = get_source_code(namespaces, line) + for ext in ["cpp", "cu"]: + fname = f"{dirname}/{basename}.t{i}.{ext}" + check_write(source_code, fname) + file_names.add(fname) + i += 1 + +# remove extraneous files from the target directory +for path in pathlib.Path(dirname).iterdir(): + if str(path) not in file_names: + path.unlink() diff --git a/src/Benchmarks/BLAS/array-operations.h b/src/Benchmarks/BLAS/array-operations.h index 38a58c4312b14bcc3231d3c097ff4ae6bd171e6b..271819de7faa5a3107f88e9c7cff9d68ebd2d281 100644 --- a/src/Benchmarks/BLAS/array-operations.h +++ b/src/Benchmarks/BLAS/array-operations.h @@ -14,8 +14,7 @@ #include -#include "../Benchmarks.h" - +#include #include namespace TNL { @@ -116,10 +115,7 @@ benchmarkArrayOperations( Benchmark<> & benchmark, hostArray = hostArray2; }; benchmark.setOperation( "copy (operator=)", 2 * datasetSize ); - // copyBasetime is used later inside HAVE_CUDA guard, so the compiler will - // complain when compiling without CUDA - const double copyBasetime = benchmark.time< Devices::Host >( reset1, "CPU", copyAssignHostHost ); - (void)copyBasetime; // ignore unused variable + benchmark.time< Devices::Host >( reset1, "CPU", copyAssignHostHost ); #ifdef HAVE_CUDA auto copyAssignCudaCuda = [&]() { deviceArray = deviceArray2; @@ -135,7 +131,7 @@ benchmarkArrayOperations( Benchmark<> & benchmark, auto copyAssignCudaHost = [&]() { hostArray = deviceArray; }; - benchmark.setOperation( "copy (operator=)", datasetSize, copyBasetime ); + benchmark.setOperation( "copy (operator=)", datasetSize, benchmark.getBaseTime() ); benchmark.time< Devices::Cuda >( reset1, "CPU->GPU", copyAssignHostCuda ); benchmark.time< Devices::Cuda >( reset1, "GPU->CPU", copyAssignCudaHost ); #endif diff --git a/src/Benchmarks/BLAS/dense-mv.h b/src/Benchmarks/BLAS/gemv.h similarity index 58% rename from src/Benchmarks/BLAS/dense-mv.h rename to src/Benchmarks/BLAS/gemv.h index 1204257cce7f2b6fa354a045245129282992c1b3..93147d6baa7c1a04b9d6a037eeec4d0050e824c5 100644 --- a/src/Benchmarks/BLAS/dense-mv.h +++ b/src/Benchmarks/BLAS/gemv.h @@ -1,5 +1,5 @@ /*************************************************************************** - dense-mv.h - description + gemv.h - description ------------------- begin : Jul 8, 2021 copyright : (C) 2021 by Tomas Oberhuber et al. @@ -8,15 +8,14 @@ /* See Copyright Notice in tnl/Copyright */ -// Implemented by: Jakub Klinkovsky +// Implemented by: Jakub Klinkovsky, Tomas Oberhuber #pragma once -#include "../Benchmarks.h" +#include #include "cublasWrappers.h" #include -#include #include #include #include @@ -27,16 +26,12 @@ namespace Benchmarks { template< typename Matrix > void setMatrix( Matrix& matrix ) { - using RealType = typename Matrix::RealType; - using IndexType = typename Matrix::IndexType; - matrix.forAllElements( [] __cuda_callable__ ( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx_, RealType& value ) { - value = 1.0; } ); + matrix.setValue( 1.0 ); } template< typename Real > void -benchmarkDenseMVSynthetic( Benchmark<> & benchmark, - const int & size ) +benchmarkGemv( Benchmark<> & benchmark, int rows, int columns ) { using HostMatrix = TNL::Matrices::DenseMatrix< Real, TNL::Devices::Host >; using RowMajorCudaMatrix = TNL::Matrices::DenseMatrix< Real, TNL::Devices::Cuda, int, TNL::Algorithms::Segments::RowMajorOrder >; @@ -50,20 +45,13 @@ benchmarkDenseMVSynthetic( Benchmark<> & benchmark, HostVector inHostVector, outHostVector; CudaVector inCudaVector, outCudaVector1, outCudaVector2; - // create benchmark group - const std::vector< String > parsedType = parseObjectType( getType< HostMatrix >() ); -#ifdef HAVE_CUDA - benchmark.createHorizontalGroup( parsedType[ 0 ], 2 ); -#else - benchmark.createHorizontalGroup( parsedType[ 0 ], 1 ); -#endif - - hostMatrix.setDimensions( size, size ); - inHostVector.setSize( size ); - outHostVector.setSize( size ); + hostMatrix.setDimensions( rows, columns ); + inHostVector.setSize( columns ); + outHostVector.setSize( rows ); setMatrix< HostMatrix >( hostMatrix ); - const double datasetSize = (double) ( size * size ) * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB; + const double datasetSize = (double) ( rows * columns + rows + columns ) * sizeof(Real) / oneGB; + benchmark.setOperation( "gemv", datasetSize ); // reset function auto reset = [&]() { @@ -80,14 +68,13 @@ benchmarkDenseMVSynthetic( Benchmark<> & benchmark, auto spmvHost = [&]() { hostMatrix.vectorProduct( inHostVector, outHostVector ); }; - benchmark.setOperation( datasetSize ); benchmark.time< Devices::Host >( reset, "CPU", spmvHost ); #ifdef HAVE_CUDA - columnMajorCudaMatrix.setDimensions( size, size ); - inCudaVector.setSize( size ); - outCudaVector1.setSize( size ); - outCudaVector2.setSize( size ); + columnMajorCudaMatrix.setDimensions( rows, columns ); + inCudaVector.setSize( columns ); + outCudaVector1.setSize( rows ); + outCudaVector2.setSize( rows ); setMatrix< ColumnMajorCudaMatrix >( columnMajorCudaMatrix ); auto columnMajorMvCuda = [&]() { @@ -97,7 +84,7 @@ benchmarkDenseMVSynthetic( Benchmark<> & benchmark, columnMajorCudaMatrix.reset(); - rowMajorCudaMatrix.setDimensions( size, size ); + rowMajorCudaMatrix.setDimensions( rows, columns ); setMatrix< RowMajorCudaMatrix >( rowMajorCudaMatrix ); auto rowMajorMvCuda = [&]() { @@ -109,7 +96,7 @@ benchmarkDenseMVSynthetic( Benchmark<> & benchmark, //std::cerr << outCudaVector1 << std::endl << outCudaVector2 << std::endl; rowMajorCudaMatrix.reset(); - columnMajorCudaMatrix.setDimensions( size, size ); + columnMajorCudaMatrix.setDimensions( rows, columns ); setMatrix< ColumnMajorCudaMatrix >( columnMajorCudaMatrix ); cublasHandle_t cublasHandle; @@ -117,8 +104,8 @@ benchmarkDenseMVSynthetic( Benchmark<> & benchmark, auto mvCublas = [&] () { Real alpha = 1.0; Real beta = 0.0; - cublasGemv( cublasHandle, CUBLAS_OP_N, size, size, &alpha, - columnMajorCudaMatrix.getValues().getData(), size, + cublasGemv( cublasHandle, CUBLAS_OP_N, rows, columns, &alpha, + columnMajorCudaMatrix.getValues().getData(), rows, inCudaVector.getData(), 1, &beta, outCudaVector1.getData(), 1 ); }; @@ -128,19 +115,5 @@ benchmarkDenseMVSynthetic( Benchmark<> & benchmark, #endif } -/*template< typename Real = double, - typename Index = int > -void -benchmarkDenseSynthetic( Benchmark<> & benchmark, - const int & size ) -{ - // TODO: benchmark all formats from tnl-benchmark-spmv (different parameters of the base formats) - // NOTE: CSR is disabled because it is very slow on GPU - //benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Scalar >( benchmark, size, elementsPerRow ); - benchmarkSpMV< Real, Benchmarks::SpMV::ReferenceFormats::Legacy::Ellpack >( benchmark, size, elementsPerRow ); - benchmarkSpMV< Real, SlicedEllpack >( benchmark, size, elementsPerRow ); - benchmarkSpMV< Real, Benchmarks::SpMV::ReferenceFormats::Legacy::ChunkedEllpack >( benchmark, size, elementsPerRow ); -}*/ - } // namespace Benchmarks } // namespace TNL diff --git a/src/Benchmarks/BLAS/spmv.h b/src/Benchmarks/BLAS/spmv.h deleted file mode 100644 index 6cd669dc085f2bbfc74e75f8c1f93fb5fb6682a6..0000000000000000000000000000000000000000 --- a/src/Benchmarks/BLAS/spmv.h +++ /dev/null @@ -1,189 +0,0 @@ -/*************************************************************************** - spmv.h - description - ------------------- - begin : Dec 30, 2015 - copyright : (C) 2015 by Tomas Oberhuber et al. - email : tomas.oberhuber@fjfi.cvut.cz - ***************************************************************************/ - -/* See Copyright Notice in tnl/Copyright */ - -// Implemented by: Jakub Klinkovsky - -#pragma once - -#include "../Benchmarks.h" - -#include -#include -#include -#include -#include - -namespace TNL { -namespace Benchmarks { - -// silly alias to match the number of template parameters with other formats -template< typename Real, typename Device, typename Index > -using SlicedEllpack = SpMV::ReferenceFormats::Legacy::SlicedEllpack< Real, Device, Index >; - -// Legacy formats -template< typename Real, typename Device, typename Index > -using SparseMatrixLegacy_CSR_Scalar = SpMV::ReferenceFormats::Legacy::CSR< Real, Device, Index, SpMV::ReferenceFormats::Legacy::CSRScalar >; - - -template< typename Matrix > -int setHostTestMatrix( Matrix& matrix, - const int elementsPerRow ) -{ - const int size = matrix.getRows(); - int elements( 0 ); - for( int row = 0; row < size; row++ ) { - int col = row - elementsPerRow / 2; - for( int element = 0; element < elementsPerRow; element++ ) { - if( col + element >= 0 && - col + element < size ) - { - matrix.setElement( row, col + element, element + 1 ); - elements++; - } - } - } - return elements; -} - -#ifdef HAVE_CUDA -template< typename Matrix > -__global__ void setCudaTestMatrixKernel( Matrix* matrix, - const int elementsPerRow, - const int gridIdx ) -{ - const int rowIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; - if( rowIdx >= matrix->getRows() ) - return; - int col = rowIdx - elementsPerRow / 2; - for( int element = 0; element < elementsPerRow; element++ ) { - if( col + element >= 0 && - col + element < matrix->getColumns() ) - matrix->setElementFast( rowIdx, col + element, element + 1 ); - } -} -#endif - -template< typename Matrix > -void setCudaTestMatrix( Matrix& matrix, - const int elementsPerRow ) -{ -#ifdef HAVE_CUDA - typedef typename Matrix::IndexType IndexType; - typedef typename Matrix::RealType RealType; - Pointers::DevicePointer< Matrix > kernel_matrix( matrix ); - dim3 cudaBlockSize( 256 ), cudaGridSize( Cuda::getMaxGridSize() ); - const IndexType cudaBlocks = roundUpDivision( matrix.getRows(), cudaBlockSize.x ); - const IndexType cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() ); - for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ ) { - if( gridIdx == cudaGrids - 1 ) - cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize(); - setCudaTestMatrixKernel< Matrix > - <<< cudaGridSize, cudaBlockSize >>> - ( &kernel_matrix.template modifyData< Devices::Cuda >(), elementsPerRow, gridIdx ); - TNL_CHECK_CUDA_DEVICE; - } -#endif -} - - -// TODO: rename as benchmark_SpMV_synthetic and move to spmv-synthetic.h -template< typename Real, - template< typename, typename, typename > class Matrix > -void -benchmarkSpMV( Benchmark<> & benchmark, - const int & size, - const int elementsPerRow = 5 ) -{ - typedef Matrix< Real, Devices::Host, int > HostMatrix; - typedef Matrix< Real, Devices::Cuda, int > DeviceMatrix; - typedef Containers::Vector< Real, Devices::Host, int > HostVector; - typedef Containers::Vector< Real, Devices::Cuda, int > CudaVector; - - HostMatrix hostMatrix; - DeviceMatrix deviceMatrix; - Containers::Vector< int, Devices::Host, int > hostRowLengths; - Containers::Vector< int, Devices::Cuda, int > deviceRowLengths; - HostVector hostVector, hostVector2; - CudaVector deviceVector, deviceVector2; - - // create benchmark group - const std::vector< String > parsedType = parseObjectType( getType< HostMatrix >() ); -#ifdef HAVE_CUDA - benchmark.createHorizontalGroup( parsedType[ 0 ], 2 ); -#else - benchmark.createHorizontalGroup( parsedType[ 0 ], 1 ); -#endif - - hostRowLengths.setSize( size ); - hostMatrix.setDimensions( size, size ); - hostVector.setSize( size ); - hostVector2.setSize( size ); -#ifdef HAVE_CUDA - deviceRowLengths.setSize( size ); - deviceMatrix.setDimensions( size, size ); - deviceVector.setSize( size ); - deviceVector2.setSize( size ); -#endif - - hostRowLengths.setValue( elementsPerRow ); -#ifdef HAVE_CUDA - deviceRowLengths.setValue( elementsPerRow ); -#endif - - hostMatrix.setCompressedRowLengths( hostRowLengths ); -#ifdef HAVE_CUDA - deviceMatrix.setCompressedRowLengths( deviceRowLengths ); -#endif - - const int elements = setHostTestMatrix< HostMatrix >( hostMatrix, elementsPerRow ); - setCudaTestMatrix< DeviceMatrix >( deviceMatrix, elementsPerRow ); - const double datasetSize = (double) elements * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB; - - // reset function - auto reset = [&]() { - hostVector.setValue( 1.0 ); - hostVector2.setValue( 0.0 ); -#ifdef HAVE_CUDA - deviceVector.setValue( 1.0 ); - deviceVector2.setValue( 0.0 ); -#endif - }; - - // compute functions - auto spmvHost = [&]() { - hostMatrix.vectorProduct( hostVector, hostVector2 ); - }; - benchmark.setOperation( datasetSize ); - benchmark.time< Devices::Host >( reset, "CPU", spmvHost ); -#ifdef HAVE_CUDA - auto spmvCuda = [&]() { - deviceMatrix.vectorProduct( deviceVector, deviceVector2 ); - }; - benchmark.time< Devices::Cuda >( reset, "GPU", spmvCuda ); -#endif -} - -template< typename Real = double, - typename Index = int > -void -benchmarkSpmvSynthetic( Benchmark<> & benchmark, - const int & size, - const int & elementsPerRow ) -{ - // TODO: benchmark all formats from tnl-benchmark-spmv (different parameters of the base formats) - // NOTE: CSR is disabled because it is very slow on GPU - //benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Scalar >( benchmark, size, elementsPerRow ); - benchmarkSpMV< Real, Benchmarks::SpMV::ReferenceFormats::Legacy::Ellpack >( benchmark, size, elementsPerRow ); - benchmarkSpMV< Real, SlicedEllpack >( benchmark, size, elementsPerRow ); - benchmarkSpMV< Real, Benchmarks::SpMV::ReferenceFormats::Legacy::ChunkedEllpack >( benchmark, size, elementsPerRow ); -} - -} // namespace Benchmarks -} // namespace TNL diff --git a/src/Benchmarks/BLAS/tnl-benchmark-blas.h b/src/Benchmarks/BLAS/tnl-benchmark-blas.h index 9b061adf65a752116bc75090d86af8509e4e764a..ca9ffbb00aca570798c2e83dead535dc8ad1ca30 100644 --- a/src/Benchmarks/BLAS/tnl-benchmark-blas.h +++ b/src/Benchmarks/BLAS/tnl-benchmark-blas.h @@ -21,8 +21,7 @@ #include "array-operations.h" #include "vector-operations.h" #include "triad.h" -#include "spmv.h" -#include "dense-mv.h" +#include "gemv.h" using namespace TNL; @@ -32,37 +31,39 @@ using namespace TNL::Benchmarks; template< typename Real > void runBlasBenchmarks( Benchmark<> & benchmark, - Benchmark<>::MetadataMap metadata, const std::size_t & minSize, const std::size_t & maxSize, - const double & sizeStepFactor, - const int & elementsPerRow ) + const double & sizeStepFactor ) { - const String precision = getType< Real >(); - metadata["precision"] = precision; + benchmark.setMetadataWidths({ + { "operation", 30 }, + { "performer", 21 }, + { "precision", 10 }, + }); // Array operations - benchmark.newBenchmark( String("Array operations (") + precision + ", host allocator = Host)", - metadata ); + std::cout << "\n== Array operations ==\n" << std::endl; for( std::size_t size = minSize; size <= maxSize; size *= 2 ) { benchmark.setMetadataColumns( Benchmark<>::MetadataColumns({ + { "precision", getType< Real >() }, + { "host allocator", "Host" }, { "size", convertToString( size ) }, } )); benchmarkArrayOperations< Real >( benchmark, size ); } #ifdef HAVE_CUDA - benchmark.newBenchmark( String("Array operations (") + precision + ", host allocator = CudaHost)", - metadata ); for( std::size_t size = minSize; size <= maxSize; size *= 2 ) { benchmark.setMetadataColumns( Benchmark<>::MetadataColumns({ + { "precision", getType< Real >() }, + { "host allocator", "CudaHost" }, { "size", convertToString( size ) }, } )); benchmarkArrayOperations< Real, int, Allocators::CudaHost >( benchmark, size ); } - benchmark.newBenchmark( String("Array operations (") + precision + ", host allocator = CudaManaged)", - metadata ); for( std::size_t size = minSize; size <= maxSize; size *= 2 ) { benchmark.setMetadataColumns( Benchmark<>::MetadataColumns({ + { "precision", getType< Real >() }, + { "host allocator", "CudaManaged" }, { "size", convertToString( size ) }, } )); benchmarkArrayOperations< Real, int, Allocators::CudaManaged >( benchmark, size ); @@ -70,10 +71,10 @@ runBlasBenchmarks( Benchmark<> & benchmark, #endif // Vector operations - benchmark.newBenchmark( String("Vector operations (") + precision + ")", - metadata ); + std::cout << "\n== Vector operations ==\n" << std::endl; for( std::size_t size = minSize; size <= maxSize; size *= sizeStepFactor ) { benchmark.setMetadataColumns( Benchmark<>::MetadataColumns({ + { "precision", getType< Real >() }, { "size", convertToString( size ) }, } )); benchmarkVectorOperations< Real >( benchmark, size ); @@ -81,39 +82,30 @@ runBlasBenchmarks( Benchmark<> & benchmark, // Triad benchmark: copy from host, compute, copy to host #ifdef HAVE_CUDA - benchmark.newBenchmark( String("Triad benchmark (") + precision + ")", - metadata ); + std::cout << "\n== Triad ==\n" << std::endl; for( std::size_t size = minSize; size <= maxSize; size *= 2 ) { benchmark.setMetadataColumns( Benchmark<>::MetadataColumns({ + { "precision", getType< Real >() }, { "size", convertToString( size ) }, } )); benchmarkTriad< Real >( benchmark, size ); } #endif - // Sparse matrix-vector multiplication - benchmark.newBenchmark( String("Sparse matrix-vector multiplication (") + precision + ")", - metadata ); - for( std::size_t size = minSize; size <= maxSize; size *= 2 ) { - benchmark.setMetadataColumns( Benchmark<>::MetadataColumns({ - { "rows", convertToString( size ) }, - { "columns", convertToString( size ) }, - { "elements per row", convertToString( elementsPerRow ) }, - } )); - benchmarkSpmvSynthetic< Real >( benchmark, size, elementsPerRow ); - } - // Dense matrix-vector multiplication - benchmark.newBenchmark( String("Dense matrix-vector multiplication (") + precision + ")", - metadata ); - for( std::size_t size = 10; size <= 20000; size *= 2 ) { - benchmark.setMetadataColumns( Benchmark<>::MetadataColumns({ - { "rows", convertToString( size ) }, - { "columns", convertToString( size ) } - } )); - benchmarkDenseMVSynthetic< Real >( benchmark, size ); + std::cout << "\n== Dense matrix-vector multiplication ==\n" << std::endl; + for( std::size_t rows = 10; rows <= 20000 * 20000; rows *= 2 ) { + for( std::size_t columns = 10; columns <= 20000 * 20000; columns *= 2 ) { + if( rows * columns > 20000 * 20000 ) + break; + benchmark.setMetadataColumns( Benchmark<>::MetadataColumns({ + { "precision", getType< Real >() }, + { "rows", convertToString( rows ) }, + { "columns", convertToString( columns ) } + } )); + benchmarkGemv< Real >( benchmark, rows, columns ); + } } - } void @@ -132,7 +124,6 @@ setupConfig( Config::ConfigDescription & config ) config.addEntry< int >( "max-size", "Minimum size of arrays/vectors used in the benchmark.", 10000000 ); config.addEntry< int >( "size-step-factor", "Factor determining the size of arrays/vectors used in the benchmark. First size is min-size and each following size is stepFactor*previousSize, up to max-size.", 2 ); config.addEntry< int >( "loops", "Number of iterations for every computation.", 10 ); - config.addEntry< int >( "elements-per-row", "Number of elements per row of the sparse matrix used in the matrix-vector multiplication benchmark.", 5 ); config.addEntry< int >( "verbose", "Verbose mode.", 1 ); config.addDelimiter( "Device settings:" ); @@ -167,7 +158,6 @@ main( int argc, char* argv[] ) const std::size_t maxSize = parameters.getParameter< int >( "max-size" ); const int sizeStepFactor = parameters.getParameter< int >( "size-step-factor" ); const int loops = parameters.getParameter< int >( "loops" ); - const int elementsPerRow = parameters.getParameter< int >( "elements-per-row" ); const int verbose = parameters.getParameter< int >( "verbose" ); if( sizeStepFactor <= 1 ) { @@ -179,23 +169,19 @@ main( int argc, char* argv[] ) auto mode = std::ios::out; if( outputMode == "append" ) mode |= std::ios::app; - std::ofstream logFile( logFileName.getString(), mode ); + std::ofstream logFile( logFileName, mode ); - // init benchmark and common metadata - Benchmark<> benchmark( loops, verbose ); + // init benchmark and set parameters + Benchmark<> benchmark( logFile, loops, verbose ); - // prepare global metadata - Benchmark<>::MetadataMap metadata = getHardwareMetadata< Logging >(); + // write global metadata into a separate file + std::map< std::string, std::string > metadata = getHardwareMetadata(); + writeMapAsJson( metadata, logFileName, ".metadata.json" ); if( precision == "all" || precision == "float" ) - runBlasBenchmarks< float >( benchmark, metadata, minSize, maxSize, sizeStepFactor, elementsPerRow ); + runBlasBenchmarks< float >( benchmark, minSize, maxSize, sizeStepFactor ); if( precision == "all" || precision == "double" ) - runBlasBenchmarks< double >( benchmark, metadata, minSize, maxSize, sizeStepFactor, elementsPerRow ); - - if( ! benchmark.save( logFile ) ) { - std::cerr << "Failed to write the benchmark results to file '" << logFileName << "'." << std::endl; - return EXIT_FAILURE; - } + runBlasBenchmarks< double >( benchmark, minSize, maxSize, sizeStepFactor ); return EXIT_SUCCESS; } diff --git a/src/Benchmarks/BLAS/triad.h b/src/Benchmarks/BLAS/triad.h index d2bdf12cf684268c4652171db2e53a518dbb7a11..b466e5652cd0018f4da8d79077cf75ecc59f81ff 100644 --- a/src/Benchmarks/BLAS/triad.h +++ b/src/Benchmarks/BLAS/triad.h @@ -12,7 +12,7 @@ #pragma once -#include "../Benchmarks.h" +#include #include #include diff --git a/src/Benchmarks/BLAS/vector-operations.h b/src/Benchmarks/BLAS/vector-operations.h index c2a3ceab321b879eec052d8df24f7091cf778d05..1d9937de80a6ba00d3c862e12b666b0ae2d1a78d 100644 --- a/src/Benchmarks/BLAS/vector-operations.h +++ b/src/Benchmarks/BLAS/vector-operations.h @@ -15,7 +15,7 @@ #include // srand48 #include // std::partial_sum -#include "../Benchmarks.h" +#include #include #include diff --git a/src/Benchmarks/Benchmark.hpp b/src/Benchmarks/Benchmark.hpp deleted file mode 100644 index e2357990a8285bdef417f738b317fde4e3942735..0000000000000000000000000000000000000000 --- a/src/Benchmarks/Benchmark.hpp +++ /dev/null @@ -1,312 +0,0 @@ -/*************************************************************************** - Benchmarks.hpp - description - ------------------- - begin : Jun 7, 2021 - copyright : (C) 2021 by Tomas Oberhuber et al. - email : tomas.oberhuber@fjfi.cvut.cz - ***************************************************************************/ - -/* See Copyright Notice in tnl/Copyright */ - -// Implemented by: Jakub Klinkovsky, -// Tomas Oberhuber - -#pragma once - -#include "FunctionTimer.h" -#include "Logging.h" - -#include -#include -#include - -#include - -#include -#include -#include -#include -#include - -namespace TNL { -namespace Benchmarks { - - -template< typename Logger > -Benchmark< Logger >:: -Benchmark( int loops, - bool verbose, - String outputMode, - bool logFileAppend ) -: Logger(verbose, outputMode, logFileAppend), loops(loops) -{} - -template< typename Logger > -void -Benchmark< Logger >:: -configSetup( Config::ConfigDescription& config ) -{ - config.addEntry< int >( "loops", "Number of iterations for every computation.", 10 ); - config.addEntry< bool >( "reset", "Call reset function between loops.", true ); - config.addEntry< double >( "min-time", "Minimal real time in seconds for every computation.", 0.0 ); - config.addEntry< int >( "verbose", "Verbose mode, the higher number the more verbosity.", 1 ); -} - -template< typename Logger > -void -Benchmark< Logger >:: -setup( const Config::ParameterContainer& parameters ) -{ - this->loops = parameters.getParameter< int >( "loops" ); - this->reset = parameters.getParameter< bool >( "reset" ); - this->minTime = parameters.getParameter< double >( "min-time" ); - const int verbose = parameters.getParameter< int >( "verbose" ); - Logger::setVerbose( verbose ); -} - -template< typename Logger > -void -Benchmark< Logger >:: -setLoops( int loops ) -{ - this->loops = loops; -} - -template< typename Logger > -void -Benchmark< Logger >:: -setMinTime( const double& minTime ) -{ - this->minTime = minTime; -} - -template< typename Logger > -void -Benchmark< Logger >:: -newBenchmark( const String & title ) -{ - Logger::closeTable(); - Logger::writeTitle( title ); -} - -template< typename Logger > -void -Benchmark< Logger >:: -newBenchmark( const String & title, - MetadataMap metadata ) -{ - Logger::closeTable(); - Logger::writeTitle( title ); - // add loops and reset flag to metadata - metadata["loops"] = convertToString(loops); - metadata["reset"] = convertToString( reset ); - metadata["minimal test time"] = convertToString( minTime ); - Logger::writeMetadata( metadata ); -} - -template< typename Logger > -void -Benchmark< Logger >:: -setMetadataColumns( const MetadataColumns & metadata ) -{ - if( Logger::metadataColumns != metadata ) - Logger::header_changed = true; - Logger::metadataColumns = metadata; -} - -template< typename Logger > -void -Benchmark< Logger >:: -setOperation( const String & operation, - const double datasetSize, - const double baseTime ) -{ - monitor.setStage( operation.getString() ); - if( Logger::metadataColumns.size() > 0 && String(Logger::metadataColumns[ 0 ].first) == "operation" ) { - Logger::metadataColumns[ 0 ].second = operation; - } - else { - Logger::metadataColumns.insert( Logger::metadataColumns.begin(), {"operation", operation} ); - } - setOperation( datasetSize, baseTime ); - Logger::header_changed = true; -} - -template< typename Logger > -void -Benchmark< Logger >:: -setOperation( const double datasetSize, - const double baseTime ) -{ - this->datasetSize = datasetSize; - this->baseTime = baseTime; -} - -template< typename Logger > -void -Benchmark< Logger >:: -createHorizontalGroup( const String & name, - int subcolumns ) -{ - if( Logger::horizontalGroups.size() == 0 ) { - Logger::horizontalGroups.push_back( {name, subcolumns} ); - } - else { - auto & last = Logger::horizontalGroups.back(); - if( last.first != name && last.second > 0 ) { - Logger::horizontalGroups.push_back( {name, subcolumns} ); - } - else { - last.first = name; - last.second = subcolumns; - } - } -} - -template< typename Logger > - template< typename Device, - typename ResetFunction, - typename ComputeFunction > -double -Benchmark< Logger >:: -time( ResetFunction reset, - const String & performer, - ComputeFunction & compute, - BenchmarkResult< Logger > & result ) -{ - result.time = std::numeric_limits::quiet_NaN(); - result.stddev = std::numeric_limits::quiet_NaN(); - FunctionTimer< Device > functionTimer; - try { - if( Logger::verbose > 1 ) { - // run the monitor main loop - Solvers::SolverMonitorThread monitor_thread( monitor ); - if( this->reset ) - std::tie( result.time, result.stddev ) = functionTimer.timeFunction( compute, reset, loops, minTime, Logger::verbose, monitor ); - else - std::tie( result.time, result.stddev ) = functionTimer.timeFunction( compute, loops, minTime, Logger::verbose, monitor ); - } - else { - if( this->reset ) - std::tie( result.time, result.stddev ) = functionTimer.timeFunction( compute, reset, loops, minTime, Logger::verbose, monitor ); - else - std::tie( result.time, result.stddev ) = functionTimer.timeFunction( compute, loops, minTime, Logger::verbose, monitor ); - } - this->performedLoops = functionTimer.getPerformedLoops(); - } - catch ( const std::exception& e ) { - std::cerr << "timeFunction failed due to a C++ exception with description: " << e.what() << std::endl; - } - - result.bandwidth = datasetSize / result.time; - result.speedup = this->baseTime / result.time; - if( this->baseTime == 0.0 ) - this->baseTime = result.time; - - Logger::writeTableHeader( performer, result.getTableHeader() ); - Logger::writeTableRow( performer, result.getRowElements() ); - - return this->baseTime; -} - -template< typename Logger > - template< typename Device, - typename ResetFunction, - typename ComputeFunction > -inline double -Benchmark< Logger >:: -time( ResetFunction reset, - const String& performer, - ComputeFunction& compute ) -{ - BenchmarkResult< Logger > result; - return time< Device, ResetFunction, ComputeFunction >( reset, performer, compute, result ); -} - -template< typename Logger > - template< typename Device, - typename ComputeFunction > -double -Benchmark< Logger >:: -time( const String & performer, - ComputeFunction & compute, - BenchmarkResult< Logger > & result ) -{ - result.time = std::numeric_limits::quiet_NaN(); - result.stddev = std::numeric_limits::quiet_NaN(); - FunctionTimer< Device > functionTimer; - try { - if( Logger::verbose > 1 ) { - // run the monitor main loop - Solvers::SolverMonitorThread monitor_thread( monitor ); - std::tie( result.time, result.stddev ) = functionTimer.timeFunction( compute, loops, minTime, Logger::verbose, monitor ); - } - else { - std::tie( result.time, result.stddev ) = functionTimer.timeFunction( compute, loops, minTime, Logger::verbose, monitor ); - } - } - catch ( const std::exception& e ) { - std::cerr << "Function timer failed due to a C++ exception with description: " << e.what() << std::endl; - } - - result.bandwidth = datasetSize / result.time; - result.speedup = this->baseTime / result.time; - if( this->baseTime == 0.0 ) - this->baseTime = result.time; - - Logger::writeTableHeader( performer, result.getTableHeader() ); - Logger::writeTableRow( performer, result.getRowElements() ); - - return this->baseTime; -} - -template< typename Logger > - template< typename Device, - typename ComputeFunction > -inline double -Benchmark< Logger >:: -time( const String & performer, - ComputeFunction & compute ) -{ - BenchmarkResult< Logger > result; - return time< Device, ComputeFunction >( performer, compute, result ); -} - -template< typename Logger > -void -Benchmark< Logger >:: -addErrorMessage( const char* msg, - int numberOfComputations ) -{ - // each computation has 3 subcolumns - const int colspan = 3 * numberOfComputations; - Logger::writeErrorMessage( msg, colspan ); - std::cerr << msg << std::endl; -} - -template< typename Logger > -auto -Benchmark< Logger >:: -getMonitor() -> SolverMonitorType& -{ - return monitor; -} - -template< typename Logger > -int -Benchmark< Logger >:: -getPerformedLoops() const -{ - return this->performedLoops; -} - -template< typename Logger > -bool -Benchmark< Logger >:: -isResetingOn() const -{ - return reset; -} - -} // namespace Benchmarks -} // namespace TNL diff --git a/src/Benchmarks/Benchmarks.h b/src/Benchmarks/Benchmarks.h deleted file mode 100644 index 77fa9e47c897617b6ddee3d5ab94e1510bd5e777..0000000000000000000000000000000000000000 --- a/src/Benchmarks/Benchmarks.h +++ /dev/null @@ -1,256 +0,0 @@ -/*************************************************************************** - Benchmarks.h - description - ------------------- - begin : Dec 30, 2015 - copyright : (C) 2015 by Tomas Oberhuber et al. - email : tomas.oberhuber@fjfi.cvut.cz - ***************************************************************************/ - -/* See Copyright Notice in tnl/Copyright */ - -// Implemented by: Jakub Klinkovsky, -// Tomas Oberhuber - -#pragma once - -#include "FunctionTimer.h" -#include "Logging.h" - -#include -#include -#include - -#include - -#include -#include -#include -#include -#include - -namespace TNL { -namespace Benchmarks { - -const double oneGB = 1024.0 * 1024.0 * 1024.0; - - -template< typename Logger = Logging > -struct BenchmarkResult -{ - using HeaderElements = typename Logger::HeaderElements; - using RowElements = typename Logger::RowElements; - - double time = std::numeric_limits::quiet_NaN(); - double stddev = std::numeric_limits::quiet_NaN(); - double bandwidth = std::numeric_limits::quiet_NaN(); - double speedup = std::numeric_limits::quiet_NaN(); - - virtual HeaderElements getTableHeader() const - { - return HeaderElements( { - std::pair< String, int >( "time", 8 ), - std::pair< String, int >( "stddev", 8 ), - std::pair< String, int >( "stddev/time", 8 ), - std::pair< String, int >( "bandwidth", 8 ), - std::pair< String, int >( "speedup", 8 ) } ); - } - - virtual RowElements getRowElements() const - { - RowElements elements; - elements << time << stddev << stddev / time << bandwidth; - if( speedup != 0 ) - elements << speedup; - else - elements << "N/A"; - return elements; - } -}; - -template< typename Logger = Logging > -class Benchmark -: protected Logger -{ - public: - using typename Logger::MetadataElement; - using typename Logger::MetadataMap; - using typename Logger::MetadataColumns; - using SolverMonitorType = Solvers::IterativeSolverMonitor< double, int >; - - using typename Logger::CommonLogs; - using Logger::addCommonLogs; - using Logger::addLogsMetadata; - using Logger::writeHeader; - - Benchmark( int loops = 10, - bool verbose = true, - String outputMode = "", - bool logFileAppend = false ); - - static void configSetup( Config::ConfigDescription& config ); - - void setup( const Config::ParameterContainer& parameters ); - - // TODO: ensure that this is not called in the middle of the benchmark - // (or just remove it completely?) - void setLoops( int loops ); - - void setMinTime( const double& minTime ); - - // Marks the start of a new benchmark - void newBenchmark( const String & title ); - - // Marks the start of a new benchmark (with custom metadata) - void newBenchmark( const String & title, - MetadataMap metadata ); - - // Sets metadata columns -- values used for all subsequent rows until - // the next call to this function. - void setMetadataColumns( const MetadataColumns & metadata ); - - // TODO: maybe should be renamed to createVerticalGroup and ensured that vertical and horizontal groups are not used within the same "Benchmark" - // Sets current operation -- operations expand the table vertically - // - baseTime should be reset to 0.0 for most operations, but sometimes - // it is useful to override it - // - Order of operations inside a "Benchmark" does not matter, rows can be - // easily sorted while converting to HTML.) - void - setOperation( const String & operation, - const double datasetSize = 0.0, // in GB - const double baseTime = 0.0 ); - - void setOperation( const double datasetSize = 0.0, - const double baseTime = 0.0 ); - - // Creates new horizontal groups inside a benchmark -- increases the number - // of columns in the "Benchmark", implies column spanning. - // (Useful e.g. for SpMV formats, different configurations etc.) - void - createHorizontalGroup( const String & name, - int subcolumns ); - - // Times a single ComputeFunction. Subsequent calls implicitly split - // the current "horizontal group" into sub-columns identified by - // "performer", which are further split into "bandwidth", "time" and - // "speedup" columns. - // TODO: allow custom columns bound to lambda functions (e.g. for Gflops calculation) - // Also terminates the recursion of the following variadic template. - template< typename Device, - typename ResetFunction, - typename ComputeFunction > - double time( ResetFunction reset, - const String & performer, - ComputeFunction & compute, - BenchmarkResult< Logger > & result ); - - template< typename Device, - typename ResetFunction, - typename ComputeFunction > - inline double time( ResetFunction reset, - const String & performer, - ComputeFunction & compute ); - /*{ - BenchmarkResult< Logger > result; - return time< Device, ResetFunction, ComputeFunction >( reset, performer, compute, result ); - }*/ - - /**** - * The same methods as above but without reset function - */ - template< typename Device, - typename ComputeFunction > - double time( const String & performer, - ComputeFunction & compute, - BenchmarkResult< Logger > & result ); - - template< typename Device, - typename ComputeFunction > - inline double time( const String & performer, - ComputeFunction & compute ); - - // Adds an error message to the log. Should be called in places where the - // "time" method could not be called (e.g. due to failed allocation). - void addErrorMessage( const char* msg, - int numberOfComputations = 1 ); - - using Logger::save; - - SolverMonitorType& getMonitor(); - - int getPerformedLoops() const; - - bool isResetingOn() const; - - protected: - - int loops = 1, performedLoops = 0; - - double minTime = 0.0; - - double datasetSize = 0.0; - - double baseTime = 0.0; - - bool reset = true; - - SolverMonitorType monitor; -}; - - -template< typename Logger > -inline typename Benchmark< Logger >::MetadataMap getHardwareMetadata() -{ - const int cpu_id = 0; - const CacheSizes cacheSizes = SystemInfo::getCPUCacheSizes( cpu_id ); - String cacheInfo = convertToString( cacheSizes.L1data ) + ", " - + convertToString( cacheSizes.L1instruction ) + ", " - + convertToString( cacheSizes.L2 ) + ", " - + convertToString( cacheSizes.L3 ); -#ifdef HAVE_CUDA - const int activeGPU = Cuda::DeviceInfo::getActiveDevice(); - const String deviceArch = convertToString( Cuda::DeviceInfo::getArchitectureMajor( activeGPU ) ) + "." + - convertToString( Cuda::DeviceInfo::getArchitectureMinor( activeGPU ) ); -#endif - -#ifdef HAVE_MPI - int nproc = 1; - // check if MPI was initialized (some benchmarks do not initialize MPI even when - // they are built with HAVE_MPI and thus MPI::GetSize() cannot be used blindly) - if( TNL::MPI::Initialized() ) - nproc = TNL::MPI::GetSize(); -#endif - - typename Benchmark< Logger >::MetadataMap metadata { - { "host name", SystemInfo::getHostname() }, - { "architecture", SystemInfo::getArchitecture() }, - { "system", SystemInfo::getSystemName() }, - { "system release", SystemInfo::getSystemRelease() }, - { "start time", SystemInfo::getCurrentTime() }, -#ifdef HAVE_MPI - { "number of MPI processes", convertToString( nproc ) }, -#endif - { "OpenMP enabled", convertToString( Devices::Host::isOMPEnabled() ) }, - { "OpenMP threads", convertToString( Devices::Host::getMaxThreadsCount() ) }, - { "CPU model name", SystemInfo::getCPUModelName( cpu_id ) }, - { "CPU cores", convertToString( SystemInfo::getNumberOfCores( cpu_id ) ) }, - { "CPU threads per core", convertToString( SystemInfo::getNumberOfThreads( cpu_id ) / SystemInfo::getNumberOfCores( cpu_id ) ) }, - { "CPU max frequency (MHz)", convertToString( SystemInfo::getCPUMaxFrequency( cpu_id ) / 1e3 ) }, - { "CPU cache sizes (L1d, L1i, L2, L3) (kiB)", cacheInfo }, -#ifdef HAVE_CUDA - { "GPU name", Cuda::DeviceInfo::getDeviceName( activeGPU ) }, - { "GPU architecture", deviceArch }, - { "GPU CUDA cores", convertToString( Cuda::DeviceInfo::getCudaCores( activeGPU ) ) }, - { "GPU clock rate (MHz)", convertToString( (double) Cuda::DeviceInfo::getClockRate( activeGPU ) / 1e3 ) }, - { "GPU global memory (GB)", convertToString( (double) Cuda::DeviceInfo::getGlobalMemory( activeGPU ) / 1e9 ) }, - { "GPU memory clock rate (MHz)", convertToString( (double) Cuda::DeviceInfo::getMemoryClockRate( activeGPU ) / 1e3 ) }, - { "GPU memory ECC enabled", convertToString( Cuda::DeviceInfo::getECCEnabled( activeGPU ) ) }, -#endif - }; - - return metadata; -} - -} // namespace Benchmarks -} // namespace TNL - -#include diff --git a/src/Benchmarks/CMakeLists.txt b/src/Benchmarks/CMakeLists.txt index 4e1961b3c1b39fad7cb794fd030cfe33e2f8dd0a..0fc8e0f023d8e4a08ee652cc0474643f8a32e1dc 100644 --- a/src/Benchmarks/CMakeLists.txt +++ b/src/Benchmarks/CMakeLists.txt @@ -7,11 +7,3 @@ add_subdirectory( LinearSolvers ) add_subdirectory( ODESolvers ) add_subdirectory( Sorting ) add_subdirectory( Traversers ) - -set( headers - Benchmarks.h - FunctionTimer.h - Logging.h -) - -install( FILES ${headers} DESTINATION ${TNL_TARGET_INCLUDE_DIRECTORY}/Benchmarks ) diff --git a/src/Benchmarks/DistSpMV/tnl-benchmark-distributed-spmv.h b/src/Benchmarks/DistSpMV/tnl-benchmark-distributed-spmv.h index b79d80ebf1e5bb9730357a690fe8824e7cc3864b..5329540cbb61ca1e3233f498d918c55284b1558c 100644 --- a/src/Benchmarks/DistSpMV/tnl-benchmark-distributed-spmv.h +++ b/src/Benchmarks/DistSpMV/tnl-benchmark-distributed-spmv.h @@ -27,7 +27,7 @@ #include "Legacy/DistributedMatrix.h" #include -#include "../Benchmarks.h" +#include #include "ordering.h" #include @@ -157,7 +157,6 @@ struct SpmvBenchmark static bool run( Benchmark<>& benchmark, - Benchmark<>::MetadataMap metadata, const Config::ParameterContainer& parameters ) { MatrixType matrix; @@ -169,10 +168,11 @@ struct SpmvBenchmark matrix.getCompressedRowLengths( rowLengths ); const IndexType maxRowLength = max( rowLengths ); - const String name = String( (TNL::MPI::GetSize() > 1) ? "DistSpMV" : "SpMV" ) - + " (" + parameters.getParameter< String >( "name" ) + "): "; - benchmark.newBenchmark( name, metadata ); + const String title = (TNL::MPI::GetSize() > 1) ? "DistSpMV" : "SpMV"; + std::cout << "\n== " << title << " ==\n" << std::endl; + benchmark.setMetadataColumns( Benchmark<>::MetadataColumns({ + { "matrix name", parameters.getParameter< String >( "name" ) }, // TODO: strip the device // { "matrix type", matrix.getType() }, { "rows", convertToString( matrix.getRows() ) }, @@ -190,15 +190,15 @@ struct SpmvBenchmark MatrixType matrix_perm; Matrices::reorderSparseMatrix( matrix, matrix_perm, perm, iperm ); if( TNL::MPI::GetSize() > 1 ) - runDistributed( benchmark, metadata, parameters, matrix_perm, vector ); + runDistributed( benchmark, parameters, matrix_perm, vector ); else - runNonDistributed( benchmark, metadata, parameters, matrix_perm, vector ); + runNonDistributed( benchmark, parameters, matrix_perm, vector ); } else { if( TNL::MPI::GetSize() > 1 ) - runDistributed( benchmark, metadata, parameters, matrix, vector ); + runDistributed( benchmark, parameters, matrix, vector ); else - runNonDistributed( benchmark, metadata, parameters, matrix, vector ); + runNonDistributed( benchmark, parameters, matrix, vector ); } return true; @@ -206,7 +206,6 @@ struct SpmvBenchmark static void runNonDistributed( Benchmark<>& benchmark, - Benchmark<>::MetadataMap metadata, const Config::ParameterContainer& parameters, MatrixType& matrix, VectorType& vector ) @@ -219,7 +218,6 @@ struct SpmvBenchmark static void runDistributed( Benchmark<>& benchmark, - Benchmark<>::MetadataMap metadata, const Config::ParameterContainer& parameters, MatrixType& matrix, VectorType& vector ) @@ -331,31 +329,24 @@ main( int argc, char* argv[] ) mode |= std::ios::app; std::ofstream logFile; if( rank == 0 ) - logFile.open( logFileName.getString(), mode ); + logFile.open( logFileName, mode ); - // init benchmark and common metadata - Benchmark<> benchmark( loops, verbose ); + // init benchmark and set parameters + Benchmark<> benchmark( logFile, loops, verbose ); - // prepare global metadata - Benchmark<>::MetadataMap metadata = getHardwareMetadata< Logging >(); + // write global metadata into a separate file + std::map< std::string, std::string > metadata = getHardwareMetadata(); + writeMapAsJson( metadata, logFileName, ".metadata.json" ); // TODO: implement resolveMatrixType // return ! Matrices::resolveMatrixType< MainConfig, // Devices::Host, -// SpmvBenchmark >( benchmark, metadata, parameters ); +// SpmvBenchmark >( benchmark, parameters ); using MatrixType = TNL::Matrices::SparseMatrix< double, Devices::Host, int, TNL::Matrices::GeneralMatrix, SegmentsType >; - const bool status = SpmvBenchmark< MatrixType >::run( benchmark, metadata, parameters ); - - if( rank == 0 ) - if( ! benchmark.save( logFile ) ) { - std::cerr << "Failed to write the benchmark results to file '" << parameters.getParameter< String >( "log-file" ) << "'." << std::endl; - return EXIT_FAILURE; - } - - return ! status; + return ! SpmvBenchmark< MatrixType >::run( benchmark, parameters ); } diff --git a/src/Benchmarks/FunctionTimer.h b/src/Benchmarks/FunctionTimer.h deleted file mode 100644 index 1edd6120476f5f50bf3eb714ca3ea1bd8a8ca4aa..0000000000000000000000000000000000000000 --- a/src/Benchmarks/FunctionTimer.h +++ /dev/null @@ -1,120 +0,0 @@ -/*************************************************************************** - FunctionTimer.h - description - ------------------- - begin : Dec 25, 2018 - copyright : (C) 2018 by Tomas Oberhuber et al. - email : tomas.oberhuber@fjfi.cvut.cz - ***************************************************************************/ - -/* See Copyright Notice in tnl/Copyright */ - -// Implemented by: Jakub Klinkovsky, -// Tomas Oberhuber - -#pragma once - -#include - -#include -#include -#include -#include - -namespace TNL { -namespace Benchmarks { - -template< typename Device > -class FunctionTimer -{ -public: - // returns a pair of (mean, stddev) where mean is the arithmetic mean of the - // computation times and stddev is the sample standard deviation - template< typename ComputeFunction, - typename ResetFunction, - typename Monitor = TNL::Solvers::IterativeSolverMonitor< double, int > > - std::pair< double, double > - timeFunction( ComputeFunction compute, - ResetFunction reset, - int maxLoops, - const double& minTime, - int verbose = 1, - Monitor && monitor = Monitor() ) - { - // the timer is constructed zero-initialized and stopped - Timer timer; - - // set timer to the monitor - if( verbose > 1 ) - monitor.setTimer( timer ); - - // warm up - reset(); - compute(); - - Containers::Vector< double > results( maxLoops ); - results.setValue( 0.0 ); - - for( loops = 0; - loops < maxLoops || sum( results ) < minTime; - loops++ ) - { - // abuse the monitor's "time" for loops - monitor.setTime( loops + 1 ); - reset(); - - // Explicit synchronization of the CUDA device -#ifdef HAVE_CUDA - if( std::is_same< Device, Devices::Cuda >::value ) - cudaDeviceSynchronize(); -#endif - - // reset timer before each computation - timer.reset(); - timer.start(); - compute(); -#ifdef HAVE_CUDA - if( std::is_same< Device, Devices::Cuda >::value ) - cudaDeviceSynchronize(); -#endif - timer.stop(); - - results[ loops ] = timer.getRealTime(); - } - - const double mean = sum( results ) / (double) loops; - if( loops > 1 ) { - const double stddev = 1.0 / std::sqrt( loops - 1 ) * l2Norm( results - mean ); - return std::make_pair( mean, stddev ); - } - else { - const double stddev = std::numeric_limits::quiet_NaN(); - return std::make_pair( mean, stddev ); - } - } - - // returns a pair of (mean, stddev) where mean is the arithmetic mean of the - // computation times and stddev is the sample standard deviation - template< typename ComputeFunction, - typename Monitor = TNL::Solvers::IterativeSolverMonitor< double, int > > - std::pair< double, double > - timeFunction( ComputeFunction compute, - int maxLoops, - const double& minTime, - int verbose = 1, - Monitor && monitor = Monitor() ) - { - auto noReset = [] () {}; - return timeFunction( compute, noReset, maxLoops, minTime, verbose, monitor ); - } - - int getPerformedLoops() const - { - return this->loops; - } - -protected: - int loops; -}; - -} // namespace Benchmarks -} // namespace TNL diff --git a/src/Benchmarks/JsonLogging.h b/src/Benchmarks/JsonLogging.h deleted file mode 100644 index 7d9817c654a36f937063b3f27a18bcf0dfed7cc7..0000000000000000000000000000000000000000 --- a/src/Benchmarks/JsonLogging.h +++ /dev/null @@ -1,267 +0,0 @@ -/*************************************************************************** - JsonLogging.h - description - ------------------- - begin : May 11, 2021 - copyright : (C) 2021 by Tomas Oberhuber et al. - email : tomas.oberhuber@fjfi.cvut.cz - ***************************************************************************/ - -/* See Copyright Notice in tnl/Copyright */ - -// Implemented by: Jakub Klinkovsky, -// Tomas Oberhuber - -#pragma once - -#include -#include -#include -#include -#include -#include - -#include - -namespace TNL { -namespace Benchmarks { - -class JsonLoggingRowElements -{ - public: - - JsonLoggingRowElements() - { - stream << std::setprecision( 6 ) << std::fixed; - } - - template< typename T > - JsonLoggingRowElements& operator << ( const T& b ) - { - stream << b; - elements.push_back( stream.str() ); - stream.str( std::string() ); - return *this; - } - - JsonLoggingRowElements& operator << ( decltype( std::setprecision( 2 ) )& setprec ) - { - stream << setprec; - return *this; - } - - JsonLoggingRowElements& operator << ( decltype( std::fixed )& setfixed ) // the same works also for std::scientific - { - stream << setfixed; - return *this; - } - - // iterators - auto begin() noexcept { return elements.begin(); } - - auto begin() const noexcept { return elements.begin(); } - - auto cbegin() const noexcept { return elements.cbegin(); } - - auto end() noexcept { return elements.end(); } - - auto end() const noexcept { return elements.end(); } - - auto cend() const noexcept { return elements.cend(); } - - size_t size() const noexcept { return this->elements.size(); }; - protected: - std::list< String > elements; - - std::stringstream stream; -}; - -class JsonLogging -{ -public: - using MetadataElement = std::pair< const char*, String >; - using MetadataMap = std::map< const char*, String >; - using MetadataColumns = std::vector; - - using CommonLogs = std::vector< std::pair< const char*, String > >; - using LogsMetadata = std::vector< std::pair< String, int > >; - - using HeaderElements = std::vector< std::pair< String, int > >; - using RowElements = JsonLoggingRowElements; - - JsonLogging( int verbose = true, - String outputMode = "", - bool logFileAppend = false ) - : verbose(verbose), outputMode( outputMode ), logFileAppend( logFileAppend ) - {} - - void - setVerbose( int verbose) - { - this->verbose = verbose; - } - - void addCommonLogs( const CommonLogs& logs ) - { - this->commonLogs = logs; - if( verbose ) - { - std::cout << std::endl << "Benchmark setup:" << std::endl; - for( auto lg : logs ) - std::cout << " " << lg.first << " = " << lg.second << std::endl; - std::cout << std::endl; - } - }; - - void resetLogsMetada() { this->logsMetadata.clear(); }; - - void addLogsMetadata( const std::vector< std::pair< String, int > >& md ) - { - this->logsMetadata.insert( this->logsMetadata.end(), md.begin(), md.end() ); - } - - void writeHeader() - { - if( verbose ) - { - for( auto md : this->logsMetadata ) - std::cout << std::setw( md.second ) << md.first; - std::cout << std::endl; - } - } - - void writeRow( const RowElements& rowEls ) - { - TNL_ASSERT_EQ( rowEls.size(), this->logsMetadata.size(), "" ); - if( this->lineStarted ) - log << "," << std::endl; - - log << " {" << std::endl; - - // write common logs - int idx( 0 ); - for( auto lg : this->commonLogs ) - { - if( idx++ > 0 ) - log << "," << std::endl; - log << " \"" << lg.first << "\" : \"" << lg.second << "\""; - } - - auto md = this->logsMetadata.begin(); - for( auto el : rowEls ) - { - if( verbose ) - std::cout << std::setw( md->second ) << el; - if( idx++ > 0 ) - log << "," << std::endl; - log << " \"" << md++->first << "\" : \"" << el << "\""; - } - log << std::endl << " }"; - this->lineStarted = true; - if( verbose ) - std::cout << std::endl; - } - - void - writeTitle( const String & title ) - { - if( outputMode == "append" ) - return; - - if( verbose ) - std::cout << std::endl << "== " << title << " ==" << std::endl << std::endl; - } - - void - writeMetadata( const MetadataMap & metadata ) - { - if( outputMode == "append" ) - return; - - if( verbose ) - std::cout << "properties:" << std::endl; - - for( auto & it : metadata ) { - if( verbose ) - std::cout << " " << it.first << " = " << it.second << std::endl; - } - - if( verbose ) - std::cout << std::endl; - } - - void - writeTableHeader( const String & spanningElement, - const HeaderElements & subElements ) - { - } - - void - writeTableRow( const String & spanningElement, - const RowElements & subElements ) - { - writeRow( subElements ); - } - - void - writeErrorMessage( const char* msg, - int colspan = 1 ) - { - log << "\"error\" : \"" << msg << "\"" << std::endl; - } - - void - closeTable() - { - } - - bool save( std::ostream & logFile ) - { - if( ! this->logFileAppend ) - { - logFile << "{" << std::endl; - logFile << " \"results\" : [ " << std::endl; - } - else - logFile << log.str(); - if( logFile.good() ) { - log.str() = ""; - return true; - } - return false; - } - -protected: - // manual double -> String conversion with fixed precision - static String - _to_string( double num, int precision = 0, bool fixed = false ) - { - std::stringstream str; - if( fixed ) - str << std::fixed; - if( precision ) - str << std::setprecision( precision ); - str << num; - return String( str.str().data() ); - } - - std::stringstream log; - std::string header_indent; - std::string body_indent; - - int verbose; - MetadataColumns metadataColumns; - bool header_changed = true; - std::vector< std::pair< String, int > > horizontalGroups; - - // new JSON implementation - LogsMetadata logsMetadata; - CommonLogs commonLogs; - String outputMode; - - bool lineStarted = false; - bool resultsStarted = false; - bool logFileAppend = false; -}; - -} // namespace Benchmarks -} // namespace TNL diff --git a/src/Benchmarks/LinearSolvers/benchmarks.h b/src/Benchmarks/LinearSolvers/benchmarks.h index 59d2ab3de327ced0beb8a986f44e938af4b4e5d0..33395b04d45a9143d374a451a8632ed4427f6f40 100644 --- a/src/Benchmarks/LinearSolvers/benchmarks.h +++ b/src/Benchmarks/LinearSolvers/benchmarks.h @@ -5,7 +5,7 @@ #include #include -#include "../Benchmarks.h" +#include #ifdef HAVE_ARMADILLO #include @@ -126,7 +126,7 @@ benchmarkSolver( Benchmark<>& benchmark, // subclass BenchmarkResult to add extra columns to the benchmark // (iterations, preconditioned residue, true residue) - struct MyBenchmarkResult : public BenchmarkResult<> + struct MyBenchmarkResult : public BenchmarkResult { using HeaderElements = BenchmarkResult::HeaderElements; using RowElements = BenchmarkResult::RowElements; @@ -145,15 +145,7 @@ benchmarkSolver( Benchmark<>& benchmark, virtual HeaderElements getTableHeader() const override { - return HeaderElements( { - std::pair< String, int >( "time", 8 ), - std::pair< String, int >( "stddev", 8 ), - std::pair< String, int >( "stddev/time", 8 ), - std::pair< String, int >( "speedup", 8 ), - std::pair< String, int >( "converged", 8 ), - std::pair< String, int >( "iterations", 8 ), - std::pair< String, int >( "residue_precond", 8 ), - std::pair< String, int >( "residue_true", 8 ) } ); + return HeaderElements({ "time", "stddev", "stddev/time", "speedup", "converged", "iterations", "residue_precond", "residue_true" }); } virtual RowElements getRowElements() const override diff --git a/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.h b/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.h index 0c16513203dff54edf82564434b58fa07d2493f9..acb02a434c3e409c3a7198dab90c20c5e6e8e46c 100644 --- a/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.h +++ b/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.h @@ -40,7 +40,7 @@ #include #include -#include "../Benchmarks.h" +#include #include "../DistSpMV/ordering.h" #include "benchmarks.h" @@ -338,7 +338,6 @@ struct LinearSolversBenchmark static bool run( Benchmark<>& benchmark, - Benchmark<>::MetadataMap metadata, const Config::ParameterContainer& parameters ) { const String file_matrix = parameters.getParameter< String >( "input-matrix" ); @@ -381,10 +380,11 @@ struct LinearSolversBenchmark matrixPointer->getCompressedRowLengths( rowLengths ); const IndexType maxRowLength = max( rowLengths ); - const String name = String( (TNL::MPI::GetSize() > 1) ? "Distributed linear solvers" : "Linear solvers" ) - + " (" + parameters.getParameter< String >( "name" ) + "): "; - benchmark.newBenchmark( name, metadata ); + const String title = (TNL::MPI::GetSize() > 1) ? "Distributed linear solvers" : "Linear solvers"; + std::cout << "\n== " << title << " ==\n" << std::endl; + benchmark.setMetadataColumns( Benchmark<>::MetadataColumns({ + { "matrix name", parameters.getParameter< String >( "name" ) }, // TODO: strip the device // { "matrix type", matrixPointer->getType() }, { "rows", convertToString( matrixPointer->getRows() ) }, @@ -407,15 +407,15 @@ struct LinearSolversBenchmark Matrices::reorderArray( x0, x0_perm, perm ); Matrices::reorderArray( b, b_perm, perm ); if( TNL::MPI::GetSize() > 1 ) - runDistributed( benchmark, metadata, parameters, matrix_perm, x0_perm, b_perm ); + runDistributed( benchmark, parameters, matrix_perm, x0_perm, b_perm ); else - runNonDistributed( benchmark, metadata, parameters, matrix_perm, x0_perm, b_perm ); + runNonDistributed( benchmark, parameters, matrix_perm, x0_perm, b_perm ); } else { if( TNL::MPI::GetSize() > 1 ) - runDistributed( benchmark, metadata, parameters, matrixPointer, x0, b ); + runDistributed( benchmark, parameters, matrixPointer, x0, b ); else - runNonDistributed( benchmark, metadata, parameters, matrixPointer, x0, b ); + runNonDistributed( benchmark, parameters, matrixPointer, x0, b ); } return true; @@ -423,7 +423,6 @@ struct LinearSolversBenchmark static void runDistributed( Benchmark<>& benchmark, - Benchmark<>::MetadataMap metadata, const Config::ParameterContainer& parameters, const SharedPointer< MatrixType >& matrixPointer, const VectorType& x0, @@ -467,7 +466,6 @@ struct LinearSolversBenchmark static void runNonDistributed( Benchmark<>& benchmark, - Benchmark<>::MetadataMap metadata, const Config::ParameterContainer& parameters, const SharedPointer< MatrixType >& matrixPointer, const VectorType& x0, @@ -611,31 +609,24 @@ main( int argc, char* argv[] ) mode |= std::ios::app; std::ofstream logFile; if( rank == 0 ) - logFile.open( logFileName.getString(), mode ); + logFile.open( logFileName, mode ); - // init benchmark and common metadata - Benchmark<> benchmark( loops, verbose ); + // init benchmark and set parameters + Benchmark<> benchmark( logFile, loops, verbose ); - // prepare global metadata - Benchmark<>::MetadataMap metadata = getHardwareMetadata< Logging >(); + // write global metadata into a separate file + std::map< std::string, std::string > metadata = getHardwareMetadata(); + writeMapAsJson( metadata, logFileName, ".metadata.json" ); // TODO: implement resolveMatrixType // return ! Matrices::resolveMatrixType< MainConfig, // Devices::Host, -// LinearSolversBenchmark >( benchmark, metadata, parameters ); +// LinearSolversBenchmark >( benchmark, parameters ); using MatrixType = TNL::Matrices::SparseMatrix< double, Devices::Host, int, TNL::Matrices::GeneralMatrix, SegmentsType >; - const bool status = LinearSolversBenchmark< MatrixType >::run( benchmark, metadata, parameters ); - - if( rank == 0 ) - if( ! benchmark.save( logFile ) ) { - std::cerr << "Failed to write the benchmark results to file '" << parameters.getParameter< String >( "log-file" ) << "'." << std::endl; - return EXIT_FAILURE; - } - - return ! status; + return ! LinearSolversBenchmark< MatrixType >::run( benchmark, parameters ); } diff --git a/src/Benchmarks/Logging.h b/src/Benchmarks/Logging.h deleted file mode 100644 index 2c8262d21018473c2013bec8d8f1f13f9f9d4e77..0000000000000000000000000000000000000000 --- a/src/Benchmarks/Logging.h +++ /dev/null @@ -1,304 +0,0 @@ -/*************************************************************************** - Logging.h - description - ------------------- - begin : Dec 25, 2018 - copyright : (C) 2018 by Tomas Oberhuber et al. - email : tomas.oberhuber@fjfi.cvut.cz - ***************************************************************************/ - -/* See Copyright Notice in tnl/Copyright */ - -// Implemented by: Jakub Klinkovsky, -// Tomas Oberhuber - -#pragma once - -#include -#include -#include -#include -#include -#include - -#include - -namespace TNL { -namespace Benchmarks { - -class LoggingRowElements -{ - public: - - LoggingRowElements() - { - stream << std::setprecision( 6 ) << std::fixed; - } - - template< typename T > - LoggingRowElements& operator << ( const T& b ) - { - stream << b; - elements.push_back( stream.str() ); - stream.str( std::string() ); - return *this; - } - - LoggingRowElements& operator << ( decltype( std::setprecision( 2 ) )& setprec ) - { - stream << setprec; - return *this; - } - - LoggingRowElements& operator << ( decltype( std::fixed )& setfixed ) // the same works also for std::scientific - { - stream << setfixed; - return *this; - } - - // iterators - auto begin() noexcept { return elements.begin(); } - - auto begin() const noexcept { return elements.begin(); } - - auto cbegin() const noexcept { return elements.cbegin(); } - - auto end() noexcept { return elements.end(); } - - auto end() const noexcept { return elements.end(); } - - auto cend() const noexcept { return elements.cend(); } - - protected: - std::list< String > elements; - - std::stringstream stream; -}; - -class Logging -{ -public: - using MetadataElement = std::pair< const char*, String >; - using MetadataMap = std::map< const char*, String >; - using MetadataColumns = std::vector; - - using CommonLogs = std::vector< std::pair< const char*, String > >; - - using HeaderElements = std::vector< std::pair< String, int > >; - using RowElements = LoggingRowElements; - - Logging( int verbose = true, - String outputMode = "", - bool logFileAppend = false ) - : verbose(verbose), outputMode( outputMode ) - {} - - void - setVerbose( int verbose) - { - this->verbose = verbose; - } - - void - writeTitle( const String & title ) - { - if( verbose ) - std::cout << std::endl << "== " << title << " ==" << std::endl << std::endl; - log << ": title = " << title << std::endl; - } - - void addCommonLogs( const CommonLogs& logs ) - { - for( auto log : logs ) - { - if( verbose ) - std::cout << log.first << " = " << log.second << std::endl; - } - }; - - void addLogsMetadata( const std::vector< String >& md ){}; - - void writeHeader(){}; - - void - writeMetadata( const MetadataMap & metadata ) - { - if( verbose ) - std::cout << "properties:" << std::endl; - - for( auto & it : metadata ) { - if( verbose ) - std::cout << " " << it.first << " = " << it.second << std::endl; - log << ": " << it.first << " = " << it.second << std::endl; - } - if( verbose ) - std::cout << std::endl; - } - - void - writeTableHeader( const String & spanningElement, - const HeaderElements & subElements ) - { - if( verbose && header_changed ) { - for( auto & it : metadataColumns ) { - std::cout << std::setw( 20 ) << it.first; - } - - // spanning element is printed as usual column to stdout, - // but is excluded from header - std::cout << std::setw( 15 ) << ""; - - for( auto & it : subElements ) { - std::cout << std::setw( 15 ) << it.first; - } - std::cout << std::endl; - - header_changed = false; - } - - // initial indent string - header_indent = "!"; - log << std::endl; - for( auto & it : metadataColumns ) { - log << header_indent << " " << it.first << std::endl; - } - - // dump stacked spanning columns - if( horizontalGroups.size() > 0 ) - while( horizontalGroups.back().second <= 0 ) { - horizontalGroups.pop_back(); - header_indent.pop_back(); - } - for( size_t i = 0; i < horizontalGroups.size(); i++ ) { - if( horizontalGroups[ i ].second > 0 ) { - log << header_indent << " " << horizontalGroups[ i ].first << std::endl; - header_indent += "!"; - } - } - - log << header_indent << " " << spanningElement << std::endl; - for( auto & it : subElements ) { - log << header_indent << "! " << it.first << std::endl; - } - - if( horizontalGroups.size() > 0 ) { - horizontalGroups.back().second--; - header_indent.pop_back(); - } - } - - void - writeTableRow( const String & spanningElement, - const RowElements & subElements ) - { - if( verbose ) { - for( auto & it : metadataColumns ) { - std::cout << std::setw( 20 ) << it.second; - } - // spanning element is printed as usual column to stdout - std::cout << std::setw( 15 ) << spanningElement; - for( auto & it : subElements ) { - std::cout << std::setw( 15 ) << it; - } - std::cout << std::endl; - } - - // only when changed (the header has been already adjusted) - // print each element on separate line - for( auto & it : metadataColumns ) { - log << it.second << std::endl; - } - - // benchmark data are indented - const String indent = " "; - for( auto & it : subElements ) { - log << indent << it << std::endl; - } - } - - void - writeErrorMessage( const char* msg, - int colspan = 1 ) - { - // initial indent string - header_indent = "!"; - log << std::endl; - for( auto & it : metadataColumns ) { - log << header_indent << " " << it.first << std::endl; - } - - // make sure there is a header column for the message - if( horizontalGroups.size() == 0 ) - horizontalGroups.push_back( {"", 1} ); - - // dump stacked spanning columns - while( horizontalGroups.back().second <= 0 ) { - horizontalGroups.pop_back(); - header_indent.pop_back(); - } - for( size_t i = 0; i < horizontalGroups.size(); i++ ) { - if( horizontalGroups[ i ].second > 0 ) { - log << header_indent << " " << horizontalGroups[ i ].first << std::endl; - header_indent += "!"; - } - } - if( horizontalGroups.size() > 0 ) { - horizontalGroups.back().second -= colspan; - header_indent.pop_back(); - } - - // only when changed (the header has been already adjusted) - // print each element on separate line - for( auto & it : metadataColumns ) { - log << it.second << std::endl; - } - log << msg << std::endl; - } - - void - closeTable() - { - log << std::endl; - header_indent = body_indent = ""; - header_changed = true; - horizontalGroups.clear(); - } - - bool save( std::ostream & logFile ) - { - closeTable(); - logFile << log.str(); - if( logFile.good() ) { - log.str() = ""; - return true; - } - return false; - } - -protected: - // manual double -> String conversion with fixed precision - static String - _to_string( double num, int precision = 0, bool fixed = false ) - { - std::stringstream str; - if( fixed ) - str << std::fixed; - if( precision ) - str << std::setprecision( precision ); - str << num; - return String( str.str().data() ); - } - - std::stringstream log; - std::string header_indent; - std::string body_indent; - - int verbose; - MetadataColumns metadataColumns; - bool header_changed = true; - std::vector< std::pair< String, int > > horizontalGroups; - - String outputMode; -}; - -} // namespace Benchmarks -} // namespace TNL diff --git a/src/Benchmarks/NDArray/tnl-benchmark-ndarray-boundary.h b/src/Benchmarks/NDArray/tnl-benchmark-ndarray-boundary.h index f7a485aa13f955ea22b97c830b0f43f37f10f529..6e0e807e2dc264b537c79ec316a1a933bdc92dae 100644 --- a/src/Benchmarks/NDArray/tnl-benchmark-ndarray-boundary.h +++ b/src/Benchmarks/NDArray/tnl-benchmark-ndarray-boundary.h @@ -18,7 +18,7 @@ #include -#include "../Benchmarks.h" +#include using namespace TNL; using namespace TNL::Benchmarks; @@ -440,13 +440,14 @@ int main( int argc, char* argv[] ) auto mode = std::ios::out; if( outputMode == "append" ) mode |= std::ios::app; - std::ofstream logFile( logFileName.getString(), mode ); + std::ofstream logFile( logFileName, mode ); - // init benchmark and common metadata - Benchmark<> benchmark( loops, verbose ); + // init benchmark and set parameters + Benchmark<> benchmark( logFile, loops, verbose ); - // prepare global metadata - Benchmark<>::MetadataMap metadata = getHardwareMetadata< Logging >(); + // write global metadata into a separate file + std::map< std::string, std::string > metadata = getHardwareMetadata(); + writeMapAsJson( metadata, logFileName, ".metadata.json" ); const String devices = parameters.getParameter< String >( "devices" ); if( devices == "all" || devices == "host" ) @@ -456,10 +457,5 @@ int main( int argc, char* argv[] ) run_benchmarks< Devices::Cuda >( benchmark ); #endif - if( ! benchmark.save( logFile ) ) { - std::cerr << "Failed to write the benchmark results to file '" << parameters.getParameter< String >( "log-file" ) << "'." << std::endl; - return EXIT_FAILURE; - } - return EXIT_SUCCESS; } diff --git a/src/Benchmarks/NDArray/tnl-benchmark-ndarray.h b/src/Benchmarks/NDArray/tnl-benchmark-ndarray.h index 8d4ac8e7ace879496e9c18cc25469e83efde3a1b..f0d2e1a9e0c4e4135a127a95908dc94d9905001d 100644 --- a/src/Benchmarks/NDArray/tnl-benchmark-ndarray.h +++ b/src/Benchmarks/NDArray/tnl-benchmark-ndarray.h @@ -20,7 +20,7 @@ #include #include -#include "../Benchmarks.h" +#include using namespace TNL; using namespace TNL::Benchmarks; @@ -428,13 +428,14 @@ int main( int argc, char* argv[] ) auto mode = std::ios::out; if( outputMode == "append" ) mode |= std::ios::app; - std::ofstream logFile( logFileName.getString(), mode ); + std::ofstream logFile( logFileName, mode ); - // init benchmark and common metadata - Benchmark<> benchmark( loops, verbose ); + // init benchmark and set parameters + Benchmark<> benchmark( logFile, loops, verbose ); - // prepare global metadata - Benchmark<>::MetadataMap metadata = getHardwareMetadata< Logging >(); + // write global metadata into a separate file + std::map< std::string, std::string > metadata = getHardwareMetadata(); + writeMapAsJson( metadata, logFileName, ".metadata.json" ); const String devices = parameters.getParameter< String >( "devices" ); if( devices == "all" || devices == "host" ) @@ -444,10 +445,5 @@ int main( int argc, char* argv[] ) run_benchmarks< Devices::Cuda >( benchmark ); #endif - if( ! benchmark.save( logFile ) ) { - std::cerr << "Failed to write the benchmark results to file '" << parameters.getParameter< String >( "log-file" ) << "'." << std::endl; - return EXIT_FAILURE; - } - return EXIT_SUCCESS; } diff --git a/src/Benchmarks/ODESolvers/benchmarks.h b/src/Benchmarks/ODESolvers/benchmarks.h index f27d6962e33cb0cf5cd19373d734a65683f5e1c2..c901b669456712a394a86ccd619c932109f0980c 100644 --- a/src/Benchmarks/ODESolvers/benchmarks.h +++ b/src/Benchmarks/ODESolvers/benchmarks.h @@ -15,7 +15,7 @@ #include #include -#include "../Benchmarks.h" +#include #include // std::runtime_error diff --git a/src/Benchmarks/ODESolvers/tnl-benchmark-ode-solvers.h b/src/Benchmarks/ODESolvers/tnl-benchmark-ode-solvers.h index afdf33d3a4ade89f9444ddbc0e7b45542278ece3..01b112c2cdfdf9ee27041fd750b88a0f6b233246 100644 --- a/src/Benchmarks/ODESolvers/tnl-benchmark-ode-solvers.h +++ b/src/Benchmarks/ODESolvers/tnl-benchmark-ode-solvers.h @@ -28,7 +28,7 @@ #include #include -#include "../Benchmarks.h" +#include #include "benchmarks.h" #include "SimpleProblem.h" #include "Euler.h" @@ -108,67 +108,41 @@ struct ODESolversBenchmark static bool run( Benchmark<>& benchmark, - Benchmark<>::MetadataMap metadata, const Config::ParameterContainer& parameters ) { - const String name = String( (TNL::MPI::GetSize() > 1) ? "Distributed ODE solvers" : "ODE solvers" ); - //+ " (" + parameters.getParameter< String >( "name" ) + "): "; - benchmark.newBenchmark( name, metadata ); + const String title = (TNL::MPI::GetSize() > 1) ? "Distributed ODE solvers" : "ODE solvers"; + std::cout << "\n== " << title << " ==\n" << std::endl; + for( size_t dofs = 25; dofs <= 10000000; dofs *= 2 ) { benchmark.setMetadataColumns( Benchmark<>::MetadataColumns({ - // TODO: strip the device + { "precision", getType< Real >() }, { "DOFs", convertToString( dofs ) }, } )); - if( TNL::MPI::GetSize() > 1 ) - runDistributed( benchmark, metadata, parameters, dofs ); - else - runNonDistributed( benchmark, metadata, parameters, dofs ); + benchmarkODESolvers< Real, Index >( benchmark, parameters, dofs ); } return true; } - - static void - runDistributed( Benchmark<>& benchmark, - Benchmark<>::MetadataMap metadata, - const Config::ParameterContainer& parameters, - size_t dofs ) - { - std::cout << "Iterative solvers:" << std::endl; - benchmarkODESolvers< Real, Index >( benchmark, parameters, dofs ); - } - - static void - runNonDistributed( Benchmark<>& benchmark, - Benchmark<>::MetadataMap metadata, - const Config::ParameterContainer& parameters, - size_t dofs ) - { - std::cout << "Iterative solvers:" << std::endl; - benchmarkODESolvers< Real, Index >( benchmark, parameters, dofs ); - } }; template< typename Real > bool resolveIndexType( Benchmark<>& benchmark, - Benchmark<>::MetadataMap& metadata, - Config::ParameterContainer& parameters ) + Config::ParameterContainer& parameters ) { const String& index = parameters.getParameter< String >( "index-type" ); - if( index == "int" ) return ODESolversBenchmark< Real, int >::run( benchmark, metadata, parameters ); - return ODESolversBenchmark< Real, long int >::run( benchmark, metadata, parameters ); + if( index == "int" ) return ODESolversBenchmark< Real, int >::run( benchmark, parameters ); + return ODESolversBenchmark< Real, long int >::run( benchmark, parameters ); } bool resolveRealTypes( Benchmark<>& benchmark, - Benchmark<>::MetadataMap& metadata, - Config::ParameterContainer& parameters ) + Config::ParameterContainer& parameters ) { const String& realType = parameters.getParameter< String >( "real-type" ); if( ( realType == "float" || realType == "all" ) && - ! resolveIndexType< float >( benchmark, metadata, parameters ) ) + ! resolveIndexType< float >( benchmark, parameters ) ) return false; if( ( realType == "double" || realType == "all" ) && - ! resolveIndexType< double >( benchmark, metadata, parameters ) ) + ! resolveIndexType< double >( benchmark, parameters ) ) return false; return true; } @@ -177,7 +151,7 @@ void configSetup( Config::ConfigDescription& config ) { config.addDelimiter( "Benchmark settings:" ); - config.addEntry< String >( "log-file", "Log file name.", "tnl-benchmark-linear-solvers.log"); + config.addEntry< String >( "log-file", "Log file name.", "tnl-benchmark-ode-solvers.log"); config.addEntry< String >( "output-mode", "Mode for opening the log file.", "overwrite" ); config.addEntryEnum( "append" ); config.addEntryEnum( "overwrite" ); @@ -242,21 +216,14 @@ main( int argc, char* argv[] ) mode |= std::ios::app; std::ofstream logFile; if( rank == 0 ) - logFile.open( logFileName.getString(), mode ); - - // init benchmark and common metadata - Benchmark<> benchmark( loops, verbose ); - - // prepare global metadata - Benchmark<>::MetadataMap metadata = getHardwareMetadata< Logging >(); + logFile.open( logFileName, mode ); - const bool status = resolveRealTypes( benchmark, metadata, parameters ); + // init benchmark and set parameters + Benchmark<> benchmark( logFile, loops, verbose ); - if( rank == 0 ) - if( ! benchmark.save( logFile ) ) { - std::cerr << "Failed to write the benchmark results to file '" << parameters.getParameter< String >( "log-file" ) << "'." << std::endl; - return EXIT_FAILURE; - } + // write global metadata into a separate file + std::map< std::string, std::string > metadata = getHardwareMetadata(); + writeMapAsJson( metadata, logFileName, ".metadata.json" ); - return ! status; + return ! resolveRealTypes( benchmark, parameters ); } diff --git a/src/Benchmarks/SpMV/CMakeLists.txt b/src/Benchmarks/SpMV/CMakeLists.txt index 93dccab0dc793ea2f3218eb6ba0cb9f326450425..4998088530017c8d4673d5cc97a05da4248f86ff 100644 --- a/src/Benchmarks/SpMV/CMakeLists.txt +++ b/src/Benchmarks/SpMV/CMakeLists.txt @@ -9,15 +9,17 @@ #include( cmake/BuildCSR5.cmake ) if( BUILD_CUDA ) - cuda_include_directories( ${CXX_BENCHMARKS_INCLUDE_DIRS} ) - message( STATUS ${CXX_BENCHMARKS_FLAGS} ) - CUDA_ADD_EXECUTABLE( tnl-benchmark-spmv tnl-benchmark-spmv.cu OPTIONS ${CXX_BENCHMARKS_FLAGS} ${PETSC_CXX_FLAGS} ) - TARGET_LINK_LIBRARIES( tnl-benchmark-spmv ${CUDA_cusparse_LIBRARY} ${CUDA_cudadevrt_LIBRARY} ${PETSC_LINKER_FLAGS}) + file( GLOB EXPLICIT_TEMPLATES spmv.templates/*.cu ) + cuda_include_directories( ${CXX_BENCHMARKS_INCLUDE_DIRS} ) + cuda_add_executable( tnl-benchmark-spmv tnl-benchmark-spmv.cu ${EXPLICIT_TEMPLATES} ReferenceFormats/LightSpMV-1.0/SpMV.cu ReferenceFormats/LightSpMV-1.0/SpMVCSR.cu + OPTIONS ${CXX_BENCHMARKS_FLAGS} ${PETSC_CXX_FLAGS} ) + target_link_libraries( tnl-benchmark-spmv ${CUDA_cusparse_LIBRARY} ${CUDA_cudadevrt_LIBRARY} ${PETSC_LINKER_FLAGS}) else() - ADD_EXECUTABLE( tnl-benchmark-spmv tnl-benchmark-spmv.cpp ) - target_compile_options( tnl-benchmark-spmv PRIVATE ${CXX_BENCHMARKS_FLAGS} ${PETSC_CXX_FLAGS} ) - target_include_directories( tnl-benchmark-spmv PRIVATE ${CXX_BENCHMARKS_INCLUDE_DIRS} ) - TARGET_LINK_LIBRARIES( tnl-benchmark-spmv ${PETSC_LINKER_FLAGS} ) + file( GLOB EXPLICIT_TEMPLATES spmv.templates/*.cpp ) + add_executable( tnl-benchmark-spmv tnl-benchmark-spmv.cpp ${EXPLICIT_TEMPLATES} ) + target_compile_options( tnl-benchmark-spmv PRIVATE ${CXX_BENCHMARKS_FLAGS} ${PETSC_CXX_FLAGS} ) + target_include_directories( tnl-benchmark-spmv PRIVATE ${CXX_BENCHMARKS_INCLUDE_DIRS} ) + target_link_libraries( tnl-benchmark-spmv ${PETSC_LINKER_FLAGS} ) endif() install( TARGETS tnl-benchmark-spmv RUNTIME DESTINATION bin ) diff --git a/src/Benchmarks/SpMV/ReferenceFormats/LightSpMVBenchmark.h b/src/Benchmarks/SpMV/ReferenceFormats/LightSpMVBenchmark.h index 7d6ffde49960a1e53b563222af8ab4861accba0f..1d6e343201e8897a62532a7cd502b8f623ef7476 100644 --- a/src/Benchmarks/SpMV/ReferenceFormats/LightSpMVBenchmark.h +++ b/src/Benchmarks/SpMV/ReferenceFormats/LightSpMVBenchmark.h @@ -17,8 +17,8 @@ #pragma push #pragma diag_suppress = 1444 #include "LightSpMV-1.0/SpMV.h" -#include "LightSpMV-1.0/SpMV.cu" -#include "LightSpMV-1.0/SpMVCSR.cu" +//#include "LightSpMV-1.0/SpMV.cu" +//#include "LightSpMV-1.0/SpMVCSR.cu" #pragma pop #endif #include diff --git a/src/Benchmarks/SpMV/SpmvBenchmarkResult.h b/src/Benchmarks/SpMV/SpmvBenchmarkResult.h index 61fae4f609601f6a35a08160c9132f471cd4d72c..128b0033485990229cc90502492a3c71cef52daf 100644 --- a/src/Benchmarks/SpMV/SpmvBenchmarkResult.h +++ b/src/Benchmarks/SpMV/SpmvBenchmarkResult.h @@ -10,7 +10,7 @@ #pragma once -#include "../Benchmarks.h" +#include namespace TNL { namespace Benchmarks { @@ -21,7 +21,7 @@ template< typename Real, typename ResultReal = Real, typename Logger = JsonLogging > struct SpmvBenchmarkResult -: public BenchmarkResult< Logger > +: public BenchmarkResult { using RealType = Real; using DeviceType = Device; @@ -29,36 +29,28 @@ struct SpmvBenchmarkResult using HostVector = Containers::Vector< Real, Devices::Host, Index >; using BenchmarkVector = Containers::Vector< ResultReal, Device, Index >; - using typename BenchmarkResult< Logger >::HeaderElements; - using typename BenchmarkResult< Logger >::RowElements; - using BenchmarkResult< Logger >::stddev; - using BenchmarkResult< Logger >::bandwidth; - using BenchmarkResult< Logger >::speedup; - using BenchmarkResult< Logger >::time; + using typename BenchmarkResult::HeaderElements; + using typename BenchmarkResult::RowElements; + using BenchmarkResult::stddev; + using BenchmarkResult::bandwidth; + using BenchmarkResult::speedup; + using BenchmarkResult::time; - SpmvBenchmarkResult( const String& format, - const HostVector& csrResult, - const BenchmarkVector& benchmarkResult, - const IndexType nonzeros ) - : format( format ), csrResult( csrResult ), benchmarkResult( benchmarkResult ), nonzeros( nonzeros ){}; + SpmvBenchmarkResult( const HostVector& csrResult, + const BenchmarkVector& benchmarkResult ) + : csrResult( csrResult ), benchmarkResult( benchmarkResult ) + {} virtual HeaderElements getTableHeader() const override { - return HeaderElements( { - std::pair< String, int >( "format", 35 ), - std::pair< String, int >( "device", 12 ), - std::pair< String, int >( "non-zeros", 12 ), - std::pair< String, int >( "time", 12 ), - std::pair< String, int >( "stddev", 12 ), - std::pair< String, int >( "stddev/time", 14 ), - std::pair< String, int >( "bandwidth", 12 ), - std::pair< String, int >( "speedup", 12 ), - std::pair< String, int >( "CSR Diff.Max", 14 ), - std::pair< String, int >( "CSR Diff.L2", 14 ) } ); + return HeaderElements({ "time", "stddev", "stddev/time", "loops", "bandwidth", "speedup", "CSR Diff.Max", "CSR Diff.L2" }); } - void setFormat( const String& format ) { this->format = format; }; + virtual std::vector< int > getColumnWidthHints() const override + { + return std::vector< int >({ 14, 14, 14, 6, 14, 14, 14, 14 }); + } virtual RowElements getRowElements() const override { @@ -66,20 +58,18 @@ struct SpmvBenchmarkResult benchmarkResultCopy = benchmarkResult; auto diff = csrResult - benchmarkResultCopy; RowElements elements; - elements << format - << ( std::is_same< Device, Devices::Host >::value ? "CPU" : "GPU" ) - << nonzeros << time << stddev << stddev/time << bandwidth; + // write in scientific format to avoid precision loss + elements << std::scientific << time << stddev << stddev/time << loops << bandwidth; if( speedup != 0.0 ) elements << speedup; - else elements << "N/A"; + else + elements << "N/A"; elements << max( abs( diff ) ) << lpNorm( diff, 2.0 ); return elements; } - String format; const HostVector& csrResult; const BenchmarkVector& benchmarkResult; - const IndexType nonzeros; }; } //namespace Benchmarks diff --git a/src/Benchmarks/SpMV/eti.py b/src/Benchmarks/SpMV/eti.py new file mode 120000 index 0000000000000000000000000000000000000000..c59a420e1efa07b7a627cc80511a14ad9e3d0111 --- /dev/null +++ b/src/Benchmarks/SpMV/eti.py @@ -0,0 +1 @@ +../../../scripts/eti.py \ No newline at end of file diff --git a/src/Benchmarks/SpMV/spmv.h b/src/Benchmarks/SpMV/spmv.h index 559adadfff6078cd09d98daa006a21e8f52add8c..ff7fecfbb6ef59e6efcbf18cf6eda2b1cba53b38 100644 --- a/src/Benchmarks/SpMV/spmv.h +++ b/src/Benchmarks/SpMV/spmv.h @@ -16,8 +16,8 @@ #include -#include "../Benchmarks.h" -#include "../JsonLogging.h" +#include +#include #include "SpmvBenchmarkResult.h" #include @@ -43,12 +43,6 @@ #include #endif -// Comment the following to turn off some groups of SpMV benchmarks and speed-up the compilation -#define WITH_TNL_BENCHMARK_SPMV_GENERAL_MATRICES -#define WITH_TNL_BENCHMARK_SPMV_SYMMETRIC_MATRICES -#define WITH_TNL_BENCHMARK_SPMV_BINARY_MATRICES -#define WITH_TNL_BENCHMARK_SPMV_LEGACY_FORMATS - // Uncomment the following line to enable benchmarking the sandbox sparse matrix. //#define WITH_TNL_BENCHMARK_SPMV_SANDBOX_MATRIX #ifdef WITH_TNL_BENCHMARK_SPMV_SANDBOX_MATRIX @@ -197,8 +191,7 @@ template< typename Real, typename Device, typename Index > using SlicedEllpackAlias = Benchmarks::SpMV::ReferenceFormats::Legacy::SlicedEllpack< Real, Device, Index >; template< typename Real, - template< typename, typename, typename > class Matrix, - template< typename, typename, typename, typename > class Vector = Containers::Vector > + template< typename, typename, typename > class Matrix > void benchmarkSpMVLegacy( BenchmarkType& benchmark, const TNL::Containers::Vector< Real, Devices::Host, int >& csrResultVector, @@ -211,6 +204,8 @@ benchmarkSpMVLegacy( BenchmarkType& benchmark, using HostVector = Containers::Vector< Real, Devices::Host, int >; using CudaVector = Containers::Vector< Real, Devices::Cuda, int >; + benchmark.setMetadataElement({ "format", MatrixInfo< HostMatrix >::getFormat() }); + HostMatrix hostMatrix; CudaMatrix cudaMatrix; @@ -220,13 +215,13 @@ benchmarkSpMVLegacy( BenchmarkType& benchmark, } catch(const std::exception& e) { - std::cerr << "Unable to read the matrix: " << e.what() << std::endl; + benchmark.addErrorMessage( "Unable to read the matrix:" + String(e.what()) ); return; } - const int elements = hostMatrix.getNonzeroElementsCount(); - const double datasetSize = (double) elements * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB; - benchmark.setOperation( datasetSize ); + const int nonzeros = hostMatrix.getNonzeroElementsCount(); + const double datasetSize = (double) nonzeros * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB; + benchmark.setDatasetSize( datasetSize ); ///// // Benchmark SpMV on host @@ -244,7 +239,7 @@ benchmarkSpMVLegacy( BenchmarkType& benchmark, hostMatrix.vectorProduct( hostInVector, hostOutVector ); }; - SpmvBenchmarkResult< Real, Devices::Host, int > hostBenchmarkResults( MatrixInfo< HostMatrix >::getFormat(), csrResultVector, hostOutVector, hostMatrix.getNonzeroElementsCount() ); + SpmvBenchmarkResult< Real, Devices::Host, int > hostBenchmarkResults( csrResultVector, hostOutVector ); benchmark.time< Devices::Host >( resetHostVectors, "CPU", spmvHost, hostBenchmarkResults ); } @@ -258,7 +253,7 @@ benchmarkSpMVLegacy( BenchmarkType& benchmark, } catch(const std::exception& e) { - std::cerr << "Unable to copy the matrix on GPU: " << e.what() << std::endl; + benchmark.addErrorMessage( "Unable to copy the matrix on GPU: " + String(e.what()) ); return; } @@ -272,15 +267,14 @@ benchmarkSpMVLegacy( BenchmarkType& benchmark, auto spmvCuda = [&]() { cudaMatrix.vectorProduct( cudaInVector, cudaOutVector ); }; - SpmvBenchmarkResult< Real, Devices::Cuda, int > cudaBenchmarkResults( MatrixInfo< HostMatrix >::getFormat(), csrResultVector, cudaOutVector, cudaMatrix.getNonzeroElementsCount() ); + SpmvBenchmarkResult< Real, Devices::Cuda, int > cudaBenchmarkResults( csrResultVector, cudaOutVector ); benchmark.time< Devices::Cuda >( resetCudaVectors, "GPU", spmvCuda, cudaBenchmarkResults ); #endif } template< typename Real, typename InputMatrix, - template< typename, typename, typename > class Matrix, - template< typename, typename, typename, typename > class Vector = Containers::Vector > + template< typename, typename, typename > class Matrix > void benchmarkSpMV( BenchmarkType& benchmark, const InputMatrix& inputMatrix, @@ -294,6 +288,8 @@ benchmarkSpMV( BenchmarkType& benchmark, using HostVector = Containers::Vector< Real, Devices::Host, int >; using CudaVector = Containers::Vector< Real, Devices::Cuda, int >; + benchmark.setMetadataElement({ "format", MatrixInfo< HostMatrix >::getFormat() }); + HostMatrix hostMatrix; try { @@ -301,13 +297,13 @@ benchmarkSpMV( BenchmarkType& benchmark, } catch(const std::exception& e) { - std::cerr << "Unable to convert the matrix to the target format:" << e.what() << std::endl; + benchmark.addErrorMessage( "Unable to convert the matrix to the target format:" + String(e.what()) ); return; } - const int elements = hostMatrix.getNonzeroElementsCount(); - const double datasetSize = (double) elements * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB; - benchmark.setOperation( datasetSize ); + const int nonzeros = hostMatrix.getNonzeroElementsCount(); + const double datasetSize = (double) nonzeros * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB; + benchmark.setDatasetSize( datasetSize ); ///// // Benchmark SpMV on host @@ -325,7 +321,7 @@ benchmarkSpMV( BenchmarkType& benchmark, hostMatrix.vectorProduct( hostInVector, hostOutVector ); }; - SpmvBenchmarkResult< Real, Devices::Host, int > hostBenchmarkResults( MatrixInfo< HostMatrix >::getFormat(), csrResultVector, hostOutVector, hostMatrix.getNonzeroElementsCount() ); + SpmvBenchmarkResult< Real, Devices::Host, int > hostBenchmarkResults( csrResultVector, hostOutVector ); benchmark.time< Devices::Host >( resetHostVectors, "CPU", spmvHost, hostBenchmarkResults ); } @@ -340,7 +336,7 @@ benchmarkSpMV( BenchmarkType& benchmark, } catch(const std::exception& e) { - std::cerr << "Unable to copy the matrix on GPU:" << e.what() << std::endl; + benchmark.addErrorMessage( "Unable to copy the matrix on GPU: " + String(e.what()) ); return; } @@ -354,7 +350,7 @@ benchmarkSpMV( BenchmarkType& benchmark, auto spmvCuda = [&]() { cudaMatrix.vectorProduct( cudaInVector, cudaOutVector ); }; - SpmvBenchmarkResult< Real, Devices::Cuda, int > cudaBenchmarkResults( MatrixInfo< HostMatrix >::getFormat(), csrResultVector, cudaOutVector, cudaMatrix.getNonzeroElementsCount() ); + SpmvBenchmarkResult< Real, Devices::Cuda, int > cudaBenchmarkResults( csrResultVector, cudaOutVector ); benchmark.time< Devices::Cuda >( resetCudaVectors, "GPU", spmvCuda, cudaBenchmarkResults ); #endif } @@ -362,8 +358,7 @@ benchmarkSpMV( BenchmarkType& benchmark, template< typename Real, typename InputMatrix, template< typename, typename, typename > class Matrix, - typename TestReal = Real, - template< typename, typename, typename, typename > class Vector = Containers::Vector > + typename TestReal = Real > void benchmarkSpMVCSRLight( BenchmarkType& benchmark, const InputMatrix& inputMatrix, @@ -377,6 +372,8 @@ benchmarkSpMVCSRLight( BenchmarkType& benchmark, using HostVector = Containers::Vector< Real, Devices::Host, int >; using CudaVector = Containers::Vector< Real, Devices::Cuda, int >; + benchmark.setMetadataElement({ "format", MatrixInfo< HostMatrix >::getFormat() }); + HostMatrix hostMatrix; try { @@ -384,13 +381,13 @@ benchmarkSpMVCSRLight( BenchmarkType& benchmark, } catch(const std::exception& e) { - std::cerr << "Unable to convert the matrix to the target format:" << e.what() << std::endl; + benchmark.addErrorMessage( "Unable to convert the matrix to the target format:" + String(e.what()) ); return; } - const int elements = hostMatrix.getNonzeroElementsCount(); - const double datasetSize = (double) elements * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB; - benchmark.setOperation( datasetSize ); + const int nonzeros = hostMatrix.getNonzeroElementsCount(); + const double datasetSize = (double) nonzeros * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB; + benchmark.setDatasetSize( datasetSize ); ///// // Benchmark SpMV on host @@ -408,7 +405,7 @@ benchmarkSpMVCSRLight( BenchmarkType& benchmark, hostMatrix.vectorProduct( hostInVector, hostOutVector ); }; - SpmvBenchmarkResult< Real, Devices::Host, int > hostBenchmarkResults( MatrixInfo< HostMatrix >::getFormat(), csrResultVector, hostOutVector, hostMatrix.getNonzeroElementsCount() ); + SpmvBenchmarkResult< Real, Devices::Host, int > hostBenchmarkResults( csrResultVector, hostOutVector ); benchmark.time< Devices::Host >( resetHostVectors, "CPU", spmvHost, hostBenchmarkResults ); } @@ -423,7 +420,7 @@ benchmarkSpMVCSRLight( BenchmarkType& benchmark, } catch(const std::exception& e) { - std::cerr << "Unable to copy the matrix on GPU:" << e.what() << std::endl; + benchmark.addErrorMessage( "Unable to copy the matrix on GPU: " + String(e.what()) ); return; } @@ -441,14 +438,18 @@ benchmarkSpMVCSRLight( BenchmarkType& benchmark, { cudaMatrix.getSegments().getKernel().setThreadsMapping( Algorithms::Segments::CSRLightAutomaticThreads ); String format = MatrixInfo< HostMatrix >::getFormat() + " Automatic"; - SpmvBenchmarkResult< Real, Devices::Cuda, int > cudaBenchmarkResults( format, csrResultVector, cudaOutVector, cudaMatrix.getNonzeroElementsCount() ); + benchmark.setMetadataElement({ "format", format }); + + SpmvBenchmarkResult< Real, Devices::Cuda, int > cudaBenchmarkResults( csrResultVector, cudaOutVector ); benchmark.time< Devices::Cuda >( resetCudaVectors, "GPU", spmvCuda, cudaBenchmarkResults ); }; { cudaMatrix.getSegments().getKernel().setThreadsMapping( Algorithms::Segments::CSRLightAutomaticThreadsLightSpMV ); String format = MatrixInfo< HostMatrix >::getFormat() + " Automatic Light"; - SpmvBenchmarkResult< Real, Devices::Cuda, int > cudaBenchmarkResults( format, csrResultVector, cudaOutVector, cudaMatrix.getNonzeroElementsCount() ); + benchmark.setMetadataElement({ "format", format }); + + SpmvBenchmarkResult< Real, Devices::Cuda, int > cudaBenchmarkResults( csrResultVector, cudaOutVector ); benchmark.time< Devices::Cuda >( resetCudaVectors, "GPU", spmvCuda, cudaBenchmarkResults ); }; @@ -456,7 +457,9 @@ benchmarkSpMVCSRLight( BenchmarkType& benchmark, { cudaMatrix.getSegments().getKernel().setThreadsPerSegment( threadsPerRow ); String format = MatrixInfo< HostMatrix >::getFormat() + " " + convertToString( threadsPerRow ); - SpmvBenchmarkResult< Real, Devices::Cuda, int > cudaBenchmarkResults( format, csrResultVector, cudaOutVector, cudaMatrix.getNonzeroElementsCount() ); + benchmark.setMetadataElement({ "format", format }); + + SpmvBenchmarkResult< Real, Devices::Cuda, int > cudaBenchmarkResults( csrResultVector, cudaOutVector ); benchmark.time< Devices::Cuda >( resetCudaVectors, "GPU", spmvCuda, cudaBenchmarkResults ); }*/ #endif @@ -465,8 +468,7 @@ benchmarkSpMVCSRLight( BenchmarkType& benchmark, template< typename Real, typename InputMatrix, - template< typename, typename, typename > class Matrix, - template< typename, typename, typename, typename > class Vector = Containers::Vector > + template< typename, typename, typename > class Matrix > void benchmarkBinarySpMV( BenchmarkType& benchmark, const InputMatrix& inputMatrix, @@ -480,6 +482,8 @@ benchmarkBinarySpMV( BenchmarkType& benchmark, using HostVector = Containers::Vector< Real, Devices::Host, int >; using CudaVector = Containers::Vector< Real, Devices::Cuda, int >; + benchmark.setMetadataElement({ "format", MatrixInfo< HostMatrix >::getFormat() }); + HostMatrix hostMatrix; try { @@ -487,13 +491,13 @@ benchmarkBinarySpMV( BenchmarkType& benchmark, } catch(const std::exception& e) { - std::cerr << "Unable to convert the matrix to the target format:" << e.what() << std::endl; + benchmark.addErrorMessage( "Unable to convert the matrix to the target format:" + String(e.what()) ); return; } - const int elements = hostMatrix.getNonzeroElementsCount(); - const double datasetSize = (double) elements * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB; - benchmark.setOperation( datasetSize ); + const int nonzeros = hostMatrix.getNonzeroElementsCount(); + const double datasetSize = (double) nonzeros * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB; + benchmark.setDatasetSize( datasetSize ); ///// // Benchmark SpMV on host @@ -511,7 +515,7 @@ benchmarkBinarySpMV( BenchmarkType& benchmark, hostMatrix.vectorProduct( hostInVector, hostOutVector ); }; - SpmvBenchmarkResult< Real, Devices::Host, int > hostBenchmarkResults( MatrixInfo< HostMatrix >::getFormat(), csrResultVector, hostOutVector, hostMatrix.getNonzeroElementsCount() ); + SpmvBenchmarkResult< Real, Devices::Host, int > hostBenchmarkResults( csrResultVector, hostOutVector ); benchmark.time< Devices::Host >( resetHostVectors, "CPU", spmvHost, hostBenchmarkResults ); } @@ -526,7 +530,7 @@ benchmarkBinarySpMV( BenchmarkType& benchmark, } catch(const std::exception& e) { - std::cerr << "Unable to copy the matrix on GPU:" << e.what() << std::endl; + benchmark.addErrorMessage( "Unable to copy the matrix on GPU: " + String(e.what()) ); return; } @@ -540,11 +544,143 @@ benchmarkBinarySpMV( BenchmarkType& benchmark, auto spmvCuda = [&]() { cudaMatrix.vectorProduct( cudaInVector, cudaOutVector ); }; - SpmvBenchmarkResult< Real, Devices::Cuda, int > cudaBenchmarkResults( MatrixInfo< HostMatrix >::getFormat(), csrResultVector, cudaOutVector, cudaMatrix.getNonzeroElementsCount() ); + SpmvBenchmarkResult< Real, Devices::Cuda, int > cudaBenchmarkResults( csrResultVector, cudaOutVector ); benchmark.time< Devices::Cuda >( resetCudaVectors, "GPU", spmvCuda, cudaBenchmarkResults ); #endif } +template< typename Real > +void +dispatchLegacy( BenchmarkType& benchmark, + const TNL::Containers::Vector< Real, Devices::Host, int >& hostOutVector, + const String& inputFileName, + bool allCpuTests, + bool verboseMR ) +{ + using namespace Benchmarks::SpMV::ReferenceFormats; + benchmarkSpMVLegacy< Real, SparseMatrixLegacy_CSR_Scalar >( benchmark, hostOutVector, inputFileName, allCpuTests, verboseMR ); + benchmarkSpMVLegacy< Real, SparseMatrixLegacy_CSR_Vector >( benchmark, hostOutVector, inputFileName, allCpuTests, verboseMR ); + //benchmarkSpMVLegacy< Real, SparseMatrixLegacy_CSR_Light >( benchmark, hostOutVector, inputFileName, verboseMR ); + //benchmarkSpMVLegacy< Real, SparseMatrixLegacy_CSR_Light2 >( benchmark, hostOutVector, inputFileName, verboseMR ); + //benchmarkSpMVLegacy< Real, SparseMatrixLegacy_CSR_Light3 >( benchmark, hostOutVector, inputFileName, verboseMR ); + //benchmarkSpMVLegacy< Real, SparseMatrixLegacy_CSR_Light4 >( benchmark, hostOutVector, inputFileName, verboseMR ); + //benchmarkSpMVLegacy< Real, SparseMatrixLegacy_CSR_Light5 >( benchmark, hostOutVector, inputFileName, verboseMR ); + //benchmarkSpMVLegacy< Real, SparseMatrixLegacy_CSR_Light6 >( benchmark, hostOutVector, inputFileName, verboseMR ); + benchmarkSpMVLegacy< Real, SparseMatrixLegacy_CSR_Adaptive >( benchmark, hostOutVector, inputFileName, allCpuTests, verboseMR ); + benchmarkSpMVLegacy< Real, SparseMatrixLegacy_CSR_MultiVector >( benchmark, hostOutVector, inputFileName, allCpuTests, verboseMR ); + benchmarkSpMVLegacy< Real, SparseMatrixLegacy_CSR_LightWithoutAtomic >( benchmark, hostOutVector, inputFileName, allCpuTests, verboseMR ); + benchmarkSpMVLegacy< Real, Legacy::Ellpack >( benchmark, hostOutVector, inputFileName, allCpuTests, verboseMR ); + benchmarkSpMVLegacy< Real, SlicedEllpackAlias >( benchmark, hostOutVector, inputFileName, allCpuTests, verboseMR ); + benchmarkSpMVLegacy< Real, Legacy::ChunkedEllpack >( benchmark, hostOutVector, inputFileName, allCpuTests, verboseMR ); + benchmarkSpMVLegacy< Real, Legacy::BiEllpack >( benchmark, hostOutVector, inputFileName, allCpuTests, verboseMR ); + // AdEllpack is broken + //benchmarkSpMV< Real, Matrices::AdEllpack >( benchmark, hostOutVector, inputFileName, verboseMR ); +} + +template< typename Real, typename HostMatrix > +void +dispatchBinary( BenchmarkType& benchmark, + const HostMatrix& hostMatrix, + const TNL::Containers::Vector< Real, Devices::Host, int >& hostOutVector, + const String& inputFileName, + bool allCpuTests, + bool verboseMR ) +{ + benchmarkBinarySpMV< Real, HostMatrix, SparseMatrix_CSR_Scalar >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR ); + benchmarkBinarySpMV< Real, HostMatrix, SparseMatrix_CSR_Vector >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR ); + benchmarkSpMVCSRLight< Real, HostMatrix, SparseMatrix_CSR_Light, bool >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR ); + benchmarkBinarySpMV< Real, HostMatrix, SparseMatrix_CSR_Adaptive >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR ); + benchmarkBinarySpMV< Real, HostMatrix, SparseMatrix_Ellpack >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR ); + benchmarkBinarySpMV< Real, HostMatrix, SparseMatrix_SlicedEllpack >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR ); + benchmarkBinarySpMV< Real, HostMatrix, SparseMatrix_ChunkedEllpack >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR ); + benchmarkBinarySpMV< Real, HostMatrix, SparseMatrix_BiEllpack >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR ); +} + +template< typename Real > +void +dispatchSpMV( BenchmarkType& benchmark, + const TNL::Containers::Vector< Real, Devices::Host, int >& hostOutVector, + const String& inputFileName, + bool allCpuTests, + bool verboseMR ) +{ + using HostMatrixType = TNL::Matrices::SparseMatrix< Real, TNL::Devices::Host >; + HostMatrixType hostMatrix; + TNL::Matrices::MatrixReader< HostMatrixType >::readMtx( inputFileName, hostMatrix, verboseMR ); + benchmarkSpMV< Real, HostMatrixType, SparseMatrix_CSR_Scalar >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR ); + benchmarkSpMV< Real, HostMatrixType, SparseMatrix_CSR_Vector >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR ); + //benchmarkSpMV< Real, HostMatrixType, SparseMatrix_CSR_Hybrid >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR ); + benchmarkSpMVCSRLight< Real, HostMatrixType, SparseMatrix_CSR_Light >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR ); + benchmarkSpMV< Real, HostMatrixType, SparseMatrix_CSR_Adaptive >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR ); + benchmarkSpMV< Real, HostMatrixType, SparseMatrix_Ellpack >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR ); + benchmarkSpMV< Real, HostMatrixType, SparseMatrix_SlicedEllpack >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR ); + benchmarkSpMV< Real, HostMatrixType, SparseMatrix_ChunkedEllpack >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR ); + benchmarkSpMV< Real, HostMatrixType, SparseMatrix_BiEllpack >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR ); + dispatchBinary< Real >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR ); +#ifdef WITH_TNL_BENCHMARK_SPMV_SANDBOX_MATRIX + benchmarkSpMV< Real, HostMatrixType, SparseSandboxMatrix >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR ); +#endif +} + +template< typename Real, typename SymmetricInputMatrix > +void +dispatchSymmetricBinary( BenchmarkType& benchmark, + const SymmetricInputMatrix& symmetricHostMatrix, + const TNL::Containers::Vector< Real, Devices::Host, int >& hostOutVector, + const String& inputFileName, + bool allCpuTests, + bool verboseMR ) +{ + benchmarkBinarySpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_CSR_Scalar >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR ); + benchmarkBinarySpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_CSR_Vector >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR ); + //benchmarkBinarySpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_CSR_Hybrid >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR ); + benchmarkSpMVCSRLight< Real, SymmetricInputMatrix, SymmetricSparseMatrix_CSR_Light, bool >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR ); + benchmarkBinarySpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_CSR_Adaptive >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR ); + benchmarkBinarySpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_Ellpack >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR ); + benchmarkBinarySpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_SlicedEllpack >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR ); + benchmarkBinarySpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_ChunkedEllpack >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR ); + benchmarkBinarySpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_BiEllpack >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR ); +} + +template< typename Real > +void +dispatchSymmetric( BenchmarkType& benchmark, + const TNL::Containers::Vector< Real, Devices::Host, int >& hostOutVector, + const String& inputFileName, + bool allCpuTests, + bool verboseMR ) +{ + using SymmetricInputMatrix = TNL::Matrices::SparseMatrix< Real, TNL::Devices::Host, int, TNL::Matrices::SymmetricMatrix >; + using InputMatrix = TNL::Matrices::SparseMatrix< Real, TNL::Devices::Host, int >; + SymmetricInputMatrix symmetricHostMatrix; + try + { + TNL::Matrices::MatrixReader< SymmetricInputMatrix >::readMtx( inputFileName, symmetricHostMatrix, verboseMR ); + } + catch(const std::exception& e) + { + benchmark.addErrorMessage( "Unable to read the symmetric matrix: " + String(e.what()) ); + return; + } + InputMatrix hostMatrix; + TNL::Matrices::MatrixReader< InputMatrix >::readMtx( inputFileName, hostMatrix, verboseMR ); + // TODO: Comparison of symmetric and general matrix does not work yet. + //if( hostMatrix != symmetricHostMatrix ) + //{ + // std::cerr << "ERROR: Symmetric matrices do not match !!!" << std::endl; + //} + benchmarkSpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_CSR_Scalar >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR ); + benchmarkSpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_CSR_Vector >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR ); + //benchmarkSpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_CSR_Hybrid >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR ); + benchmarkSpMVCSRLight< Real, SymmetricInputMatrix, SymmetricSparseMatrix_CSR_Light >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR ); + benchmarkSpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_CSR_Adaptive >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR ); + benchmarkSpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_Ellpack >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR ); + benchmarkSpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_SlicedEllpack >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR ); + benchmarkSpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_ChunkedEllpack >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR ); + benchmarkSpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_BiEllpack >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR ); + dispatchSymmetricBinary< Real >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR ); +} + template< typename Real = double, typename Index = int > void @@ -584,20 +720,26 @@ benchmarkSpmv( BenchmarkType& benchmark, // Set-up benchmark datasize // MatrixReader< CSRHostMatrix >::readMtx( inputFileName, csrHostMatrix, verboseMR ); - const int elements = csrHostMatrix.getNonzeroElementsCount(); - const double datasetSize = (double) elements * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB; - benchmark.setOperation( datasetSize ); + const int nonzeros = csrHostMatrix.getNonzeroElementsCount(); + const double datasetSize = (double) nonzeros * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB; + benchmark.setDatasetSize( datasetSize ); //// // Perform benchmark on host with CSR as a reference CPU format // - auto nonzeros = csrHostMatrix.getNonzeroElementsCount(); - benchmark.addCommonLogs( BenchmarkType::CommonLogs( { - { "matrix name", convertToString( inputFileName ) }, + benchmark.setMetadataColumns({ + { "matrix name", inputFileName }, + { "precision", getType< Real >() }, { "rows", convertToString( csrHostMatrix.getRows() ) }, { "columns", convertToString( csrHostMatrix.getColumns() ) }, { "nonzeros", convertToString( nonzeros ) }, - { "nonzeros per row", convertToString( ( double ) nonzeros / ( double ) csrHostMatrix.getRows() ) } } ) ); + // NOTE: this can be easily calculated with Pandas based on the other metadata + //{ "nonzeros per row", convertToString( ( double ) nonzeros / ( double ) csrHostMatrix.getRows() ) }, + }); + benchmark.setMetadataWidths({ + { "matrix name", 32 }, + { "format", 46 }, + }); HostVector hostInVector( csrHostMatrix.getRows() ), hostOutVector( csrHostMatrix.getRows() ); @@ -610,10 +752,9 @@ benchmarkSpmv( BenchmarkType& benchmark, csrHostMatrix.vectorProduct( hostInVector, hostOutVector ); }; - SpmvBenchmarkResult< Real, Devices::Host, int > csrBenchmarkResults( String( "CSR" ), hostOutVector, hostOutVector, csrHostMatrix.getNonzeroElementsCount() ); - benchmark.addLogsMetadata( csrBenchmarkResults.getTableHeader() ); - benchmark.writeHeader(); - benchmark.time< Devices::Host >( resetHostVectors, "", spmvCSRHost, csrBenchmarkResults ); + SpmvBenchmarkResult< Real, Devices::Host, int > csrBenchmarkResults( hostOutVector, hostOutVector ); + benchmark.setMetadataElement({ "format", "CSR" }); + benchmark.time< Devices::Host >( resetHostVectors, "CPU", spmvCSRHost, csrBenchmarkResults ); #ifdef HAVE_PETSC Mat petscMatrix; @@ -640,10 +781,9 @@ benchmarkSpmv( BenchmarkType& benchmark, MatMult( petscMatrix, inVector, outVector ); }; - SpmvBenchmarkResult< Real, Devices::Host, int > petscBenchmarkResults( String( "Petsc" ), hostOutVector, hostOutVector, csrHostMatrix.getNonzeroElementsCount() ); - //benchmark.addLogsMetadata( petscBenchmarkResults.getTableHeader() ); - //benchmark.writeHeader(); - benchmark.time< Devices::Host >( resetPetscVectors, "", petscSpmvCSRHost, petscBenchmarkResults ); + SpmvBenchmarkResult< Real, Devices::Host, int > petscBenchmarkResults( hostOutVector, hostOutVector ); + benchmark.setMetadataElement({ "format", "Petsc" }); + benchmark.time< Devices::Host >( resetPetscVectors, "CPU", petscSpmvCSRHost, petscBenchmarkResults ); #endif @@ -671,15 +811,14 @@ benchmarkSpmv( BenchmarkType& benchmark, cusparseMatrix.vectorProduct( cudaInVector, cudaOutVector ); }; - SpmvBenchmarkResult< Real, Devices::Cuda, int > cudaBenchmarkResults( String( "cusparse" ), hostOutVector, cudaOutVector, csrHostMatrix.getNonzeroElementsCount() ); + SpmvBenchmarkResult< Real, Devices::Cuda, int > cudaBenchmarkResults( hostOutVector, cudaOutVector ); + benchmark.setMetadataElement({ "format", "cusparse" }); benchmark.time< Devices::Cuda >( resetCusparseVectors, "GPU", spmvCusparse, cudaBenchmarkResults ); #ifdef HAVE_CSR5 //// // Perform benchmark on CUDA device with CSR5 as a reference GPU format // - cudaBenchmarkResults.setFormat( String( "CSR5" ) ); - CudaVector cudaOutVector2( cudaOutVector ); CSR5Benchmark::CSR5Benchmark< CSRCudaMatrix > csr5Benchmark( csrCudaMatrix, cudaInVector, cudaOutVector ); @@ -687,6 +826,7 @@ benchmarkSpmv( BenchmarkType& benchmark, csr5Benchmark.vectorProduct(); }; + benchmark.setMetadataElement({ "format", "CSR5" }); benchmark.time< Devices::Cuda >( resetCusparseVectors, "GPU", csr5SpMV, cudaBenchmarkResults ); std::cerr << "CSR5 error = " << max( abs( cudaOutVector - cudaOutVector2 ) ) << std::endl; csrCudaMatrix.reset(); @@ -695,8 +835,6 @@ benchmarkSpmv( BenchmarkType& benchmark, //// // Perform benchmark on CUDA device with LightSpMV as a reference GPU format // - cudaBenchmarkResults.setFormat( String( "LightSpMV Vector" ) ); - LightSpMVCSRHostMatrix lightSpMVCSRHostMatrix; lightSpMVCSRHostMatrix = csrHostMatrix; LightSpMVBenchmark< Real > lightSpMVBenchmark( lightSpMVCSRHostMatrix, LightSpMVBenchmarkKernelVector ); @@ -707,123 +845,70 @@ benchmarkSpmv( BenchmarkType& benchmark, auto spmvLightSpMV = [&]() { lightSpMVBenchmark.vectorProduct(); }; + benchmark.setMetadataElement({ "format", "LightSpMV Vector" }); benchmark.time< Devices::Cuda >( resetLightSpMVVectors, "GPU", spmvLightSpMV, cudaBenchmarkResults ); - cudaBenchmarkResults.setFormat( String( "LightSpMV Warp" ) ); lightSpMVBenchmark.setKernelType( LightSpMVBenchmarkKernelWarp ); + benchmark.setMetadataElement({ "format", "LightSpMV Warp" }); benchmark.time< Devices::Cuda >( resetLightSpMVVectors, "GPU", spmvLightSpMV, cudaBenchmarkResults ); #endif csrHostMatrix.reset(); bool allCpuTests = parameters.getParameter< bool >( "with-all-cpu-tests" ); -#ifdef WITH_TNL_BENCHMARK_SPMV_LEGACY_FORMATS ///// // Benchmarking of TNL legacy formats // if( parameters.getParameter< bool >("with-legacy-matrices") ) - { - using namespace Benchmarks::SpMV::ReferenceFormats; - benchmarkSpMVLegacy< Real, SparseMatrixLegacy_CSR_Scalar >( benchmark, hostOutVector, inputFileName, allCpuTests, verboseMR ); - benchmarkSpMVLegacy< Real, SparseMatrixLegacy_CSR_Vector >( benchmark, hostOutVector, inputFileName, allCpuTests, verboseMR ); - //benchmarkSpMVLegacy< Real, SparseMatrixLegacy_CSR_Light >( benchmark, hostOutVector, inputFileName, verboseMR ); - //benchmarkSpMVLegacy< Real, SparseMatrixLegacy_CSR_Light2 >( benchmark, hostOutVector, inputFileName, verboseMR ); - //benchmarkSpMVLegacy< Real, SparseMatrixLegacy_CSR_Light3 >( benchmark, hostOutVector, inputFileName, verboseMR ); - //benchmarkSpMVLegacy< Real, SparseMatrixLegacy_CSR_Light4 >( benchmark, hostOutVector, inputFileName, verboseMR ); - //benchmarkSpMVLegacy< Real, SparseMatrixLegacy_CSR_Light5 >( benchmark, hostOutVector, inputFileName, verboseMR ); - //benchmarkSpMVLegacy< Real, SparseMatrixLegacy_CSR_Light6 >( benchmark, hostOutVector, inputFileName, verboseMR ); - benchmarkSpMVLegacy< Real, SparseMatrixLegacy_CSR_Adaptive >( benchmark, hostOutVector, inputFileName, allCpuTests, verboseMR ); - benchmarkSpMVLegacy< Real, SparseMatrixLegacy_CSR_MultiVector >( benchmark, hostOutVector, inputFileName, allCpuTests, verboseMR ); - benchmarkSpMVLegacy< Real, SparseMatrixLegacy_CSR_LightWithoutAtomic >( benchmark, hostOutVector, inputFileName, allCpuTests, verboseMR ); - benchmarkSpMVLegacy< Real, Legacy::Ellpack >( benchmark, hostOutVector, inputFileName, allCpuTests, verboseMR ); - benchmarkSpMVLegacy< Real, SlicedEllpackAlias >( benchmark, hostOutVector, inputFileName, allCpuTests, verboseMR ); - benchmarkSpMVLegacy< Real, Legacy::ChunkedEllpack >( benchmark, hostOutVector, inputFileName, allCpuTests, verboseMR ); - benchmarkSpMVLegacy< Real, Legacy::BiEllpack >( benchmark, hostOutVector, inputFileName, allCpuTests, verboseMR ); - } - // AdEllpack is broken - //benchmarkSpMV< Real, Matrices::AdEllpack >( benchmark, hostOutVector, inputFileName, verboseMR ); -#endif + dispatchLegacy< Real >( benchmark, hostOutVector, inputFileName, allCpuTests, verboseMR ); -#ifdef WITH_TNL_BENCHMARK_SPMV_GENERAL_MATRICES ///// // Benchmarking TNL formats // - using HostMatrixType = TNL::Matrices::SparseMatrix< Real, TNL::Devices::Host >; - HostMatrixType hostMatrix; - TNL::Matrices::MatrixReader< HostMatrixType >::readMtx( inputFileName, hostMatrix, verboseMR ); - benchmarkSpMV< Real, HostMatrixType, SparseMatrix_CSR_Scalar >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR ); - benchmarkSpMV< Real, HostMatrixType, SparseMatrix_CSR_Vector >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR ); - //benchmarkSpMV< Real, HostMatrixType, SparseMatrix_CSR_Hybrid >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR ); - benchmarkSpMVCSRLight< Real, HostMatrixType, SparseMatrix_CSR_Light >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR ); - benchmarkSpMV< Real, HostMatrixType, SparseMatrix_CSR_Adaptive >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR ); - benchmarkSpMV< Real, HostMatrixType, SparseMatrix_Ellpack >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR ); - benchmarkSpMV< Real, HostMatrixType, SparseMatrix_SlicedEllpack >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR ); - benchmarkSpMV< Real, HostMatrixType, SparseMatrix_ChunkedEllpack >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR ); - benchmarkSpMV< Real, HostMatrixType, SparseMatrix_BiEllpack >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR ); -#ifdef WITH_TNL_BENCHMARK_SPMV_BINARY_MATRICES - benchmarkBinarySpMV< Real, HostMatrixType, SparseMatrix_CSR_Scalar >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR ); - benchmarkBinarySpMV< Real, HostMatrixType, SparseMatrix_CSR_Vector >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR ); - benchmarkSpMVCSRLight< Real, HostMatrixType, SparseMatrix_CSR_Light, bool >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR ); - benchmarkBinarySpMV< Real, HostMatrixType, SparseMatrix_CSR_Adaptive >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR ); - benchmarkBinarySpMV< Real, HostMatrixType, SparseMatrix_Ellpack >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR ); - benchmarkBinarySpMV< Real, HostMatrixType, SparseMatrix_SlicedEllpack >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR ); - benchmarkBinarySpMV< Real, HostMatrixType, SparseMatrix_ChunkedEllpack >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR ); - benchmarkBinarySpMV< Real, HostMatrixType, SparseMatrix_BiEllpack >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR ); -#endif -#ifdef WITH_TNL_BENCHMARK_SPMV_SANDBOX_MATRIX - benchmarkSpMV< Real, HostMatrixType, SparseSandboxMatrix >( benchmark, hostMatrix, hostOutVector, inputFileName, verboseMR ); -#endif - hostMatrix.reset(); -#endif + dispatchSpMV< Real >( benchmark, hostOutVector, inputFileName, allCpuTests, verboseMR ); -#ifdef WITH_TNL_BENCHMARK_SPMV_SYMMETRIC_MATRICES ///// // Benchmarking symmetric sparse matrices // if( parameters.getParameter< bool >("with-symmetric-matrices") ) - { - using SymmetricInputMatrix = TNL::Matrices::SparseMatrix< Real, TNL::Devices::Host, int, TNL::Matrices::SymmetricMatrix >; - using InputMatrix = TNL::Matrices::SparseMatrix< Real, TNL::Devices::Host, int >; - SymmetricInputMatrix symmetricHostMatrix; - try - { - TNL::Matrices::MatrixReader< SymmetricInputMatrix >::readMtx( inputFileName, symmetricHostMatrix, verboseMR ); - } - catch(const std::exception& e) - { - std::cerr << e.what() << " ... SKIPPING " << std::endl; - return; - } - InputMatrix hostMatrix; - TNL::Matrices::MatrixReader< InputMatrix >::readMtx( inputFileName, hostMatrix, verboseMR ); - // TODO: Comparison of symmetric and general matrix does not work yet. - //if( hostMatrix != symmetricHostMatrix ) - //{ - // std::cerr << "ERROR: Symmetric matrices do not match !!!" << std::endl; - //} - benchmarkSpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_CSR_Scalar >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR ); - benchmarkSpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_CSR_Vector >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR ); - //benchmarkSpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_CSR_Hybrid >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR ); - benchmarkSpMVCSRLight< Real, SymmetricInputMatrix, SymmetricSparseMatrix_CSR_Light >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR ); - benchmarkSpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_CSR_Adaptive >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR ); - benchmarkSpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_Ellpack >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR ); - benchmarkSpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_SlicedEllpack >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR ); - benchmarkSpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_ChunkedEllpack >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR ); - benchmarkSpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_BiEllpack >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR ); -#ifdef WITH_TNL_BENCHMARK_SPMV_BINARY_MATRICES - benchmarkBinarySpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_CSR_Scalar >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR ); - benchmarkBinarySpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_CSR_Vector >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR ); - //benchmarkBinarySpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_CSR_Hybrid >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR ); - benchmarkSpMVCSRLight< Real, SymmetricInputMatrix, SymmetricSparseMatrix_CSR_Light, bool >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR ); - benchmarkBinarySpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_CSR_Adaptive >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR ); - benchmarkBinarySpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_Ellpack >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR ); - benchmarkBinarySpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_SlicedEllpack >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR ); - benchmarkBinarySpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_ChunkedEllpack >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR ); - benchmarkBinarySpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_BiEllpack >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR ); -#endif - } -#endif + dispatchSymmetric< Real >( benchmark, hostOutVector, inputFileName, allCpuTests, verboseMR ); } - } // namespace SpMVLegacy +// =============== EXPLICIT TEMPLATE INSTANTIATIONS =============== +// The explicit template declarations (extern ...) are converted to definitions +// in separate source files using the eti.py script. The developer should call +// this script whenever the declarations are changed and commit the generated +// definitions in the git repository. +// +// IMPORTANT: +// - Each template instantiation must be written on exactly one line (the code +// generator script (spmv.py) does not support parsing multiple lines). +// - Make sure that all "dispatch*" functions that are called above are +// instantiated below. +// - Also make sure that all functions that are explicitly instantiated below +// are actually used. +// - Explicit template instantiations cannot be guarded by #ifdef (the code +// generator script (spmv.py) does not support parsing macros). +// - For optimum compilation performance, the explicitly instantiated functions +// should be as independent as possible. The compilation of each explicit +// instantiation should take about the same time so that the work load in a +// parallel build is balanced. Functions that are not instantiated explicitly +// will be compiled in the main unit that is compiled serially. + +extern template void dispatchLegacy< float >( BenchmarkType&, const Containers::Vector< float, Devices::Host, int >&, const String&, bool, bool ); +extern template void dispatchLegacy< double >( BenchmarkType&, const Containers::Vector< double, Devices::Host, int >&, const String&, bool, bool ); + +extern template void dispatchBinary< float >( BenchmarkType&, const Matrices::SparseMatrix< float, Devices::Host >&, const Containers::Vector< float, Devices::Host, int >&, const String&, bool, bool ); +extern template void dispatchBinary< double >( BenchmarkType&, const Matrices::SparseMatrix< float, Devices::Host >&, const Containers::Vector< double, Devices::Host, int >&, const String&, bool, bool ); + +extern template void dispatchSpMV< float >( BenchmarkType&, const Containers::Vector< float, Devices::Host, int >&, const String&, bool, bool ); +extern template void dispatchSpMV< double >( BenchmarkType&, const Containers::Vector< double, Devices::Host, int >&, const String&, bool, bool ); + +extern template void dispatchSymmetric< float >( BenchmarkType&, const Containers::Vector< float, Devices::Host, int >&, const String&, bool, bool ); +extern template void dispatchSymmetric< double >( BenchmarkType&, const Containers::Vector< double, Devices::Host, int >&, const String&, bool, bool ); + +extern template void dispatchSymmetricBinary< float >( BenchmarkType&, const Matrices::SparseMatrix< float, Devices::Host >&, const Containers::Vector< float, Devices::Host, int >&, const String&, bool, bool ); +extern template void dispatchSymmetricBinary< double >( BenchmarkType&, const Matrices::SparseMatrix< float, Devices::Host >&, const Containers::Vector< double, Devices::Host, int >&, const String&, bool, bool ); + + } // namespace SpMV } // namespace Benchmarks } // namespace TNL diff --git a/src/Benchmarks/SpMV/spmv.templates/spmv.t0.cpp b/src/Benchmarks/SpMV/spmv.templates/spmv.t0.cpp new file mode 100644 index 0000000000000000000000000000000000000000..fafd4b2b01ce4094a897dabf8261b210b8fe088e --- /dev/null +++ b/src/Benchmarks/SpMV/spmv.templates/spmv.t0.cpp @@ -0,0 +1,8 @@ +#include "../spmv.h" +namespace TNL { +namespace Benchmarks { +namespace SpMV { +template void dispatchLegacy< float >( BenchmarkType&, const Containers::Vector< float, Devices::Host, int >&, const String&, bool, bool ); +} // namespace TNL +} // namespace Benchmarks +} // namespace SpMV diff --git a/src/Benchmarks/SpMV/spmv.templates/spmv.t0.cu b/src/Benchmarks/SpMV/spmv.templates/spmv.t0.cu new file mode 100644 index 0000000000000000000000000000000000000000..fafd4b2b01ce4094a897dabf8261b210b8fe088e --- /dev/null +++ b/src/Benchmarks/SpMV/spmv.templates/spmv.t0.cu @@ -0,0 +1,8 @@ +#include "../spmv.h" +namespace TNL { +namespace Benchmarks { +namespace SpMV { +template void dispatchLegacy< float >( BenchmarkType&, const Containers::Vector< float, Devices::Host, int >&, const String&, bool, bool ); +} // namespace TNL +} // namespace Benchmarks +} // namespace SpMV diff --git a/src/Benchmarks/SpMV/spmv.templates/spmv.t1.cpp b/src/Benchmarks/SpMV/spmv.templates/spmv.t1.cpp new file mode 100644 index 0000000000000000000000000000000000000000..fbd1d792ab8af6c1a5083cfa6127ecdf072b666f --- /dev/null +++ b/src/Benchmarks/SpMV/spmv.templates/spmv.t1.cpp @@ -0,0 +1,8 @@ +#include "../spmv.h" +namespace TNL { +namespace Benchmarks { +namespace SpMV { +template void dispatchLegacy< double >( BenchmarkType&, const Containers::Vector< double, Devices::Host, int >&, const String&, bool, bool ); +} // namespace TNL +} // namespace Benchmarks +} // namespace SpMV diff --git a/src/Benchmarks/SpMV/spmv.templates/spmv.t1.cu b/src/Benchmarks/SpMV/spmv.templates/spmv.t1.cu new file mode 100644 index 0000000000000000000000000000000000000000..fbd1d792ab8af6c1a5083cfa6127ecdf072b666f --- /dev/null +++ b/src/Benchmarks/SpMV/spmv.templates/spmv.t1.cu @@ -0,0 +1,8 @@ +#include "../spmv.h" +namespace TNL { +namespace Benchmarks { +namespace SpMV { +template void dispatchLegacy< double >( BenchmarkType&, const Containers::Vector< double, Devices::Host, int >&, const String&, bool, bool ); +} // namespace TNL +} // namespace Benchmarks +} // namespace SpMV diff --git a/src/Benchmarks/SpMV/spmv.templates/spmv.t2.cpp b/src/Benchmarks/SpMV/spmv.templates/spmv.t2.cpp new file mode 100644 index 0000000000000000000000000000000000000000..bf67f96e16ed354300913caff8478f3050405484 --- /dev/null +++ b/src/Benchmarks/SpMV/spmv.templates/spmv.t2.cpp @@ -0,0 +1,8 @@ +#include "../spmv.h" +namespace TNL { +namespace Benchmarks { +namespace SpMV { +template void dispatchBinary< float >( BenchmarkType&, const Matrices::SparseMatrix< float, Devices::Host >&, const Containers::Vector< float, Devices::Host, int >&, const String&, bool, bool ); +} // namespace TNL +} // namespace Benchmarks +} // namespace SpMV diff --git a/src/Benchmarks/SpMV/spmv.templates/spmv.t2.cu b/src/Benchmarks/SpMV/spmv.templates/spmv.t2.cu new file mode 100644 index 0000000000000000000000000000000000000000..bf67f96e16ed354300913caff8478f3050405484 --- /dev/null +++ b/src/Benchmarks/SpMV/spmv.templates/spmv.t2.cu @@ -0,0 +1,8 @@ +#include "../spmv.h" +namespace TNL { +namespace Benchmarks { +namespace SpMV { +template void dispatchBinary< float >( BenchmarkType&, const Matrices::SparseMatrix< float, Devices::Host >&, const Containers::Vector< float, Devices::Host, int >&, const String&, bool, bool ); +} // namespace TNL +} // namespace Benchmarks +} // namespace SpMV diff --git a/src/Benchmarks/SpMV/spmv.templates/spmv.t3.cpp b/src/Benchmarks/SpMV/spmv.templates/spmv.t3.cpp new file mode 100644 index 0000000000000000000000000000000000000000..3d11252c95e93b1d2a183c12190bd08eb8ba9b0e --- /dev/null +++ b/src/Benchmarks/SpMV/spmv.templates/spmv.t3.cpp @@ -0,0 +1,8 @@ +#include "../spmv.h" +namespace TNL { +namespace Benchmarks { +namespace SpMV { +template void dispatchBinary< double >( BenchmarkType&, const Matrices::SparseMatrix< float, Devices::Host >&, const Containers::Vector< double, Devices::Host, int >&, const String&, bool, bool ); +} // namespace TNL +} // namespace Benchmarks +} // namespace SpMV diff --git a/src/Benchmarks/SpMV/spmv.templates/spmv.t3.cu b/src/Benchmarks/SpMV/spmv.templates/spmv.t3.cu new file mode 100644 index 0000000000000000000000000000000000000000..3d11252c95e93b1d2a183c12190bd08eb8ba9b0e --- /dev/null +++ b/src/Benchmarks/SpMV/spmv.templates/spmv.t3.cu @@ -0,0 +1,8 @@ +#include "../spmv.h" +namespace TNL { +namespace Benchmarks { +namespace SpMV { +template void dispatchBinary< double >( BenchmarkType&, const Matrices::SparseMatrix< float, Devices::Host >&, const Containers::Vector< double, Devices::Host, int >&, const String&, bool, bool ); +} // namespace TNL +} // namespace Benchmarks +} // namespace SpMV diff --git a/src/Benchmarks/SpMV/spmv.templates/spmv.t4.cpp b/src/Benchmarks/SpMV/spmv.templates/spmv.t4.cpp new file mode 100644 index 0000000000000000000000000000000000000000..735f853e79dbca17406e69899e191471c75a009a --- /dev/null +++ b/src/Benchmarks/SpMV/spmv.templates/spmv.t4.cpp @@ -0,0 +1,8 @@ +#include "../spmv.h" +namespace TNL { +namespace Benchmarks { +namespace SpMV { +template void dispatchSpMV< float >( BenchmarkType&, const Containers::Vector< float, Devices::Host, int >&, const String&, bool, bool ); +} // namespace TNL +} // namespace Benchmarks +} // namespace SpMV diff --git a/src/Benchmarks/SpMV/spmv.templates/spmv.t4.cu b/src/Benchmarks/SpMV/spmv.templates/spmv.t4.cu new file mode 100644 index 0000000000000000000000000000000000000000..735f853e79dbca17406e69899e191471c75a009a --- /dev/null +++ b/src/Benchmarks/SpMV/spmv.templates/spmv.t4.cu @@ -0,0 +1,8 @@ +#include "../spmv.h" +namespace TNL { +namespace Benchmarks { +namespace SpMV { +template void dispatchSpMV< float >( BenchmarkType&, const Containers::Vector< float, Devices::Host, int >&, const String&, bool, bool ); +} // namespace TNL +} // namespace Benchmarks +} // namespace SpMV diff --git a/src/Benchmarks/SpMV/spmv.templates/spmv.t5.cpp b/src/Benchmarks/SpMV/spmv.templates/spmv.t5.cpp new file mode 100644 index 0000000000000000000000000000000000000000..e880c7d586b1cc380b6aa012f6f9e79d21ab82f3 --- /dev/null +++ b/src/Benchmarks/SpMV/spmv.templates/spmv.t5.cpp @@ -0,0 +1,8 @@ +#include "../spmv.h" +namespace TNL { +namespace Benchmarks { +namespace SpMV { +template void dispatchSpMV< double >( BenchmarkType&, const Containers::Vector< double, Devices::Host, int >&, const String&, bool, bool ); +} // namespace TNL +} // namespace Benchmarks +} // namespace SpMV diff --git a/src/Benchmarks/SpMV/spmv.templates/spmv.t5.cu b/src/Benchmarks/SpMV/spmv.templates/spmv.t5.cu new file mode 100644 index 0000000000000000000000000000000000000000..e880c7d586b1cc380b6aa012f6f9e79d21ab82f3 --- /dev/null +++ b/src/Benchmarks/SpMV/spmv.templates/spmv.t5.cu @@ -0,0 +1,8 @@ +#include "../spmv.h" +namespace TNL { +namespace Benchmarks { +namespace SpMV { +template void dispatchSpMV< double >( BenchmarkType&, const Containers::Vector< double, Devices::Host, int >&, const String&, bool, bool ); +} // namespace TNL +} // namespace Benchmarks +} // namespace SpMV diff --git a/src/Benchmarks/SpMV/spmv.templates/spmv.t6.cpp b/src/Benchmarks/SpMV/spmv.templates/spmv.t6.cpp new file mode 100644 index 0000000000000000000000000000000000000000..082fe7967c41b3ec9a1a90dc70e48556becd334c --- /dev/null +++ b/src/Benchmarks/SpMV/spmv.templates/spmv.t6.cpp @@ -0,0 +1,8 @@ +#include "../spmv.h" +namespace TNL { +namespace Benchmarks { +namespace SpMV { +template void dispatchSymmetric< float >( BenchmarkType&, const Containers::Vector< float, Devices::Host, int >&, const String&, bool, bool ); +} // namespace TNL +} // namespace Benchmarks +} // namespace SpMV diff --git a/src/Benchmarks/SpMV/spmv.templates/spmv.t6.cu b/src/Benchmarks/SpMV/spmv.templates/spmv.t6.cu new file mode 100644 index 0000000000000000000000000000000000000000..082fe7967c41b3ec9a1a90dc70e48556becd334c --- /dev/null +++ b/src/Benchmarks/SpMV/spmv.templates/spmv.t6.cu @@ -0,0 +1,8 @@ +#include "../spmv.h" +namespace TNL { +namespace Benchmarks { +namespace SpMV { +template void dispatchSymmetric< float >( BenchmarkType&, const Containers::Vector< float, Devices::Host, int >&, const String&, bool, bool ); +} // namespace TNL +} // namespace Benchmarks +} // namespace SpMV diff --git a/src/Benchmarks/SpMV/spmv.templates/spmv.t7.cpp b/src/Benchmarks/SpMV/spmv.templates/spmv.t7.cpp new file mode 100644 index 0000000000000000000000000000000000000000..a1191d34dc9e0f61710abfede52d1e0f72ebf6b0 --- /dev/null +++ b/src/Benchmarks/SpMV/spmv.templates/spmv.t7.cpp @@ -0,0 +1,8 @@ +#include "../spmv.h" +namespace TNL { +namespace Benchmarks { +namespace SpMV { +template void dispatchSymmetric< double >( BenchmarkType&, const Containers::Vector< double, Devices::Host, int >&, const String&, bool, bool ); +} // namespace TNL +} // namespace Benchmarks +} // namespace SpMV diff --git a/src/Benchmarks/SpMV/spmv.templates/spmv.t7.cu b/src/Benchmarks/SpMV/spmv.templates/spmv.t7.cu new file mode 100644 index 0000000000000000000000000000000000000000..a1191d34dc9e0f61710abfede52d1e0f72ebf6b0 --- /dev/null +++ b/src/Benchmarks/SpMV/spmv.templates/spmv.t7.cu @@ -0,0 +1,8 @@ +#include "../spmv.h" +namespace TNL { +namespace Benchmarks { +namespace SpMV { +template void dispatchSymmetric< double >( BenchmarkType&, const Containers::Vector< double, Devices::Host, int >&, const String&, bool, bool ); +} // namespace TNL +} // namespace Benchmarks +} // namespace SpMV diff --git a/src/Benchmarks/SpMV/spmv.templates/spmv.t8.cpp b/src/Benchmarks/SpMV/spmv.templates/spmv.t8.cpp new file mode 100644 index 0000000000000000000000000000000000000000..fea28e004019fc4aa767798e6030fed5cca41160 --- /dev/null +++ b/src/Benchmarks/SpMV/spmv.templates/spmv.t8.cpp @@ -0,0 +1,8 @@ +#include "../spmv.h" +namespace TNL { +namespace Benchmarks { +namespace SpMV { +template void dispatchSymmetricBinary< float >( BenchmarkType&, const Matrices::SparseMatrix< float, Devices::Host >&, const Containers::Vector< float, Devices::Host, int >&, const String&, bool, bool ); +} // namespace TNL +} // namespace Benchmarks +} // namespace SpMV diff --git a/src/Benchmarks/SpMV/spmv.templates/spmv.t8.cu b/src/Benchmarks/SpMV/spmv.templates/spmv.t8.cu new file mode 100644 index 0000000000000000000000000000000000000000..fea28e004019fc4aa767798e6030fed5cca41160 --- /dev/null +++ b/src/Benchmarks/SpMV/spmv.templates/spmv.t8.cu @@ -0,0 +1,8 @@ +#include "../spmv.h" +namespace TNL { +namespace Benchmarks { +namespace SpMV { +template void dispatchSymmetricBinary< float >( BenchmarkType&, const Matrices::SparseMatrix< float, Devices::Host >&, const Containers::Vector< float, Devices::Host, int >&, const String&, bool, bool ); +} // namespace TNL +} // namespace Benchmarks +} // namespace SpMV diff --git a/src/Benchmarks/SpMV/spmv.templates/spmv.t9.cpp b/src/Benchmarks/SpMV/spmv.templates/spmv.t9.cpp new file mode 100644 index 0000000000000000000000000000000000000000..16c242a796e7a09012a746bf2a3eca9599fe372d --- /dev/null +++ b/src/Benchmarks/SpMV/spmv.templates/spmv.t9.cpp @@ -0,0 +1,8 @@ +#include "../spmv.h" +namespace TNL { +namespace Benchmarks { +namespace SpMV { +template void dispatchSymmetricBinary< double >( BenchmarkType&, const Matrices::SparseMatrix< float, Devices::Host >&, const Containers::Vector< double, Devices::Host, int >&, const String&, bool, bool ); +} // namespace TNL +} // namespace Benchmarks +} // namespace SpMV diff --git a/src/Benchmarks/SpMV/spmv.templates/spmv.t9.cu b/src/Benchmarks/SpMV/spmv.templates/spmv.t9.cu new file mode 100644 index 0000000000000000000000000000000000000000..16c242a796e7a09012a746bf2a3eca9599fe372d --- /dev/null +++ b/src/Benchmarks/SpMV/spmv.templates/spmv.t9.cu @@ -0,0 +1,8 @@ +#include "../spmv.h" +namespace TNL { +namespace Benchmarks { +namespace SpMV { +template void dispatchSymmetricBinary< double >( BenchmarkType&, const Matrices::SparseMatrix< float, Devices::Host >&, const Containers::Vector< double, Devices::Host, int >&, const String&, bool, bool ); +} // namespace TNL +} // namespace Benchmarks +} // namespace SpMV diff --git a/src/Benchmarks/SpMV/tnl-benchmark-spmv.h b/src/Benchmarks/SpMV/tnl-benchmark-spmv.h index c5ff2bb3fa90dd9cf88207d7b087767db8377825..dd2617d3bba16f1be9351e73719c7efe117bbdde 100644 --- a/src/Benchmarks/SpMV/tnl-benchmark-spmv.h +++ b/src/Benchmarks/SpMV/tnl-benchmark-spmv.h @@ -38,17 +38,10 @@ using namespace TNL::Benchmarks; template< typename Real > void runSpMVBenchmarks( TNL::Benchmarks::SpMV::BenchmarkType & benchmark, - TNL::Benchmarks::SpMV::BenchmarkType::MetadataMap metadata, const String & inputFileName, const Config::ParameterContainer& parameters, bool verboseMR = false ) { - const String precision = getType< Real >(); - metadata["precision"] = precision; - - // Sparse matrix-vector multiplication - benchmark.newBenchmark( String("Sparse matrix-vector multiplication (") + precision + ")", - metadata ); // Start the actual benchmark in spmv.h try { TNL::Benchmarks::SpMV::benchmarkSpmv< Real >( benchmark, inputFileName, parameters, verboseMR ); @@ -66,7 +59,7 @@ std::string getCurrDateTime() char buffer[ 80 ]; time( &rawtime ); timeinfo = localtime( &rawtime ); - strftime( buffer, sizeof( buffer ), "%d-%m-%Y--%H:%M:%S", timeinfo ); + strftime( buffer, sizeof( buffer ), "%Y-%m-%d--%H:%M:%S", timeinfo ); std::string curr_date_time( buffer ); return curr_date_time; } @@ -75,15 +68,14 @@ void setupConfig( Config::ConfigDescription & config ) { config.addDelimiter( "Benchmark settings:" ); - config.addEntry< String >( "input-file", "Input file name.", "" ); + config.addRequiredEntry< String >( "input-file", "Input file name." ); config.addEntry< bool >( "with-symmetric-matrices", "Perform benchmark even for symmetric matrix formats.", true ); config.addEntry< bool >( "with-legacy-matrices", "Perform benchmark even for legacy TNL matrix formats.", true ); config.addEntry< bool >( "with-all-cpu-tests", "All matrix formats are tested on both CPU and GPU. ", false ); config.addEntry< String >( "log-file", "Log file name.", "tnl-benchmark-spmv::" + getCurrDateTime() + ".log"); - config.addEntry< String >( "output-mode", "Mode for opening the log file - 'close' will only finalize the log file.", "append" ); + config.addEntry< String >( "output-mode", "Mode for opening the log file.", "append" ); config.addEntryEnum( "append" ); config.addEntryEnum( "overwrite" ); - config.addEntryEnum( "close" ); config.addEntry< String >( "precision", "Precision of the arithmetics.", "double" ); config.addEntryEnum( "float" ); config.addEntryEnum( "double" ); @@ -134,22 +126,13 @@ main( int argc, char* argv[] ) const int verboseMR = parameters.getParameter< int >( "verbose-MReader" ); // open log file - if( outputMode == "close" ) - { - std::fstream file; - file.open( logFileName.getString(), std::ios::out | std::ios::app ); - file << std::endl << " ]" << std::endl << "}"; - return EXIT_SUCCESS; - } if( inputFileName == "" ) { std::cerr << "ERROR: Input file name is required." << std::endl; return EXIT_FAILURE; } - bool logFileAppend( false ); if( std::experimental::filesystem::exists(logFileName.getString()) ) { - logFileAppend = true; std::cout << "Log file " << logFileName << " exists and "; if( outputMode == "append" ) std::cout << "new logs will be appended." << std::endl; @@ -160,26 +143,22 @@ main( int argc, char* argv[] ) auto mode = std::ios::out; if( outputMode == "append" ) mode |= std::ios::app; - std::ofstream logFile( logFileName.getString(), mode ); + std::ofstream logFile( logFileName, mode ); - // init benchmark and common metadata - TNL::Benchmarks::SpMV::BenchmarkType benchmark( loops, verbose, outputMode, logFileAppend ); + // init benchmark and set parameters + TNL::Benchmarks::SpMV::BenchmarkType benchmark( logFile, loops, verbose ); - // prepare global metadata - TNL::Benchmarks::SpMV::BenchmarkType::MetadataMap metadata = getHardwareMetadata< Logging >(); + // write global metadata into a separate file + std::map< std::string, std::string > metadata = getHardwareMetadata(); + writeMapAsJson( metadata, logFileName, ".metadata.json" ); // Initiate setup of benchmarks if( precision == "all" || precision == "float" ) - runSpMVBenchmarks< float >( benchmark, metadata, inputFileName, parameters, verboseMR ); + runSpMVBenchmarks< float >( benchmark, inputFileName, parameters, verboseMR ); if( precision == "all" || precision == "double" ) - runSpMVBenchmarks< double >( benchmark, metadata, inputFileName, parameters, verboseMR ); - - if( ! benchmark.save( logFile ) ) { - std::cerr << "Failed to write the benchmark results to file '" << parameters.getParameter< String >( "log-file" ) << "'." << std::endl; - return EXIT_FAILURE; - } + runSpMVBenchmarks< double >( benchmark, inputFileName, parameters, verboseMR ); // Confirm that the benchmark has finished - std::cout << "\n== BENCHMARK FINISHED ==" << std::endl; + std::cout << "\n==> BENCHMARK FINISHED" << std::endl; return EXIT_SUCCESS; } diff --git a/src/Benchmarks/Traversers/BenchmarkTraverserUserData.h b/src/Benchmarks/Traversers/BenchmarkTraverserUserData.h index 2ae00ec697452aad99163752bbd83fee4c5af1f1..35b08d993e2240dfb11ca8414ae1e4691975ce32 100644 --- a/src/Benchmarks/Traversers/BenchmarkTraverserUserData.h +++ b/src/Benchmarks/Traversers/BenchmarkTraverserUserData.h @@ -12,6 +12,8 @@ #pragma once +#include + namespace TNL { namespace Benchmarks { namespace Traversers { @@ -25,7 +27,7 @@ class BenchmarkTraverserUserData using RealType = typename MeshType::RealType; using DeviceType = typename MeshType::DeviceType; using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >; - + BenchmarkTraverserUserData( MeshFunctionPointer& f ) : u( &f.template modifyData< DeviceType >() ), data( f->getData().getData() ){} diff --git a/src/Benchmarks/Traversers/CMakeLists.txt b/src/Benchmarks/Traversers/CMakeLists.txt index 5932d2606db852736f1ce665f4d52f53d0fa5d09..6b7712d2b3da3845d29a06b6ee2ba7d48be6a4b6 100644 --- a/src/Benchmarks/Traversers/CMakeLists.txt +++ b/src/Benchmarks/Traversers/CMakeLists.txt @@ -1,10 +1,9 @@ # TODO: Split the benchmark into several files for faster build -#if( BUILD_CUDA ) -# CUDA_ADD_EXECUTABLE( tnl-benchmark-traversers tnl-benchmark-traversers.cu ) -#else() -# ADD_EXECUTABLE( tnl-benchmark-traversers tnl-benchmark-traversers.cpp ) -#endif() - -#install( TARGETS tnl-benchmark-traversers RUNTIME DESTINATION bin ) +if( BUILD_CUDA ) + CUDA_ADD_EXECUTABLE( tnl-benchmark-traversers tnl-benchmark-traversers.cu ) +else() + ADD_EXECUTABLE( tnl-benchmark-traversers tnl-benchmark-traversers.cpp ) +endif() +install( TARGETS tnl-benchmark-traversers RUNTIME DESTINATION bin ) diff --git a/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper.h b/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper.h index 6da7ec09bb05d1591689688bf9fb85420fe5e820..13022458a5f0a153ae6bab14295c421cd5b8c183 100644 --- a/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper.h +++ b/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper.h @@ -12,10 +12,6 @@ #pragma once -#include "AddOneEntitiesProcessor.h" -#include "BenchmarkTraverserUserData.h" -#include "SimpleCell.h" - namespace TNL { namespace Benchmarks { namespace Traversers { @@ -23,7 +19,6 @@ namespace TNL { template< typename Grid > class GridTraverserBenchmarkHelper{}; - } // namespace Traversers } // namespace Benchmarks } // namespace TNL diff --git a/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper_1D.h b/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper_1D.h index e460a8bca4ac8edb77dcab823576349335de6a73..b2fc9bf9c86f0bfc0af43954d5b7489a54f72151 100644 --- a/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper_1D.h +++ b/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper_1D.h @@ -12,12 +12,15 @@ #pragma once +#include +#include +#include + #include "GridTraverserBenchmarkHelper.h" #include "AddOneEntitiesProcessor.h" #include "BenchmarkTraverserUserData.h" #include "SimpleCell.h" - namespace TNL { namespace Benchmarks { namespace Traversers { @@ -40,10 +43,10 @@ _GridTraverser1D( typedef Index IndexType; typedef Meshes::Grid< 1, Real, Devices::Cuda, Index > GridType; //typename GridType::CoordinatesType coordinates; - + GridEntity entity( *grid ); - entity.getCoordinates().x() = begin.x() + ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; - //coordinates.x() = begin.x() + ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; + entity.getCoordinates().x() = begin.x() + ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; + //coordinates.x() = begin.x() + ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; if( entity.getCoordinates() <= end ) { entity.refresh(); @@ -69,7 +72,7 @@ class GridTraverserBenchmarkHelper< Meshes::Grid< 1, Real, Devices::Host, Index using RealType = typename GridType::RealType; using IndexType = typename GridType::IndexType; using CoordinatesType = typename GridType::CoordinatesType; - using MeshFunction = Functions::MeshFunction< GridType >; + using MeshFunction = Functions::MeshFunctionView< GridType >; using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >; using CellType = typename GridType::template EntityType< Dimension, Meshes::GridEntityNoStencilStorage >; using SimpleCellType = SimpleCell< GridType >; @@ -108,7 +111,7 @@ class GridTraverserBenchmarkHelper< Meshes::Grid< 1, Real, Devices::Cuda, Index using RealType = typename GridType::RealType; using IndexType = typename GridType::IndexType; using CoordinatesType = typename GridType::CoordinatesType; - using MeshFunction = Functions::MeshFunction< GridType >; + using MeshFunction = Functions::MeshFunctionView< GridType >; using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >; using CellType = typename GridType::template EntityType< Dimension, Meshes::GridEntityNoStencilStorage >; using SimpleCellType = SimpleCell< GridType >; @@ -122,7 +125,7 @@ class GridTraverserBenchmarkHelper< Meshes::Grid< 1, Real, Devices::Cuda, Index { #ifdef HAVE_CUDA dim3 blockSize( 256 ), blocksCount, gridsCount; - Devices::Cuda::setupThreads( + Cuda::setupThreads( blockSize, blocksCount, gridsCount, @@ -131,7 +134,7 @@ class GridTraverserBenchmarkHelper< Meshes::Grid< 1, Real, Devices::Cuda, Index for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ ) { dim3 gridSize; - Devices::Cuda::setupGrid( + Cuda::setupGrid( blocksCount, gridsCount, gridIdx, @@ -148,7 +151,7 @@ class GridTraverserBenchmarkHelper< Meshes::Grid< 1, Real, Devices::Cuda, Index #endif } }; - + } // namespace Traversers } // namespace Benchmarks } // namespace TNL diff --git a/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper_2D.h b/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper_2D.h index eca6c7fee0057e2689fa5f473a214d97266471e6..a3d1a04364f87311969e727413052dfde92e2d40 100644 --- a/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper_2D.h +++ b/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper_2D.h @@ -12,6 +12,10 @@ #pragma once +#include +#include +#include + #include "GridTraverserBenchmarkHelper.h" #include "AddOneEntitiesProcessor.h" #include "BenchmarkTraverserUserData.h" @@ -40,8 +44,8 @@ _GridTraverser2D( typedef Meshes::Grid< 2, Real, Devices::Cuda, Index > GridType; GridEntity entity( *grid ); - entity.getCoordinates().x() = begin.x() + ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; - entity.getCoordinates().y() = begin.y() + ( gridIdx.y * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y; + entity.getCoordinates().x() = begin.x() + ( gridIdx.x * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; + entity.getCoordinates().y() = begin.y() + ( gridIdx.y * Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y; if( entity.getCoordinates() <= end ) { entity.refresh(); @@ -62,7 +66,7 @@ class GridTraverserBenchmarkHelper< Meshes::Grid< 2, Real, Devices::Host, Index using RealType = typename GridType::RealType; using IndexType = typename GridType::IndexType; using CoordinatesType = typename GridType::CoordinatesType; - using MeshFunction = Functions::MeshFunction< GridType >; + using MeshFunction = Functions::MeshFunctionView< GridType >; using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >; using CellType = typename GridType::template EntityType< Dimension, Meshes::GridEntityNoStencilStorage >; using SimpleCellType = SimpleCell< GridType >; @@ -104,7 +108,7 @@ class GridTraverserBenchmarkHelper< Meshes::Grid< 2, Real, Devices::Cuda, Index using RealType = typename GridType::RealType; using IndexType = typename GridType::IndexType; using CoordinatesType = typename GridType::CoordinatesType; - using MeshFunction = Functions::MeshFunction< GridType >; + using MeshFunction = Functions::MeshFunctionView< GridType >; using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >; using CellType = typename GridType::template EntityType< Dimension, Meshes::GridEntityNoStencilStorage >; using SimpleCellType = SimpleCell< GridType >; @@ -118,7 +122,7 @@ class GridTraverserBenchmarkHelper< Meshes::Grid< 2, Real, Devices::Cuda, Index { #ifdef HAVE_CUDA dim3 blockSize( 16, 16 ), blocksCount, gridsCount; - Devices::Cuda::setupThreads( + Cuda::setupThreads( blockSize, blocksCount, gridsCount, @@ -129,7 +133,7 @@ class GridTraverserBenchmarkHelper< Meshes::Grid< 2, Real, Devices::Cuda, Index for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ ) { dim3 gridSize; - Devices::Cuda::setupGrid( + Cuda::setupGrid( blocksCount, gridsCount, gridIdx, diff --git a/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper_3D.h b/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper_3D.h index 4a5da6fd4b89c1cebc716ccfdff31d6ecaf96470..939d8a6815f4f8b4ef0439d8f61c3cc84cc01a19 100644 --- a/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper_3D.h +++ b/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper_3D.h @@ -12,6 +12,10 @@ #pragma once +#include +#include +#include + #include "GridTraverserBenchmarkHelper.h" #include "AddOneEntitiesProcessor.h" #include "BenchmarkTraverserUserData.h" @@ -38,12 +42,12 @@ _GridTraverser3D( typedef Real RealType; typedef Index IndexType; typedef Meshes::Grid< 3, Real, Devices::Cuda, Index > GridType; - + GridEntity entity( *grid ); - entity.getCoordinates().x() = begin.x() + ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; - entity.getCoordinates().y() = begin.y() + ( gridIdx.y * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y; - entity.getCoordinates().z() = begin.z() + ( gridIdx.z * Devices::Cuda::getMaxGridSize() + blockIdx.z ) * blockDim.z + threadIdx.z; - + entity.getCoordinates().x() = begin.x() + ( gridIdx.x * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; + entity.getCoordinates().y() = begin.y() + ( gridIdx.y * Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y; + entity.getCoordinates().z() = begin.z() + ( gridIdx.z * Cuda::getMaxGridSize() + blockIdx.z ) * blockDim.z + threadIdx.z; + if( entity.getCoordinates() <= end ) { entity.refresh(); @@ -64,7 +68,7 @@ class GridTraverserBenchmarkHelper< Meshes::Grid< 3, Real, Devices::Host, Index using RealType = typename GridType::RealType; using IndexType = typename GridType::IndexType; using CoordinatesType = typename GridType::CoordinatesType; - using MeshFunction = Functions::MeshFunction< GridType >; + using MeshFunction = Functions::MeshFunctionView< GridType >; using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >; using CellType = typename GridType::template EntityType< Dimension, Meshes::GridEntityNoStencilStorage >; using SimpleCellType = SimpleCell< GridType >; @@ -107,7 +111,7 @@ class GridTraverserBenchmarkHelper< Meshes::Grid< 3, Real, Devices::Cuda, Index using RealType = typename GridType::RealType; using IndexType = typename GridType::IndexType; using CoordinatesType = typename GridType::CoordinatesType; - using MeshFunction = Functions::MeshFunction< GridType >; + using MeshFunction = Functions::MeshFunctionView< GridType >; using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >; using CellType = typename GridType::template EntityType< Dimension, Meshes::GridEntityNoStencilStorage >; using SimpleCellType = SimpleCell< GridType >; @@ -121,7 +125,7 @@ class GridTraverserBenchmarkHelper< Meshes::Grid< 3, Real, Devices::Cuda, Index { #ifdef HAVE_CUDA dim3 blockSize( 32, 4, 2 ), blocksCount, gridsCount; - Devices::Cuda::setupThreads( + Cuda::setupThreads( blockSize, blocksCount, gridsCount, @@ -134,7 +138,7 @@ class GridTraverserBenchmarkHelper< Meshes::Grid< 3, Real, Devices::Cuda, Index for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ ) { dim3 gridSize; - Devices::Cuda::setupGrid( + Cuda::setupGrid( blocksCount, gridsCount, gridIdx, diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark.h b/src/Benchmarks/Traversers/GridTraversersBenchmark.h index 01590f1221f7a451270234044e180a16ff589e02..5ea87b6dcdc8a8917304826c1cef71185a65c915 100644 --- a/src/Benchmarks/Traversers/GridTraversersBenchmark.h +++ b/src/Benchmarks/Traversers/GridTraversersBenchmark.h @@ -12,28 +12,10 @@ #pragma once -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "AddOneEntitiesProcessor.h" -#include "AddTwoEntitiesProcessor.h" -#include "GridTraverserBenchmarkHelper.h" -#include "BenchmarkTraverserUserData.h" -#include "cuda-kernels.h" - namespace TNL { namespace Benchmarks { namespace Traversers { - - template< int Dimension, typename Device, typename Real, diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h index 9820af39274cd5c8db310b583c0595d2b64bf252..6e8e0c37fc6452df57c76995af3e7cf6fe64cd63 100644 --- a/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h +++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h @@ -13,23 +13,27 @@ #pragma once #include +#include #include #include #include #include #include #include -#include +#include #include #include "cuda-kernels.h" +#include "AddOneEntitiesProcessor.h" +#include "AddTwoEntitiesProcessor.h" +#include "BenchmarkTraverserUserData.h" #include "GridTraversersBenchmark.h" +#include "GridTraverserBenchmarkHelper.h" #include "SimpleCell.h" namespace TNL { namespace Benchmarks { namespace Traversers { - template< typename Device, typename Real, typename Index > @@ -41,7 +45,7 @@ class GridTraversersBenchmark< 1, Device, Real, Index > using GridType = Meshes::Grid< 1, Real, Device, Index >; using GridPointer = Pointers::SharedPointer< GridType >; using Coordinates = typename GridType::CoordinatesType; - using MeshFunction = Functions::MeshFunction< GridType >; + using MeshFunction = Functions::MeshFunctionView< GridType >; using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >; using CellType = typename GridType::template EntityType< 1, Meshes::GridEntityNoStencilStorage >; using SimpleCellType = SimpleCell< GridType >; @@ -51,11 +55,13 @@ class GridTraversersBenchmark< 1, Device, Real, Index > using AddTwoEntitiesProcessorType = AddTwoEntitiesProcessor< UserDataType >; GridTraversersBenchmark( Index size ) - :size( size ), v( size ), grid( size ), u( grid ), + :size( size ), + v( size ), + grid( size ), userData( this->u ) { v_data = v.getData(); - u->getData().bind( v ); + u->bind( grid, v ); } void reset() @@ -74,7 +80,7 @@ class GridTraversersBenchmark< 1, Device, Real, Index > { #ifdef HAVE_CUDA dim3 blockSize( 256 ), blocksCount, gridsCount; - Devices::Cuda::setupThreads( + Cuda::setupThreads( blockSize, blocksCount, gridsCount, @@ -83,7 +89,7 @@ class GridTraversersBenchmark< 1, Device, Real, Index > for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ ) { dim3 gridSize; - Devices::Cuda::setupGrid( + Cuda::setupGrid( blocksCount, gridsCount, gridIdx, @@ -100,7 +106,7 @@ class GridTraversersBenchmark< 1, Device, Real, Index > { data[ i ] += (Real) 1.0; }; - ParallelFor< Device, AsynchronousMode >::exec( ( Index ) 0, size, f, v.getData() ); + Algorithms::ParallelFor< Device, Algorithms::AsynchronousMode >::exec( ( Index ) 0, size, f, v.getData() ); } void addOneUsingSimpleCell() @@ -113,7 +119,7 @@ class GridTraversersBenchmark< 1, Device, Real, Index > entity.refresh(); data[ entity.getIndex() ] += (Real) 1.0; }; - ParallelFor< Device, AsynchronousMode >::exec( ( Index ) 0, size, f, v.getData() );*/ + Algorithms::ParallelFor< Device, Algorithms::AsynchronousMode >::exec( ( Index ) 0, size, f, v.getData() );*/ GridTraverserBenchmarkHelper< GridType >::simpleCellTest( grid, userData, @@ -132,15 +138,15 @@ class GridTraversersBenchmark< 1, Device, Real, Index > _u->getData().getData()[ entity.getIndex() ] += (Real) 1.0; // ( *_u )( entity ) += (Real) 1.0; }; - ParallelFor< Device, AsynchronousMode >::exec( ( Index ) 0, size, f ); + Algorithms::ParallelFor< Device, Algorithms::AsynchronousMode >::exec( ( Index ) 0, size, f ); } void addOneUsingTraverser() { using CoordinatesType = typename GridType::CoordinatesType; - traverser.template processAllEntities< UserDataType, AddOneEntitiesProcessorType > + traverser.template processAllEntities< AddOneEntitiesProcessorType > ( grid, userData ); - + /*GridTraverserBenchmarkHelper< GridType >::noBCTraverserTest( grid, userData, @@ -151,8 +157,8 @@ class GridTraversersBenchmark< 1, Device, Real, Index > { std::cout << loops << " -> " << v << std::endl; if( reseting ) - return v.containsOnlyValue( 1.0 ); - return v.containsOnlyValue( ( Real ) loops ); + return Algorithms::containsOnlyValue( v, 1.0 ); + return Algorithms::containsOnlyValue( v, ( Real ) loops ); } void traverseUsingPureC() @@ -168,7 +174,7 @@ class GridTraversersBenchmark< 1, Device, Real, Index > { #ifdef HAVE_CUDA dim3 blockSize( 256 ), blocksCount, gridsCount; - Devices::Cuda::setupThreads( + Cuda::setupThreads( blockSize, blocksCount, gridsCount, @@ -177,7 +183,7 @@ class GridTraversersBenchmark< 1, Device, Real, Index > for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ ) { dim3 gridSize; - Devices::Cuda::setupGrid( + Cuda::setupGrid( blocksCount, gridsCount, gridIdx, @@ -187,7 +193,7 @@ class GridTraversersBenchmark< 1, Device, Real, Index > for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ ) { dim3 gridSize; - Devices::Cuda::setupGrid( + Cuda::setupGrid( blocksCount, gridsCount, gridIdx, @@ -201,11 +207,11 @@ class GridTraversersBenchmark< 1, Device, Real, Index > void traverseUsingTraverser() { // TODO !!!!!!!!!!!!!!!!!!!!!! - //traverser.template processAllEntities< UserDataType, AddOneEntitiesProcessorType > + //traverser.template processAllEntities< AddOneEntitiesProcessorType > - traverser.template processBoundaryEntities< UserDataType, AddTwoEntitiesProcessorType > + traverser.template processBoundaryEntities< AddTwoEntitiesProcessorType > ( grid, userData ); - traverser.template processInteriorEntities< UserDataType, AddOneEntitiesProcessorType > + traverser.template processInteriorEntities< AddOneEntitiesProcessorType > ( grid, userData ); } diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h index 0e9ae7f2f36a9d19a913a4437d988d5172592e5b..d310ec451751bf7b13c1b984ee5a2e0e2faa52b1 100644 --- a/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h +++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h @@ -13,16 +13,21 @@ #pragma once #include +#include #include #include #include #include #include #include -#include +#include #include #include "cuda-kernels.h" +#include "AddOneEntitiesProcessor.h" +#include "AddTwoEntitiesProcessor.h" +#include "BenchmarkTraverserUserData.h" #include "GridTraversersBenchmark.h" +#include "GridTraverserBenchmarkHelper.h" #include "SimpleCell.h" namespace TNL { @@ -35,12 +40,12 @@ template< typename Device, class GridTraversersBenchmark< 2, Device, Real, Index > { public: - + using Vector = Containers::Vector< Real, Device, Index >; using GridType = Meshes::Grid< 2, Real, Device, Index >; using GridPointer = Pointers::SharedPointer< GridType >; using Coordinates = typename GridType::CoordinatesType; - using MeshFunction = Functions::MeshFunction< GridType >; + using MeshFunction = Functions::MeshFunctionView< GridType >; using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >; using CellType = typename GridType::template EntityType< 2, Meshes::GridEntityNoStencilStorage >; using SimpleCellType = SimpleCell< GridType >; @@ -50,11 +55,13 @@ class GridTraversersBenchmark< 2, Device, Real, Index > using AddTwoEntitiesProcessorType = AddTwoEntitiesProcessor< UserDataType >; GridTraversersBenchmark( Index size ) - :size( size ), v( size * size ), grid( size, size ), u( grid ), + :size( size ), + v( size * size ), + grid( size, size ), userData( u ) { v_data = v.getData(); - u->getData().bind( v ); + u->bind( grid, v ); } void reset() @@ -74,7 +81,7 @@ class GridTraversersBenchmark< 2, Device, Real, Index > { #ifdef HAVE_CUDA dim3 blockSize( 16, 16 ), blocksCount, gridsCount; - Devices::Cuda::setupThreads( + Cuda::setupThreads( blockSize, blocksCount, gridsCount, @@ -85,7 +92,7 @@ class GridTraversersBenchmark< 2, Device, Real, Index > for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ ) { dim3 gridSize; - Devices::Cuda::setupGrid( + Cuda::setupGrid( blocksCount, gridsCount, gridIdx, @@ -103,8 +110,8 @@ class GridTraversersBenchmark< 2, Device, Real, Index > { data[ j * _size + i ] += (Real) 1.0; }; - - ParallelFor2D< Device, AsynchronousMode >::exec( + + Algorithms::ParallelFor2D< Device, Algorithms::AsynchronousMode >::exec( ( Index ) 0, ( Index ) 0, this->size, @@ -123,8 +130,8 @@ class GridTraversersBenchmark< 2, Device, Real, Index > entity.refresh(); data[ entity.getIndex() ] += (Real) 1.0; }; - - ParallelFor2D< Device, AsynchronousMode >::exec( + + Algorithms::ParallelFor2D< Device, Algorithms::AsynchronousMode >::exec( ( Index ) 0, ( Index ) 0, this->size, @@ -134,7 +141,7 @@ class GridTraversersBenchmark< 2, Device, Real, Index > grid, userData, size ); - + } void addOneUsingParallelForAndMeshFunction() @@ -150,8 +157,8 @@ class GridTraversersBenchmark< 2, Device, Real, Index > //( *_u )( entity ) += (Real) 1.0; _u->getData().getData()[ entity.getIndex() ] += (Real) 1.0; }; - - ParallelFor2D< Device, AsynchronousMode >::exec( + + Algorithms::ParallelFor2D< Device, Algorithms::AsynchronousMode >::exec( ( Index ) 0, ( Index ) 0, this->size, @@ -163,9 +170,9 @@ class GridTraversersBenchmark< 2, Device, Real, Index > void addOneUsingTraverser() { using CoordinatesType = typename GridType::CoordinatesType; - traverser.template processAllEntities< UserDataType, AddOneEntitiesProcessorType > + traverser.template processAllEntities< AddOneEntitiesProcessorType > ( grid, userData ); - + /*Meshes::GridTraverser< Grid >::template processEntities< Cell, WriteOneEntitiesProcessorType, WriteOneTraverserUserDataType, false >( grid, CoordinatesType( 0 ), @@ -188,8 +195,8 @@ class GridTraversersBenchmark< 2, Device, Real, Index > bool checkAddOne( int loops, bool reseting ) { if( reseting ) - return v.containsOnlyValue( 1.0 ); - return v.containsOnlyValue( ( Real ) loops ); + return Algorithms::containsOnlyValue( v, 1.0 ); + return Algorithms::containsOnlyValue( v, ( Real ) loops ); } void traverseUsingPureC() @@ -215,7 +222,7 @@ class GridTraversersBenchmark< 2, Device, Real, Index > { #ifdef HAVE_CUDA dim3 blockSize( 32, 8 ), blocksCount, gridsCount; - Devices::Cuda::setupThreads( + Cuda::setupThreads( blockSize, blocksCount, gridsCount, @@ -226,7 +233,7 @@ class GridTraversersBenchmark< 2, Device, Real, Index > for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ ) { dim3 gridSize; - Devices::Cuda::setupGrid( + Cuda::setupGrid( blocksCount, gridsCount, gridIdx, @@ -237,7 +244,7 @@ class GridTraversersBenchmark< 2, Device, Real, Index > for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ ) { dim3 gridSize; - Devices::Cuda::setupGrid( + Cuda::setupGrid( blocksCount, gridsCount, gridIdx, @@ -250,15 +257,15 @@ class GridTraversersBenchmark< 2, Device, Real, Index > void traverseUsingTraverser() { - //traverser.template processAllEntities< UserDataType, AddOneEntitiesProcessorType > - traverser.template processBoundaryEntities< UserDataType, AddTwoEntitiesProcessorType > + //traverser.template processAllEntities< AddOneEntitiesProcessorType > + traverser.template processBoundaryEntities< AddTwoEntitiesProcessorType > ( grid, userData ); - traverser.template processInteriorEntities< UserDataType, AddOneEntitiesProcessorType > + traverser.template processInteriorEntities< AddOneEntitiesProcessorType > ( grid, userData ); } protected: - + Index size; Vector v; Real* v_data; diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h index 26b6413e43edbc86a3a02490580afc834bb0bde6..8e352536665968e4832b7986ba3bfa3842c9f1a6 100644 --- a/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h +++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h @@ -13,18 +13,21 @@ #pragma once #include +#include #include #include #include #include #include #include -#include +#include #include #include "cuda-kernels.h" #include "AddOneEntitiesProcessor.h" +#include "AddTwoEntitiesProcessor.h" #include "BenchmarkTraverserUserData.h" #include "GridTraversersBenchmark.h" +#include "GridTraverserBenchmarkHelper.h" #include "SimpleCell.h" namespace TNL { @@ -42,7 +45,7 @@ class GridTraversersBenchmark< 3, Device, Real, Index > using GridType = Meshes::Grid< 3, Real, Device, Index >; using GridPointer = Pointers::SharedPointer< GridType >; using Coordinates = typename GridType::CoordinatesType; - using MeshFunction = Functions::MeshFunction< GridType >; + using MeshFunction = Functions::MeshFunctionView< GridType >; using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >; using CellType = typename GridType::template EntityType< 3, Meshes::GridEntityNoStencilStorage >; using SimpleCellType = SimpleCell< GridType >; @@ -55,11 +58,10 @@ class GridTraversersBenchmark< 3, Device, Real, Index > : size( size ), v( size * size * size ), grid( size, size, size ), - u( grid ), userData( u ) { v_data = v.getData(); - u->getData().bind( v ); + u->bind( grid, v ); } void reset() @@ -80,7 +82,7 @@ class GridTraversersBenchmark< 3, Device, Real, Index > { #ifdef HAVE_CUDA dim3 blockSize( 32, 4, 2 ), blocksCount, gridsCount; - Devices::Cuda::setupThreads( + Cuda::setupThreads( blockSize, blocksCount, gridsCount, @@ -93,7 +95,7 @@ class GridTraversersBenchmark< 3, Device, Real, Index > for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ ) { dim3 gridSize; - Devices::Cuda::setupGrid( + Cuda::setupGrid( blocksCount, gridsCount, gridIdx, @@ -111,8 +113,8 @@ class GridTraversersBenchmark< 3, Device, Real, Index > { data[ ( k * _size + j ) * _size + i ] += (Real) 1.0; }; - - ParallelFor3D< Device, AsynchronousMode >::exec( + + Algorithms::ParallelFor3D< Device, Algorithms::AsynchronousMode >::exec( ( Index ) 0, ( Index ) 0, ( Index ) 0, @@ -135,7 +137,7 @@ class GridTraversersBenchmark< 3, Device, Real, Index > data[ entity.getIndex() ] += (Real) 1.0; }; - ParallelFor3D< Device, AsynchronousMode >::exec( + Algorithms::ParallelFor3D< Device, Algorithms::AsynchronousMode >::exec( ( Index ) 0, ( Index ) 0, ( Index ) 0, @@ -165,7 +167,7 @@ class GridTraversersBenchmark< 3, Device, Real, Index > _u->getData().getData()[ entity.getIndex() ] += (Real) 1.0; }; - ParallelFor3D< Device, AsynchronousMode >::exec( + Algorithms::ParallelFor3D< Device, Algorithms::AsynchronousMode >::exec( ( Index ) 0, ( Index ) 0, ( Index ) 0, @@ -177,15 +179,15 @@ class GridTraversersBenchmark< 3, Device, Real, Index > void addOneUsingTraverser() { - traverser.template processAllEntities< UserDataType, AddOneEntitiesProcessorType > + traverser.template processAllEntities< AddOneEntitiesProcessorType > ( grid, userData ); } bool checkAddOne( int loops, bool reseting ) { if( reseting ) - return v.containsOnlyValue( 1.0 ); - return v.containsOnlyValue( ( Real ) loops ); + return Algorithms::containsOnlyValue( v, 1.0 ); + return Algorithms::containsOnlyValue( v, ( Real ) loops ); } void traverseUsingPureC() @@ -221,7 +223,7 @@ class GridTraversersBenchmark< 3, Device, Real, Index > { #ifdef HAVE_CUDA dim3 blockSize( 32, 4, 2 ), blocksCount, gridsCount; - Devices::Cuda::setupThreads( + Cuda::setupThreads( blockSize, blocksCount, gridsCount, @@ -234,7 +236,7 @@ class GridTraversersBenchmark< 3, Device, Real, Index > for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ ) { dim3 gridSize; - Devices::Cuda::setupGrid( + Cuda::setupGrid( blocksCount, gridsCount, gridIdx, @@ -246,7 +248,7 @@ class GridTraversersBenchmark< 3, Device, Real, Index > for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ ) { dim3 gridSize; - Devices::Cuda::setupGrid( + Cuda::setupGrid( blocksCount, gridsCount, gridIdx, @@ -260,16 +262,16 @@ class GridTraversersBenchmark< 3, Device, Real, Index > void traverseUsingTraverser() { // TODO !!!!!!!!!!!!!!!!!!!!!! - //traverser.template processAllEntities< UserDataType, AddOneEntitiesProcessorType > + //traverser.template processAllEntities< AddOneEntitiesProcessorType > - traverser.template processBoundaryEntities< UserDataType, AddTwoEntitiesProcessorType > + traverser.template processBoundaryEntities< AddTwoEntitiesProcessorType > ( grid, userData ); - traverser.template processInteriorEntities< UserDataType, AddOneEntitiesProcessorType > + traverser.template processInteriorEntities< AddOneEntitiesProcessorType > ( grid, userData ); } protected: - + Index size; Vector v; Real* v_data; diff --git a/src/Benchmarks/Traversers/cuda-kernels.h b/src/Benchmarks/Traversers/cuda-kernels.h index a90baf5b02075d9c80d572bd51b93ed5cd97b391..d092925bf0bfec6192a770809e44cdecafa8abff 100644 --- a/src/Benchmarks/Traversers/cuda-kernels.h +++ b/src/Benchmarks/Traversers/cuda-kernels.h @@ -25,7 +25,7 @@ template< typename Real, typename Index > __global__ void fullGridTraverseKernel1D( const Index size, const dim3 gridIdx, Real* v_data ) { - const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; + const Index threadIdx_x = ( gridIdx.x * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; if( threadIdx_x < size ) v_data[ threadIdx_x ] += (Real) 1.0; } @@ -34,8 +34,8 @@ template< typename Real, typename Index > __global__ void fullGridTraverseKernel2D( const Index size, const dim3 gridIdx, Real* v_data ) { - const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; - const Index threadIdx_y = ( gridIdx.y * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y; + const Index threadIdx_x = ( gridIdx.x * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; + const Index threadIdx_y = ( gridIdx.y * Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y; if( threadIdx_x < size && threadIdx_y < size ) v_data[ threadIdx_y * size + threadIdx_x ] += (Real) 1.0; } @@ -44,21 +44,21 @@ template< typename Real, typename Index > __global__ void fullGridTraverseKernel3D( const Index size, const dim3 gridIdx, Real* v_data ) { - const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; - const Index threadIdx_y = ( gridIdx.y * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y; - const Index threadIdx_z = ( gridIdx.z * Devices::Cuda::getMaxGridSize() + blockIdx.z ) * blockDim.z + threadIdx.z; + const Index threadIdx_x = ( gridIdx.x * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; + const Index threadIdx_y = ( gridIdx.y * Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y; + const Index threadIdx_z = ( gridIdx.z * Cuda::getMaxGridSize() + blockIdx.z ) * blockDim.z + threadIdx.z; if( threadIdx_x < size && threadIdx_y < size && threadIdx_z < size ) v_data[ ( threadIdx_z * size + threadIdx_y ) * size + threadIdx_x ] += (Real) 1.0; } /**** - * Traversing interior cells + * Traversing interior cells */ template< typename Real, typename Index > __global__ void interiorTraverseKernel1D( const Index size, const dim3 gridIdx, Real* v_data ) { - const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; + const Index threadIdx_x = ( gridIdx.x * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; if( threadIdx_x > 0 && threadIdx_x < size - 1 ) v_data[ threadIdx_x ] += (Real) 1.0; } @@ -67,9 +67,9 @@ template< typename Real, typename Index > __global__ void interiorTraverseKernel2D( const Index size, const dim3 gridIdx, Real* v_data ) { - const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; - const Index threadIdx_y = ( gridIdx.y * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y; - if( threadIdx_x > 0 && threadIdx_y > 0 && + const Index threadIdx_x = ( gridIdx.x * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; + const Index threadIdx_y = ( gridIdx.y * Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y; + if( threadIdx_x > 0 && threadIdx_y > 0 && threadIdx_x < size - 1 && threadIdx_y < size - 1 ) v_data[ threadIdx_y * size + threadIdx_x ] += (Real) 1.0; } @@ -78,9 +78,9 @@ template< typename Real, typename Index > __global__ void interiorTraverseKernel3D( const Index size, const dim3 gridIdx, Real* v_data ) { - const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; - const Index threadIdx_y = ( gridIdx.y * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y; - const Index threadIdx_z = ( gridIdx.z * Devices::Cuda::getMaxGridSize() + blockIdx.z ) * blockDim.z + threadIdx.z; + const Index threadIdx_x = ( gridIdx.x * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; + const Index threadIdx_y = ( gridIdx.y * Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y; + const Index threadIdx_z = ( gridIdx.z * Cuda::getMaxGridSize() + blockIdx.z ) * blockDim.z + threadIdx.z; if( threadIdx_x > 0 && threadIdx_y > 0 && threadIdx_z > 0 && threadIdx_x < size - 1 && threadIdx_y < size - 1 && threadIdx_z < size - 1 ) v_data[ ( threadIdx_z * size + threadIdx_y ) * size + threadIdx_x ] += (Real) 1.0; @@ -93,7 +93,7 @@ template< typename Real, typename Index > __global__ void boundariesTraverseKernel1D( const Index size, const dim3 gridIdx, Real* v_data ) { - const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; + const Index threadIdx_x = ( gridIdx.x * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; if( threadIdx_x == 0 || threadIdx_x == size - 1 ) v_data[ threadIdx_x ] += (Real) 2.0; } @@ -102,9 +102,9 @@ template< typename Real, typename Index > __global__ void boundariesTraverseKernel2D( const Index size, const dim3 gridIdx, Real* v_data ) { - const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; - const Index threadIdx_y = ( gridIdx.y * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y; - if( threadIdx_x > 0 && threadIdx_y > 0 && + const Index threadIdx_x = ( gridIdx.x * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; + const Index threadIdx_y = ( gridIdx.y * Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y; + if( threadIdx_x > 0 && threadIdx_y > 0 && threadIdx_x < size - 1 && threadIdx_y < size - 1 ) v_data[ threadIdx_y * size + threadIdx_x ] += (Real) 2.0; } @@ -113,9 +113,9 @@ template< typename Real, typename Index > __global__ void boundariesTraverseKernel3D( const Index size, const dim3 gridIdx, Real* v_data ) { - const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; - const Index threadIdx_y = ( gridIdx.y * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y; - const Index threadIdx_z = ( gridIdx.z * Devices::Cuda::getMaxGridSize() + blockIdx.z ) * blockDim.z + threadIdx.z; + const Index threadIdx_x = ( gridIdx.x * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; + const Index threadIdx_y = ( gridIdx.y * Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y; + const Index threadIdx_z = ( gridIdx.z * Cuda::getMaxGridSize() + blockIdx.z ) * blockDim.z + threadIdx.z; if( threadIdx_x == 0 || threadIdx_y == 0 || threadIdx_z == 0 || threadIdx_x == size - 1 || threadIdx_y == size - 1 || threadIdx_z == size - 1 ) v_data[ ( threadIdx_z * size + threadIdx_y ) * size + threadIdx_x ] += (Real) 2.0; diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h index 30c364ac37f2bc32753a14c723ff2276332550c9..592098b9553d20f290ed111637820630c028ccbd 100644 --- a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h +++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h @@ -12,7 +12,7 @@ #pragma once -#include "../Benchmarks.h" +#include //#include "grid-traversing.h" #include "GridTraversersBenchmark.h" @@ -20,25 +20,25 @@ #include #include #include -#include using namespace TNL; using namespace TNL::Benchmarks; using namespace TNL::Benchmarks::Traversers; +template< typename T, typename S > +bool containsValue( const std::vector< T >& container, const S& value ) +{ + return std::find( container.begin(), container.end(), value ) != container.end(); +} + template< int Dimension, typename Real = float, typename Index = int > bool runBenchmark( const Config::ParameterContainer& parameters, - Benchmark& benchmark, - Benchmark::MetadataMap& metadata ) + Benchmark<>& benchmark ) { - const Containers::List< String >& tests = parameters.getParameter< Containers::List< String > >( "tests" ); - // FIXME: the --tests is just a string because list does not work with enums -// const Containers::List< String >& tests = parameters.getParameter< Containers::List< String > >( "tests" ); - //Containers::List< String > tests; - //tests.Append( parameters.getParameter< String >( "tests" ) ); + const std::vector< String >& tests = parameters.getParameter< std::vector< String > >( "tests" ); // FIXME: getParameter< std::size_t >() does not work with parameters added with addEntry< int >(), // which have a default value. The workaround below works for int values, but it is not possible // to pass 64-bit integer values @@ -49,15 +49,14 @@ bool runBenchmark( const Config::ParameterContainer& parameters, const bool withHost = parameters.getParameter< bool >( "with-host" ); #ifdef HAVE_CUDA const bool withCuda = parameters.getParameter< bool >( "with-cuda" ); -#else - const bool withCuda = false; +//#else +// const bool withCuda = false; #endif const bool check = parameters.getParameter< bool >( "check" ); /**** * Full grid traversing with no boundary conditions */ - benchmark.newBenchmark( String("Traversing without boundary conditions" + convertToString( Dimension ) + "D" ), metadata ); for( std::size_t size = minSize; size <= maxSize; size *= 2 ) { GridTraversersBenchmark< Dimension, Devices::Host, Real, Index > hostTraverserBenchmark( size ); @@ -76,14 +75,16 @@ bool runBenchmark( const Config::ParameterContainer& parameters, cudaTraverserBenchmark.reset(); }; #endif - benchmark.setMetadataColumns( - Benchmark::MetadataColumns( - { {"size", convertToString( size ) }, } ) ); + benchmark.setMetadataColumns({ + { "dimension", convertToString( Dimension ) }, + { "traverser", "without BC" }, + { "size", convertToString( size ) }, + }); /**** * Add one using pure C code */ - if( tests.containsValue( "all" ) || tests.containsValue( "add-one-pure-c" ) ) + if( containsValue( tests, "all" ) || containsValue( tests, "add-one-pure-c" ) ) { benchmark.setOperation( "Pure C", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); @@ -93,9 +94,9 @@ bool runBenchmark( const Config::ParameterContainer& parameters, }; if( withHost ) { - benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingPureC ); + const BenchmarkResult result = benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingPureC ); if( check && ! hostTraverserBenchmark.checkAddOne( - benchmark.getPerformedLoops(), + result.loops, benchmark.isResetingOn() ) ) benchmark.addErrorMessage( "Test results are not correct." ); } @@ -106,9 +107,9 @@ bool runBenchmark( const Config::ParameterContainer& parameters, }; if( withCuda ) { - benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingPureC ); + const BenchmarkResult result = benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingPureC ); if( check && ! cudaTraverserBenchmark.checkAddOne( - benchmark.getPerformedLoops(), + result.loops, benchmark.isResetingOn() ) ) benchmark.addErrorMessage( "Test results are not correct." ); } @@ -118,7 +119,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters, /**** * Add one using parallel for */ - if( tests.containsValue( "all" ) || tests.containsValue( "add-one-parallel-for" ) ) + if( containsValue( tests, "all" ) || containsValue( tests, "add-one-parallel-for" ) ) { benchmark.setOperation( "parallel for", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); @@ -128,9 +129,9 @@ bool runBenchmark( const Config::ParameterContainer& parameters, }; if( withHost ) { - benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelFor ); - if( check && ! hostTraverserBenchmark.checkAddOne( - benchmark.getPerformedLoops(), + const BenchmarkResult result = benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelFor ); + if( check && ! hostTraverserBenchmark.checkAddOne( + result.loops, benchmark.isResetingOn() ) ) benchmark.addErrorMessage( "Test results are not correct." ); } @@ -142,9 +143,9 @@ bool runBenchmark( const Config::ParameterContainer& parameters, }; if( withCuda ) { - benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingParallelFor ); - if( check && ! cudaTraverserBenchmark.checkAddOne( - benchmark.getPerformedLoops(), + const BenchmarkResult result = benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingParallelFor ); + if( check && ! cudaTraverserBenchmark.checkAddOne( + result.loops, benchmark.isResetingOn() ) ) benchmark.addErrorMessage( "Test results are not correct." ); } @@ -154,7 +155,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters, /**** * Add one using parallel for with grid entity */ - if( tests.containsValue( "all" ) || tests.containsValue( "add-one-simple-cell" ) ) + if( containsValue( tests, "all" ) || containsValue( tests, "add-one-simple-cell" ) ) { auto hostAddOneUsingSimpleCell = [&] () { @@ -163,9 +164,9 @@ bool runBenchmark( const Config::ParameterContainer& parameters, benchmark.setOperation( "simple cell", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); if( withHost ) { - benchmark.time< Devices::Host >( hostReset, "CPU", hostAddOneUsingSimpleCell ); - if( check && ! hostTraverserBenchmark.checkAddOne( - benchmark.getPerformedLoops(), + const BenchmarkResult result = benchmark.time< Devices::Host >( hostReset, "CPU", hostAddOneUsingSimpleCell ); + if( check && ! hostTraverserBenchmark.checkAddOne( + result.loops, benchmark.isResetingOn() ) ) benchmark.addErrorMessage( "Test results are not correct." ); } @@ -176,9 +177,9 @@ bool runBenchmark( const Config::ParameterContainer& parameters, }; if( withCuda ) { - benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaAddOneUsingSimpleCell ); - if( check && ! cudaTraverserBenchmark.checkAddOne( - benchmark.getPerformedLoops(), + const BenchmarkResult result = benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaAddOneUsingSimpleCell ); + if( check && ! cudaTraverserBenchmark.checkAddOne( + result.loops, benchmark.isResetingOn() ) ) benchmark.addErrorMessage( "Test results are not correct." ); } @@ -188,7 +189,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters, /**** * Add one using parallel for with mesh function */ - if( tests.containsValue( "all" ) || tests.containsValue( "add-one-parallel-for-and-mesh-function" ) ) + if( containsValue( tests, "all" ) || containsValue( tests, "add-one-parallel-for-and-mesh-function" ) ) { auto hostAddOneUsingParallelForAndMeshFunction = [&] () { @@ -197,9 +198,9 @@ bool runBenchmark( const Config::ParameterContainer& parameters, benchmark.setOperation( "par.for+mesh fc.", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); if( withHost ) { - benchmark.time< Devices::Host >( hostReset, "CPU", hostAddOneUsingParallelForAndMeshFunction ); - if( check && ! hostTraverserBenchmark.checkAddOne( - benchmark.getPerformedLoops(), + const BenchmarkResult result = benchmark.time< Devices::Host >( hostReset, "CPU", hostAddOneUsingParallelForAndMeshFunction ); + if( check && ! hostTraverserBenchmark.checkAddOne( + result.loops, benchmark.isResetingOn() ) ) benchmark.addErrorMessage( "Test results are not correct." ); } @@ -210,9 +211,9 @@ bool runBenchmark( const Config::ParameterContainer& parameters, }; if( withCuda ) { - benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaAddOneUsingParallelForAndMeshFunction ); - if( check && ! cudaTraverserBenchmark.checkAddOne( - benchmark.getPerformedLoops(), + const BenchmarkResult result = benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaAddOneUsingParallelForAndMeshFunction ); + if( check && ! cudaTraverserBenchmark.checkAddOne( + result.loops, benchmark.isResetingOn() ) ) benchmark.addErrorMessage( "Test results are not correct." ); } @@ -222,7 +223,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters, /**** * Add one using traverser */ - if( tests.containsValue( "all" ) || tests.containsValue( "add-one-traverser" ) ) + if( containsValue( tests, "all" ) || containsValue( tests, "add-one-traverser" ) ) { benchmark.setOperation( "traverser", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); auto hostWriteOneUsingTraverser = [&] () @@ -231,9 +232,9 @@ bool runBenchmark( const Config::ParameterContainer& parameters, }; if( withHost ) { - benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingTraverser ); - if( check && ! hostTraverserBenchmark.checkAddOne( - benchmark.getPerformedLoops(), + const BenchmarkResult result = benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingTraverser ); + if( check && ! hostTraverserBenchmark.checkAddOne( + result.loops, benchmark.isResetingOn() ) ) benchmark.addErrorMessage( "Test results are not correct." ); } @@ -245,9 +246,9 @@ bool runBenchmark( const Config::ParameterContainer& parameters, }; if( withCuda ) { - benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingTraverser ); - if( check && ! cudaTraverserBenchmark.checkAddOne( - benchmark.getPerformedLoops(), + const BenchmarkResult result = benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingTraverser ); + if( check && ! cudaTraverserBenchmark.checkAddOne( + result.loops, benchmark.isResetingOn() ) ) benchmark.addErrorMessage( "Test results are not correct." ); } @@ -255,15 +256,10 @@ bool runBenchmark( const Config::ParameterContainer& parameters, } std::cout << "--------------------------------------------------------------------------------------------------------" << std::endl; } - return true; - } - std::cout << "--------------------------------------------------------------------------------------------------------" << std::endl; - } /**** * Full grid traversing including boundary conditions */ - benchmark.newBenchmark( String("Traversing with boundary conditions" + convertToString( Dimension ) + "D" ), metadata ); for( std::size_t size = minSize; size <= maxSize; size *= 2 ) { GridTraversersBenchmark< Dimension, Devices::Host, Real, Index > hostTraverserBenchmark( size ); @@ -281,9 +277,11 @@ bool runBenchmark( const Config::ParameterContainer& parameters, }; #endif - benchmark.setMetadataColumns( - Benchmark::MetadataColumns( - { {"size", convertToString( size ) }, } ) ); + benchmark.setMetadataColumns({ + { "dimension", convertToString( Dimension ) }, + { "traverser", "with BC" }, + { "size", convertToString( size ) }, + }); /**** * Write one and two (as BC) using C for @@ -300,7 +298,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters, }; #endif - if( tests.containsValue( "all" ) || tests.containsValue( "bc-pure-c" ) ) + if( containsValue( tests, "all" ) || containsValue( tests, "bc-pure-c" ) ) { benchmark.setOperation( "Pure C", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); if( withHost ) @@ -335,7 +333,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters, }; #endif - if( tests.containsValue( "all" ) || tests.containsValue( "bc-parallel-for" ) ) + if( containsValue( tests, "all" ) || containsValue( tests, "bc-parallel-for" ) ) { benchmark.setOperation( "parallel for", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); if( withHost ) @@ -364,7 +362,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters, // cudaTraverserBenchmark.addOneUsingParallelFor(); // }; // -// if( tests.containsValue( "all" ) || tests.containsValue( "bc-parallel-for" ) ) +// if( containsValue( tests, "all" ) || containsValue( tests, "bc-parallel-for" ) ) // { // benchmark.setOperation( "parallel for", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); // if( withHost ) @@ -393,8 +391,8 @@ bool runBenchmark( const Config::ParameterContainer& parameters, cudaTraverserBenchmark.addOneUsingTraverser(); }; #endif - - if( tests.containsValue( "all" ) || tests.containsValue( "bc-traverser" ) ) + + if( containsValue( tests, "all" ) || containsValue( tests, "bc-traverser" ) ) { benchmark.setOperation( "traverser", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); if( withHost ) @@ -418,11 +416,9 @@ bool runBenchmark( const Config::ParameterContainer& parameters, return true; } -void setupConfig( Config::ConfigDescription& config ) +void configSetup( Config::ConfigDescription& config ) { - config.addList< String >( "tests", "Tests to be performed.", "all" ); - // FIXME: addList does not work with addEntryEnum - ConfigDescription::addEntryEnum throws std::bad_cast - // config.addList< String >( "tests", "Tests to be performed.", "all" ); + config.addList< String >( "tests", "Tests to be performed.", {"all"} ); config.addEntryEnum( "all" ); config.addEntryEnum( "add-one-pure-c" ); config.addEntryEnum( "add-one-parallel-for" ); @@ -443,15 +439,15 @@ void setupConfig( Config::ConfigDescription& config ) config.addEntryEnum( "append" ); config.addEntryEnum( "overwrite" ); - config.addEntry< String >( "precision", "Precision of the arithmetics.", "double" ); - config.addEntryEnum( "float" ); - config.addEntryEnum( "double" ); - config.addEntryEnum( "all" ); +// config.addEntry< String >( "precision", "Precision of the arithmetics.", "double" ); +// config.addEntryEnum( "float" ); +// config.addEntryEnum( "double" ); +// config.addEntryEnum( "all" ); config.addEntry< int >( "dimension", "Set the problem dimension. 0 means all dimensions 1,2 and 3.", 0 ); config.addEntry< int >( "min-size", "Minimum size of arrays/vectors used in the benchmark.", 10 ); config.addEntry< int >( "max-size", "Minimum size of arrays/vectors used in the benchmark.", 1000 ); - config.addEntry< int >( "size-step-factor", "Factor determining the size of arrays/vectors used in the benchmark. First size is min-size and each following size is stepFactor*previousSize, up to max-size.", 2 ); - Benchmark::configSetup( config ); +// config.addEntry< int >( "size-step-factor", "Factor determining the size of arrays/vectors used in the benchmark. First size is min-size and each following size is stepFactor*previousSize, up to max-size.", 2 ); + Benchmark<>::configSetup( config ); config.addDelimiter( "Device settings:" ); Devices::Host::configSetup( config ); @@ -463,24 +459,27 @@ bool setupBenchmark( const Config::ParameterContainer& parameters ) { const String & logFileName = parameters.getParameter< String >( "log-file" ); const String & outputMode = parameters.getParameter< String >( "output-mode" ); - const String & precision = parameters.getParameter< String >( "precision" ); - const unsigned sizeStepFactor = parameters.getParameter< unsigned >( "size-step-factor" ); - - Benchmark benchmark; //( loops, verbose ); - benchmark.setup( parameters ); - Benchmark::MetadataMap metadata = getHardwareMetadata(); - runBenchmark< Dimension >( parameters, benchmark, metadata ); +// const String & precision = parameters.getParameter< String >( "precision" ); +// const unsigned sizeStepFactor = parameters.getParameter< unsigned >( "size-step-factor" ); auto mode = std::ios::out; if( outputMode == "append" ) mode |= std::ios::app; - std::ofstream logFile( logFileName.getString(), mode ); + std::ofstream logFile( logFileName, mode ); + + // init benchmark and set parameters + Benchmark<> benchmark( logFile ); //( loops, verbose ); + benchmark.setup( parameters ); + + // write global metadata into a separate file + std::map< std::string, std::string > metadata = getHardwareMetadata(); + metadata["loops"] = convertToString( parameters.getParameter< int >( "loops" ) ); + metadata["reset"] = convertToString( parameters.getParameter< bool >( "reset" ) ); + metadata["minimal test time"] = convertToString( parameters.getParameter< double >( "min-time" ) ); + writeMapAsJson( metadata, logFileName, ".metadata.json" ); + + runBenchmark< Dimension >( parameters, benchmark ); - if( ! benchmark.save( logFile ) ) - { - std::cerr << "Failed to write the benchmark results to file '" << parameters.getParameter< String >( "log-file" ) << "'." << std::endl; - return false; - } return true; } @@ -488,15 +487,15 @@ int main( int argc, char* argv[] ) { Config::ConfigDescription config; Config::ParameterContainer parameters; - - setupConfig( config ); + + configSetup( config ); if( ! parseCommandLine( argc, argv, config, parameters ) ) return EXIT_FAILURE; if( ! Devices::Host::setup( parameters ) || ! Devices::Cuda::setup( parameters ) ) return EXIT_FAILURE; - + const int dimension = parameters.getParameter< int >( "dimension" ); bool status( false ); if( ! dimension ) @@ -520,7 +519,5 @@ int main( int argc, char* argv[] ) break; } } - if( status == false ) - return EXIT_FAILURE; - return EXIT_SUCCESS; + return ! status; } diff --git a/src/Python/BenchmarkLogs.py b/src/Python/BenchmarkLogs.py new file mode 100644 index 0000000000000000000000000000000000000000..8c92040988565d67015a5dd92f2ec958b7849b92 --- /dev/null +++ b/src/Python/BenchmarkLogs.py @@ -0,0 +1,116 @@ +#!/usr/bin/python3 + +__all__ = [ + "dict_to_html_table", + "get_benchmark_metadata", + "get_benchmark_dataframes", +] + +import os.path +import json +import pandas + +def dict_to_html_table(data): + html = "\n" + html += "\n" + for key in sorted(data.keys()): + html += f"\t\n" + html += "\n" + html += "
{key}{data[key]}
\n" + return html + +def get_benchmark_metadata(filename): + """ + Reads metadata of the benchmark in the given file. + + :param str filename: path of the file with metadata or benchmark results. + - If it ends with ".metadata.json", metadata is read from that file. + - Otherwise, the extension is first replaced with ".metadata.json". + :returns: dict as returned by json.load, or None if the file does not exist. + """ + if not filename.endswith(".metadata.json"): + filename = os.path.splitext(filename)[0] + ".metadata.json" + if os.path.isfile(filename): + print(f"Parsing metadata from file {filename}") + return json.load(open(filename, "r")) + print(f"Metadata file {filename} does not exist") + return None + +def get_benchmark_dataframe(logFile): + """ + Get pandas dataframe with benchmark results stored in the given log file. + + :param logFile: path to the log file + :returns: pandas.DataFrame instance + """ + print(f"Parsing input file {logFile}") + df = pandas.read_json(open(logFile, "r"), orient="records", lines=True) + + # convert "N/A" in the speedup column to nan + if "speedup" in df.columns: + df["speedup"] = pandas.to_numeric(df["speedup"], errors="coerce") + + return df + +def gen_dataframes_per_operation(logFile, header_elements=None): + """ + Reads benchmark results stored in the given log file and splits them into + multiple dataframes according to the "operation" column. + + Various post-processing steps are done on each partial dataframe: + - columns with only NaN values are removed + - the operation column is removed + - the "index" and "columns" of the dataframe are set: + - if header_elements are given, they are set as "columns" and everything + else is used for the index + - otherwise, all columns in the dataframe before "time" are used for + the index, and the remaining columns (starting with "time") stay as + "columns" + - the "performer" column is set as the last column of the index + - note that the index is not explicitly sorted, so data is ordered as in the + input file + + :param logFile: path to the log file + :yields: pairs of (str, pandas.DataFrame) object, where the str denotes the + particular operation name + """ + main_df = get_benchmark_dataframe(logFile) + + # check if there is at least one operation + if "operation" not in main_df.columns: + yield "Dummy operation", main_df + return + + # extract all benchmark operations, preserve their order as found in the dataframe + operations = [] + for op in main_df["operation"]: + if op not in operations: + operations.append(op) + + # set operation as index + main_df = main_df.set_index("operation") + + # if header_elements was not provided, we assume that "time" and all following columns + # are benchmark results, and all preceding columns are metadata columns that will be + # set as index of the dataframe + if header_elements is None: + header_elements = list(main_df.columns) + header_elements = header_elements[header_elements.index("time"):] + # FIXME: the "rows" and "columns" (in the gemv operation) are parsed after the correct header elements, because the preceding operations don't have these metadata columns + # TODO: each benchmark should record the header elements in the metadata file + header_elements = [e for e in header_elements if e not in ["rows", "columns"]] + + # emit one df per operation + for op in operations: + df = main_df.loc[op] + # remove columns with only NaNs + df = df.dropna(axis=1, how="all") + # remove the operation column (index) + df = df.reset_index(drop=True) + # prepare index_columns and make sure that performer is the last + index_columns = [c for c in df.columns if c not in header_elements and c != "performer"] + index_columns.append("performer") + # set new index for the df: all columns except header_elements + df = df.set_index(index_columns) + # emit a pair (op, df) + yield op, df diff --git a/src/Python/BenchmarkPlots.py b/src/Python/BenchmarkPlots.py new file mode 100644 index 0000000000000000000000000000000000000000..19a6e94f6cdb5090aee7b3cc62a0b7020aa2f8a4 --- /dev/null +++ b/src/Python/BenchmarkPlots.py @@ -0,0 +1,129 @@ +#!/usr/bin/python3 + +__all__ = [ + "plot_bandwidth_vs_size", + "heatmaps_bandwidth", + "get_image_html_tag", +] + +import numpy +import matplotlib.pyplot as plt +from cycler import cycler +import io +import base64 + +custom_cycler = cycler(linestyle=["-", "--", ":", "-."]) * cycler("color", ["#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd", "#8c564b", "#e377c2", "#7f7f7f", "#bcbd22", "#17becf"]) + +def plot_bandwidth_vs_size(df, size_name="size", prop_cycler=custom_cycler, **kwargs): + """ + Creates a bandwidth-size plot. The "size" data are expected in the index of + the dataframe, all other columns of the index are used for labels of the + graph lines. + + :param df: a pandas.DataFrame instance + :param size_name: name of the "size" column in the index + :param prop_cycler: + property cycler for the graph lines, see the documentation for details: + https://matplotlib.org/stable/tutorials/intermediate/color_cycle.html + :param kwargs: + optional keyword arguments passed to matplotlib's errorbar function + :returns: a tuple (fig, ax) as returned by plt.subplots() + """ + # prepare the dataframe + assert "bandwidth" in df.columns + assert size_name in df.index.names + df = df.reset_index(level=size_name).sort_index() + + # set default parameters for the plot + kwargs.setdefault("capsize", 4) + + # plot the graph + fig, ax = plt.subplots() + ax.set_xlabel(size_name) + ax.set_ylabel("bandwidth [GiB/s]") + ax.set_prop_cycle(prop_cycler) + for idx in df.index.unique(): + part = df.loc[idx] + err = part["bandwidth"] * part["stddev/time"] + ax.errorbar(part[size_name], part["bandwidth"], yerr=err, label=", ".join(idx), **kwargs) + # see https://stackoverflow.com/a/43439132 + ax.legend(bbox_to_anchor=(1.04, 1), loc="upper left", borderaxespad=0.) + + return fig, ax + +def heatmaps_bandwidth(df, x_name="columns", y_name="rows", *, cbar_kw=None, **kwargs): + """ + Creates heatmaps two-dimensional data of bandwidth. The "size" data (i.e. + x_name and y_name) are expected in the index of the dataframe, all other + columns of the index are used to label the heatmaps. Heatmaps are generated + using the Python generator interface for each unique tuple of dataframe + index values. + + :param df: a pandas.DataFrame instance + :param x_name: name of the column in the index to map along the x-axis + :param y_name: name of the column in the index to map along the y-axis + :param cbar_kw: + optional dict of arguments passed to matplotlib's colorbar function + :param kwargs: + optional keyword arguments passed to matplotlib's imshow function + :returns: a tuple (fig, ax) as returned by plt.subplots() + """ + # prepare the dataframe + assert "bandwidth" in df.columns + assert x_name in df.index.names + assert y_name in df.index.names + df = df.reset_index(level=[x_name, y_name]).sort_index() + + if cbar_kw is None: + cbar_kw = {} + + for idx in df.index.unique(): + # drop the index + part = df.loc[idx].reset_index(drop=True) + # get just the data we need + part = part[[x_name, y_name, "bandwidth"]].set_index([y_name, x_name]) + # convert to a 2D array + bandwidth = part.stack().unstack(level=x_name) + # remove the column full of "bandwidth" from the index + bandwidth = bandwidth.reset_index(level=1, drop=True) + + # figure setup + fig, ax = plt.subplots() + ax.set_xlabel(x_name) + ax.set_ylabel(y_name) + label = ", ".join(idx) + ax.set_title(f"{label} bandwidth [GiB/s]") + + # plot the heatmap and colorbar + im = ax.imshow(bandwidth, interpolation=None, **kwargs) + cbar = ax.figure.colorbar(im, ax=ax, **cbar_kw) + cbar.ax.set_ylabel("bandwidth", rotation=-90, va="bottom") + + # set ticks and their labels + ax.set_xticks(numpy.arange(len(bandwidth.columns))) + ax.set_yticks(numpy.arange(len(bandwidth.index))) + ax.set_xticklabels(int(n) for n in bandwidth.columns) + ax.set_yticklabels(int(n) for n in bandwidth.index) + + # rotate xtick labels and set their alignment + plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor") + + yield fig, ax + +def get_image_html_tag(fig, format="svg"): + """ + Returns an HTML tag with embedded image data in the given format. + + :param fig: a matplotlib figure instance + :param format: output image format (passed to fig.savefig) + """ + stream = io.BytesIO() + # bbox_inches: expand the canvas to include the legend that was put outside the plot + # see https://stackoverflow.com/a/43439132 + fig.savefig(stream, format=format, bbox_inches="tight") + data = stream.getvalue() + + if format == "svg": + return data.decode("utf-8") + data = base64.b64encode(data).decode("utf-8") + return f"" diff --git a/src/Python/CMakeLists.txt b/src/Python/CMakeLists.txt index 505e5f19429d3b9ecf5c1e6010a3293ca347e3a4..87f2c9cc12f87d337c739aafda5c72c28bf58af8 100644 --- a/src/Python/CMakeLists.txt +++ b/src/Python/CMakeLists.txt @@ -6,6 +6,8 @@ set( PYTHON_SITE_PACKAGES_DIR lib/python${PYTHON_VERSION_MAJOR}.${PYTHON_VERSION if( PYTHONINTERP_FOUND ) CONFIGURE_FILE( "__init__.py.in" "__init__.py" ) INSTALL( FILES ${CMAKE_CURRENT_BINARY_DIR}/__init__.py + BenchmarkLogs.py + BenchmarkPlots.py LogParser.py DESTINATION ${PYTHON_SITE_PACKAGES_DIR}/TNL ) endif() diff --git a/src/Python/LogParser.py b/src/Python/LogParser.py index 0c327d71c89fa242cd869076beb0f19e531b8839..e3c9c672a82f8cbc0fc6afc0f10ee357436c6148 100644 --- a/src/Python/LogParser.py +++ b/src/Python/LogParser.py @@ -1,5 +1,10 @@ #!/usr/bin/env python3 +import warnings +warnings.warn("The CustomLogging format for TNL benchmarks is deprecated. Please switch your benchmark " + "to JsonLogging and use the tnl-benchmark-to-html.py script for post-processing.", + DeprecationWarning) + import collections try: diff --git a/src/TNL/Benchmarks/Benchmarks.h b/src/TNL/Benchmarks/Benchmarks.h new file mode 100644 index 0000000000000000000000000000000000000000..4036fa6d46dc7a58fa7dfc6395a428e157532f2d --- /dev/null +++ b/src/TNL/Benchmarks/Benchmarks.h @@ -0,0 +1,165 @@ +/*************************************************************************** + Benchmarks.h - description + ------------------- + begin : Dec 30, 2015 + copyright : (C) 2015 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Jakub Klinkovsky, +// Tomas Oberhuber + +#pragma once + +#include "JsonLogging.h" + +#include + +#include +#include + +namespace TNL { +namespace Benchmarks { + +const double oneGB = 1024.0 * 1024.0 * 1024.0; + +struct BenchmarkResult +{ + using HeaderElements = typename Logging::HeaderElements; + using RowElements = typename Logging::RowElements; + + int loops = 0; + double time = std::numeric_limits::quiet_NaN(); + double stddev = std::numeric_limits::quiet_NaN(); + double bandwidth = std::numeric_limits::quiet_NaN(); + double speedup = std::numeric_limits::quiet_NaN(); + + virtual HeaderElements getTableHeader() const + { + return HeaderElements({ "time", "stddev", "stddev/time", "loops", "bandwidth", "speedup" }); + } + + virtual std::vector< int > getColumnWidthHints() const + { + return std::vector< int >({ 14, 14, 14, 6, 14, 14 }); + } + + virtual RowElements getRowElements() const + { + RowElements elements; + // write in scientific format to avoid precision loss + elements << std::scientific << time << stddev << stddev / time << loops << bandwidth; + if( speedup != 0 ) + elements << speedup; + else + elements << "N/A"; + return elements; + } +}; + +template< typename Logger = JsonLogging > +class Benchmark +{ + public: + using MetadataElement = typename Logger::MetadataElement; + using MetadataColumns = typename Logger::MetadataColumns; + using SolverMonitorType = Solvers::IterativeSolverMonitor< double, int >; + + Benchmark( std::ostream& output, int loops = 10, bool verbose = true ); + + static void configSetup( Config::ConfigDescription& config ); + + void setup( const Config::ParameterContainer& parameters ); + + void setLoops( int loops ); + + void setMinTime( double minTime ); + + bool isResetingOn() const; + + // Sets metadata columns -- values used for all subsequent rows until + // the next call to this function. + void setMetadataColumns( const MetadataColumns & metadata ); + + // Sets the value of one metadata column -- useful for iteratively + // changing MetadataColumns that were set using the previous method. + void setMetadataElement( const typename MetadataColumns::value_type & element ); + + // Sets the width of metadata columns when printed to the terminal. + void setMetadataWidths( const std::map< std::string, int > & widths ); + + // Sets the dataset size and base time for the calculations of bandwidth + // and speedup in the benchmarks result. + void setDatasetSize( const double datasetSize = 0.0, // in GB + const double baseTime = 0.0 ); + + // Sets current operation -- operations expand the table vertically + // - baseTime should be reset to 0.0 for most operations, but sometimes + // it is useful to override it + // - Order of operations inside a "Benchmark" does not matter, rows can be + // easily sorted while converting to HTML.) + void + setOperation( const String & operation, + const double datasetSize = 0.0, // in GB + const double baseTime = 0.0 ); + + // Times a single ComputeFunction. Subsequent calls implicitly split + // the current operation into sub-columns identified by "performer", + // which are further split into "bandwidth", "time" and "speedup" columns. + template< typename Device, + typename ResetFunction, + typename ComputeFunction > + void time( ResetFunction reset, + const String & performer, + ComputeFunction & compute, + BenchmarkResult & result ); + + template< typename Device, + typename ResetFunction, + typename ComputeFunction > + BenchmarkResult time( ResetFunction reset, + const String & performer, + ComputeFunction & compute ); + + // The same methods as above but without the reset function + template< typename Device, + typename ComputeFunction > + void time( const String & performer, + ComputeFunction & compute, + BenchmarkResult & result ); + + template< typename Device, + typename ComputeFunction > + BenchmarkResult time( const String & performer, + ComputeFunction & compute ); + + // Adds an error message to the log. Should be called in places where the + // "time" method could not be called (e.g. due to failed allocation). + void addErrorMessage( const std::string& message ); + + SolverMonitorType& getMonitor(); + + double getBaseTime() const; + + protected: + Logger logger; + + int loops = 1; + + double minTime = 0.0; + + double datasetSize = 0.0; + + double baseTime = 0.0; + + bool reset = true; + + SolverMonitorType monitor; +}; + +} // namespace Benchmarks +} // namespace TNL + +#include "Benchmarks.hpp" diff --git a/src/TNL/Benchmarks/Benchmarks.hpp b/src/TNL/Benchmarks/Benchmarks.hpp new file mode 100644 index 0000000000000000000000000000000000000000..8aa3ae9c17f17493ba1ac1570e82aef8f4705128 --- /dev/null +++ b/src/TNL/Benchmarks/Benchmarks.hpp @@ -0,0 +1,234 @@ +/*************************************************************************** + Benchmarks.hpp - description + ------------------- + begin : Jun 7, 2021 + copyright : (C) 2021 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Jakub Klinkovsky, +// Tomas Oberhuber + +#pragma once + +#include "Benchmarks.h" +#include "Utils.h" + +#include +#include + +namespace TNL { +namespace Benchmarks { + + +template< typename Logger > +Benchmark< Logger >:: +Benchmark( std::ostream& output, int loops, bool verbose ) +: logger(output, verbose), loops(loops) +{} + +template< typename Logger > +void +Benchmark< Logger >:: +configSetup( Config::ConfigDescription& config ) +{ + config.addEntry< int >( "loops", "Number of iterations for every computation.", 10 ); + config.addEntry< bool >( "reset", "Call reset function between loops.", true ); + config.addEntry< double >( "min-time", "Minimal real time in seconds for every computation.", 0.0 ); + config.addEntry< int >( "verbose", "Verbose mode, the higher number the more verbosity.", 1 ); +} + +template< typename Logger > +void +Benchmark< Logger >:: +setup( const Config::ParameterContainer& parameters ) +{ + this->loops = parameters.getParameter< int >( "loops" ); + this->reset = parameters.getParameter< bool >( "reset" ); + this->minTime = parameters.getParameter< double >( "min-time" ); + const int verbose = parameters.getParameter< int >( "verbose" ); + logger.setVerbose( verbose ); +} + +template< typename Logger > +void +Benchmark< Logger >:: +setLoops( int loops ) +{ + this->loops = loops; +} + +template< typename Logger > +void +Benchmark< Logger >:: +setMinTime( double minTime ) +{ + this->minTime = minTime; +} + +template< typename Logger > +bool +Benchmark< Logger >:: +isResetingOn() const +{ + return reset; +} + +template< typename Logger > +void +Benchmark< Logger >:: +setMetadataColumns( const MetadataColumns & metadata ) +{ + logger.setMetadataColumns( metadata ); +} + +template< typename Logger > +void +Benchmark< Logger >:: +setMetadataElement( const typename MetadataColumns::value_type & element ) +{ + logger.setMetadataElement( element ); +} + +template< typename Logger > +void +Benchmark< Logger >:: +setMetadataWidths( const std::map< std::string, int > & widths ) +{ + logger.setMetadataWidths( widths ); +} + +template< typename Logger > +void +Benchmark< Logger >:: +setDatasetSize( const double datasetSize, + const double baseTime ) +{ + this->datasetSize = datasetSize; + this->baseTime = baseTime; +} + +template< typename Logger > +void +Benchmark< Logger >:: +setOperation( const String & operation, + const double datasetSize, + const double baseTime ) +{ + monitor.setStage( operation.getString() ); + logger.setMetadataElement( {"operation", operation}, 0 ); + setDatasetSize( datasetSize, baseTime ); +} + +template< typename Logger > + template< typename Device, + typename ResetFunction, + typename ComputeFunction > +void +Benchmark< Logger >:: +time( ResetFunction reset, + const String & performer, + ComputeFunction & compute, + BenchmarkResult & result ) +{ + result.time = std::numeric_limits::quiet_NaN(); + result.stddev = std::numeric_limits::quiet_NaN(); + + // run the monitor main loop + Solvers::SolverMonitorThread monitor_thread( monitor ); + if( logger.getVerbose() <= 1 ) + // stop the main loop when not verbose + monitor.stopMainLoop(); + + std::string errorMessage; + try { + if( this->reset ) + std::tie( result.loops, result.time, result.stddev ) = timeFunction< Device >( compute, reset, loops, minTime, monitor ); + else { + auto noReset = [] () {}; + std::tie( result.loops, result.time, result.stddev ) = timeFunction< Device >( compute, noReset, loops, minTime, monitor ); + } + } + catch ( const std::exception& e ) { + errorMessage = "timeFunction failed due to a C++ exception with description: " + std::string(e.what()); + std::cerr << errorMessage << std::endl; + } + + result.bandwidth = datasetSize / result.time; + result.speedup = this->baseTime / result.time; + if( this->baseTime == 0.0 ) + this->baseTime = result.time; + + logger.logResult( performer, result.getTableHeader(), result.getRowElements(), result.getColumnWidthHints(), errorMessage ); +} + +template< typename Logger > + template< typename Device, + typename ResetFunction, + typename ComputeFunction > +BenchmarkResult +Benchmark< Logger >:: +time( ResetFunction reset, + const String& performer, + ComputeFunction& compute ) +{ + BenchmarkResult result; + time< Device >( reset, performer, compute, result ); + return result; +} + +template< typename Logger > + template< typename Device, + typename ComputeFunction > +void +Benchmark< Logger >:: +time( const String & performer, + ComputeFunction & compute, + BenchmarkResult & result ) +{ + auto noReset = [] () {}; + time< Device >( noReset, performer, compute, result ); +} + +template< typename Logger > + template< typename Device, + typename ComputeFunction > +BenchmarkResult +Benchmark< Logger >:: +time( const String & performer, + ComputeFunction & compute ) +{ + BenchmarkResult result; + time< Device >( performer, compute, result ); + return result; +} + +template< typename Logger > +void +Benchmark< Logger >:: +addErrorMessage( const std::string& message ) +{ + logger.writeErrorMessage( message ); + std::cerr << message << std::endl; +} + +template< typename Logger > +auto +Benchmark< Logger >:: +getMonitor() -> SolverMonitorType& +{ + return monitor; +} + +template< typename Logger > +double +Benchmark< Logger >:: +getBaseTime() const +{ + return baseTime; +} + +} // namespace Benchmarks +} // namespace TNL diff --git a/src/TNL/Benchmarks/CustomLogging.h b/src/TNL/Benchmarks/CustomLogging.h new file mode 100644 index 0000000000000000000000000000000000000000..d734ecbe15a66c69d632081a3d55fd465c20ba24 --- /dev/null +++ b/src/TNL/Benchmarks/CustomLogging.h @@ -0,0 +1,148 @@ +/*************************************************************************** + CustomLogging.h - description + ------------------- + begin : May 11, 2021 + copyright : (C) 2021 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Jakub Klinkovsky, +// Tomas Oberhuber + +#pragma once + +#include "Logging.h" +#include + +namespace TNL { +namespace Benchmarks { + +class CustomLogging +: public Logging +{ +public: + // inherit constructors + using Logging::Logging; + + void + writeTableHeader( const std::string & spanningElement, + const HeaderElements & subElements ) + { + if( verbose && header_changed ) { + for( auto & it : metadataColumns ) { + const int width = (metadataWidths.count( it.first )) ? metadataWidths[ it.first ] : 15; + std::cout << std::setw( width ) << it.first; + } + + // spanning element is printed as usual column to stdout, + // but is excluded from header + std::cout << std::setw( 15 ) << ""; + + for( auto & it : subElements ) { + std::cout << std::setw( 15 ) << it; + } + std::cout << std::endl; + + header_changed = false; + } + + // initial indent string + log << std::endl; + for( auto & it : metadataColumns ) { + log << "! " << it.first << std::endl; + } + + log << "! " << spanningElement << std::endl; + for( auto & it : subElements ) { + log << "!! " << it << std::endl; + } + } + + void + writeTableRow( const std::string & spanningElement, + const RowElements & subElements, + const std::string & errorMessage ) + { + if( verbose ) { + for( auto & it : metadataColumns ) { + const int width = (metadataWidths.count( it.first )) ? metadataWidths[ it.first ] : 15; + std::cout << std::setw( width ) << it.second; + } + // spanning element is printed as usual column to stdout + std::cout << std::setw( 15 ) << spanningElement; + for( auto & it : subElements ) { + std::cout << std::setw( 15 ) << it; + } + std::cout << std::endl; + } + + // only when changed (the header has been already adjusted) + // print each element on separate line + for( auto & it : metadataColumns ) { + log << it.second << std::endl; + } + + if( errorMessage.empty() ) { + // benchmark data are indented + const std::string indent = " "; + for( auto & it : subElements ) { + log << indent << it << std::endl; + } + } + else { + // write the message + log << errorMessage << std::endl; + } + } + + virtual void + logResult( const std::string& performer, + const HeaderElements& headerElements, + const RowElements& rowElements, + const WidthHints& columnWidthHints, + const std::string& errorMessage = "" ) override + { + TNL_ASSERT_EQ( headerElements.size(), rowElements.size(), "elements must have equal sizes" ); + TNL_ASSERT_EQ( headerElements.size(), columnWidthHints.size(), "elements must have equal sizes" ); + writeTableHeader( performer, headerElements ); + writeTableRow( performer, rowElements, errorMessage ); + } + + virtual void + writeErrorMessage( const std::string& message ) override + { + // initial indent string + log << std::endl; + for( auto & it : metadataColumns ) { + log << "! " << it.first << std::endl; + } + + // only when changed (the header has been already adjusted) + // print each element on separate line + for( auto & it : metadataColumns ) { + log << it.second << std::endl; + } + + // write the message + log << message << std::endl; + } + +protected: + // manual double -> string conversion with fixed precision + static std::string + _to_string( double num, int precision = 0, bool fixed = false ) + { + std::stringstream str; + if( fixed ) + str << std::fixed; + if( precision ) + str << std::setprecision( precision ); + str << num; + return std::string( str.str().data() ); + } +}; + +} // namespace Benchmarks +} // namespace TNL diff --git a/src/TNL/Benchmarks/JsonLogging.h b/src/TNL/Benchmarks/JsonLogging.h new file mode 100644 index 0000000000000000000000000000000000000000..d74ecbdf23660a0aa907b9e94307cfcf1e9c918f --- /dev/null +++ b/src/TNL/Benchmarks/JsonLogging.h @@ -0,0 +1,137 @@ +/*************************************************************************** + JsonLogging.h - description + ------------------- + begin : May 11, 2021 + copyright : (C) 2021 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Jakub Klinkovsky, +// Tomas Oberhuber + +#pragma once + +#include "Logging.h" +#include + +namespace TNL { +namespace Benchmarks { + +class JsonLogging +: public Logging +{ +public: + // inherit constructors + using Logging::Logging; + + void writeHeader( const HeaderElements& headerElements, const WidthHints& widths ) + { + TNL_ASSERT_EQ( headerElements.size(), widths.size(), "elements must have equal sizes" ); + if( verbose && header_changed ) + { + for( auto & lg : metadataColumns ) { + const int width = (metadataWidths.count( lg.first )) ? metadataWidths[ lg.first ] : 14; + std::cout << std::setw( width ) << lg.first; + } + for( std::size_t i = 0; i < headerElements.size(); i++ ) + std::cout << std::setw( widths[ i ] ) << headerElements[ i ]; + std::cout << std::endl; + header_changed = false; + } + } + + void writeRow( const HeaderElements& headerElements, + const RowElements& rowElements, + const WidthHints& widths, + const std::string& errorMessage ) + { + TNL_ASSERT_EQ( headerElements.size(), rowElements.size(), "elements must have equal sizes" ); + TNL_ASSERT_EQ( headerElements.size(), widths.size(), "elements must have equal sizes" ); + + log << "{"; + + // write common logs + int idx( 0 ); + for( auto lg : this->metadataColumns ) + { + if( verbose ) { + const int width = (metadataWidths.count( lg.first )) ? metadataWidths[ lg.first ] : 14; + std::cout << std::setw( width ) << lg.second; + } + if( idx++ > 0 ) + log << ", "; + log << "\"" << lg.first << "\": \"" << lg.second << "\""; + } + + std::size_t i = 0; + for( auto el : rowElements ) + { + if( verbose ) + std::cout << std::setw( widths[ i ] ) << el; + if( idx++ > 0 ) + log << ", "; + log << "\"" << headerElements[ i ] << "\": \"" << el << "\""; + i++; + } + if( ! errorMessage.empty() ) { + if( idx++ > 0 ) + log << ", "; + log << "\"error\": \"" << errorMessage << "\""; + } + log << "}" << std::endl; + if( verbose ) + std::cout << std::endl; + } + + virtual void + logResult( const std::string& performer, + const HeaderElements& headerElements, + const RowElements& rowElements, + const WidthHints& columnWidthHints, + const std::string& errorMessage = "" ) override + { + setMetadataElement({ "performer", performer }); + writeHeader( headerElements, columnWidthHints ); + writeRow( headerElements, rowElements, columnWidthHints, errorMessage ); + } + + virtual void + writeErrorMessage( const std::string& message ) override + { + log << "{"; + + // write common logs + int idx( 0 ); + for( auto lg : this->metadataColumns ) + { + if( idx++ > 0 ) + log << ", "; + log << "\"" << lg.first << "\": \"" << lg.second << "\""; + } + + if( idx++ > 0 ) + log << ", "; + log << "\"error\": \"" << message << "\""; + + log << "}" << std::endl; + } + +protected: + // manual double -> string conversion with fixed precision + static std::string + _to_string( double num, int precision = 0, bool fixed = false ) + { + std::stringstream str; + if( fixed ) + str << std::fixed; + if( precision ) + str << std::setprecision( precision ); + str << num; + return std::string( str.str().data() ); + } +}; + +} // namespace Benchmarks +} // namespace TNL diff --git a/src/TNL/Benchmarks/Logging.h b/src/TNL/Benchmarks/Logging.h new file mode 100644 index 0000000000000000000000000000000000000000..693d02ee796556f4c852bfcaf19f92f921074939 --- /dev/null +++ b/src/TNL/Benchmarks/Logging.h @@ -0,0 +1,181 @@ +/*************************************************************************** + Logging.h - description + ------------------- + begin : Dec 25, 2018 + copyright : (C) 2018 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Jakub Klinkovsky, +// Tomas Oberhuber + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +namespace TNL { +namespace Benchmarks { + +class LoggingRowElements +{ + public: + + LoggingRowElements() + { + stream << std::setprecision( 6 ) << std::fixed; + } + + template< typename T > + LoggingRowElements& operator << ( const T& b ) + { + stream << b; + elements.push_back( stream.str() ); + stream.str( std::string() ); + return *this; + } + + LoggingRowElements& operator << ( decltype( std::setprecision( 2 ) )& setprec ) + { + stream << setprec; + return *this; + } + + LoggingRowElements& operator << ( decltype( std::fixed )& setfixed ) // the same works also for std::scientific + { + stream << setfixed; + return *this; + } + + std::size_t size() const noexcept { return elements.size(); }; + + // iterators + auto begin() noexcept { return elements.begin(); } + + auto begin() const noexcept { return elements.begin(); } + + auto cbegin() const noexcept { return elements.cbegin(); } + + auto end() noexcept { return elements.end(); } + + auto end() const noexcept { return elements.end(); } + + auto cend() const noexcept { return elements.cend(); } + + protected: + std::list< std::string > elements; + + std::stringstream stream; +}; + +class Logging +{ +public: + using MetadataElement = std::pair< std::string, std::string >; + using MetadataColumns = std::vector< MetadataElement >; + + using HeaderElements = std::vector< std::string >; + using RowElements = LoggingRowElements; + using WidthHints = std::vector< int >; + + Logging( std::ostream& log, int verbose = true ) + : log(log), verbose(verbose) + { + try { + // check if we got an open file + std::ofstream& file = dynamic_cast< std::ofstream& >( log ); + if( file.is_open() ) + // enable exceptions, but only if we got an open file + // (under MPI, only the master rank typically opens the log file and thus + // logs from other ranks are ignored here) + file.exceptions( std::ostream::failbit | std::ostream::badbit | std::ostream::eofbit ); + } + catch( std::bad_cast& ) { + // also enable exceptions if we did not get a file + log.exceptions( std::ostream::failbit | std::ostream::badbit | std::ostream::eofbit ); + } + } + + void + setVerbose( int verbose ) + { + this->verbose = verbose; + } + + int getVerbose() const + { + return verbose; + } + + virtual void setMetadataColumns( const MetadataColumns& elements ) + { + // check if a header element changed (i.e. a first item of the pairs) + if( metadataColumns.size() != elements.size() ) + header_changed = true; + else + for( std::size_t i = 0; i < metadataColumns.size(); i++ ) + if( metadataColumns[ i ].first != elements[ i ].first ) { + header_changed = true; + break; + } + metadataColumns = elements; + } + + virtual void + setMetadataElement( const typename MetadataColumns::value_type & element, + int insertPosition = -1 /* negative values insert from the end */ ) + { + bool found = false; + for( auto & it : metadataColumns ) + if( it.first == element.first ) { + if( it.second != element.second ) + it.second = element.second; + found = true; + break; + } + if( ! found ) { + if( insertPosition < 0 ) + metadataColumns.insert( metadataColumns.end() + insertPosition + 1, element ); + else + metadataColumns.insert( metadataColumns.begin() + insertPosition, element ); + header_changed = true; + } + } + + virtual void + setMetadataWidths( const std::map< std::string, int > & widths ) + { + for( auto & it : widths ) + if( metadataWidths.count( it.first ) ) + metadataWidths[ it.first ] = it.second; + else + metadataWidths.insert( it ); + } + + virtual void + logResult( const std::string& performer, + const HeaderElements& headerElements, + const RowElements& rowElements, + const WidthHints& columnWidthHints, + const std::string& errorMessage = "" ) = 0; + + virtual void writeErrorMessage( const std::string& message ) = 0; + +protected: + std::ostream& log; + int verbose = 0; + + MetadataColumns metadataColumns; + std::map< std::string, int > metadataWidths; + bool header_changed = true; +}; + +} // namespace Benchmarks +} // namespace TNL diff --git a/src/TNL/Benchmarks/Utils.h b/src/TNL/Benchmarks/Utils.h new file mode 100644 index 0000000000000000000000000000000000000000..e1e243437d70f435db9ca4ca66cda350595bf6fa --- /dev/null +++ b/src/TNL/Benchmarks/Utils.h @@ -0,0 +1,186 @@ +/*************************************************************************** + Utils.h - description + ------------------- + begin : Dec 25, 2018 + copyright : (C) 2018 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Jakub Klinkovsky, +// Tomas Oberhuber + +#pragma once + +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +namespace TNL { +namespace Benchmarks { + +// returns a tuple of (loops, mean, stddev) where loops is the number of +// performed loops (i.e. timing samples), mean is the arithmetic mean of the +// computation times and stddev is the sample standard deviation +template< typename Device, + typename ComputeFunction, + typename ResetFunction, + typename Monitor = TNL::Solvers::IterativeSolverMonitor< double, int > > +std::tuple< int, double, double > +timeFunction( ComputeFunction compute, + ResetFunction reset, + int maxLoops, + const double& minTime, + Monitor && monitor = Monitor() ) +{ + // the timer is constructed zero-initialized and stopped + Timer timer; + + // set timer to the monitor + monitor.setTimer( timer ); + + // warm up + reset(); + compute(); + + Containers::Vector< double > results( maxLoops ); + results.setValue( 0.0 ); + + int loops; + for( loops = 0; + loops < maxLoops || sum( results ) < minTime; + loops++ ) + { + // abuse the monitor's "time" for loops + monitor.setTime( loops + 1 ); + reset(); + + // Explicit synchronization of the CUDA device +#ifdef HAVE_CUDA + if( std::is_same< Device, Devices::Cuda >::value ) + cudaDeviceSynchronize(); +#endif + + // reset timer before each computation + timer.reset(); + timer.start(); + compute(); +#ifdef HAVE_CUDA + if( std::is_same< Device, Devices::Cuda >::value ) + cudaDeviceSynchronize(); +#endif + timer.stop(); + + results[ loops ] = timer.getRealTime(); + } + + const double mean = sum( results ) / (double) loops; + double stddev; + if( loops > 1 ) + stddev = 1.0 / std::sqrt( loops - 1 ) * l2Norm( results - mean ); + else + stddev = std::numeric_limits::quiet_NaN(); + return std::make_tuple( loops, mean, stddev ); +} + +inline std::map< std::string, std::string > getHardwareMetadata() +{ + const int cpu_id = 0; + const CacheSizes cacheSizes = SystemInfo::getCPUCacheSizes( cpu_id ); + const std::string cacheInfo = std::to_string( cacheSizes.L1data ) + ", " + + std::to_string( cacheSizes.L1instruction ) + ", " + + std::to_string( cacheSizes.L2 ) + ", " + + std::to_string( cacheSizes.L3 ); +#ifdef HAVE_CUDA + const int activeGPU = Cuda::DeviceInfo::getActiveDevice(); + const std::string deviceArch = std::to_string( Cuda::DeviceInfo::getArchitectureMajor( activeGPU ) ) + "." + + std::to_string( Cuda::DeviceInfo::getArchitectureMinor( activeGPU ) ); +#endif + +#ifdef HAVE_MPI + int nproc = 1; + // check if MPI was initialized (some benchmarks do not initialize MPI even when + // they are built with HAVE_MPI and thus MPI::GetSize() cannot be used blindly) + if( TNL::MPI::Initialized() ) + nproc = TNL::MPI::GetSize(); +#endif + + std::map< std::string, std::string > metadata { + { "host name", SystemInfo::getHostname() }, + { "architecture", SystemInfo::getArchitecture() }, + { "system", SystemInfo::getSystemName() }, + { "system release", SystemInfo::getSystemRelease() }, + { "start time", SystemInfo::getCurrentTime() }, +#ifdef HAVE_MPI + { "number of MPI processes", std::to_string( nproc ) }, +#endif + { "OpenMP enabled", std::to_string( Devices::Host::isOMPEnabled() ) }, + { "OpenMP threads", std::to_string( Devices::Host::getMaxThreadsCount() ) }, + { "CPU model name", SystemInfo::getCPUModelName( cpu_id ) }, + { "CPU cores", std::to_string( SystemInfo::getNumberOfCores( cpu_id ) ) }, + { "CPU threads per core", std::to_string( SystemInfo::getNumberOfThreads( cpu_id ) / SystemInfo::getNumberOfCores( cpu_id ) ) }, + { "CPU max frequency (MHz)", std::to_string( SystemInfo::getCPUMaxFrequency( cpu_id ) / 1e3 ) }, + { "CPU cache sizes (L1d, L1i, L2, L3) (kiB)", cacheInfo }, +#ifdef HAVE_CUDA + { "GPU name", Cuda::DeviceInfo::getDeviceName( activeGPU ) }, + { "GPU architecture", deviceArch }, + { "GPU CUDA cores", std::to_string( Cuda::DeviceInfo::getCudaCores( activeGPU ) ) }, + { "GPU clock rate (MHz)", std::to_string( (double) Cuda::DeviceInfo::getClockRate( activeGPU ) / 1e3 ) }, + { "GPU global memory (GB)", std::to_string( (double) Cuda::DeviceInfo::getGlobalMemory( activeGPU ) / 1e9 ) }, + { "GPU memory clock rate (MHz)", std::to_string( (double) Cuda::DeviceInfo::getMemoryClockRate( activeGPU ) / 1e3 ) }, + { "GPU memory ECC enabled", std::to_string( Cuda::DeviceInfo::getECCEnabled( activeGPU ) ) }, +#endif + }; + + return metadata; +} + +inline void writeMapAsJson( const std::map< std::string, std::string >& data, + std::ostream& out ) +{ + out << "{\n"; + for( auto it = data.begin(); it != data.end(); ) { + out << "\t\"" << it->first << "\": \"" << it->second << "\""; + // increment the iterator now to peek at the next element + it++; + // write a comma if there are still elements remaining + if( it != data.end() ) + out << ","; + out << "\n"; + } + out << "}\n" << std::flush; +} + +inline void writeMapAsJson( const std::map< std::string, std::string >& data, + std::string filename, + std::string newExtension = "" ) +{ + namespace fs = std::experimental::filesystem; + + if( newExtension != "" ) { + const fs::path oldPath = filename; + const fs::path newPath = oldPath.parent_path() / ( oldPath.stem().string() + newExtension ); + filename = newPath; + } + + std::ofstream file( filename ); + // enable exceptions + file.exceptions( std::ostream::failbit | std::ostream::badbit | std::ostream::eofbit ); + writeMapAsJson( data, file ); +} + +} // namespace Benchmarks +} // namespace TNL diff --git a/src/Tools/CMakeLists.txt b/src/Tools/CMakeLists.txt index deb03b47562d32d4739dd99a95f69bff82becc3f..84a05dd9ca25620402d48d35b5981841025895ad 100644 --- a/src/Tools/CMakeLists.txt +++ b/src/Tools/CMakeLists.txt @@ -79,5 +79,6 @@ INSTALL( TARGETS tnl-init DESTINATION bin ) INSTALL( PROGRAMS tnl-err2eoc + tnl-benchmark-to-html.py tnl-log-to-html.py DESTINATION bin ) diff --git a/src/Tools/tnl-benchmark-to-html.py b/src/Tools/tnl-benchmark-to-html.py new file mode 100755 index 0000000000000000000000000000000000000000..70c79349129a318f7ae2f22893619eab15e2d417 --- /dev/null +++ b/src/Tools/tnl-benchmark-to-html.py @@ -0,0 +1,93 @@ +#!/usr/bin/python3 + +import sys +import os.path +import matplotlib.pyplot as plt + +from TNL.BenchmarkLogs import * +from TNL.BenchmarkPlots import * + +if len(sys.argv) < 2 or len(sys.argv) > 3: + print(f"""\ +usage: {sys.argv[0]} FILE.log [OUTPUT.html] + +where FILE.log contains one JSON record per line, +and OUTPUT.html is the output file name (by default, OUTPUT=FILE). +""", file=sys.stderr) + sys.exit(1) + +logFile = sys.argv[1] +if len(sys.argv) > 2: + htmlFile = sys.argv[2] +else: + htmlFile = os.path.splitext(logFile)[0] + ".html" + + +metadata = get_benchmark_metadata(logFile) +if metadata is not None and "title" in metadata: + title = metadata["title"] +else: + title = os.path.splitext(os.path.basename(logFile))[0] +dataframes = list(gen_dataframes_per_operation(logFile)) + +print(f"Writing output to {htmlFile}") +with open(htmlFile, 'w') as f: + print("", file=f) + # add some basic style + print("""\ + + + + +""", file=f) + + print(f"

{title}

", file=f) + if metadata is not None: + print(dict_to_html_table(metadata), file=f) + + # create a TOC + print(f"

Table of contents

", file=f) + print("
    ", file=f) + for op, df in dataframes: + id = op.replace(" ", "_") + print(f"
  1. {op}
  2. ", file=f) + print("
", file=f) + + # formatters for specific columns of the table + formatters = { + "stddev": lambda value: f"{value:e}", + "bandwidth": lambda value: f"{value:.3f}", + "speedup": lambda value: f"{value:.3f}", + } + + for op, df in dataframes: + # section heading + id = op.replace(" ", "_") + print(f"

{op}

", file=f) + # table + print(df.to_html(classes="benchmark", formatters=formatters), file=f) + + # graphs + size_name = None + if "size" in df.index.names: + size_name = "size" + elif "DOFs" in df.index.names: + size_name = "DOFs" + if size_name is not None: + fig, ax = plot_bandwidth_vs_size(df, size_name) + print(get_image_html_tag(fig, format="png"), file=f) + plt.close(fig) + + # heatmaps + if "rows" in df.index.names and "columns" in df.index.names: + for fig, ax in heatmaps_bandwidth(df): + print(get_image_html_tag(fig, format="png"), file=f) + plt.close(fig) + + print("", file=f) + print("", file=f) diff --git a/src/Tools/tnl-log-to-html.py b/src/Tools/tnl-log-to-html.py index bb9577298871c54464d14e111d21180e6738e71d..a42c417c2f15ab3d41af13ac14ddeaec9c6a36f8 100755 --- a/src/Tools/tnl-log-to-html.py +++ b/src/Tools/tnl-log-to-html.py @@ -1,5 +1,10 @@ #!/usr/bin/env python3 +import warnings +warnings.warn("The CustomLogging format for TNL benchmarks is deprecated. Please switch your benchmark " + "to JsonLogging and use the tnl-benchmark-to-html.py script for post-processing.", + DeprecationWarning) + import sys from TNL.LogParser import LogParser