diff --git a/CMakeLists.txt b/CMakeLists.txt index 851e28ec2e6db851e449ece0d037a3d40f41589c..b7540a169070b96faf9ad644d4faaf9f7f646bc5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -29,6 +29,7 @@ option(WITH_TESTS "Build tests" ON) option(WITH_COVERAGE "Enable code coverage reports from unit tests" OFF) option(WITH_EXAMPLES "Compile the 'examples' directory" ON) option(WITH_TOOLS "Compile the 'src/Tools' directory" ON) +option(WITH_BENCHMARKS "Compile the 'src/Benchmarks' directory" ON) option(WITH_PYTHON "Compile the Python bindings" ON) option(WITH_TEMPLATES_INSTANTIATION "Enable explicit template instantiation" OFF) @@ -473,6 +474,7 @@ message( " WITH_TESTS=${WITH_TESTS}" ) message( " WITH_COVERAGE=${WITH_COVERAGE}" ) message( " WITH_EXAMPLES=${WITH_EXAMPLES}" ) message( " WITH_TOOLS=${WITH_TOOLS}" ) +message( " WITH_BENCHMARKS=${WITH_BENCHMARKS}" ) message( " WITH_PYTHON=${WITH_PYTHON}" ) message( " WITH_TEMPLATES_INSTANTIATION=${WITH_TEMPLATES_INSTANTIATION}" ) # Print compiler options diff --git a/build b/build index 13929b20258bc8c9f5dbf7ad7cd7015ed0e35b19..e0c8dbb993e592c420aa62abd991c189fbff4870 100755 --- a/build +++ b/build @@ -26,6 +26,7 @@ WITH_COVERAGE="no" WITH_EXAMPLES="yes" WITH_PYTHON="yes" WITH_TOOLS="yes" +WITH_BENCHMARKS="yes" WITH_TEMPLATE_INSTANTIATION="no" INSTANTIATE_LONG_INT="no" @@ -60,6 +61,7 @@ do --with-coverage=* ) WITH_COVERAGE="${option#*=}" ;; --with-examples=* ) WITH_EXAMPLES="${option#*=}" ;; --with-tools=* ) WITH_TOOLS="${option#*=}" ;; + --with-benchmarks=* ) WITH_BENCHMARKS="${option#*=}" ;; --with-python=* ) WITH_PYTHON="${option#*=}" ;; --with-templates-instantiation=* ) WITH_TEMPLATE_INSTANTIATION="${option#*=}" ;; --instantiate-long-int=* ) INSTANTIATE_LONG_INT="${option#*=}" ;; @@ -164,6 +166,7 @@ cmake_command=( -DWITH_COVERAGE=${WITH_COVERAGE} -DWITH_EXAMPLES=${WITH_EXAMPLES} -DWITH_TOOLS=${WITH_TOOLS} + -DWITH_BENCHMARKS=${WITH_BENCHMARKS} -DWITH_PYTHON=${WITH_PYTHON} -DDCMTK_DIR=${DCMTK_DIR} -DWITH_TEMPLATE_INSTANTIATION=${WITH_TEMPLATE_INSTANTIATION} diff --git a/src/Benchmarks/BLAS/CMakeLists.txt b/src/Benchmarks/BLAS/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..dc396c46e60512b5cab5f5f7455756f609b76c47 --- /dev/null +++ b/src/Benchmarks/BLAS/CMakeLists.txt @@ -0,0 +1,10 @@ +if( BUILD_CUDA ) + CUDA_ADD_EXECUTABLE( tnl-benchmark-blas tnl-benchmark-blas.cu ) + CUDA_ADD_CUBLAS_TO_TARGET( tnl-benchmark-blas ) + TARGET_LINK_LIBRARIES( tnl-benchmark-blas tnl ) +else() + ADD_EXECUTABLE( tnl-benchmark-blas tnl-benchmark-blas.cpp ) + TARGET_LINK_LIBRARIES( tnl-benchmark-blas tnl ) +endif() + +install( TARGETS tnl-benchmark-blas RUNTIME DESTINATION bin ) diff --git a/src/Benchmarks/BLAS/array-operations.h b/src/Benchmarks/BLAS/array-operations.h new file mode 100644 index 0000000000000000000000000000000000000000..aacdb9cc65315af377ddec49c167a9a197483bd5 --- /dev/null +++ b/src/Benchmarks/BLAS/array-operations.h @@ -0,0 +1,165 @@ +/*************************************************************************** + array-operations.h - description + ------------------- + begin : Dec 30, 2015 + copyright : (C) 2015 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Jakub Klinkovsky + +#pragma once + +#include "../Benchmarks.h" + +#include <TNL/Containers/Array.h> + +namespace TNL { +namespace Benchmarks { + +template< typename Real = double, + typename Index = int > +bool +benchmarkArrayOperations( Benchmark & benchmark, + const int & loops, + const long & size ) +{ + typedef Containers::Array< Real, Devices::Host, Index > HostArray; + typedef Containers::Array< Real, Devices::Cuda, Index > CudaArray; + using namespace std; + + double datasetSize = ( double ) ( loops * size ) * sizeof( Real ) / oneGB; + + HostArray hostArray, hostArray2; + CudaArray deviceArray, deviceArray2; + hostArray.setSize( size ); + hostArray2.setSize( size ); +#ifdef HAVE_CUDA + deviceArray.setSize( size ); + deviceArray2.setSize( size ); +#endif + + Real resultHost, resultDevice; + + + // reset functions + auto reset1 = [&]() { + hostArray.setValue( 1.0 ); +#ifdef HAVE_CUDA + deviceArray.setValue( 1.0 ); +#endif + }; + auto reset2 = [&]() { + hostArray2.setValue( 1.0 ); +#ifdef HAVE_CUDA + deviceArray2.setValue( 1.0 ); +#endif + }; + auto reset12 = [&]() { + reset1(); + reset2(); + }; + + + reset12(); + + + auto compareHost = [&]() { + resultHost = (int) hostArray == hostArray2; + }; + auto compareCuda = [&]() { + resultDevice = (int) deviceArray == deviceArray2; + }; + benchmark.setOperation( "comparison (operator==)", 2 * datasetSize ); + benchmark.time( reset1, "CPU", compareHost ); +#ifdef HAVE_CUDA + benchmark.time( reset1, "GPU", compareCuda ); +#endif + + + auto copyAssignHostHost = [&]() { + hostArray = hostArray2; + }; + auto copyAssignCudaCuda = [&]() { + deviceArray = deviceArray2; + }; + benchmark.setOperation( "copy (operator=)", 2 * datasetSize ); + // copyBasetime is used later inside HAVE_CUDA guard, so the compiler will + // complain when compiling without CUDA + const double copyBasetime = benchmark.time( reset1, "CPU", copyAssignHostHost ); +#ifdef HAVE_CUDA + benchmark.time( reset1, "GPU", copyAssignCudaCuda ); +#endif + + + auto copyAssignHostCuda = [&]() { + deviceArray = hostArray; + }; + auto copyAssignCudaHost = [&]() { + hostArray = deviceArray; + }; +#ifdef HAVE_CUDA + benchmark.setOperation( "copy (operator=)", datasetSize, copyBasetime ); + benchmark.time( reset1, + "CPU->GPU", copyAssignHostCuda, + "GPU->CPU", copyAssignCudaHost ); +#endif + + + auto setValueHost = [&]() { + hostArray.setValue( 3.0 ); + }; + auto setValueCuda = [&]() { + deviceArray.setValue( 3.0 ); + }; + benchmark.setOperation( "setValue", datasetSize ); + benchmark.time( reset1, "CPU", setValueHost ); +#ifdef HAVE_CUDA + benchmark.time( reset1, "GPU", setValueCuda ); +#endif + + + auto setSizeHost = [&]() { + hostArray.setSize( size ); + }; + auto setSizeCuda = [&]() { + deviceArray.setSize( size ); + }; + auto resetSize1 = [&]() { + hostArray.reset(); +#ifdef HAVE_CUDA + deviceArray.reset(); +#endif + }; + benchmark.setOperation( "allocation (setSize)", datasetSize ); + benchmark.time( resetSize1, "CPU", setSizeHost ); +#ifdef HAVE_CUDA + benchmark.time( resetSize1, "GPU", setSizeCuda ); +#endif + + + auto resetSizeHost = [&]() { + hostArray.reset(); + }; + auto resetSizeCuda = [&]() { + deviceArray.reset(); + }; + auto setSize1 = [&]() { + hostArray.setSize( size ); +#ifdef HAVE_CUDA + deviceArray.setSize( size ); +#endif + }; + benchmark.setOperation( "deallocation (reset)", datasetSize ); + benchmark.time( setSize1, "CPU", resetSizeHost ); +#ifdef HAVE_CUDA + benchmark.time( setSize1, "GPU", resetSizeCuda ); +#endif + + return true; +} + +} // namespace Benchmarks +} // namespace TNL diff --git a/tests/benchmarks/cublasWrappers.h b/src/Benchmarks/BLAS/cublasWrappers.h similarity index 74% rename from tests/benchmarks/cublasWrappers.h rename to src/Benchmarks/BLAS/cublasWrappers.h index 6b71a4ed7befc12664440cf5425f5975e6feec0c..1e63e139d6faa513706ed1b18a207e87ea1a079d 100644 --- a/tests/benchmarks/cublasWrappers.h +++ b/src/Benchmarks/BLAS/cublasWrappers.h @@ -8,14 +8,14 @@ inline cublasStatus_t cublasIgamax( cublasHandle_t handle, int n, const float *x, int incx, int *result ) { - return cublasIsamax( handle, n, x, incx, result ); + return cublasIsamax( handle, n, x, incx, result ); } inline cublasStatus_t cublasIgamax( cublasHandle_t handle, int n, const double *x, int incx, int *result ) { - return cublasIdamax( handle, n, x, incx, result ); + return cublasIdamax( handle, n, x, incx, result ); } @@ -23,14 +23,14 @@ inline cublasStatus_t cublasIgamin( cublasHandle_t handle, int n, const float *x, int incx, int *result ) { - return cublasIsamin( handle, n, x, incx, result ); + return cublasIsamin( handle, n, x, incx, result ); } inline cublasStatus_t cublasIgamin( cublasHandle_t handle, int n, const double *x, int incx, int *result ) { - return cublasIdamin( handle, n, x, incx, result ); + return cublasIdamin( handle, n, x, incx, result ); } @@ -38,14 +38,14 @@ inline cublasStatus_t cublasGasum( cublasHandle_t handle, int n, const float *x, int incx, float *result ) { - return cublasSasum( handle, n, x, incx, result ); + return cublasSasum( handle, n, x, incx, result ); } inline cublasStatus_t cublasGasum( cublasHandle_t handle, int n, const double *x, int incx, double *result ) { - return cublasDasum( handle, n, x, incx, result ); + return cublasDasum( handle, n, x, incx, result ); } @@ -55,7 +55,7 @@ cublasGaxpy( cublasHandle_t handle, int n, const float *x, int incx, float *y, int incy ) { - return cublasSaxpy( handle, n, alpha, x, incx, y, incy ); + return cublasSaxpy( handle, n, alpha, x, incx, y, incy ); } inline cublasStatus_t @@ -64,7 +64,7 @@ cublasGaxpy( cublasHandle_t handle, int n, const double *x, int incx, double *y, int incy ) { - return cublasDaxpy( handle, n, alpha, x, incx, y, incy ); + return cublasDaxpy( handle, n, alpha, x, incx, y, incy ); } @@ -74,7 +74,7 @@ cublasGdot( cublasHandle_t handle, int n, const float *y, int incy, float *result ) { - return cublasSdot( handle, n, x, incx, y, incy, result ); + return cublasSdot( handle, n, x, incx, y, incy, result ); } inline cublasStatus_t @@ -83,7 +83,7 @@ cublasGdot( cublasHandle_t handle, int n, const double *y, int incy, double *result ) { - return cublasDdot( handle, n, x, incx, y, incy, result ); + return cublasDdot( handle, n, x, incx, y, incy, result ); } @@ -91,14 +91,14 @@ inline cublasStatus_t cublasGnrm2( cublasHandle_t handle, int n, const float *x, int incx, float *result ) { - return cublasSnrm2( handle, n, x, incx, result ); + return cublasSnrm2( handle, n, x, incx, result ); } inline cublasStatus_t cublasGnrm2( cublasHandle_t handle, int n, const double *x, int incx, double *result ) { - return cublasDnrm2( handle, n, x, incx, result ); + return cublasDnrm2( handle, n, x, incx, result ); } @@ -107,7 +107,7 @@ cublasGscal( cublasHandle_t handle, int n, const float *alpha, float *x, int incx ) { - return cublasSscal( handle, n, alpha, x, incx ); + return cublasSscal( handle, n, alpha, x, incx ); } inline cublasStatus_t @@ -115,7 +115,7 @@ cublasGscal( cublasHandle_t handle, int n, const double *alpha, double *x, int incx ) { - return cublasDscal( handle, n, alpha, x, incx ); + return cublasDscal( handle, n, alpha, x, incx ); } #endif diff --git a/src/Benchmarks/BLAS/spmv.h b/src/Benchmarks/BLAS/spmv.h new file mode 100644 index 0000000000000000000000000000000000000000..a6840af9f8c1f1dedeee876fd2bca258a55c64ba --- /dev/null +++ b/src/Benchmarks/BLAS/spmv.h @@ -0,0 +1,189 @@ +/*************************************************************************** + spmv.h - description + ------------------- + begin : Dec 30, 2015 + copyright : (C) 2015 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Jakub Klinkovsky + +#pragma once + +#include "../Benchmarks.h" + +#include <TNL/Containers/List.h> +#include <TNL/Pointers/DevicePointer.h> +#include <TNL/Matrices/CSR.h> +#include <TNL/Matrices/Ellpack.h> +#include <TNL/Matrices/SlicedEllpack.h> +#include <TNL/Matrices/ChunkedEllpack.h> + +namespace TNL { +namespace Benchmarks { + +// silly alias to match the number of template parameters with other formats +template< typename Real, typename Device, typename Index > +using SlicedEllpack = Matrices::SlicedEllpack< Real, Device, Index >; + +template< typename Matrix > +int setHostTestMatrix( Matrix& matrix, + const int elementsPerRow ) +{ + const int size = matrix.getRows(); + int elements( 0 ); + for( int row = 0; row < size; row++ ) { + int col = row - elementsPerRow / 2; + for( int element = 0; element < elementsPerRow; element++ ) { + if( col + element >= 0 && + col + element < size ) + { + matrix.setElement( row, col + element, element + 1 ); + elements++; + } + } + } + return elements; +} + +#ifdef HAVE_CUDA +template< typename Matrix > +__global__ void setCudaTestMatrixKernel( Matrix* matrix, + const int elementsPerRow, + const int gridIdx ) +{ + const int rowIdx = ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; + if( rowIdx >= matrix->getRows() ) + return; + int col = rowIdx - elementsPerRow / 2; + for( int element = 0; element < elementsPerRow; element++ ) { + if( col + element >= 0 && + col + element < matrix->getColumns() ) + matrix->setElementFast( rowIdx, col + element, element + 1 ); + } +} +#endif + +template< typename Matrix > +void setCudaTestMatrix( Matrix& matrix, + const int elementsPerRow ) +{ +#ifdef HAVE_CUDA + typedef typename Matrix::IndexType IndexType; + typedef typename Matrix::RealType RealType; + Pointers::DevicePointer< Matrix > kernel_matrix( matrix ); + dim3 cudaBlockSize( 256 ), cudaGridSize( Devices::Cuda::getMaxGridSize() ); + const IndexType cudaBlocks = roundUpDivision( matrix.getRows(), cudaBlockSize.x ); + const IndexType cudaGrids = roundUpDivision( cudaBlocks, Devices::Cuda::getMaxGridSize() ); + for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ ) { + if( gridIdx == cudaGrids - 1 ) + cudaGridSize.x = cudaBlocks % Devices::Cuda::getMaxGridSize(); + setCudaTestMatrixKernel< Matrix > + <<< cudaGridSize, cudaBlockSize >>> + ( &kernel_matrix.template modifyData< Devices::Cuda >(), elementsPerRow, gridIdx ); + TNL_CHECK_CUDA_DEVICE; + } +#endif +} + + +// TODO: rename as benchmark_SpMV_synthetic and move to spmv-synthetic.h +template< typename Real, + template< typename, typename, typename > class Matrix, + template< typename, typename, typename > class Vector = Containers::Vector > +bool +benchmarkSpMV( Benchmark & benchmark, + const int & loops, + const int & size, + const int elementsPerRow = 5 ) +{ + typedef Matrix< Real, Devices::Host, int > HostMatrix; + typedef Matrix< Real, Devices::Cuda, int > DeviceMatrix; + typedef Containers::Vector< Real, Devices::Host, int > HostVector; + typedef Containers::Vector< Real, Devices::Cuda, int > CudaVector; + + HostMatrix hostMatrix; + DeviceMatrix deviceMatrix; + Containers::Vector< int, Devices::Host, int > hostRowLengths; + Containers::Vector< int, Devices::Cuda, int > deviceRowLengths; + HostVector hostVector, hostVector2; + CudaVector deviceVector, deviceVector2; + + // create benchmark group + Containers::List< String > parsedType; + parseObjectType( HostMatrix::getType(), parsedType ); + benchmark.createHorizontalGroup( parsedType[ 0 ], 2 ); + + hostRowLengths.setSize( size ); + hostMatrix.setDimensions( size, size ); + hostVector.setSize( size ); + hostVector2.setSize( size ); +#ifdef HAVE_CUDA + deviceRowLengths.setSize( size ); + deviceMatrix.setDimensions( size, size ); + deviceVector.setSize( size ); + deviceVector2.setSize( size ); +#endif + + hostRowLengths.setValue( elementsPerRow ); +#ifdef HAVE_CUDA + deviceRowLengths.setValue( elementsPerRow ); +#endif + + hostMatrix.setCompressedRowLengths( hostRowLengths ); +#ifdef HAVE_CUDA + deviceMatrix.setCompressedRowLengths( deviceRowLengths ); +#endif + + const int elements = setHostTestMatrix< HostMatrix >( hostMatrix, elementsPerRow ); + setCudaTestMatrix< DeviceMatrix >( deviceMatrix, elementsPerRow ); + const double datasetSize = ( double ) loops * elements * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB; + + // reset function + auto reset = [&]() { + hostVector.setValue( 1.0 ); + hostVector2.setValue( 0.0 ); +#ifdef HAVE_CUDA + deviceVector.setValue( 1.0 ); + deviceVector2.setValue( 0.0 ); +#endif + }; + + // compute functions + auto spmvHost = [&]() { + hostMatrix.vectorProduct( hostVector, hostVector2 ); + }; + auto spmvCuda = [&]() { + deviceMatrix.vectorProduct( deviceVector, deviceVector2 ); + }; + + benchmark.setOperation( datasetSize ); + benchmark.time( reset, "CPU", spmvHost ); +#ifdef HAVE_CUDA + benchmark.time( reset, "GPU", spmvCuda ); +#endif + + return true; +} + +template< typename Real = double, + typename Index = int > +bool +benchmarkSpmvSynthetic( Benchmark & benchmark, + const int & loops, + const int & size, + const int & elementsPerRow ) +{ + bool result = true; + // TODO: benchmark all formats from tnl-benchmark-spmv (different parameters of the base formats) + result |= benchmarkSpMV< Real, Matrices::CSR >( benchmark, loops, size, elementsPerRow ); + result |= benchmarkSpMV< Real, Matrices::Ellpack >( benchmark, loops, size, elementsPerRow ); + result |= benchmarkSpMV< Real, SlicedEllpack >( benchmark, loops, size, elementsPerRow ); + result |= benchmarkSpMV< Real, Matrices::ChunkedEllpack >( benchmark, loops, size, elementsPerRow ); + return result; +} + +} // namespace Benchmarks +} // namespace TNL diff --git a/tests/benchmarks/tnl-benchmark-blas.cpp b/src/Benchmarks/BLAS/tnl-benchmark-blas.cpp similarity index 100% rename from tests/benchmarks/tnl-benchmark-blas.cpp rename to src/Benchmarks/BLAS/tnl-benchmark-blas.cpp diff --git a/tests/benchmarks/tnl-benchmark-blas.cu b/src/Benchmarks/BLAS/tnl-benchmark-blas.cu similarity index 100% rename from tests/benchmarks/tnl-benchmark-blas.cu rename to src/Benchmarks/BLAS/tnl-benchmark-blas.cu diff --git a/src/Benchmarks/BLAS/tnl-benchmark-blas.h b/src/Benchmarks/BLAS/tnl-benchmark-blas.h new file mode 100644 index 0000000000000000000000000000000000000000..73ea0b375a23440105ea6dfbae599c63fb108adb --- /dev/null +++ b/src/Benchmarks/BLAS/tnl-benchmark-blas.h @@ -0,0 +1,192 @@ +/*************************************************************************** + tnl-benchmark-blas.h - description + ------------------- + begin : Jan 27, 2010 + copyright : (C) 2010 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Jakub Klinkovsky + +#pragma once + +#include <TNL/Devices/Host.h> +#include <TNL/Devices/CudaDeviceInfo.h> +#include <TNL/Devices/SystemInfo.h> +#include <TNL/Config/ConfigDescription.h> +#include <TNL/Config/ParameterContainer.h> + +#include "array-operations.h" +#include "vector-operations.h" +#include "spmv.h" + +using namespace TNL; +using namespace TNL::Benchmarks; + + +// TODO: should benchmarks check the result of the computation? + + +template< typename Real > +void +runBlasBenchmarks( Benchmark & benchmark, + Benchmark::MetadataMap metadata, + const std::size_t & minSize, + const std::size_t & maxSize, + const double & sizeStepFactor, + const unsigned & loops, + const unsigned & elementsPerRow ) +{ + const String precision = getType< Real >(); + metadata["precision"] = precision; + + // Array operations + benchmark.newBenchmark( String("Array operations (") + precision + ")", + metadata ); + for( std::size_t size = minSize; size <= maxSize; size *= 2 ) { + benchmark.setMetadataColumns( Benchmark::MetadataColumns({ + {"size", size}, + } )); + benchmarkArrayOperations< Real >( benchmark, loops, size ); + } + + // Vector operations + benchmark.newBenchmark( String("Vector operations (") + precision + ")", + metadata ); + for( std::size_t size = minSize; size <= maxSize; size *= sizeStepFactor ) { + benchmark.setMetadataColumns( Benchmark::MetadataColumns({ + {"size", size}, + } )); + benchmarkVectorOperations< Real >( benchmark, loops, size ); + } + + // Sparse matrix-vector multiplication + benchmark.newBenchmark( String("Sparse matrix-vector multiplication (") + precision + ")", + metadata ); + for( std::size_t size = minSize; size <= maxSize; size *= 2 ) { + benchmark.setMetadataColumns( Benchmark::MetadataColumns({ + {"rows", size}, + {"columns", size}, + {"elements per row", elementsPerRow}, + } )); + benchmarkSpmvSynthetic< Real >( benchmark, loops, size, elementsPerRow ); + } +} + +void +setupConfig( Config::ConfigDescription & config ) +{ + config.addDelimiter( "Benchmark settings:" ); + config.addEntry< String >( "log-file", "Log file name.", "tnl-benchmark-blas.log"); + config.addEntry< String >( "output-mode", "Mode for opening the log file.", "overwrite" ); + config.addEntryEnum( "append" ); + config.addEntryEnum( "overwrite" ); + config.addEntry< String >( "precision", "Precision of the arithmetics.", "double" ); + config.addEntryEnum( "float" ); + config.addEntryEnum( "double" ); + config.addEntryEnum( "all" ); + config.addEntry< int >( "min-size", "Minimum size of arrays/vectors used in the benchmark.", 100000 ); + config.addEntry< int >( "max-size", "Minimum size of arrays/vectors used in the benchmark.", 10000000 ); + config.addEntry< int >( "size-step-factor", "Factor determining the size of arrays/vectors used in the benchmark. First size is min-size and each following size is stepFactor*previousSize, up to max-size.", 2 ); + config.addEntry< int >( "loops", "Number of iterations for every computation.", 10 ); + config.addEntry< int >( "elements-per-row", "Number of elements per row of the sparse matrix used in the matrix-vector multiplication benchmark.", 5 ); + config.addEntry< int >( "verbose", "Verbose mode.", 1 ); + + config.addDelimiter( "Device settings:" ); + Devices::Host::configSetup( config ); + Devices::Cuda::configSetup( config ); +} + +int +main( int argc, char* argv[] ) +{ + Config::ParameterContainer parameters; + Config::ConfigDescription conf_desc; + + setupConfig( conf_desc ); + + if( ! parseCommandLine( argc, argv, conf_desc, parameters ) ) { + conf_desc.printUsage( argv[ 0 ] ); + return 1; + } + + Devices::Host::setup( parameters ); + Devices::Cuda::setup( parameters ); + + const String & logFileName = parameters.getParameter< String >( "log-file" ); + const String & outputMode = parameters.getParameter< String >( "output-mode" ); + const String & precision = parameters.getParameter< String >( "precision" ); + // FIXME: getParameter< std::size_t >() does not work with parameters added with addEntry< int >(), + // which have a default value. The workaround below works for int values, but it is not possible + // to pass 64-bit integer values +// const std::size_t minSize = parameters.getParameter< std::size_t >( "min-size" ); +// const std::size_t maxSize = parameters.getParameter< std::size_t >( "max-size" ); + const std::size_t minSize = parameters.getParameter< int >( "min-size" ); + const std::size_t maxSize = parameters.getParameter< int >( "max-size" ); + const unsigned sizeStepFactor = parameters.getParameter< unsigned >( "size-step-factor" ); + const unsigned loops = parameters.getParameter< unsigned >( "loops" ); + const unsigned elementsPerRow = parameters.getParameter< unsigned >( "elements-per-row" ); + const unsigned verbose = parameters.getParameter< unsigned >( "verbose" ); + + if( sizeStepFactor <= 1 ) { + std::cerr << "The value of --size-step-factor must be greater than 1." << std::endl; + return EXIT_FAILURE; + } + + // open log file + auto mode = std::ios::out; + if( outputMode == "append" ) + mode |= std::ios::app; + std::ofstream logFile( logFileName.getString(), mode ); + + // init benchmark and common metadata + Benchmark benchmark( loops, verbose ); + + // prepare global metadata + const int cpu_id = 0; + Devices::CacheSizes cacheSizes = Devices::SystemInfo::getCPUCacheSizes( cpu_id ); + String cacheInfo = String( cacheSizes.L1data ) + ", " + + String( cacheSizes.L1instruction ) + ", " + + String( cacheSizes.L2 ) + ", " + + String( cacheSizes.L3 ); +#ifdef HAVE_CUDA + const int activeGPU = Devices::CudaDeviceInfo::getActiveDevice(); + const String deviceArch = String( Devices::CudaDeviceInfo::getArchitectureMajor( activeGPU ) ) + "." + + String( Devices::CudaDeviceInfo::getArchitectureMinor( activeGPU ) ); +#endif + Benchmark::MetadataMap metadata { + { "host name", Devices::SystemInfo::getHostname() }, + { "architecture", Devices::SystemInfo::getArchitecture() }, + { "system", Devices::SystemInfo::getSystemName() }, + { "system release", Devices::SystemInfo::getSystemRelease() }, + { "start time", Devices::SystemInfo::getCurrentTime() }, + { "CPU model name", Devices::SystemInfo::getCPUModelName( cpu_id ) }, + { "CPU cores", Devices::SystemInfo::getNumberOfCores( cpu_id ) }, + { "CPU threads per core", Devices::SystemInfo::getNumberOfThreads( cpu_id ) / Devices::SystemInfo::getNumberOfCores( cpu_id ) }, + { "CPU max frequency (MHz)", Devices::SystemInfo::getCPUMaxFrequency( cpu_id ) / 1e3 }, + { "CPU cache sizes (L1d, L1i, L2, L3) (kiB)", cacheInfo }, +#ifdef HAVE_CUDA + { "GPU name", Devices::CudaDeviceInfo::getDeviceName( activeGPU ) }, + { "GPU architecture", deviceArch }, + { "GPU CUDA cores", Devices::CudaDeviceInfo::getCudaCores( activeGPU ) }, + { "GPU clock rate (MHz)", (double) Devices::CudaDeviceInfo::getClockRate( activeGPU ) / 1e3 }, + { "GPU global memory (GB)", (double) Devices::CudaDeviceInfo::getGlobalMemory( activeGPU ) / 1e9 }, + { "GPU memory clock rate (MHz)", (double) Devices::CudaDeviceInfo::getMemoryClockRate( activeGPU ) / 1e3 }, + { "GPU memory ECC enabled", Devices::CudaDeviceInfo::getECCEnabled( activeGPU ) }, +#endif + }; + + if( precision == "all" || precision == "float" ) + runBlasBenchmarks< float >( benchmark, metadata, minSize, maxSize, sizeStepFactor, loops, elementsPerRow ); + if( precision == "all" || precision == "double" ) + runBlasBenchmarks< double >( benchmark, metadata, minSize, maxSize, sizeStepFactor, loops, elementsPerRow ); + + if( ! benchmark.save( logFile ) ) { + std::cerr << "Failed to write the benchmark results to file '" << parameters.getParameter< String >( "log-file" ) << "'." << std::endl; + return EXIT_FAILURE; + } + + return EXIT_SUCCESS; +} diff --git a/src/Benchmarks/BLAS/vector-operations.h b/src/Benchmarks/BLAS/vector-operations.h new file mode 100644 index 0000000000000000000000000000000000000000..e65f8980b1066e042206e328d15b50e32c81432f --- /dev/null +++ b/src/Benchmarks/BLAS/vector-operations.h @@ -0,0 +1,440 @@ +/*************************************************************************** + vector-operations.h - description + ------------------- + begin : Dec 30, 2015 + copyright : (C) 2015 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Jakub Klinkovsky + +#pragma once + +#include <stdlib.h> // srand48 + +#include "../Benchmarks.h" + +#include <TNL/Containers/Vector.h> + +#ifdef HAVE_CUDA +#include "cublasWrappers.h" +#endif + +namespace TNL { +namespace Benchmarks { + +template< typename Real = double, + typename Index = int > +bool +benchmarkVectorOperations( Benchmark & benchmark, + const int & loops, + const long & size ) +{ + typedef Containers::Vector< Real, Devices::Host, Index > HostVector; + typedef Containers::Vector< Real, Devices::Cuda, Index > CudaVector; + using namespace std; + + double datasetSize = ( double ) ( loops * size ) * sizeof( Real ) / oneGB; + + HostVector hostVector, hostVector2; + CudaVector deviceVector, deviceVector2; + hostVector.setSize( size ); + hostVector2.setSize( size ); +#ifdef HAVE_CUDA + deviceVector.setSize( size ); + deviceVector2.setSize( size ); +#endif + + Real resultHost, resultDevice; + +#ifdef HAVE_CUDA + cublasHandle_t cublasHandle; + cublasCreate( &cublasHandle ); +#endif + + + // reset functions + // (Make sure to always use some in benchmarks, even if it's not necessary + // to assure correct result - it helps to clear cache and avoid optimizations + // of the benchmark loop.) + auto reset1 = [&]() { + hostVector.setValue( 1.0 ); +#ifdef HAVE_CUDA + deviceVector.setValue( 1.0 ); +#endif + // A relatively harmless call to keep the compiler from realizing we + // don't actually do any useful work with the result of the reduciton. + srand48(resultHost); + resultHost = resultDevice = 0.0; + }; + auto reset2 = [&]() { + hostVector2.setValue( 1.0 ); +#ifdef HAVE_CUDA + deviceVector2.setValue( 1.0 ); +#endif + }; + auto reset12 = [&]() { + reset1(); + reset2(); + }; + + + reset12(); + + + auto maxHost = [&]() { + resultHost = hostVector.max(); + }; + auto maxHostGeneral = [&]() { + Real result( 0 ); + Containers::Algorithms::ParallelReductionMax< Real > operation; + Containers::Algorithms::Reduction< Devices::Host >::reduce( + operation, + hostVector.getSize(), + hostVector.getData(), + ( Real* ) 0, + result ); + return result; + }; + auto maxCuda = [&]() { + resultDevice = deviceVector.max(); + }; + benchmark.setOperation( "max", datasetSize ); + benchmark.time( reset1, "CPU", maxHost ); + benchmark.time( reset1, "CPU (general)", maxHostGeneral ); +#ifdef HAVE_CUDA + benchmark.time( reset1, "GPU", maxCuda ); +#endif + + + auto minHost = [&]() { + resultHost = hostVector.min(); + }; + auto minHostGeneral = [&]() { + Real result( 0 ); + Containers::Algorithms::ParallelReductionMin< Real > operation; + Containers::Algorithms::Reduction< Devices::Host >::reduce( + operation, + hostVector.getSize(), + hostVector.getData(), + ( Real* ) 0, + result ); + return result; + }; + auto minCuda = [&]() { + resultDevice = deviceVector.min(); + }; + benchmark.setOperation( "min", datasetSize ); + benchmark.time( reset1, "CPU", minHost ); + benchmark.time( reset1, "CPU (general)", minHostGeneral ); +#ifdef HAVE_CUDA + benchmark.time( reset1, "GPU", minCuda ); +#endif + + + auto absMaxHost = [&]() { + resultHost = hostVector.absMax(); + }; + auto absMaxHostGeneral = [&]() { + Real result( 0 ); + Containers::Algorithms::ParallelReductionAbsMax< Real > operation; + Containers::Algorithms::Reduction< Devices::Host >::reduce( + operation, + hostVector.getSize(), + hostVector.getData(), + ( Real* ) 0, + result ); + return result; + }; + auto absMaxCuda = [&]() { + resultDevice = deviceVector.absMax(); + }; +#ifdef HAVE_CUDA + auto absMaxCublas = [&]() { + int index = 0; + cublasIgamax( cublasHandle, size, + deviceVector.getData(), 1, + &index ); + resultDevice = deviceVector.getElement( index ); + }; +#endif + benchmark.setOperation( "absMax", datasetSize ); + benchmark.time( reset1, "CPU", absMaxHost ); + benchmark.time( reset1, "CPU (general)", absMaxHostGeneral ); +#ifdef HAVE_CUDA + benchmark.time( reset1, "GPU", absMaxCuda ); + benchmark.time( reset1, "cuBLAS", absMaxCublas ); +#endif + + + auto absMinHost = [&]() { + resultHost = hostVector.absMin(); + }; + auto absMinHostGeneral = [&]() { + Real result( 0 ); + Containers::Algorithms::ParallelReductionAbsMin< Real > operation; + Containers::Algorithms::Reduction< Devices::Host >::reduce( + operation, + hostVector.getSize(), + hostVector.getData(), + ( Real* ) 0, + result ); + return result; + }; + auto absMinCuda = [&]() { + resultDevice = deviceVector.absMin(); + }; +#ifdef HAVE_CUDA + auto absMinCublas = [&]() { + int index = 0; + cublasIgamin( cublasHandle, size, + deviceVector.getData(), 1, + &index ); + resultDevice = deviceVector.getElement( index ); + }; +#endif + benchmark.setOperation( "absMin", datasetSize ); + benchmark.time( reset1, "CPU", absMinHost ); + benchmark.time( reset1, "CPU (general)", absMinHostGeneral ); +#ifdef HAVE_CUDA + benchmark.time( reset1, "GPU", absMinCuda ); + benchmark.time( reset1, "cuBLAS", absMinCublas ); +#endif + + + auto sumHost = [&]() { + resultHost = hostVector.sum(); + }; + auto sumHostGeneral = [&]() { + Real result( 0 ); + Containers::Algorithms::ParallelReductionSum< Real > operation; + Containers::Algorithms::Reduction< Devices::Host >::reduce( + operation, + hostVector.getSize(), + hostVector.getData(), + ( Real* ) 0, + result ); + return result; + }; + auto sumCuda = [&]() { + resultDevice = deviceVector.sum(); + }; + benchmark.setOperation( "sum", datasetSize ); + benchmark.time( reset1, "CPU", sumHost ); + benchmark.time( reset1, "CPU (general)", sumHostGeneral ); +#ifdef HAVE_CUDA + benchmark.time( reset1, "GPU", sumCuda ); +#endif + + + auto l1normHost = [&]() { + resultHost = hostVector.lpNorm( 1.0 ); + }; + auto l1normHostGeneral = [&]() { + Real result( 0 ); + Containers::Algorithms::ParallelReductionAbsSum< Real > operation; + Containers::Algorithms::Reduction< Devices::Host >::reduce( + operation, + hostVector.getSize(), + hostVector.getData(), + ( Real* ) 0, + result ); + return result; + }; + auto l1normCuda = [&]() { + resultDevice = deviceVector.lpNorm( 1.0 ); + }; +#ifdef HAVE_CUDA + auto l1normCublas = [&]() { + cublasGasum( cublasHandle, size, + deviceVector.getData(), 1, + &resultDevice ); + }; +#endif + benchmark.setOperation( "l1 norm", datasetSize ); + benchmark.time( reset1, "CPU", l1normHost ); + benchmark.time( reset1, "CPU (general)", l1normHostGeneral ); +#ifdef HAVE_CUDA + benchmark.time( reset1, "GPU", l1normCuda ); + benchmark.time( reset1, "cuBLAS", l1normCublas ); +#endif + + + auto l2normHost = [&]() { + resultHost = hostVector.lpNorm( 2.0 ); + }; + auto l2normHostGeneral = [&]() { + Real result( 0 ); + Containers::Algorithms::ParallelReductionL2Norm< Real > operation; + Containers::Algorithms::Reduction< Devices::Host >::reduce( + operation, + hostVector.getSize(), + hostVector.getData(), + ( Real* ) 0, + result ); + return result; + }; + auto l2normCuda = [&]() { + resultDevice = deviceVector.lpNorm( 2.0 ); + }; +#ifdef HAVE_CUDA + auto l2normCublas = [&]() { + cublasGnrm2( cublasHandle, size, + deviceVector.getData(), 1, + &resultDevice ); + }; +#endif + benchmark.setOperation( "l2 norm", datasetSize ); + benchmark.time( reset1, "CPU", l2normHost ); + benchmark.time( reset1, "CPU (general)", l2normHostGeneral ); +#ifdef HAVE_CUDA + benchmark.time( reset1, "GPU", l2normCuda ); + benchmark.time( reset1, "cuBLAS", l2normCublas ); +#endif + + + auto l3normHost = [&]() { + resultHost = hostVector.lpNorm( 3.0 ); + }; + auto l3normHostGeneral = [&]() { + Real result( 0 ); + Containers::Algorithms::ParallelReductionLpNorm< Real > operation; + operation.setPower( 3.0 ); + Containers::Algorithms::Reduction< Devices::Host >::reduce( + operation, + hostVector.getSize(), + hostVector.getData(), + ( Real* ) 0, + result ); + return result; + }; + auto l3normCuda = [&]() { + resultDevice = deviceVector.lpNorm( 3.0 ); + }; + benchmark.setOperation( "l3 norm", datasetSize ); + benchmark.time( reset1, "CPU", l3normHost ); + benchmark.time( reset1, "CPU (general)", l3normHostGeneral ); +#ifdef HAVE_CUDA + benchmark.time( reset1, "GPU", l3normCuda ); +#endif + + + auto scalarProductHost = [&]() { + resultHost = hostVector.scalarProduct( hostVector2 ); + }; + auto scalarProductHostGeneral = [&]() { + Real result( 0 ); + Containers::Algorithms::ParallelReductionScalarProduct< Real, Real > operation; + Containers::Algorithms::Reduction< Devices::Host >::reduce( + operation, + hostVector.getSize(), + hostVector.getData(), + hostVector2.getData(), + result ); + return result; + }; + auto scalarProductCuda = [&]() { + resultDevice = deviceVector.scalarProduct( deviceVector2 ); + }; +#ifdef HAVE_CUDA + auto scalarProductCublas = [&]() { + cublasGdot( cublasHandle, size, + deviceVector.getData(), 1, + deviceVector2.getData(), 1, + &resultDevice ); + }; +#endif + benchmark.setOperation( "scalar product", 2 * datasetSize ); + benchmark.time( reset1, "CPU", scalarProductHost ); + benchmark.time( reset1, "CPU (general)", scalarProductHostGeneral ); +#ifdef HAVE_CUDA + benchmark.time( reset1, "GPU", scalarProductCuda ); + benchmark.time( reset1, "cuBLAS", scalarProductCublas ); +#endif + + /* + std::cout << "Benchmarking prefix-sum:" << std::endl; + timer.reset(); + timer.start(); + hostVector.computePrefixSum(); + timer.stop(); + timeHost = timer.getTime(); + bandwidth = 2 * datasetSize / loops / timer.getTime(); + std::cout << " CPU: bandwidth: " << bandwidth << " GB/sec, time: " << timer.getTime() << " sec." << std::endl; + + timer.reset(); + timer.start(); + deviceVector.computePrefixSum(); + timer.stop(); + timeDevice = timer.getTime(); + bandwidth = 2 * datasetSize / loops / timer.getTime(); + std::cout << " GPU: bandwidth: " << bandwidth << " GB/sec, time: " << timer.getTime() << " sec." << std::endl; + std::cout << " CPU/GPU speedup: " << timeHost / timeDevice << std::endl; + + HostVector auxHostVector; + auxHostVector.setLike( deviceVector ); + auxHostVector = deviceVector; + for( int i = 0; i < size; i++ ) + if( hostVector.getElement( i ) != auxHostVector.getElement( i ) ) + { + std::cerr << "Error in prefix sum at position " << i << ": " << hostVector.getElement( i ) << " != " << auxHostVector.getElement( i ) << std::endl; + } + */ + + + auto multiplyHost = [&]() { + hostVector *= 0.5; + }; + auto multiplyCuda = [&]() { + deviceVector *= 0.5; + }; +#ifdef HAVE_CUDA + auto multiplyCublas = [&]() { + const Real alpha = 0.5; + cublasGscal( cublasHandle, size, + &alpha, + deviceVector.getData(), 1 ); + }; +#endif + benchmark.setOperation( "scalar multiplication", 2 * datasetSize ); + benchmark.time( reset1, "CPU", multiplyHost ); +#ifdef HAVE_CUDA + benchmark.time( reset1, "GPU", multiplyCuda ); + benchmark.time( reset1, "cuBLAS", multiplyCublas ); +#endif + + + auto addVectorHost = [&]() { + hostVector.addVector( hostVector2 ); + }; + auto addVectorCuda = [&]() { + deviceVector.addVector( deviceVector2 ); + }; +#ifdef HAVE_CUDA + auto addVectorCublas = [&]() { + const Real alpha = 1.0; + cublasGaxpy( cublasHandle, size, + &alpha, + deviceVector2.getData(), 1, + deviceVector.getData(), 1 ); + }; +#endif + benchmark.setOperation( "vector addition", 3 * datasetSize ); + benchmark.time( reset1, "CPU", addVectorHost ); +#ifdef HAVE_CUDA + benchmark.time( reset1, "GPU", addVectorCuda ); + benchmark.time( reset1, "cuBLAS", addVectorCublas ); +#endif + + +#ifdef HAVE_CUDA + cublasDestroy( cublasHandle ); +#endif + + return true; +} + +} // namespace Benchmarks +} // namespace TNL diff --git a/src/Benchmarks/Benchmarks.h b/src/Benchmarks/Benchmarks.h new file mode 100644 index 0000000000000000000000000000000000000000..60decddc8c04c9422451b1581f8db044655a93ba --- /dev/null +++ b/src/Benchmarks/Benchmarks.h @@ -0,0 +1,456 @@ +/*************************************************************************** + benchmarks.h - description + ------------------- + begin : Dec 30, 2015 + copyright : (C) 2015 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Jakub Klinkovsky + +#pragma once + +#include <iostream> +#include <iomanip> +#include <map> +#include <vector> + +#include <TNL/Timer.h> +#include <TNL/String.h> +#include <TNL/Solvers/IterativeSolverMonitor.h> + +namespace TNL { +namespace Benchmarks { + +const double oneGB = 1024.0 * 1024.0 * 1024.0; + +template< typename ComputeFunction, + typename ResetFunction, + typename Monitor = TNL::Solvers::IterativeSolverMonitor< double, int > > +double +timeFunction( ComputeFunction compute, + ResetFunction reset, + int loops, + Monitor && monitor = Monitor() ) +{ + // the timer is constructed zero-initialized and stopped + Timer timer; + + // set timer to the monitor + monitor.setTimer( timer ); + + // warm up + reset(); + compute(); + + for(int i = 0; i < loops; ++i) { + // abuse the monitor's "time" for loops + monitor.setTime( i + 1 ); + + reset(); + + // Explicit synchronization of the CUDA device + // TODO: not necessary for host computations +#ifdef HAVE_CUDA + cudaDeviceSynchronize(); +#endif + timer.start(); + compute(); +#ifdef HAVE_CUDA + cudaDeviceSynchronize(); +#endif + timer.stop(); + } + + return timer.getRealTime(); +} + + +class Logging +{ +public: + using MetadataElement = std::pair< const char*, String >; + using MetadataMap = std::map< const char*, String >; + using MetadataColumns = std::vector<MetadataElement>; + + using HeaderElements = std::initializer_list< String >; + using RowElements = std::initializer_list< double >; + + Logging( bool verbose = true ) + : verbose(verbose) + {} + + void + writeTitle( const String & title ) + { + if( verbose ) + std::cout << std::endl << "== " << title << " ==" << std::endl << std::endl; + log << ": title = " << title << std::endl; + } + + void + writeMetadata( const MetadataMap & metadata ) + { + if( verbose ) + std::cout << "properties:" << std::endl; + + for( auto & it : metadata ) { + if( verbose ) + std::cout << " " << it.first << " = " << it.second << std::endl; + log << ": " << it.first << " = " << it.second << std::endl; + } + if( verbose ) + std::cout << std::endl; + } + + void + writeTableHeader( const String & spanningElement, + const HeaderElements & subElements ) + { + using namespace std; + + if( verbose && header_changed ) { + for( auto & it : metadataColumns ) { + std::cout << std::setw( 20 ) << it.first; + } + + // spanning element is printed as usual column to stdout, + // but is excluded from header + std::cout << std::setw( 15 ) << ""; + + for( auto & it : subElements ) { + std::cout << std::setw( 15 ) << it; + } + std::cout << std::endl; + + header_changed = false; + } + + // initial indent string + header_indent = "!"; + log << std::endl; + for( auto & it : metadataColumns ) { + log << header_indent << " " << it.first << std::endl; + } + + // dump stacked spanning columns + if( horizontalGroups.size() > 0 ) + while( horizontalGroups.back().second <= 0 ) { + horizontalGroups.pop_back(); + header_indent.pop_back(); + } + for( size_t i = 0; i < horizontalGroups.size(); i++ ) { + if( horizontalGroups[ i ].second > 0 ) { + log << header_indent << " " << horizontalGroups[ i ].first << std::endl; + header_indent += "!"; + } + } + + log << header_indent << " " << spanningElement << std::endl; + for( auto & it : subElements ) { + log << header_indent << "! " << it << std::endl; + } + + if( horizontalGroups.size() > 0 ) { + horizontalGroups.back().second--; + header_indent.pop_back(); + } + } + + void + writeTableRow( const String & spanningElement, + const RowElements & subElements ) + { + using namespace std; + + if( verbose ) { + for( auto & it : metadataColumns ) { + std::cout << std::setw( 20 ) << it.second; + } + // spanning element is printed as usual column to stdout + std::cout << std::setw( 15 ) << spanningElement; + for( auto & it : subElements ) { + std::cout << std::setw( 15 ); + if( it != 0.0 )std::cout << it; + else std::cout << "N/A"; + } + std::cout << std::endl; + } + + // only when changed (the header has been already adjusted) + // print each element on separate line + for( auto & it : metadataColumns ) { + log << it.second << std::endl; + } + + // benchmark data are indented + const String indent = " "; + for( auto & it : subElements ) { + if( it != 0.0 ) log << indent << it << std::endl; + else log << indent << "N/A" << std::endl; + } + } + + void + writeErrorMessage( const char* msg, + int colspan = 1 ) + { + // initial indent string + header_indent = "!"; + log << std::endl; + for( auto & it : metadataColumns ) { + log << header_indent << " " << it.first << std::endl; + } + + // make sure there is a header column for the message + if( horizontalGroups.size() == 0 ) + horizontalGroups.push_back( {"", 1} ); + + // dump stacked spanning columns + while( horizontalGroups.back().second <= 0 ) { + horizontalGroups.pop_back(); + header_indent.pop_back(); + } + for( size_t i = 0; i < horizontalGroups.size(); i++ ) { + if( horizontalGroups[ i ].second > 0 ) { + log << header_indent << " " << horizontalGroups[ i ].first << std::endl; + header_indent += "!"; + } + } + if( horizontalGroups.size() > 0 ) { + horizontalGroups.back().second -= colspan; + header_indent.pop_back(); + } + + // only when changed (the header has been already adjusted) + // print each element on separate line + for( auto & it : metadataColumns ) { + log << it.second << std::endl; + } + log << msg << std::endl; + } + + void + closeTable() + { + log << std::endl; + header_indent = body_indent = ""; + header_changed = true; + horizontalGroups.clear(); + } + + bool save( std::ostream & logFile ) + { + closeTable(); + logFile << log.str(); + if( logFile.good() ) { + log.str() = ""; + return true; + } + return false; + } + +protected: + + // manual double -> String conversion with fixed precision + static String + _to_string( double num, int precision = 0, bool fixed = false ) + { + std::stringstream str; + if( fixed ) + str << std::fixed; + if( precision ) + str << std::setprecision( precision ); + str << num; + return String( str.str().data() ); + } + + std::stringstream log; + std::string header_indent; + std::string body_indent; + + bool verbose; + MetadataColumns metadataColumns; + bool header_changed = true; + std::vector< std::pair< String, int > > horizontalGroups; +}; + + +class Benchmark +: protected Logging +{ +public: + using Logging::MetadataElement; + using Logging::MetadataMap; + using Logging::MetadataColumns; + + Benchmark( int loops = 10, + bool verbose = true ) + : Logging(verbose), loops(loops) + {} + + // TODO: ensure that this is not called in the middle of the benchmark + // (or just remove it completely?) + void + setLoops( int loops ) + { + this->loops = loops; + } + + // Marks the start of a new benchmark + void + newBenchmark( const String & title ) + { + closeTable(); + writeTitle( title ); + monitor.setStage( title.getString() ); + } + + // Marks the start of a new benchmark (with custom metadata) + void + newBenchmark( const String & title, + MetadataMap metadata ) + { + closeTable(); + writeTitle( title ); + monitor.setStage( title.getString() ); + // add loops to metadata + metadata["loops"] = String(loops); + writeMetadata( metadata ); + } + + // Sets metadata columns -- values used for all subsequent rows until + // the next call to this function. + void + setMetadataColumns( const MetadataColumns & metadata ) + { + if( metadataColumns != metadata ) + header_changed = true; + metadataColumns = metadata; + } + + // TODO: maybe should be renamed to createVerticalGroup and ensured that vertical and horizontal groups are not used within the same "Benchmark" + // Sets current operation -- operations expand the table vertically + // - baseTime should be reset to 0.0 for most operations, but sometimes + // it is useful to override it + // - Order of operations inside a "Benchmark" does not matter, rows can be + // easily sorted while converting to HTML.) + void + setOperation( const String & operation, + const double datasetSize = 0.0, // in GB + const double baseTime = 0.0 ) + { + if( metadataColumns.size() > 0 && String(metadataColumns[ 0 ].first) == "operation" ) { + metadataColumns[ 0 ].second = operation; + } + else { + metadataColumns.insert( metadataColumns.begin(), {"operation", operation} ); + } + setOperation( datasetSize, baseTime ); + header_changed = true; + } + + void + setOperation( const double datasetSize = 0.0, + const double baseTime = 0.0 ) + { + this->datasetSize = datasetSize; + this->baseTime = baseTime; + } + + // Creates new horizontal groups inside a benchmark -- increases the number + // of columns in the "Benchmark", implies column spanning. + // (Useful e.g. for SpMV formats, different configurations etc.) + void + createHorizontalGroup( const String & name, + int subcolumns ) + { + if( horizontalGroups.size() == 0 ) { + horizontalGroups.push_back( {name, subcolumns} ); + } + else { + auto & last = horizontalGroups.back(); + if( last.first != name && last.second > 0 ) { + horizontalGroups.push_back( {name, subcolumns} ); + } + else { + last.first = name; + last.second = subcolumns; + } + } + } + + // Times a single ComputeFunction. Subsequent calls implicitly split + // the current "horizontal group" into sub-columns identified by + // "performer", which are further split into "bandwidth", "time" and + // "speedup" columns. + // TODO: allow custom columns bound to lambda functions (e.g. for Gflops calculation) + // Also terminates the recursion of the following variadic template. + template< typename ResetFunction, + typename ComputeFunction > + double + time( ResetFunction reset, + const String & performer, + ComputeFunction & compute ) + { + double time; + if( verbose ) { + // run the monitor main loop + Solvers::SolverMonitorThread monitor_thread( monitor ); + time = timeFunction( compute, reset, loops, monitor ); + } + else { + time = timeFunction( compute, reset, loops, monitor ); + } + + const double bandwidth = datasetSize / time; + const double speedup = this->baseTime / time; + if( this->baseTime == 0.0 ) + this->baseTime = time; + + writeTableHeader( performer, HeaderElements({"bandwidth", "time", "speedup"}) ); + writeTableRow( performer, RowElements({ bandwidth, time, speedup }) ); + + return this->baseTime; + } + + // Recursive template function to deal with multiple computations with the + // same reset function. + template< typename ResetFunction, + typename ComputeFunction, + typename... NextComputations > + inline double + time( ResetFunction reset, + const String & performer, + ComputeFunction & compute, + NextComputations & ... nextComputations ) + { + time( reset, performer, compute ); + time( reset, nextComputations... ); + return this->baseTime; + } + + // Adds an error message to the log. Should be called in places where the + // "time" method could not be called (e.g. due to failed allocation). + void + addErrorMessage( const char* msg, + int numberOfComputations = 1 ) + { + // each computation has 3 subcolumns + const int colspan = 3 * numberOfComputations; + writeErrorMessage( msg, colspan ); + } + + using Logging::save; + +protected: + int loops; + double datasetSize = 0.0; + double baseTime = 0.0; + Solvers::IterativeSolverMonitor< double, int > monitor; +}; + +} // namespace Benchmarks +} // namespace TNL diff --git a/src/Benchmarks/CMakeLists.txt b/src/Benchmarks/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..e34ade5be305bdee495749a38db1b9af004a6a92 --- /dev/null +++ b/src/Benchmarks/CMakeLists.txt @@ -0,0 +1,10 @@ +add_subdirectory( HeatEquation ) +add_subdirectory( BLAS ) +add_subdirectory( SpMV ) +add_subdirectory( LinearSolvers ) + +set( headers + Benchmarks.h +) + +install( FILES ${headers} DESTINATION ${TNL_TARGET_INCLUDE_DIRECTORY}/Benchmarks ) diff --git a/tests/benchmarks/heat-equation-benchmark/BenchmarkLaplace.h b/src/Benchmarks/HeatEquation/BenchmarkLaplace.h similarity index 100% rename from tests/benchmarks/heat-equation-benchmark/BenchmarkLaplace.h rename to src/Benchmarks/HeatEquation/BenchmarkLaplace.h diff --git a/tests/benchmarks/heat-equation-benchmark/BenchmarkLaplace_impl.h b/src/Benchmarks/HeatEquation/BenchmarkLaplace_impl.h similarity index 100% rename from tests/benchmarks/heat-equation-benchmark/BenchmarkLaplace_impl.h rename to src/Benchmarks/HeatEquation/BenchmarkLaplace_impl.h diff --git a/tests/benchmarks/heat-equation-benchmark/CMakeLists.txt b/src/Benchmarks/HeatEquation/CMakeLists.txt similarity index 100% rename from tests/benchmarks/heat-equation-benchmark/CMakeLists.txt rename to src/Benchmarks/HeatEquation/CMakeLists.txt diff --git a/tests/benchmarks/heat-equation-benchmark/DirichletBoundaryConditions.h b/src/Benchmarks/HeatEquation/DirichletBoundaryConditions.h similarity index 100% rename from tests/benchmarks/heat-equation-benchmark/DirichletBoundaryConditions.h rename to src/Benchmarks/HeatEquation/DirichletBoundaryConditions.h diff --git a/tests/benchmarks/heat-equation-benchmark/HeatEquationBenchmarkBuildConfigTag.h b/src/Benchmarks/HeatEquation/HeatEquationBenchmarkBuildConfigTag.h similarity index 100% rename from tests/benchmarks/heat-equation-benchmark/HeatEquationBenchmarkBuildConfigTag.h rename to src/Benchmarks/HeatEquation/HeatEquationBenchmarkBuildConfigTag.h diff --git a/tests/benchmarks/heat-equation-benchmark/HeatEquationBenchmarkProblem.h b/src/Benchmarks/HeatEquation/HeatEquationBenchmarkProblem.h similarity index 100% rename from tests/benchmarks/heat-equation-benchmark/HeatEquationBenchmarkProblem.h rename to src/Benchmarks/HeatEquation/HeatEquationBenchmarkProblem.h diff --git a/tests/benchmarks/heat-equation-benchmark/HeatEquationBenchmarkProblem_impl.h b/src/Benchmarks/HeatEquation/HeatEquationBenchmarkProblem_impl.h similarity index 100% rename from tests/benchmarks/heat-equation-benchmark/HeatEquationBenchmarkProblem_impl.h rename to src/Benchmarks/HeatEquation/HeatEquationBenchmarkProblem_impl.h diff --git a/tests/benchmarks/heat-equation-benchmark/HeatEquationBenchmarkRhs.h b/src/Benchmarks/HeatEquation/HeatEquationBenchmarkRhs.h similarity index 100% rename from tests/benchmarks/heat-equation-benchmark/HeatEquationBenchmarkRhs.h rename to src/Benchmarks/HeatEquation/HeatEquationBenchmarkRhs.h diff --git a/tests/benchmarks/heat-equation-benchmark/TestGridEntity.h b/src/Benchmarks/HeatEquation/TestGridEntity.h similarity index 100% rename from tests/benchmarks/heat-equation-benchmark/TestGridEntity.h rename to src/Benchmarks/HeatEquation/TestGridEntity.h diff --git a/tests/benchmarks/heat-equation-benchmark/Tuning/ExplicitUpdater.h b/src/Benchmarks/HeatEquation/Tuning/ExplicitUpdater.h similarity index 100% rename from tests/benchmarks/heat-equation-benchmark/Tuning/ExplicitUpdater.h rename to src/Benchmarks/HeatEquation/Tuning/ExplicitUpdater.h diff --git a/tests/benchmarks/heat-equation-benchmark/Tuning/GridTraverser.h b/src/Benchmarks/HeatEquation/Tuning/GridTraverser.h similarity index 100% rename from tests/benchmarks/heat-equation-benchmark/Tuning/GridTraverser.h rename to src/Benchmarks/HeatEquation/Tuning/GridTraverser.h diff --git a/tests/benchmarks/heat-equation-benchmark/Tuning/GridTraverser_impl.h b/src/Benchmarks/HeatEquation/Tuning/GridTraverser_impl.h similarity index 100% rename from tests/benchmarks/heat-equation-benchmark/Tuning/GridTraverser_impl.h rename to src/Benchmarks/HeatEquation/Tuning/GridTraverser_impl.h diff --git a/tests/benchmarks/heat-equation-benchmark/Tuning/SimpleCell.h b/src/Benchmarks/HeatEquation/Tuning/SimpleCell.h similarity index 100% rename from tests/benchmarks/heat-equation-benchmark/Tuning/SimpleCell.h rename to src/Benchmarks/HeatEquation/Tuning/SimpleCell.h diff --git a/tests/benchmarks/heat-equation-benchmark/Tuning/Traverser_Grid2D.h b/src/Benchmarks/HeatEquation/Tuning/Traverser_Grid2D.h similarity index 100% rename from tests/benchmarks/heat-equation-benchmark/Tuning/Traverser_Grid2D.h rename to src/Benchmarks/HeatEquation/Tuning/Traverser_Grid2D.h diff --git a/tests/benchmarks/heat-equation-benchmark/Tuning/Traverser_Grid2D_impl.h b/src/Benchmarks/HeatEquation/Tuning/Traverser_Grid2D_impl.h similarity index 100% rename from tests/benchmarks/heat-equation-benchmark/Tuning/Traverser_Grid2D_impl.h rename to src/Benchmarks/HeatEquation/Tuning/Traverser_Grid2D_impl.h diff --git a/tests/benchmarks/heat-equation-benchmark/Tuning/tunning.h b/src/Benchmarks/HeatEquation/Tuning/tunning.h similarity index 100% rename from tests/benchmarks/heat-equation-benchmark/Tuning/tunning.h rename to src/Benchmarks/HeatEquation/Tuning/tunning.h diff --git a/tests/benchmarks/heat-equation-benchmark/pure-c-rhs.h b/src/Benchmarks/HeatEquation/pure-c-rhs.h similarity index 100% rename from tests/benchmarks/heat-equation-benchmark/pure-c-rhs.h rename to src/Benchmarks/HeatEquation/pure-c-rhs.h diff --git a/tests/benchmarks/heat-equation-benchmark/run-HeatEquationBenchmark b/src/Benchmarks/HeatEquation/run-HeatEquationBenchmark similarity index 100% rename from tests/benchmarks/heat-equation-benchmark/run-HeatEquationBenchmark rename to src/Benchmarks/HeatEquation/run-HeatEquationBenchmark diff --git a/tests/benchmarks/heat-equation-benchmark/tnl-benchmark-heat-equation.cpp b/src/Benchmarks/HeatEquation/tnl-benchmark-heat-equation.cpp similarity index 100% rename from tests/benchmarks/heat-equation-benchmark/tnl-benchmark-heat-equation.cpp rename to src/Benchmarks/HeatEquation/tnl-benchmark-heat-equation.cpp diff --git a/tests/benchmarks/heat-equation-benchmark/tnl-benchmark-heat-equation.cu b/src/Benchmarks/HeatEquation/tnl-benchmark-heat-equation.cu similarity index 100% rename from tests/benchmarks/heat-equation-benchmark/tnl-benchmark-heat-equation.cu rename to src/Benchmarks/HeatEquation/tnl-benchmark-heat-equation.cu diff --git a/tests/benchmarks/heat-equation-benchmark/tnl-benchmark-heat-equation.h b/src/Benchmarks/HeatEquation/tnl-benchmark-heat-equation.h similarity index 100% rename from tests/benchmarks/heat-equation-benchmark/tnl-benchmark-heat-equation.h rename to src/Benchmarks/HeatEquation/tnl-benchmark-heat-equation.h diff --git a/tests/benchmarks/heat-equation-benchmark/tnl-benchmark-simple-heat-equation-bug.cu b/src/Benchmarks/HeatEquation/tnl-benchmark-simple-heat-equation-bug.cu similarity index 100% rename from tests/benchmarks/heat-equation-benchmark/tnl-benchmark-simple-heat-equation-bug.cu rename to src/Benchmarks/HeatEquation/tnl-benchmark-simple-heat-equation-bug.cu diff --git a/tests/benchmarks/heat-equation-benchmark/tnl-benchmark-simple-heat-equation-bug.h b/src/Benchmarks/HeatEquation/tnl-benchmark-simple-heat-equation-bug.h similarity index 100% rename from tests/benchmarks/heat-equation-benchmark/tnl-benchmark-simple-heat-equation-bug.h rename to src/Benchmarks/HeatEquation/tnl-benchmark-simple-heat-equation-bug.h diff --git a/tests/benchmarks/heat-equation-benchmark/tnl-benchmark-simple-heat-equation.cpp b/src/Benchmarks/HeatEquation/tnl-benchmark-simple-heat-equation.cpp similarity index 100% rename from tests/benchmarks/heat-equation-benchmark/tnl-benchmark-simple-heat-equation.cpp rename to src/Benchmarks/HeatEquation/tnl-benchmark-simple-heat-equation.cpp diff --git a/tests/benchmarks/heat-equation-benchmark/tnl-benchmark-simple-heat-equation.cu b/src/Benchmarks/HeatEquation/tnl-benchmark-simple-heat-equation.cu similarity index 100% rename from tests/benchmarks/heat-equation-benchmark/tnl-benchmark-simple-heat-equation.cu rename to src/Benchmarks/HeatEquation/tnl-benchmark-simple-heat-equation.cu diff --git a/tests/benchmarks/heat-equation-benchmark/tnl-benchmark-simple-heat-equation.h b/src/Benchmarks/HeatEquation/tnl-benchmark-simple-heat-equation.h similarity index 100% rename from tests/benchmarks/heat-equation-benchmark/tnl-benchmark-simple-heat-equation.h rename to src/Benchmarks/HeatEquation/tnl-benchmark-simple-heat-equation.h diff --git a/tests/benchmarks/heat-equation-benchmark/tnlTestGrid2D.h b/src/Benchmarks/HeatEquation/tnlTestGrid2D.h similarity index 100% rename from tests/benchmarks/heat-equation-benchmark/tnlTestGrid2D.h rename to src/Benchmarks/HeatEquation/tnlTestGrid2D.h diff --git a/tests/benchmarks/heat-equation-benchmark/tnlTestGridEntity.h b/src/Benchmarks/HeatEquation/tnlTestGridEntity.h similarity index 100% rename from tests/benchmarks/heat-equation-benchmark/tnlTestGridEntity.h rename to src/Benchmarks/HeatEquation/tnlTestGridEntity.h diff --git a/tests/benchmarks/heat-equation-benchmark/tnlTestNeighbourGridEntitiesStorage.h b/src/Benchmarks/HeatEquation/tnlTestNeighbourGridEntitiesStorage.h similarity index 100% rename from tests/benchmarks/heat-equation-benchmark/tnlTestNeighbourGridEntitiesStorage.h rename to src/Benchmarks/HeatEquation/tnlTestNeighbourGridEntitiesStorage.h diff --git a/tests/benchmarks/heat-equation-benchmark/tnlTestNeighbourGridEntityGetter.h b/src/Benchmarks/HeatEquation/tnlTestNeighbourGridEntityGetter.h similarity index 100% rename from tests/benchmarks/heat-equation-benchmark/tnlTestNeighbourGridEntityGetter.h rename to src/Benchmarks/HeatEquation/tnlTestNeighbourGridEntityGetter.h diff --git a/tests/benchmarks/heat-equation-benchmark/tnlTestNeighbourGridEntityGetter2D_impl.h b/src/Benchmarks/HeatEquation/tnlTestNeighbourGridEntityGetter2D_impl.h similarity index 100% rename from tests/benchmarks/heat-equation-benchmark/tnlTestNeighbourGridEntityGetter2D_impl.h rename to src/Benchmarks/HeatEquation/tnlTestNeighbourGridEntityGetter2D_impl.h diff --git a/src/Benchmarks/LinearSolvers/CMakeLists.txt b/src/Benchmarks/LinearSolvers/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..1a95c92f791c7d7a0176415fd4968d8aa6bb8982 --- /dev/null +++ b/src/Benchmarks/LinearSolvers/CMakeLists.txt @@ -0,0 +1,9 @@ +if( BUILD_CUDA ) + CUDA_ADD_EXECUTABLE( tnl-benchmark-linear-solvers tnl-benchmark-linear-solvers.cu ) + TARGET_LINK_LIBRARIES( tnl-benchmark-linear-solvers tnl ) +else() + ADD_EXECUTABLE( tnl-benchmark-linear-solvers tnl-benchmark-linear-solvers.cpp ) + TARGET_LINK_LIBRARIES( tnl-benchmark-linear-solvers tnl ) +endif() + +install( TARGETS tnl-benchmark-linear-solvers RUNTIME DESTINATION bin ) diff --git a/tests/benchmarks/tnl-benchmark-linear-solvers.cpp b/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.cpp similarity index 100% rename from tests/benchmarks/tnl-benchmark-linear-solvers.cpp rename to src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.cpp diff --git a/tests/benchmarks/tnl-benchmark-linear-solvers.cu b/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.cu similarity index 100% rename from tests/benchmarks/tnl-benchmark-linear-solvers.cu rename to src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.cu diff --git a/tests/benchmarks/tnl-benchmark-linear-solvers.h b/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.h similarity index 100% rename from tests/benchmarks/tnl-benchmark-linear-solvers.h rename to src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.h diff --git a/src/Benchmarks/SpMV/CMakeLists.txt b/src/Benchmarks/SpMV/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..a73e6738cdb8bb4effd889960f3ffcd5b7255b90 --- /dev/null +++ b/src/Benchmarks/SpMV/CMakeLists.txt @@ -0,0 +1,9 @@ +if( BUILD_CUDA ) + CUDA_ADD_EXECUTABLE( tnl-benchmark-spmv tnl-benchmark-spmv.cu ) + TARGET_LINK_LIBRARIES( tnl-benchmark-spmv tnl ${CUDA_cusparse_LIBRARY} ) +else() + ADD_EXECUTABLE( tnl-benchmark-spmv tnl-benchmark-spmv.cpp ) + TARGET_LINK_LIBRARIES( tnl-benchmark-spmv tnl ) +endif() + +install( TARGETS tnl-benchmark-spmv RUNTIME DESTINATION bin ) diff --git a/tests/benchmarks/tnl-benchmark-spmv.cpp b/src/Benchmarks/SpMV/tnl-benchmark-spmv.cpp similarity index 100% rename from tests/benchmarks/tnl-benchmark-spmv.cpp rename to src/Benchmarks/SpMV/tnl-benchmark-spmv.cpp diff --git a/tests/benchmarks/tnl-benchmark-spmv.cu b/src/Benchmarks/SpMV/tnl-benchmark-spmv.cu similarity index 100% rename from tests/benchmarks/tnl-benchmark-spmv.cu rename to src/Benchmarks/SpMV/tnl-benchmark-spmv.cu diff --git a/tests/benchmarks/tnl-benchmark-spmv.h b/src/Benchmarks/SpMV/tnl-benchmark-spmv.h similarity index 100% rename from tests/benchmarks/tnl-benchmark-spmv.h rename to src/Benchmarks/SpMV/tnl-benchmark-spmv.h diff --git a/tests/benchmarks/tnlCusparseCSRMatrix.h b/src/Benchmarks/SpMV/tnlCusparseCSRMatrix.h similarity index 100% rename from tests/benchmarks/tnlCusparseCSRMatrix.h rename to src/Benchmarks/SpMV/tnlCusparseCSRMatrix.h diff --git a/tests/benchmarks/share/CMakeLists.txt b/src/Benchmarks/scripts/CMakeLists.txt similarity index 100% rename from tests/benchmarks/share/CMakeLists.txt rename to src/Benchmarks/scripts/CMakeLists.txt diff --git a/tests/benchmarks/share/convert-matrices b/src/Benchmarks/scripts/convert-matrices similarity index 100% rename from tests/benchmarks/share/convert-matrices rename to src/Benchmarks/scripts/convert-matrices diff --git a/tests/benchmarks/share/cuda-profiler.conf b/src/Benchmarks/scripts/cuda-profiler.conf similarity index 100% rename from tests/benchmarks/share/cuda-profiler.conf rename to src/Benchmarks/scripts/cuda-profiler.conf diff --git a/tests/benchmarks/share/draw-matrices b/src/Benchmarks/scripts/draw-matrices similarity index 100% rename from tests/benchmarks/share/draw-matrices rename to src/Benchmarks/scripts/draw-matrices diff --git a/tests/benchmarks/share/florida-matrix-market b/src/Benchmarks/scripts/florida-matrix-market similarity index 100% rename from tests/benchmarks/share/florida-matrix-market rename to src/Benchmarks/scripts/florida-matrix-market diff --git a/tests/benchmarks/share/get-matrices b/src/Benchmarks/scripts/get-matrices similarity index 100% rename from tests/benchmarks/share/get-matrices rename to src/Benchmarks/scripts/get-matrices diff --git a/tests/benchmarks/share/matrix-market b/src/Benchmarks/scripts/matrix-market similarity index 100% rename from tests/benchmarks/share/matrix-market rename to src/Benchmarks/scripts/matrix-market diff --git a/tests/benchmarks/share/process-cuda-profile.pl b/src/Benchmarks/scripts/process-cuda-profile.pl similarity index 100% rename from tests/benchmarks/share/process-cuda-profile.pl rename to src/Benchmarks/scripts/process-cuda-profile.pl diff --git a/tests/benchmarks/share/run-matrix-solvers-benchmark b/src/Benchmarks/scripts/run-matrix-solvers-benchmark similarity index 100% rename from tests/benchmarks/share/run-matrix-solvers-benchmark rename to src/Benchmarks/scripts/run-matrix-solvers-benchmark diff --git a/tests/benchmarks/share/run-tnl-benchmark-linear-solvers b/src/Benchmarks/scripts/run-tnl-benchmark-linear-solvers similarity index 100% rename from tests/benchmarks/share/run-tnl-benchmark-linear-solvers rename to src/Benchmarks/scripts/run-tnl-benchmark-linear-solvers diff --git a/tests/benchmarks/share/run-tnl-benchmark-spmv b/src/Benchmarks/scripts/run-tnl-benchmark-spmv similarity index 100% rename from tests/benchmarks/share/run-tnl-benchmark-spmv rename to src/Benchmarks/scripts/run-tnl-benchmark-spmv diff --git a/tests/benchmarks/share/tnl-run-heat-equation-benchmark b/src/Benchmarks/scripts/tnl-run-heat-equation-benchmark similarity index 100% rename from tests/benchmarks/share/tnl-run-heat-equation-benchmark rename to src/Benchmarks/scripts/tnl-run-heat-equation-benchmark diff --git a/tests/benchmarks/share/tnl-run-spmv-benchmark b/src/Benchmarks/scripts/tnl-run-spmv-benchmark similarity index 100% rename from tests/benchmarks/share/tnl-run-spmv-benchmark rename to src/Benchmarks/scripts/tnl-run-spmv-benchmark diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index a2086691dc4ccbe4c263fe5f11b40e361d957d73..280d40ad2168b2b99f5db998b6a819b23c1382ff 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -1,5 +1,9 @@ ADD_SUBDIRECTORY( TNL ) +if( ${WITH_BENCHMARKS} ) + ADD_SUBDIRECTORY( Benchmarks ) +endif() + if( ${WITH_TOOLS} ) ADD_SUBDIRECTORY( Tools ) endif() diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 33873402d8074b5fa304e84546e1188b57d9943c..93c15ce593a9db00fc240c6263706f9c10a46ac5 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -1,9 +1,4 @@ -set( ENABLE_CODECOVERAGE ) - ADD_SUBDIRECTORY( data ) -ADD_SUBDIRECTORY( benchmarks ) #ADD_SUBDIRECTORY( unit-tests ) ADD_SUBDIRECTORY( long-time-unit-tests ) ADD_SUBDIRECTORY( mpi ) - -unset( ENABLE_CODECOVERAGE ) diff --git a/tests/benchmarks/CMakeLists.txt b/tests/benchmarks/CMakeLists.txt deleted file mode 100644 index e53ba6878070e6b8f8d3b830ab8f64afcccbcf77..0000000000000000000000000000000000000000 --- a/tests/benchmarks/CMakeLists.txt +++ /dev/null @@ -1,29 +0,0 @@ -ADD_SUBDIRECTORY( share ) -ADD_SUBDIRECTORY( heat-equation-benchmark ) - -IF( BUILD_CUDA ) - CUDA_ADD_EXECUTABLE( tnl-benchmark-blas tnl-benchmark-blas.cu ) - CUDA_ADD_CUBLAS_TO_TARGET( tnl-benchmark-blas ) - TARGET_LINK_LIBRARIES( tnl-benchmark-blas tnl ) - - CUDA_ADD_EXECUTABLE( tnl-benchmark-spmv tnl-benchmark-spmv.cu ) - TARGET_LINK_LIBRARIES( tnl-benchmark-spmv tnl ${CUDA_cusparse_LIBRARY} ) - - CUDA_ADD_EXECUTABLE( tnl-benchmark-linear-solvers tnl-benchmark-linear-solvers.cu ) - TARGET_LINK_LIBRARIES( tnl-benchmark-linear-solvers tnl ) -ELSE() - ADD_EXECUTABLE( tnl-benchmark-blas tnl-benchmark-blas.cpp ) - TARGET_LINK_LIBRARIES( tnl-benchmark-blas tnl ) - - ADD_EXECUTABLE( tnl-benchmark-spmv tnl-benchmark-spmv.cpp ) - TARGET_LINK_LIBRARIES( tnl-benchmark-spmv tnl ) - - ADD_EXECUTABLE( tnl-benchmark-linear-solvers tnl-benchmark-linear-solvers.cpp ) - TARGET_LINK_LIBRARIES( tnl-benchmark-linear-solvers tnl ) -ENDIF() - -INSTALL( TARGETS - tnl-benchmark-blas - tnl-benchmark-spmv - tnl-benchmark-linear-solvers - RUNTIME DESTINATION bin ) diff --git a/tests/benchmarks/array-operations.h b/tests/benchmarks/array-operations.h deleted file mode 100644 index 504dcc1da03a91fa87af913008f9355579e62930..0000000000000000000000000000000000000000 --- a/tests/benchmarks/array-operations.h +++ /dev/null @@ -1,167 +0,0 @@ -/*************************************************************************** - array-operations.h - description - ------------------- - begin : Dec 30, 2015 - copyright : (C) 2015 by Tomas Oberhuber et al. - email : tomas.oberhuber@fjfi.cvut.cz - ***************************************************************************/ - -/* See Copyright Notice in tnl/Copyright */ - -// Implemented by: Jakub Klinkovsky - -#pragma once - -#include "benchmarks.h" - -#include <TNL/Containers/Array.h> - -namespace TNL -{ -namespace benchmarks -{ - -template< typename Real = double, - typename Index = int > -bool -benchmarkArrayOperations( Benchmark & benchmark, - const int & loops, - const long & size ) -{ - typedef Containers::Array< Real, Devices::Host, Index > HostArray; - typedef Containers::Array< Real, Devices::Cuda, Index > CudaArray; - using namespace std; - - double datasetSize = ( double ) ( loops * size ) * sizeof( Real ) / oneGB; - - HostArray hostArray, hostArray2; - CudaArray deviceArray, deviceArray2; - hostArray.setSize( size ); - hostArray2.setSize( size ); -#ifdef HAVE_CUDA - deviceArray.setSize( size ); - deviceArray2.setSize( size ); -#endif - - Real resultHost, resultDevice; - - - // reset functions - auto reset1 = [&]() { - hostArray.setValue( 1.0 ); -#ifdef HAVE_CUDA - deviceArray.setValue( 1.0 ); -#endif - }; - auto reset2 = [&]() { - hostArray2.setValue( 1.0 ); -#ifdef HAVE_CUDA - deviceArray2.setValue( 1.0 ); -#endif - }; - auto reset12 = [&]() { - reset1(); - reset2(); - }; - - - reset12(); - - - auto compareHost = [&]() { - resultHost = (int) hostArray == hostArray2; - }; - auto compareCuda = [&]() { - resultDevice = (int) deviceArray == deviceArray2; - }; - benchmark.setOperation( "comparison (operator==)", 2 * datasetSize ); - benchmark.time( reset1, "CPU", compareHost ); -#ifdef HAVE_CUDA - benchmark.time( reset1, "GPU", compareCuda ); -#endif - - - auto copyAssignHostHost = [&]() { - hostArray = hostArray2; - }; - auto copyAssignCudaCuda = [&]() { - deviceArray = deviceArray2; - }; - benchmark.setOperation( "copy (operator=)", 2 * datasetSize ); - // copyBasetime is used later inside HAVE_CUDA guard, so the compiler will - // complain when compiling without CUDA - const double copyBasetime = benchmark.time( reset1, "CPU", copyAssignHostHost ); -#ifdef HAVE_CUDA - benchmark.time( reset1, "GPU", copyAssignCudaCuda ); -#endif - - - auto copyAssignHostCuda = [&]() { - deviceArray = hostArray; - }; - auto copyAssignCudaHost = [&]() { - hostArray = deviceArray; - }; -#ifdef HAVE_CUDA - benchmark.setOperation( "copy (operator=)", datasetSize, copyBasetime ); - benchmark.time( reset1, - "CPU->GPU", copyAssignHostCuda, - "GPU->CPU", copyAssignCudaHost ); -#endif - - - auto setValueHost = [&]() { - hostArray.setValue( 3.0 ); - }; - auto setValueCuda = [&]() { - deviceArray.setValue( 3.0 ); - }; - benchmark.setOperation( "setValue", datasetSize ); - benchmark.time( reset1, "CPU", setValueHost ); -#ifdef HAVE_CUDA - benchmark.time( reset1, "GPU", setValueCuda ); -#endif - - - auto setSizeHost = [&]() { - hostArray.setSize( size ); - }; - auto setSizeCuda = [&]() { - deviceArray.setSize( size ); - }; - auto resetSize1 = [&]() { - hostArray.reset(); -#ifdef HAVE_CUDA - deviceArray.reset(); -#endif - }; - benchmark.setOperation( "allocation (setSize)", datasetSize ); - benchmark.time( resetSize1, "CPU", setSizeHost ); -#ifdef HAVE_CUDA - benchmark.time( resetSize1, "GPU", setSizeCuda ); -#endif - - - auto resetSizeHost = [&]() { - hostArray.reset(); - }; - auto resetSizeCuda = [&]() { - deviceArray.reset(); - }; - auto setSize1 = [&]() { - hostArray.setSize( size ); -#ifdef HAVE_CUDA - deviceArray.setSize( size ); -#endif - }; - benchmark.setOperation( "deallocation (reset)", datasetSize ); - benchmark.time( setSize1, "CPU", resetSizeHost ); -#ifdef HAVE_CUDA - benchmark.time( setSize1, "GPU", resetSizeCuda ); -#endif - - return true; -} - -} // namespace benchmarks -} // namespace tnl diff --git a/tests/benchmarks/benchmarks.h b/tests/benchmarks/benchmarks.h deleted file mode 100644 index ce5e631a6899170cfeba58911be71e5cc17eb7e6..0000000000000000000000000000000000000000 --- a/tests/benchmarks/benchmarks.h +++ /dev/null @@ -1,437 +0,0 @@ -/*************************************************************************** - benchmarks.h - description - ------------------- - begin : Dec 30, 2015 - copyright : (C) 2015 by Tomas Oberhuber et al. - email : tomas.oberhuber@fjfi.cvut.cz - ***************************************************************************/ - -/* See Copyright Notice in tnl/Copyright */ - -// Implemented by: Jakub Klinkovsky - -#pragma once - -#include <iostream> -#include <iomanip> -#include <map> -#include <vector> - -#include <TNL/Timer.h> -#include <TNL/String.h> - -namespace TNL -{ -namespace benchmarks -{ - -const double oneGB = 1024.0 * 1024.0 * 1024.0; - -template< typename ComputeFunction, - typename ResetFunction > -double -timeFunction( ComputeFunction compute, - ResetFunction reset, - const int & loops ) -{ - // the timer is constructed zero-initialized and stopped - Timer timer; - - reset(); - for(int i = 0; i < loops; ++i) { - // Explicit synchronization of the CUDA device - // TODO: not necessary for host computations -#ifdef HAVE_CUDA - cudaDeviceSynchronize(); -#endif - timer.start(); - compute(); -#ifdef HAVE_CUDA - cudaDeviceSynchronize(); -#endif - timer.stop(); - - reset(); - } - - return timer.getRealTime(); -} - - -struct InternalError {}; - - -class Logging -{ -public: - using MetadataElement = std::pair< const char*, String >; - using MetadataMap = std::map< const char*, String >; - using MetadataColumns = std::vector<MetadataElement>; - - using HeaderElements = std::initializer_list< String >; - using RowElements = std::initializer_list< double >; - - Logging( bool verbose = true ) - : verbose(verbose) - { } - - void - writeTitle( const String & title ) - { - if( verbose ) - std::cout << std::endl << "== " << title << " ==" << std::endl << std::endl; - log << ": title = " << title << std::endl; - } - - void - writeMetadata( const MetadataMap & metadata ) - { - if( verbose ) - std::cout << "properties:" << std::endl; - - for( auto & it : metadata ) { - if( verbose ) - std::cout << " " << it.first << " = " << it.second << std::endl; - log << ": " << it.first << " = " << it.second << std::endl; - } - if( verbose ) - std::cout << std::endl; - } - - void - writeTableHeader( const String & spanningElement, - const HeaderElements & subElements ) - { - using namespace std; - - if( verbose && header_changed ) { - for( auto & it : metadataColumns ) { - std::cout << std::setw( 20 ) << it.first; - } - - // spanning element is printed as usual column to stdout, - // but is excluded from header - std::cout << std::setw( 15 ) << ""; - - for( auto & it : subElements ) { - std::cout << std::setw( 15 ) << it; - } - std::cout << std::endl; - - header_changed = false; - } - - // initial indent string - header_indent = "!"; - log << std::endl; - for( auto & it : metadataColumns ) { - log << header_indent << " " << it.first << std::endl; - } - - // dump stacked spanning columns - if( horizontalGroups.size() > 0 ) - while( horizontalGroups.back().second <= 0 ) { - horizontalGroups.pop_back(); - header_indent.pop_back(); - } - for( size_t i = 0; i < horizontalGroups.size(); i++ ) { - if( horizontalGroups[ i ].second > 0 ) { - log << header_indent << " " << horizontalGroups[ i ].first << std::endl; - header_indent += "!"; - } - } - - log << header_indent << " " << spanningElement << std::endl; - for( auto & it : subElements ) { - log << header_indent << "! " << it << std::endl; - } - - if( horizontalGroups.size() > 0 ) { - horizontalGroups.back().second--; - header_indent.pop_back(); - } - } - - void - writeTableRow( const String & spanningElement, - const RowElements & subElements ) - { - using namespace std; - - if( verbose ) { - for( auto & it : metadataColumns ) { - std::cout << std::setw( 20 ) << it.second; - } - // spanning element is printed as usual column to stdout - std::cout << std::setw( 15 ) << spanningElement; - for( auto & it : subElements ) { - std::cout << std::setw( 15 ); - if( it != 0.0 )std::cout << it; - else std::cout << "N/A"; - } - std::cout << std::endl; - } - - // only when changed (the header has been already adjusted) - // print each element on separate line - for( auto & it : metadataColumns ) { - log << it.second << std::endl; - } - - // benchmark data are indented - const String indent = " "; - for( auto & it : subElements ) { - if( it != 0.0 ) log << indent << it << std::endl; - else log << indent << "N/A" << std::endl; - } - } - - void - writeErrorMessage( const char* msg, - const int & colspan = 1 ) - { - // initial indent string - header_indent = "!"; - log << std::endl; - for( auto & it : metadataColumns ) { - log << header_indent << " " << it.first << std::endl; - } - - // make sure there is a header column for the message - if( horizontalGroups.size() == 0 ) - horizontalGroups.push_back( {"", 1} ); - - // dump stacked spanning columns - while( horizontalGroups.back().second <= 0 ) { - horizontalGroups.pop_back(); - header_indent.pop_back(); - } - for( size_t i = 0; i < horizontalGroups.size(); i++ ) { - if( horizontalGroups[ i ].second > 0 ) { - log << header_indent << " " << horizontalGroups[ i ].first << std::endl; - header_indent += "!"; - } - } - if( horizontalGroups.size() > 0 ) { - horizontalGroups.back().second -= colspan; - header_indent.pop_back(); - } - - // only when changed (the header has been already adjusted) - // print each element on separate line - for( auto & it : metadataColumns ) { - log << it.second << std::endl; - } - log << msg << std::endl; - } - - void - closeTable() - { - log << std::endl; - header_indent = body_indent = ""; - header_changed = true; - horizontalGroups.clear(); - } - - bool save( std::ostream & logFile ) - { - closeTable(); - logFile << log.str(); - if( logFile.good() ) { - log.str() = ""; - return true; - } - return false; - } - -protected: - - // manual double -> String conversion with fixed precision - static String - _to_string( const double & num, const int & precision = 0, bool fixed = false ) - { - std::stringstream str; - if( fixed ) - str << std::fixed; - if( precision ) - str << std::setprecision( precision ); - str << num; - return String( str.str().data() ); - } - - std::stringstream log; - std::string header_indent; - std::string body_indent; - - bool verbose; - MetadataColumns metadataColumns; - bool header_changed = true; - std::vector< std::pair< String, int > > horizontalGroups; -}; - - -class Benchmark - : protected Logging -{ -public: - using Logging::MetadataElement; - using Logging::MetadataMap; - using Logging::MetadataColumns; - - Benchmark( const int & loops = 10, - bool verbose = true ) - : Logging(verbose), loops(loops) - { } - - // TODO: ensure that this is not called in the middle of the benchmark - // (or just remove it completely?) - void - setLoops( const int & loops ) - { - this->loops = loops; - } - - // Marks the start of a new benchmark - void - newBenchmark( const String & title ) - { - closeTable(); - writeTitle( title ); - } - - // Marks the start of a new benchmark (with custom metadata) - void - newBenchmark( const String & title, - MetadataMap metadata ) - { - closeTable(); - writeTitle( title ); - // add loops to metadata - metadata["loops"] = String(loops); - writeMetadata( metadata ); - } - - // Sets metadata columns -- values used for all subsequent rows until - // the next call to this function. - void - setMetadataColumns( const MetadataColumns & metadata ) - { - if( metadataColumns != metadata ) - header_changed = true; - metadataColumns = metadata; - } - - // TODO: maybe should be renamed to createVerticalGroup and ensured that vertical and horizontal groups are not used within the same "Benchmark" - // Sets current operation -- operations expand the table vertically - // - baseTime should be reset to 0.0 for most operations, but sometimes - // it is useful to override it - // - Order of operations inside a "Benchmark" does not matter, rows can be - // easily sorted while converting to HTML.) - void - setOperation( const String & operation, - const double & datasetSize = 0.0, // in GB - const double & baseTime = 0.0 ) - { - if( metadataColumns.size() > 0 && String(metadataColumns[ 0 ].first) == "operation" ) { - metadataColumns[ 0 ].second = operation; - } - else { - metadataColumns.insert( metadataColumns.begin(), {"operation", operation} ); - } - setOperation( datasetSize, baseTime ); - header_changed = true; - } - - void - setOperation( const double & datasetSize = 0.0, - const double & baseTime = 0.0 ) - { - this->datasetSize = datasetSize; - this->baseTime = baseTime; - } - - // Creates new horizontal groups inside a benchmark -- increases the number - // of columns in the "Benchmark", implies column spanning. - // (Useful e.g. for SpMV formats, different configurations etc.) - void - createHorizontalGroup( const String & name, - const int & subcolumns ) - { - if( horizontalGroups.size() == 0 ) { - horizontalGroups.push_back( {name, subcolumns} ); - } - else { - auto & last = horizontalGroups.back(); - if( last.first != name && last.second > 0 ) { - horizontalGroups.push_back( {name, subcolumns} ); - } - else { - last.first = name; - last.second = subcolumns; - } - } - } - - // Times a single ComputeFunction. Subsequent calls implicitly split - // the current "horizontal group" into sub-columns identified by - // "performer", which are further split into "bandwidth", "time" and - // "speedup" columns. - // TODO: allow custom columns bound to lambda functions (e.g. for Gflops calculation) - // Also terminates the recursion of the following variadic template. - template< typename ResetFunction, - typename ComputeFunction > - double - time( ResetFunction reset, - const String & performer, - ComputeFunction & compute ) - { - const double time = timeFunction( compute, reset, loops ); - const double bandwidth = datasetSize / time; - const double speedup = this->baseTime / time; - if( this->baseTime == 0.0 ) - this->baseTime = time; - - writeTableHeader( performer, HeaderElements({"bandwidth", "time", "speedup"}) ); - writeTableRow( performer, RowElements({ bandwidth, time, speedup }) ); - - return this->baseTime; - } - - // Recursive template function to deal with multiple computations with the - // same reset function. - template< typename ResetFunction, - typename ComputeFunction, - typename... NextComputations > - inline double - time( ResetFunction reset, - const String & performer, - ComputeFunction & compute, - NextComputations & ... nextComputations ) - { - time( reset, performer, compute ); - time( reset, nextComputations... ); - return this->baseTime; - } - - // Adds an error message to the log. Should be called in places where the - // "time" method could not be called (e.g. due to failed allocation). - void - addErrorMessage( const char* msg, - const int & numberOfComputations = 1 ) - { - // each computation has 3 subcolumns - const int colspan = 3 * numberOfComputations; - writeErrorMessage( msg, colspan ); - } - - using Logging::save; - -protected: - int loops; - double datasetSize = 0.0; - double baseTime = 0.0; -}; - -} // namespace benchmarks -} // namespace tnl diff --git a/tests/benchmarks/spmv.h b/tests/benchmarks/spmv.h deleted file mode 100644 index cc957c7278ec9b21a418e627af9063e32bec3785..0000000000000000000000000000000000000000 --- a/tests/benchmarks/spmv.h +++ /dev/null @@ -1,191 +0,0 @@ -/*************************************************************************** - spmv.h - description - ------------------- - begin : Dec 30, 2015 - copyright : (C) 2015 by Tomas Oberhuber et al. - email : tomas.oberhuber@fjfi.cvut.cz - ***************************************************************************/ - -/* See Copyright Notice in tnl/Copyright */ - -// Implemented by: Jakub Klinkovsky - -#pragma once - -#include "benchmarks.h" - -#include <TNL/Containers/List.h> -#include <TNL/Pointers/DevicePointer.h> -#include <TNL/Matrices/CSR.h> -#include <TNL/Matrices/Ellpack.h> -#include <TNL/Matrices/SlicedEllpack.h> -#include <TNL/Matrices/ChunkedEllpack.h> - -namespace TNL -{ -namespace benchmarks -{ - -// silly alias to match the number of template parameters with other formats -template< typename Real, typename Device, typename Index > -using SlicedEllpack = Matrices::SlicedEllpack< Real, Device, Index >; - -template< typename Matrix > -int setHostTestMatrix( Matrix& matrix, - const int elementsPerRow ) -{ - const int size = matrix.getRows(); - int elements( 0 ); - for( int row = 0; row < size; row++ ) { - int col = row - elementsPerRow / 2; - for( int element = 0; element < elementsPerRow; element++ ) { - if( col + element >= 0 && - col + element < size ) - { - matrix.setElement( row, col + element, element + 1 ); - elements++; - } - } - } - return elements; -} - -#ifdef HAVE_CUDA -template< typename Matrix > -__global__ void setCudaTestMatrixKernel( Matrix* matrix, - const int elementsPerRow, - const int gridIdx ) -{ - const int rowIdx = ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; - if( rowIdx >= matrix->getRows() ) - return; - int col = rowIdx - elementsPerRow / 2; - for( int element = 0; element < elementsPerRow; element++ ) { - if( col + element >= 0 && - col + element < matrix->getColumns() ) - matrix->setElementFast( rowIdx, col + element, element + 1 ); - } -} -#endif - -template< typename Matrix > -void setCudaTestMatrix( Matrix& matrix, - const int elementsPerRow ) -{ -#ifdef HAVE_CUDA - typedef typename Matrix::IndexType IndexType; - typedef typename Matrix::RealType RealType; - Pointers::DevicePointer< Matrix > kernel_matrix( matrix ); - dim3 cudaBlockSize( 256 ), cudaGridSize( Devices::Cuda::getMaxGridSize() ); - const IndexType cudaBlocks = roundUpDivision( matrix.getRows(), cudaBlockSize.x ); - const IndexType cudaGrids = roundUpDivision( cudaBlocks, Devices::Cuda::getMaxGridSize() ); - for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ ) { - if( gridIdx == cudaGrids - 1 ) - cudaGridSize.x = cudaBlocks % Devices::Cuda::getMaxGridSize(); - setCudaTestMatrixKernel< Matrix > - <<< cudaGridSize, cudaBlockSize >>> - ( &kernel_matrix.template modifyData< Devices::Cuda >(), elementsPerRow, gridIdx ); - TNL_CHECK_CUDA_DEVICE; - } -#endif -} - - -// TODO: rename as benchmark_SpMV_synthetic and move to spmv-synthetic.h -template< typename Real, - template< typename, typename, typename > class Matrix, - template< typename, typename, typename > class Vector = Containers::Vector > -bool -benchmarkSpMV( Benchmark & benchmark, - const int & loops, - const int & size, - const int elementsPerRow = 5 ) -{ - typedef Matrix< Real, Devices::Host, int > HostMatrix; - typedef Matrix< Real, Devices::Cuda, int > DeviceMatrix; - typedef Containers::Vector< Real, Devices::Host, int > HostVector; - typedef Containers::Vector< Real, Devices::Cuda, int > CudaVector; - - HostMatrix hostMatrix; - DeviceMatrix deviceMatrix; - Containers::Vector< int, Devices::Host, int > hostRowLengths; - Containers::Vector< int, Devices::Cuda, int > deviceRowLengths; - HostVector hostVector, hostVector2; - CudaVector deviceVector, deviceVector2; - - // create benchmark group - Containers::List< String > parsedType; - parseObjectType( HostMatrix::getType(), parsedType ); - benchmark.createHorizontalGroup( parsedType[ 0 ], 2 ); - - hostRowLengths.setSize( size ); - hostMatrix.setDimensions( size, size ); - hostVector.setSize( size ); - hostVector2.setSize( size ); -#ifdef HAVE_CUDA - deviceRowLengths.setSize( size ); - deviceMatrix.setDimensions( size, size ); - deviceVector.setSize( size ); - deviceVector2.setSize( size ); -#endif - - hostRowLengths.setValue( elementsPerRow ); -#ifdef HAVE_CUDA - deviceRowLengths.setValue( elementsPerRow ); -#endif - - hostMatrix.setCompressedRowLengths( hostRowLengths ); -#ifdef HAVE_CUDA - deviceMatrix.setCompressedRowLengths( deviceRowLengths ); -#endif - - const int elements = setHostTestMatrix< HostMatrix >( hostMatrix, elementsPerRow ); - setCudaTestMatrix< DeviceMatrix >( deviceMatrix, elementsPerRow ); - const double datasetSize = ( double ) loops * elements * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB; - - // reset function - auto reset = [&]() { - hostVector.setValue( 1.0 ); - hostVector2.setValue( 0.0 ); -#ifdef HAVE_CUDA - deviceVector.setValue( 1.0 ); - deviceVector2.setValue( 0.0 ); -#endif - }; - - // compute functions - auto spmvHost = [&]() { - hostMatrix.vectorProduct( hostVector, hostVector2 ); - }; - auto spmvCuda = [&]() { - deviceMatrix.vectorProduct( deviceVector, deviceVector2 ); - }; - - benchmark.setOperation( datasetSize ); - benchmark.time( reset, "CPU", spmvHost ); -#ifdef HAVE_CUDA - benchmark.time( reset, "GPU", spmvCuda ); -#endif - - return true; -} - -template< typename Real = double, - typename Index = int > -bool -benchmarkSpmvSynthetic( Benchmark & benchmark, - const int & loops, - const int & size, - const int & elementsPerRow ) -{ - bool result = true; - // TODO: benchmark all formats from tnl-benchmark-spmv (different parameters of the base formats) - result |= benchmarkSpMV< Real, Matrices::CSR >( benchmark, loops, size, elementsPerRow ); - result |= benchmarkSpMV< Real, Matrices::Ellpack >( benchmark, loops, size, elementsPerRow ); - result |= benchmarkSpMV< Real, SlicedEllpack >( benchmark, loops, size, elementsPerRow ); - result |= benchmarkSpMV< Real, Matrices::ChunkedEllpack >( benchmark, loops, size, elementsPerRow ); - return result; -} - -} // namespace benchmarks -} // namespace tnl diff --git a/tests/benchmarks/tnl-benchmark-blas.h b/tests/benchmarks/tnl-benchmark-blas.h deleted file mode 100644 index 8c71209534894d10a430bae874035c9d14d7bbc3..0000000000000000000000000000000000000000 --- a/tests/benchmarks/tnl-benchmark-blas.h +++ /dev/null @@ -1,192 +0,0 @@ -/*************************************************************************** - tnl-benchmark-blas.h - description - ------------------- - begin : Jan 27, 2010 - copyright : (C) 2010 by Tomas Oberhuber et al. - email : tomas.oberhuber@fjfi.cvut.cz - ***************************************************************************/ - -/* See Copyright Notice in tnl/Copyright */ - -// Implemented by: Jakub Klinkovsky - -#pragma once - -#include <TNL/Devices/Host.h> -#include <TNL/Devices/CudaDeviceInfo.h> -#include <TNL/Devices/SystemInfo.h> -#include <TNL/Config/ConfigDescription.h> -#include <TNL/Config/ParameterContainer.h> - -#include "array-operations.h" -#include "vector-operations.h" -#include "spmv.h" - -using namespace TNL; -using namespace TNL::benchmarks; - - -// TODO: should benchmarks check the result of the computation? - - -template< typename Real > -void -runBlasBenchmarks( Benchmark & benchmark, - Benchmark::MetadataMap metadata, - const std::size_t & minSize, - const std::size_t & maxSize, - const double & sizeStepFactor, - const unsigned & loops, - const unsigned & elementsPerRow ) -{ - const String precision = getType< Real >(); - metadata["precision"] = precision; - - // Array operations - benchmark.newBenchmark( String("Array operations (") + precision + ")", - metadata ); - for( std::size_t size = minSize; size <= maxSize; size *= 2 ) { - benchmark.setMetadataColumns( Benchmark::MetadataColumns({ - {"size", size}, - } )); - benchmarkArrayOperations< Real >( benchmark, loops, size ); - } - - // Vector operations - benchmark.newBenchmark( String("Vector operations (") + precision + ")", - metadata ); - for( std::size_t size = minSize; size <= maxSize; size *= sizeStepFactor ) { - benchmark.setMetadataColumns( Benchmark::MetadataColumns({ - {"size", size}, - } )); - benchmarkVectorOperations< Real >( benchmark, loops, size ); - } - - // Sparse matrix-vector multiplication - benchmark.newBenchmark( String("Sparse matrix-vector multiplication (") + precision + ")", - metadata ); - for( std::size_t size = minSize; size <= maxSize; size *= 2 ) { - benchmark.setMetadataColumns( Benchmark::MetadataColumns({ - {"rows", size}, - {"columns", size}, - {"elements per row", elementsPerRow}, - } )); - benchmarkSpmvSynthetic< Real >( benchmark, loops, size, elementsPerRow ); - } -} - -void -setupConfig( Config::ConfigDescription & config ) -{ - config.addDelimiter( "Benchmark settings:" ); - config.addEntry< String >( "log-file", "Log file name.", "tnl-benchmark-blas.log"); - config.addEntry< String >( "output-mode", "Mode for opening the log file.", "overwrite" ); - config.addEntryEnum( "append" ); - config.addEntryEnum( "overwrite" ); - config.addEntry< String >( "precision", "Precision of the arithmetics.", "double" ); - config.addEntryEnum( "float" ); - config.addEntryEnum( "double" ); - config.addEntryEnum( "all" ); - config.addEntry< int >( "min-size", "Minimum size of arrays/vectors used in the benchmark.", 100000 ); - config.addEntry< int >( "max-size", "Minimum size of arrays/vectors used in the benchmark.", 10000000 ); - config.addEntry< int >( "size-step-factor", "Factor determining the size of arrays/vectors used in the benchmark. First size is min-size and each following size is stepFactor*previousSize, up to max-size.", 2 ); - config.addEntry< int >( "loops", "Number of iterations for every computation.", 10 ); - config.addEntry< int >( "elements-per-row", "Number of elements per row of the sparse matrix used in the matrix-vector multiplication benchmark.", 5 ); - config.addEntry< int >( "verbose", "Verbose mode.", 1 ); - - config.addDelimiter( "Device settings:" ); - Devices::Host::configSetup( config ); - Devices::Cuda::configSetup( config ); -} - -int -main( int argc, char* argv[] ) -{ - Config::ParameterContainer parameters; - Config::ConfigDescription conf_desc; - - setupConfig( conf_desc ); - - if( ! parseCommandLine( argc, argv, conf_desc, parameters ) ) { - conf_desc.printUsage( argv[ 0 ] ); - return 1; - } - - Devices::Host::setup( parameters ); - Devices::Cuda::setup( parameters ); - - const String & logFileName = parameters.getParameter< String >( "log-file" ); - const String & outputMode = parameters.getParameter< String >( "output-mode" ); - const String & precision = parameters.getParameter< String >( "precision" ); - // FIXME: getParameter< std::size_t >() does not work with parameters added with addEntry< int >(), - // which have a default value. The workaround below works for int values, but it is not possible - // to pass 64-bit integer values -// const std::size_t minSize = parameters.getParameter< std::size_t >( "min-size" ); -// const std::size_t maxSize = parameters.getParameter< std::size_t >( "max-size" ); - const std::size_t minSize = parameters.getParameter< int >( "min-size" ); - const std::size_t maxSize = parameters.getParameter< int >( "max-size" ); - const unsigned sizeStepFactor = parameters.getParameter< unsigned >( "size-step-factor" ); - const unsigned loops = parameters.getParameter< unsigned >( "loops" ); - const unsigned elementsPerRow = parameters.getParameter< unsigned >( "elements-per-row" ); - const unsigned verbose = parameters.getParameter< unsigned >( "verbose" ); - - if( sizeStepFactor <= 1 ) { - std::cerr << "The value of --size-step-factor must be greater than 1." << std::endl; - return EXIT_FAILURE; - } - - // open log file - auto mode = std::ios::out; - if( outputMode == "append" ) - mode |= std::ios::app; - std::ofstream logFile( logFileName.getString(), mode ); - - // init benchmark and common metadata - Benchmark benchmark( loops, verbose ); - - // prepare global metadata - const int cpu_id = 0; - Devices::CacheSizes cacheSizes = Devices::SystemInfo::getCPUCacheSizes( cpu_id ); - String cacheInfo = String( cacheSizes.L1data ) + ", " - + String( cacheSizes.L1instruction ) + ", " - + String( cacheSizes.L2 ) + ", " - + String( cacheSizes.L3 ); -#ifdef HAVE_CUDA - const int activeGPU = Devices::CudaDeviceInfo::getActiveDevice(); - const String deviceArch = String( Devices::CudaDeviceInfo::getArchitectureMajor( activeGPU ) ) + "." + - String( Devices::CudaDeviceInfo::getArchitectureMinor( activeGPU ) ); -#endif - Benchmark::MetadataMap metadata { - { "host name", Devices::SystemInfo::getHostname() }, - { "architecture", Devices::SystemInfo::getArchitecture() }, - { "system", Devices::SystemInfo::getSystemName() }, - { "system release", Devices::SystemInfo::getSystemRelease() }, - { "start time", Devices::SystemInfo::getCurrentTime() }, - { "CPU model name", Devices::SystemInfo::getCPUModelName( cpu_id ) }, - { "CPU cores", Devices::SystemInfo::getNumberOfCores( cpu_id ) }, - { "CPU threads per core", Devices::SystemInfo::getNumberOfThreads( cpu_id ) / Devices::SystemInfo::getNumberOfCores( cpu_id ) }, - { "CPU max frequency (MHz)", Devices::SystemInfo::getCPUMaxFrequency( cpu_id ) / 1e3 }, - { "CPU cache sizes (L1d, L1i, L2, L3) (kiB)", cacheInfo }, -#ifdef HAVE_CUDA - { "GPU name", Devices::CudaDeviceInfo::getDeviceName( activeGPU ) }, - { "GPU architecture", deviceArch }, - { "GPU CUDA cores", Devices::CudaDeviceInfo::getCudaCores( activeGPU ) }, - { "GPU clock rate (MHz)", (double) Devices::CudaDeviceInfo::getClockRate( activeGPU ) / 1e3 }, - { "GPU global memory (GB)", (double) Devices::CudaDeviceInfo::getGlobalMemory( activeGPU ) / 1e9 }, - { "GPU memory clock rate (MHz)", (double) Devices::CudaDeviceInfo::getMemoryClockRate( activeGPU ) / 1e3 }, - { "GPU memory ECC enabled", Devices::CudaDeviceInfo::getECCEnabled( activeGPU ) }, -#endif - }; - - if( precision == "all" || precision == "float" ) - runBlasBenchmarks< float >( benchmark, metadata, minSize, maxSize, sizeStepFactor, loops, elementsPerRow ); - if( precision == "all" || precision == "double" ) - runBlasBenchmarks< double >( benchmark, metadata, minSize, maxSize, sizeStepFactor, loops, elementsPerRow ); - - if( ! benchmark.save( logFile ) ) { - std::cerr << "Failed to write the benchmark results to file '" << parameters.getParameter< String >( "log-file" ) << "'." << std::endl; - return EXIT_FAILURE; - } - - return EXIT_SUCCESS; -} diff --git a/tests/benchmarks/vector-operations.h b/tests/benchmarks/vector-operations.h deleted file mode 100644 index cdf2443964a5bb33354de6e0ec0688018c7128a1..0000000000000000000000000000000000000000 --- a/tests/benchmarks/vector-operations.h +++ /dev/null @@ -1,442 +0,0 @@ -/*************************************************************************** - vector-operations.h - description - ------------------- - begin : Dec 30, 2015 - copyright : (C) 2015 by Tomas Oberhuber et al. - email : tomas.oberhuber@fjfi.cvut.cz - ***************************************************************************/ - -/* See Copyright Notice in tnl/Copyright */ - -// Implemented by: Jakub Klinkovsky - -#pragma once - -#include <stdlib.h> // srand48 - -#include "benchmarks.h" - -#include <TNL/Containers/Vector.h> - -#ifdef HAVE_CUDA -#include "cublasWrappers.h" -#endif - -namespace TNL -{ -namespace benchmarks -{ - -template< typename Real = double, - typename Index = int > -bool -benchmarkVectorOperations( Benchmark & benchmark, - const int & loops, - const long & size ) -{ - typedef Containers::Vector< Real, Devices::Host, Index > HostVector; - typedef Containers::Vector< Real, Devices::Cuda, Index > CudaVector; - using namespace std; - - double datasetSize = ( double ) ( loops * size ) * sizeof( Real ) / oneGB; - - HostVector hostVector, hostVector2; - CudaVector deviceVector, deviceVector2; - hostVector.setSize( size ); - hostVector2.setSize( size ); -#ifdef HAVE_CUDA - deviceVector.setSize( size ); - deviceVector2.setSize( size ); -#endif - - Real resultHost, resultDevice; - -#ifdef HAVE_CUDA - cublasHandle_t cublasHandle; - cublasCreate( &cublasHandle ); -#endif - - - // reset functions - // (Make sure to always use some in benchmarks, even if it's not necessary - // to assure correct result - it helps to clear cache and avoid optimizations - // of the benchmark loop.) - auto reset1 = [&]() { - hostVector.setValue( 1.0 ); -#ifdef HAVE_CUDA - deviceVector.setValue( 1.0 ); -#endif - // A relatively harmless call to keep the compiler from realizing we - // don't actually do any useful work with the result of the reduciton. - srand48(resultHost); - resultHost = resultDevice = 0.0; - }; - auto reset2 = [&]() { - hostVector2.setValue( 1.0 ); -#ifdef HAVE_CUDA - deviceVector2.setValue( 1.0 ); -#endif - }; - auto reset12 = [&]() { - reset1(); - reset2(); - }; - - - reset12(); - - - auto maxHost = [&]() { - resultHost = hostVector.max(); - }; - auto maxHostGeneral = [&]() { - Real result( 0 ); - Containers::Algorithms::ParallelReductionMax< Real > operation; - Containers::Algorithms::Reduction< Devices::Host >::reduce( - operation, - hostVector.getSize(), - hostVector.getData(), - ( Real* ) 0, - result ); - return result; - }; - auto maxCuda = [&]() { - resultDevice = deviceVector.max(); - }; - benchmark.setOperation( "max", datasetSize ); - benchmark.time( reset1, "CPU", maxHost ); - benchmark.time( reset1, "CPU (general)", maxHostGeneral ); -#ifdef HAVE_CUDA - benchmark.time( reset1, "GPU", maxCuda ); -#endif - - - auto minHost = [&]() { - resultHost = hostVector.min(); - }; - auto minHostGeneral = [&]() { - Real result( 0 ); - Containers::Algorithms::ParallelReductionMin< Real > operation; - Containers::Algorithms::Reduction< Devices::Host >::reduce( - operation, - hostVector.getSize(), - hostVector.getData(), - ( Real* ) 0, - result ); - return result; - }; - auto minCuda = [&]() { - resultDevice = deviceVector.min(); - }; - benchmark.setOperation( "min", datasetSize ); - benchmark.time( reset1, "CPU", minHost ); - benchmark.time( reset1, "CPU (general)", minHostGeneral ); -#ifdef HAVE_CUDA - benchmark.time( reset1, "GPU", minCuda ); -#endif - - - auto absMaxHost = [&]() { - resultHost = hostVector.absMax(); - }; - auto absMaxHostGeneral = [&]() { - Real result( 0 ); - Containers::Algorithms::ParallelReductionAbsMax< Real > operation; - Containers::Algorithms::Reduction< Devices::Host >::reduce( - operation, - hostVector.getSize(), - hostVector.getData(), - ( Real* ) 0, - result ); - return result; - }; - auto absMaxCuda = [&]() { - resultDevice = deviceVector.absMax(); - }; -#ifdef HAVE_CUDA - auto absMaxCublas = [&]() { - int index = 0; - cublasIgamax( cublasHandle, size, - deviceVector.getData(), 1, - &index ); - resultDevice = deviceVector.getElement( index ); - }; -#endif - benchmark.setOperation( "absMax", datasetSize ); - benchmark.time( reset1, "CPU", absMaxHost ); - benchmark.time( reset1, "CPU (general)", absMaxHostGeneral ); -#ifdef HAVE_CUDA - benchmark.time( reset1, "GPU", absMaxCuda ); - benchmark.time( reset1, "cuBLAS", absMaxCublas ); -#endif - - - auto absMinHost = [&]() { - resultHost = hostVector.absMin(); - }; - auto absMinHostGeneral = [&]() { - Real result( 0 ); - Containers::Algorithms::ParallelReductionAbsMin< Real > operation; - Containers::Algorithms::Reduction< Devices::Host >::reduce( - operation, - hostVector.getSize(), - hostVector.getData(), - ( Real* ) 0, - result ); - return result; - }; - auto absMinCuda = [&]() { - resultDevice = deviceVector.absMin(); - }; -#ifdef HAVE_CUDA - auto absMinCublas = [&]() { - int index = 0; - cublasIgamin( cublasHandle, size, - deviceVector.getData(), 1, - &index ); - resultDevice = deviceVector.getElement( index ); - }; -#endif - benchmark.setOperation( "absMin", datasetSize ); - benchmark.time( reset1, "CPU", absMinHost ); - benchmark.time( reset1, "CPU (general)", absMinHostGeneral ); -#ifdef HAVE_CUDA - benchmark.time( reset1, "GPU", absMinCuda ); - benchmark.time( reset1, "cuBLAS", absMinCublas ); -#endif - - - auto sumHost = [&]() { - resultHost = hostVector.sum(); - }; - auto sumHostGeneral = [&]() { - Real result( 0 ); - Containers::Algorithms::ParallelReductionSum< Real > operation; - Containers::Algorithms::Reduction< Devices::Host >::reduce( - operation, - hostVector.getSize(), - hostVector.getData(), - ( Real* ) 0, - result ); - return result; - }; - auto sumCuda = [&]() { - resultDevice = deviceVector.sum(); - }; - benchmark.setOperation( "sum", datasetSize ); - benchmark.time( reset1, "CPU", sumHost ); - benchmark.time( reset1, "CPU (general)", sumHostGeneral ); -#ifdef HAVE_CUDA - benchmark.time( reset1, "GPU", sumCuda ); -#endif - - - auto l1normHost = [&]() { - resultHost = hostVector.lpNorm( 1.0 ); - }; - auto l1normHostGeneral = [&]() { - Real result( 0 ); - Containers::Algorithms::ParallelReductionAbsSum< Real > operation; - Containers::Algorithms::Reduction< Devices::Host >::reduce( - operation, - hostVector.getSize(), - hostVector.getData(), - ( Real* ) 0, - result ); - return result; - }; - auto l1normCuda = [&]() { - resultDevice = deviceVector.lpNorm( 1.0 ); - }; -#ifdef HAVE_CUDA - auto l1normCublas = [&]() { - cublasGasum( cublasHandle, size, - deviceVector.getData(), 1, - &resultDevice ); - }; -#endif - benchmark.setOperation( "l1 norm", datasetSize ); - benchmark.time( reset1, "CPU", l1normHost ); - benchmark.time( reset1, "CPU (general)", l1normHostGeneral ); -#ifdef HAVE_CUDA - benchmark.time( reset1, "GPU", l1normCuda ); - benchmark.time( reset1, "cuBLAS", l1normCublas ); -#endif - - - auto l2normHost = [&]() { - resultHost = hostVector.lpNorm( 2.0 ); - }; - auto l2normHostGeneral = [&]() { - Real result( 0 ); - Containers::Algorithms::ParallelReductionL2Norm< Real > operation; - Containers::Algorithms::Reduction< Devices::Host >::reduce( - operation, - hostVector.getSize(), - hostVector.getData(), - ( Real* ) 0, - result ); - return result; - }; - auto l2normCuda = [&]() { - resultDevice = deviceVector.lpNorm( 2.0 ); - }; -#ifdef HAVE_CUDA - auto l2normCublas = [&]() { - cublasGnrm2( cublasHandle, size, - deviceVector.getData(), 1, - &resultDevice ); - }; -#endif - benchmark.setOperation( "l2 norm", datasetSize ); - benchmark.time( reset1, "CPU", l2normHost ); - benchmark.time( reset1, "CPU (general)", l2normHostGeneral ); -#ifdef HAVE_CUDA - benchmark.time( reset1, "GPU", l2normCuda ); - benchmark.time( reset1, "cuBLAS", l2normCublas ); -#endif - - - auto l3normHost = [&]() { - resultHost = hostVector.lpNorm( 3.0 ); - }; - auto l3normHostGeneral = [&]() { - Real result( 0 ); - Containers::Algorithms::ParallelReductionLpNorm< Real > operation; - operation.setPower( 3.0 ); - Containers::Algorithms::Reduction< Devices::Host >::reduce( - operation, - hostVector.getSize(), - hostVector.getData(), - ( Real* ) 0, - result ); - return result; - }; - auto l3normCuda = [&]() { - resultDevice = deviceVector.lpNorm( 3.0 ); - }; - benchmark.setOperation( "l3 norm", datasetSize ); - benchmark.time( reset1, "CPU", l3normHost ); - benchmark.time( reset1, "CPU (general)", l3normHostGeneral ); -#ifdef HAVE_CUDA - benchmark.time( reset1, "GPU", l3normCuda ); -#endif - - - auto scalarProductHost = [&]() { - resultHost = hostVector.scalarProduct( hostVector2 ); - }; - auto scalarProductHostGeneral = [&]() { - Real result( 0 ); - Containers::Algorithms::ParallelReductionScalarProduct< Real, Real > operation; - Containers::Algorithms::Reduction< Devices::Host >::reduce( - operation, - hostVector.getSize(), - hostVector.getData(), - hostVector2.getData(), - result ); - return result; - }; - auto scalarProductCuda = [&]() { - resultDevice = deviceVector.scalarProduct( deviceVector2 ); - }; -#ifdef HAVE_CUDA - auto scalarProductCublas = [&]() { - cublasGdot( cublasHandle, size, - deviceVector.getData(), 1, - deviceVector2.getData(), 1, - &resultDevice ); - }; -#endif - benchmark.setOperation( "scalar product", 2 * datasetSize ); - benchmark.time( reset1, "CPU", scalarProductHost ); - benchmark.time( reset1, "CPU (general)", scalarProductHostGeneral ); -#ifdef HAVE_CUDA - benchmark.time( reset1, "GPU", scalarProductCuda ); - benchmark.time( reset1, "cuBLAS", scalarProductCublas ); -#endif - - /* - std::cout << "Benchmarking prefix-sum:" << std::endl; - timer.reset(); - timer.start(); - hostVector.computePrefixSum(); - timer.stop(); - timeHost = timer.getTime(); - bandwidth = 2 * datasetSize / loops / timer.getTime(); - std::cout << " CPU: bandwidth: " << bandwidth << " GB/sec, time: " << timer.getTime() << " sec." << std::endl; - - timer.reset(); - timer.start(); - deviceVector.computePrefixSum(); - timer.stop(); - timeDevice = timer.getTime(); - bandwidth = 2 * datasetSize / loops / timer.getTime(); - std::cout << " GPU: bandwidth: " << bandwidth << " GB/sec, time: " << timer.getTime() << " sec." << std::endl; - std::cout << " CPU/GPU speedup: " << timeHost / timeDevice << std::endl; - - HostVector auxHostVector; - auxHostVector.setLike( deviceVector ); - auxHostVector = deviceVector; - for( int i = 0; i < size; i++ ) - if( hostVector.getElement( i ) != auxHostVector.getElement( i ) ) - { - std::cerr << "Error in prefix sum at position " << i << ": " << hostVector.getElement( i ) << " != " << auxHostVector.getElement( i ) << std::endl; - } - */ - - - auto multiplyHost = [&]() { - hostVector *= 0.5; - }; - auto multiplyCuda = [&]() { - deviceVector *= 0.5; - }; -#ifdef HAVE_CUDA - auto multiplyCublas = [&]() { - const Real alpha = 0.5; - cublasGscal( cublasHandle, size, - &alpha, - deviceVector.getData(), 1 ); - }; -#endif - benchmark.setOperation( "scalar multiplication", 2 * datasetSize ); - benchmark.time( reset1, "CPU", multiplyHost ); -#ifdef HAVE_CUDA - benchmark.time( reset1, "GPU", multiplyCuda ); - benchmark.time( reset1, "cuBLAS", multiplyCublas ); -#endif - - - auto addVectorHost = [&]() { - hostVector.addVector( hostVector2 ); - }; - auto addVectorCuda = [&]() { - deviceVector.addVector( deviceVector2 ); - }; -#ifdef HAVE_CUDA - auto addVectorCublas = [&]() { - const Real alpha = 1.0; - cublasGaxpy( cublasHandle, size, - &alpha, - deviceVector2.getData(), 1, - deviceVector.getData(), 1 ); - }; -#endif - benchmark.setOperation( "vector addition", 3 * datasetSize ); - benchmark.time( reset1, "CPU", addVectorHost ); -#ifdef HAVE_CUDA - benchmark.time( reset1, "GPU", addVectorCuda ); - benchmark.time( reset1, "cuBLAS", addVectorCublas ); -#endif - - -#ifdef HAVE_CUDA - cublasDestroy( cublasHandle ); -#endif - - return true; -} - -} // namespace benchmarks -} // namespace tnl