Merge branch 'cineca/mpi' into 'develop' (774735aa) · Commits · TNL / tnl-dev

src/Benchmarks/BLAS/array-operations.h

+2 −3

Original line number	Diff line number	Diff line
		@@ -102,9 +102,8 @@ benchmarkArrayOperations( Benchmark & benchmark,
		};
		#ifdef HAVE_CUDA
		benchmark.setOperation( "copy (operator=)", datasetSize, copyBasetime );
		benchmark.time( reset1,
		"CPU->GPU", copyAssignHostCuda,
		"GPU->CPU", copyAssignCudaHost );
		benchmark.time( reset1, "CPU->GPU", copyAssignHostCuda );
		benchmark.time( reset1, "GPU->CPU", copyAssignCudaHost );
		#endif

src/Benchmarks/BLAS/tnl-benchmark-blas.h

+2 −37

Original line number	Diff line number	Diff line
		@@ -13,8 +13,7 @@
		#pragma once

		#include <TNL/Devices/Host.h>
		#include <TNL/Devices/CudaDeviceInfo.h>
		#include <TNL/Devices/SystemInfo.h>
		#include <TNL/Devices/Cuda.h>
		#include <TNL/Config/ConfigDescription.h>
		#include <TNL/Config/ParameterContainer.h>

		@@ -26,9 +25,6 @@ using namespace TNL;
		using namespace TNL::Benchmarks;


		// TODO: should benchmarks check the result of the computation?


		template< typename Real >
		void
		runBlasBenchmarks( Benchmark & benchmark,
		@@ -146,38 +142,7 @@ main( int argc, char* argv[] )
		Benchmark benchmark( loops, verbose );

		// prepare global metadata
		const int cpu_id = 0;
		Devices::CacheSizes cacheSizes = Devices::SystemInfo::getCPUCacheSizes( cpu_id );
		String cacheInfo = String( cacheSizes.L1data ) + ", "
		+ String( cacheSizes.L1instruction ) + ", "
		+ String( cacheSizes.L2 ) + ", "
		+ String( cacheSizes.L3 );
		#ifdef HAVE_CUDA
		const int activeGPU = Devices::CudaDeviceInfo::getActiveDevice();
		const String deviceArch = String( Devices::CudaDeviceInfo::getArchitectureMajor( activeGPU ) ) + "." +
		String( Devices::CudaDeviceInfo::getArchitectureMinor( activeGPU ) );
		#endif
		Benchmark::MetadataMap metadata {
		{ "host name", Devices::SystemInfo::getHostname() },
		{ "architecture", Devices::SystemInfo::getArchitecture() },
		{ "system", Devices::SystemInfo::getSystemName() },
		{ "system release", Devices::SystemInfo::getSystemRelease() },
		{ "start time", Devices::SystemInfo::getCurrentTime() },
		{ "CPU model name", Devices::SystemInfo::getCPUModelName( cpu_id ) },
		{ "CPU cores", Devices::SystemInfo::getNumberOfCores( cpu_id ) },
		{ "CPU threads per core", Devices::SystemInfo::getNumberOfThreads( cpu_id ) / Devices::SystemInfo::getNumberOfCores( cpu_id ) },
		{ "CPU max frequency (MHz)", Devices::SystemInfo::getCPUMaxFrequency( cpu_id ) / 1e3 },
		{ "CPU cache sizes (L1d, L1i, L2, L3) (kiB)", cacheInfo },
		#ifdef HAVE_CUDA
		{ "GPU name", Devices::CudaDeviceInfo::getDeviceName( activeGPU ) },
		{ "GPU architecture", deviceArch },
		{ "GPU CUDA cores", Devices::CudaDeviceInfo::getCudaCores( activeGPU ) },
		{ "GPU clock rate (MHz)", (double) Devices::CudaDeviceInfo::getClockRate( activeGPU ) / 1e3 },
		{ "GPU global memory (GB)", (double) Devices::CudaDeviceInfo::getGlobalMemory( activeGPU ) / 1e9 },
		{ "GPU memory clock rate (MHz)", (double) Devices::CudaDeviceInfo::getMemoryClockRate( activeGPU ) / 1e3 },
		{ "GPU memory ECC enabled", Devices::CudaDeviceInfo::getECCEnabled( activeGPU ) },
		#endif
		};
		Benchmark::MetadataMap metadata = getHardwareMetadata();

		if( precision == "all" \|\| precision == "float" )
		runBlasBenchmarks< float >( benchmark, metadata, minSize, maxSize, sizeStepFactor, loops, elementsPerRow );

src/Benchmarks/Benchmarks.h

+104 −29

Original line number	Diff line number	Diff line
		@@ -16,11 +16,18 @@
		#include <iomanip>
		#include <map>
		#include <vector>
		#include <exception>
		#include <limits>

		#include <TNL/Timer.h>
		#include <TNL/String.h>
		#include <TNL/Solvers/IterativeSolverMonitor.h>

		#include <TNL/Devices/Host.h>
		#include <TNL/Devices/SystemInfo.h>
		#include <TNL/Devices/CudaDeviceInfo.h>
		#include <TNL/Communicators/MpiCommunicator.h>

		namespace TNL {
		namespace Benchmarks {

		@@ -64,7 +71,7 @@ timeFunction( ComputeFunction compute,
		timer.stop();
		}

		return timer.getRealTime();
		return timer.getRealTime() / loops;
		}


		@@ -75,8 +82,8 @@ public:
		using MetadataMap = std::map< const char*, String >;
		using MetadataColumns = std::vector<MetadataElement>;

		using HeaderElements = std::initializer_list< String >;
		using RowElements = std::initializer_list< double >;
		using HeaderElements = std::vector< String >;
		using RowElements = std::vector< double >;

		Logging( bool verbose = true )
		: verbose(verbose)
		@@ -109,8 +116,6 @@ public:
		writeTableHeader( const String & spanningElement,
		const HeaderElements & subElements )
		{
		using namespace std;

		if( verbose && header_changed ) {
		for( auto & it : metadataColumns ) {
		std::cout << std::setw( 20 ) << it.first;
		@@ -163,8 +168,6 @@ public:
		writeTableRow( const String & spanningElement,
		const RowElements & subElements )
		{
		using namespace std;

		if( verbose ) {
		for( auto & it : metadataColumns ) {
		std::cout << std::setw( 20 ) << it.second;
		@@ -278,6 +281,27 @@ protected:
		};


		struct BenchmarkResult
		{
		using HeaderElements = Logging::HeaderElements;
		using RowElements = Logging::RowElements;

		double bandwidth = std::numeric_limits<double>::quiet_NaN();
		double time = std::numeric_limits<double>::quiet_NaN();
		double speedup = std::numeric_limits<double>::quiet_NaN();

		virtual HeaderElements getTableHeader() const
		{
		return HeaderElements({"bandwidth", "time", "speedup"});
		}

		virtual RowElements getRowElements() const
		{
		return RowElements({ bandwidth, time, speedup });
		}
		};


		class Benchmark
		: protected Logging
		{
		@@ -305,7 +329,6 @@ public:
		{
		closeTable();
		writeTitle( title );
		monitor.setStage( title.getString() );
		}

		// Marks the start of a new benchmark (with custom metadata)
		@@ -315,7 +338,6 @@ public:
		{
		closeTable();
		writeTitle( title );
		monitor.setStage( title.getString() );
		// add loops to metadata
		metadata["loops"] = String(loops);
		writeMetadata( metadata );
		@@ -342,6 +364,7 @@ public:
		const double datasetSize = 0.0, // in GB
		const double baseTime = 0.0 )
		{
		monitor.setStage( operation.getString() );
		if( metadataColumns.size() > 0 && String(metadataColumns[ 0 ].first) == "operation" ) {
		metadataColumns[ 0 ].second = operation;
		}
		@@ -393,43 +416,45 @@ public:
		double
		time( ResetFunction reset,
		const String & performer,
		ComputeFunction & compute )
		ComputeFunction & compute,
		BenchmarkResult & result )
		{
		double time;
		result.time = std::numeric_limits<double>::quiet_NaN();
		try {
		if( verbose ) {
		// run the monitor main loop
		Solvers::SolverMonitorThread monitor_thread( monitor );
		time = timeFunction( compute, reset, loops, monitor );
		result.time = timeFunction( compute, reset, loops, monitor );
		}
		else {
		time = timeFunction( compute, reset, loops, monitor );
		result.time = timeFunction( compute, reset, loops, monitor );
		}
		}
		catch ( const std::exception& e ) {
		std::cerr << "timeFunction failed due to a C++ exception with description: " << e.what() << std::endl;
		}

		const double bandwidth = datasetSize / time;
		const double speedup = this->baseTime / time;
		result.bandwidth = datasetSize / result.time;
		result.speedup = this->baseTime / result.time;
		if( this->baseTime == 0.0 )
		this->baseTime = time;
		this->baseTime = result.time;

		writeTableHeader( performer, HeaderElements({"bandwidth", "time", "speedup"}) );
		writeTableRow( performer, RowElements({ bandwidth, time, speedup }) );
		writeTableHeader( performer, result.getTableHeader() );
		writeTableRow( performer, result.getRowElements() );

		return this->baseTime;
		}

		// Recursive template function to deal with multiple computations with the
		// same reset function.
		template< typename ResetFunction,
		typename ComputeFunction,
		typename... NextComputations >
		inline double
		time( ResetFunction reset,
		const String & performer,
		ComputeFunction & compute,
		NextComputations & ... nextComputations )
		ComputeFunction & compute )
		{
		time( reset, performer, compute );
		time( reset, nextComputations... );
		return this->baseTime;
		BenchmarkResult result;
		return time( reset, performer, compute, result );
		}

		// Adds an error message to the log. Should be called in places where the
		@@ -445,6 +470,12 @@ public:

		using Logging::save;

		Solvers::IterativeSolverMonitor< double, int >&
		getMonitor()
		{
		return monitor;
		}

		protected:
		int loops;
		double datasetSize = 0.0;
		@@ -452,5 +483,49 @@ protected:
		Solvers::IterativeSolverMonitor< double, int > monitor;
		};


		Benchmark::MetadataMap getHardwareMetadata()
		{
		const int cpu_id = 0;
		Devices::CacheSizes cacheSizes = Devices::SystemInfo::getCPUCacheSizes( cpu_id );
		String cacheInfo = String( cacheSizes.L1data ) + ", "
		+ String( cacheSizes.L1instruction ) + ", "
		+ String( cacheSizes.L2 ) + ", "
		+ String( cacheSizes.L3 );
		#ifdef HAVE_CUDA
		const int activeGPU = Devices::CudaDeviceInfo::getActiveDevice();
		const String deviceArch = String( Devices::CudaDeviceInfo::getArchitectureMajor( activeGPU ) ) + "." +
		String( Devices::CudaDeviceInfo::getArchitectureMinor( activeGPU ) );
		#endif
		Benchmark::MetadataMap metadata {
		{ "host name", Devices::SystemInfo::getHostname() },
		{ "architecture", Devices::SystemInfo::getArchitecture() },
		{ "system", Devices::SystemInfo::getSystemName() },
		{ "system release", Devices::SystemInfo::getSystemRelease() },
		{ "start time", Devices::SystemInfo::getCurrentTime() },
		#ifdef HAVE_MPI
		{ "number of MPI processes", Communicators::MpiCommunicator::GetSize( Communicators::MpiCommunicator::AllGroup ) },
		#endif
		{ "OpenMP enabled", Devices::Host::isOMPEnabled() },
		{ "OpenMP threads", Devices::Host::getMaxThreadsCount() },
		{ "CPU model name", Devices::SystemInfo::getCPUModelName( cpu_id ) },
		{ "CPU cores", Devices::SystemInfo::getNumberOfCores( cpu_id ) },
		{ "CPU threads per core", Devices::SystemInfo::getNumberOfThreads( cpu_id ) / Devices::SystemInfo::getNumberOfCores( cpu_id ) },
		{ "CPU max frequency (MHz)", Devices::SystemInfo::getCPUMaxFrequency( cpu_id ) / 1e3 },
		{ "CPU cache sizes (L1d, L1i, L2, L3) (kiB)", cacheInfo },
		#ifdef HAVE_CUDA
		{ "GPU name", Devices::CudaDeviceInfo::getDeviceName( activeGPU ) },
		{ "GPU architecture", deviceArch },
		{ "GPU CUDA cores", Devices::CudaDeviceInfo::getCudaCores( activeGPU ) },
		{ "GPU clock rate (MHz)", (double) Devices::CudaDeviceInfo::getClockRate( activeGPU ) / 1e3 },
		{ "GPU global memory (GB)", (double) Devices::CudaDeviceInfo::getGlobalMemory( activeGPU ) / 1e9 },
		{ "GPU memory clock rate (MHz)", (double) Devices::CudaDeviceInfo::getMemoryClockRate( activeGPU ) / 1e3 },
		{ "GPU memory ECC enabled", Devices::CudaDeviceInfo::getECCEnabled( activeGPU ) },
		#endif
		};

		return metadata;
		}

		} // namespace Benchmarks
		} // namespace TNL

src/Benchmarks/CMakeLists.txt

+1 −0

Original line number	Diff line number	Diff line
		add_subdirectory( HeatEquation )
		add_subdirectory( BLAS )
		add_subdirectory( SpMV )
		add_subdirectory( DistSpMV )
		add_subdirectory( LinearSolvers )

		set( headers

src/Benchmarks/DistSpMV/CMakeLists.txt

0 → 100644

+11 −0

Original line number	Diff line number	Diff line
		if( BUILD_CUDA )
		cuda_add_executable( tnl-benchmark-distributed-spmv-cuda tnl-benchmark-distributed-spmv.cu )
		target_link_libraries( tnl-benchmark-distributed-spmv-cuda tnl )

		install( TARGETS tnl-benchmark-distributed-spmv-cuda RUNTIME DESTINATION bin )
		endif()

		add_executable( tnl-benchmark-distributed-spmv tnl-benchmark-distributed-spmv.cpp )
		target_link_libraries( tnl-benchmark-distributed-spmv tnl )

		install( TARGETS tnl-benchmark-distributed-spmv RUNTIME DESTINATION bin )