Python: added modules for benchmark scripts based on JSON and a general post-processing script (b0016afe) · Commits · TNL / tnl-dev

src/Python/BenchmarkLogs.py

0 → 100644

+116 −0

Original line number	Diff line number	Diff line
		#!/usr/bin/python3

		__all__ = [
		"dict_to_html_table",
		"get_benchmark_metadata",
		"get_benchmark_dataframes",
		]

		import os.path
		import json
		import pandas

		def dict_to_html_table(data):
		html = "<table border=1>\n"
		html += "<tbody>\n"
		for key in sorted(data.keys()):
		html += f"\t<tr><td>{key}</td><td>{data[key]}</td></tr>\n"
		html += "</tbody>\n"
		html += "</table>\n"
		return html

		def get_benchmark_metadata(filename):
		"""
		Reads metadata of the benchmark in the given file.

		:param str filename: path of the file with metadata or benchmark results.
		- If it ends with ".metadata.json", metadata is read from that file.
		- Otherwise, the extension is first replaced with ".metadata.json".
		:returns: dict as returned by json.load, or None if the file does not exist.
		"""
		if not filename.endswith(".metadata.json"):
		filename = os.path.splitext(filename)[0] + ".metadata.json"
		if os.path.isfile(filename):
		print(f"Parsing metadata from file {filename}")
		return json.load(open(filename, "r"))
		print(f"Metadata file {filename} does not exist")
		return None

		def get_benchmark_dataframe(logFile):
		"""
		Get pandas dataframe with benchmark results stored in the given log file.

		:param logFile: path to the log file
		:returns: pandas.DataFrame instance
		"""
		print(f"Parsing input file {logFile}")
		df = pandas.read_json(open(logFile, "r"), orient="records", lines=True)

		# convert "N/A" in the speedup column to nan
		if "speedup" in df.columns:
		df["speedup"] = pandas.to_numeric(df["speedup"], errors="coerce")

		return df

		def gen_dataframes_per_operation(logFile, header_elements=None):
		"""
		Reads benchmark results stored in the given log file and splits them into
		multiple dataframes according to the "operation" column.

		Various post-processing steps are done on each partial dataframe:
		- columns with only NaN values are removed
		- the operation column is removed
		- the "index" and "columns" of the dataframe are set:
		- if header_elements are given, they are set as "columns" and everything
		else is used for the index
		- otherwise, all columns in the dataframe before "time" are used for
		the index, and the remaining columns (starting with "time") stay as
		"columns"
		- the "performer" column is set as the last column of the index
		- note that the index is not explicitly sorted, so data is ordered as in the
		input file

		:param logFile: path to the log file
		:yields: pairs of (str, pandas.DataFrame) object, where the str denotes the
		particular operation name
		"""
		main_df = get_benchmark_dataframe(logFile)

		# check if there is at least one operation
		if "operation" not in main_df.columns:
		yield "Dummy operation", main_df
		return

		# extract all benchmark operations, preserve their order as found in the dataframe
		operations = []
		for op in main_df["operation"]:
		if op not in operations:
		operations.append(op)

		# set operation as index
		main_df = main_df.set_index("operation")

		# if header_elements was not provided, we assume that "time" and all following columns
		# are benchmark results, and all preceding columns are metadata columns that will be
		# set as index of the dataframe
		if header_elements is None:
		header_elements = list(main_df.columns)
		header_elements = header_elements[header_elements.index("time"):]
		# FIXME: the "rows" and "columns" (in the gemv operation) are parsed after the correct header elements, because the preceding operations don't have these metadata columns
		# TODO: each benchmark should record the header elements in the metadata file
		header_elements = [e for e in header_elements if e not in ["rows", "columns"]]

		# emit one df per operation
		for op in operations:
		df = main_df.loc[op]
		# remove columns with only NaNs
		df = df.dropna(axis=1, how="all")
		# remove the operation column (index)
		df = df.reset_index(drop=True)
		# prepare index_columns and make sure that performer is the last
		index_columns = [c for c in df.columns if c not in header_elements and c != "performer"]
		index_columns.append("performer")
		# set new index for the df: all columns except header_elements
		df = df.set_index(index_columns)
		# emit a pair (op, df)
		yield op, df

src/Python/BenchmarkPlots.py

0 → 100644

+129 −0

Original line number	Diff line number	Diff line
		#!/usr/bin/python3

		__all__ = [
		"plot_bandwidth_vs_size",
		"heatmaps_bandwidth",
		"get_image_html_tag",
		]

		import numpy
		import matplotlib.pyplot as plt
		from cycler import cycler
		import io
		import base64

		custom_cycler = cycler(linestyle=["-", "--", ":", "-."]) * cycler("color", ["#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd", "#8c564b", "#e377c2", "#7f7f7f", "#bcbd22", "#17becf"])

		def plot_bandwidth_vs_size(df, size_name="size", prop_cycler=custom_cycler, **kwargs):
		"""
		Creates a bandwidth-size plot. The "size" data are expected in the index of
		the dataframe, all other columns of the index are used for labels of the
		graph lines.

		:param df: a pandas.DataFrame instance
		:param size_name: name of the "size" column in the index
		:param prop_cycler:
		property cycler for the graph lines, see the documentation for details:
		https://matplotlib.org/stable/tutorials/intermediate/color_cycle.html
		:param kwargs:
		optional keyword arguments passed to matplotlib's errorbar function
		:returns: a tuple (fig, ax) as returned by plt.subplots()
		"""
		# prepare the dataframe
		assert "bandwidth" in df.columns
		assert size_name in df.index.names
		df = df.reset_index(level=size_name).sort_index()

		# set default parameters for the plot
		kwargs.setdefault("capsize", 4)

		# plot the graph
		fig, ax = plt.subplots()
		ax.set_xlabel(size_name)
		ax.set_ylabel("bandwidth [GiB/s]")
		ax.set_prop_cycle(prop_cycler)
		for idx in df.index.unique():
		part = df.loc[idx]
		err = part["bandwidth"] * part["stddev/time"]
		ax.errorbar(part[size_name], part["bandwidth"], yerr=err, label=", ".join(idx), **kwargs)
		# see https://stackoverflow.com/a/43439132
		ax.legend(bbox_to_anchor=(1.04, 1), loc="upper left", borderaxespad=0.)

		return fig, ax

		def heatmaps_bandwidth(df, x_name="columns", y_name="rows", , cbar_kw=None, *kwargs):
		"""
		Creates heatmaps two-dimensional data of bandwidth. The "size" data (i.e.
		x_name and y_name) are expected in the index of the dataframe, all other
		columns of the index are used to label the heatmaps. Heatmaps are generated
		using the Python generator interface for each unique tuple of dataframe
		index values.

		:param df: a pandas.DataFrame instance
		:param x_name: name of the column in the index to map along the x-axis
		:param y_name: name of the column in the index to map along the y-axis
		:param cbar_kw:
		optional dict of arguments passed to matplotlib's colorbar function
		:param kwargs:
		optional keyword arguments passed to matplotlib's imshow function
		:returns: a tuple (fig, ax) as returned by plt.subplots()
		"""
		# prepare the dataframe
		assert "bandwidth" in df.columns
		assert x_name in df.index.names
		assert y_name in df.index.names
		df = df.reset_index(level=[x_name, y_name]).sort_index()

		if cbar_kw is None:
		cbar_kw = {}

		for idx in df.index.unique():
		# drop the index
		part = df.loc[idx].reset_index(drop=True)
		# get just the data we need
		part = part[[x_name, y_name, "bandwidth"]].set_index([y_name, x_name])
		# convert to a 2D array
		bandwidth = part.stack().unstack(level=x_name)
		# remove the column full of "bandwidth" from the index
		bandwidth = bandwidth.reset_index(level=1, drop=True)

		# figure setup
		fig, ax = plt.subplots()
		ax.set_xlabel(x_name)
		ax.set_ylabel(y_name)
		label = ", ".join(idx)
		ax.set_title(f"{label} bandwidth [GiB/s]")

		# plot the heatmap and colorbar
		im = ax.imshow(bandwidth, interpolation=None, **kwargs)
		cbar = ax.figure.colorbar(im, ax=ax, **cbar_kw)
		cbar.ax.set_ylabel("bandwidth", rotation=-90, va="bottom")

		# set ticks and their labels
		ax.set_xticks(numpy.arange(len(bandwidth.columns)))
		ax.set_yticks(numpy.arange(len(bandwidth.index)))
		ax.set_xticklabels(int(n) for n in bandwidth.columns)
		ax.set_yticklabels(int(n) for n in bandwidth.index)

		# rotate xtick labels and set their alignment
		plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor")

		yield fig, ax

		def get_image_html_tag(fig, format="svg"):
		"""
		Returns an HTML tag with embedded image data in the given format.

		:param fig: a matplotlib figure instance
		:param format: output image format (passed to fig.savefig)
		"""
		stream = io.BytesIO()
		# bbox_inches: expand the canvas to include the legend that was put outside the plot
		# see https://stackoverflow.com/a/43439132
		fig.savefig(stream, format=format, bbox_inches="tight")
		data = stream.getvalue()

		if format == "svg":
		return data.decode("utf-8")
		data = base64.b64encode(data).decode("utf-8")
		return f"<img src=\"data:image/{format};base64,{data}\">"

src/Python/CMakeLists.txt

+2 −0

Original line number	Diff line number	Diff line
		@@ -6,6 +6,8 @@ set( PYTHON_SITE_PACKAGES_DIR lib/python${PYTHON_VERSION_MAJOR}.${PYTHON_VERSION
		if( PYTHONINTERP_FOUND )
		CONFIGURE_FILE( "__init__.py.in" "__init__.py" )
		INSTALL( FILES ${CMAKE_CURRENT_BINARY_DIR}/__init__.py
		BenchmarkLogs.py
		BenchmarkPlots.py
		LogParser.py
		DESTINATION ${PYTHON_SITE_PACKAGES_DIR}/TNL )
		endif()

src/Tools/CMakeLists.txt

+1 −0

Original line number	Diff line number	Diff line
		@@ -79,5 +79,6 @@ INSTALL( TARGETS tnl-init
		DESTINATION bin )

		INSTALL( PROGRAMS tnl-err2eoc
		tnl-benchmark-to-html.py
		tnl-log-to-html.py
		DESTINATION bin )

src/Tools/tnl-benchmark-to-html.py

0 → 100755

+93 −0

Original line number	Diff line number	Diff line
		#!/usr/bin/python3

		import sys
		import os.path
		import matplotlib.pyplot as plt

		from TNL.BenchmarkLogs import *
		from TNL.BenchmarkPlots import *

		if len(sys.argv) < 2 or len(sys.argv) > 3:
		print(f"""\
		usage: {sys.argv[0]} FILE.log [OUTPUT.html]

		where FILE.log contains one JSON record per line,
		and OUTPUT.html is the output file name (by default, OUTPUT=FILE).
		""", file=sys.stderr)
		sys.exit(1)

		logFile = sys.argv[1]
		if len(sys.argv) > 2:
		htmlFile = sys.argv[2]
		else:
		htmlFile = os.path.splitext(logFile)[0] + ".html"


		metadata = get_benchmark_metadata(logFile)
		if metadata is not None and "title" in metadata:
		title = metadata["title"]
		else:
		title = os.path.splitext(os.path.basename(logFile))[0]
		dataframes = list(gen_dataframes_per_operation(logFile))

		print(f"Writing output to {htmlFile}")
		with open(htmlFile, 'w') as f:
		print("<html>", file=f)
		# add some basic style
		print("""\
		<head>
		<meta charset="UTF-8">
		<style>
		h1, h2 { border-bottom: solid 1px lightgray; }
		table { border-collapse: collapse; }
		table.benchmark td { text-align: end; }
		th, td { padding: 2px; }
		</style>
		</head>
		<body>""", file=f)

		print(f"<h1>{title}</h1>", file=f)
		if metadata is not None:
		print(dict_to_html_table(metadata), file=f)

		# create a TOC
		print(f"<h2>Table of contents</h2>", file=f)
		print("<ol>", file=f)
		for op, df in dataframes:
		id = op.replace(" ", "_")
		print(f"<li><a href=\"#{id}\">{op}</a></li>", file=f)
		print("</ol>", file=f)

		# formatters for specific columns of the table
		formatters = {
		"stddev": lambda value: f"{value:e}",
		"bandwidth": lambda value: f"{value:.3f}",
		"speedup": lambda value: f"{value:.3f}",
		}

		for op, df in dataframes:
		# section heading
		id = op.replace(" ", "_")
		print(f"<h2 id=\"{id}\">{op}</h2>", file=f)
		# table
		print(df.to_html(classes="benchmark", formatters=formatters), file=f)

		# graphs
		size_name = None
		if "size" in df.index.names:
		size_name = "size"
		elif "DOFs" in df.index.names:
		size_name = "DOFs"
		if size_name is not None:
		fig, ax = plot_bandwidth_vs_size(df, size_name)
		print(get_image_html_tag(fig, format="png"), file=f)
		plt.close(fig)

		# heatmaps
		if "rows" in df.index.names and "columns" in df.index.names:
		for fig, ax in heatmaps_bandwidth(df):
		print(get_image_html_tag(fig, format="png"), file=f)
		plt.close(fig)

		print("</body>", file=f)
		print("</html>", file=f)