Commit b0016afe authored by Jakub Klinkovský's avatar Jakub Klinkovský
Browse files

Python: added modules for benchmark scripts based on JSON and a general post-processing script

parent f07915f1
Loading
Loading
Loading
Loading
+116 −0
Original line number Diff line number Diff line
#!/usr/bin/python3

__all__ = [
    "dict_to_html_table",
    "get_benchmark_metadata",
    "get_benchmark_dataframes",
]

import os.path
import json
import pandas

def dict_to_html_table(data):
    html = "<table border=1>\n"
    html += "<tbody>\n"
    for key in sorted(data.keys()):
        html += f"\t<tr><td>{key}</td><td>{data[key]}</td></tr>\n"
    html += "</tbody>\n"
    html += "</table>\n"
    return html

def get_benchmark_metadata(filename):
    """
    Reads metadata of the benchmark in the given file.

    :param str filename: path of the file with metadata or benchmark results.
        - If it ends with ".metadata.json", metadata is read from that file.
        - Otherwise, the extension is first replaced with ".metadata.json".
    :returns: dict as returned by json.load, or None if the file does not exist.
    """
    if not filename.endswith(".metadata.json"):
        filename = os.path.splitext(filename)[0] + ".metadata.json"
    if os.path.isfile(filename):
        print(f"Parsing metadata from file {filename}")
        return json.load(open(filename, "r"))
    print(f"Metadata file {filename} does not exist")
    return None

def get_benchmark_dataframe(logFile):
    """
    Get pandas dataframe with benchmark results stored in the given log file.

    :param logFile: path to the log file
    :returns: pandas.DataFrame instance
    """
    print(f"Parsing input file {logFile}")
    df = pandas.read_json(open(logFile, "r"), orient="records", lines=True)

    # convert "N/A" in the speedup column to nan
    if "speedup" in df.columns:
        df["speedup"] = pandas.to_numeric(df["speedup"], errors="coerce")

    return df

def gen_dataframes_per_operation(logFile, header_elements=None):
    """
    Reads benchmark results stored in the given log file and splits them into
    multiple dataframes according to the "operation" column.

    Various post-processing steps are done on each partial dataframe:
    - columns with only NaN values are removed
    - the operation column is removed
    - the "index" and "columns" of the dataframe are set:
        - if header_elements are given, they are set as "columns" and everything
          else is used for the index
        - otherwise, all columns in the dataframe before "time" are used for
          the index, and the remaining columns (starting with "time") stay as
          "columns"
    - the "performer" column is set as the last column of the index
    - note that the index is not explicitly sorted, so data is ordered as in the
      input file

    :param logFile: path to the log file
    :yields: pairs of (str, pandas.DataFrame) object, where the str denotes the
             particular operation name
    """
    main_df = get_benchmark_dataframe(logFile)

    # check if there is at least one operation
    if "operation" not in main_df.columns:
        yield "Dummy operation", main_df
        return

    # extract all benchmark operations, preserve their order as found in the dataframe
    operations = []
    for op in main_df["operation"]:
        if op not in operations:
            operations.append(op)

    # set operation as index
    main_df = main_df.set_index("operation")

    # if header_elements was not provided, we assume that "time" and all following columns
    # are benchmark results, and all preceding columns are metadata columns that will be
    # set as index of the dataframe
    if header_elements is None:
        header_elements = list(main_df.columns)
        header_elements = header_elements[header_elements.index("time"):]
        # FIXME: the "rows" and "columns" (in the gemv operation) are parsed after the correct header elements, because the preceding operations don't have these metadata columns
        # TODO: each benchmark should record the header elements in the metadata file
        header_elements = [e for e in header_elements if e not in ["rows", "columns"]]

    # emit one df per operation
    for op in operations:
        df = main_df.loc[op]
        # remove columns with only NaNs
        df = df.dropna(axis=1, how="all")
        # remove the operation column (index)
        df = df.reset_index(drop=True)
        # prepare index_columns and make sure that performer is the last
        index_columns = [c for c in df.columns if c not in header_elements and c != "performer"]
        index_columns.append("performer")
        # set new index for the df: all columns except header_elements
        df = df.set_index(index_columns)
        # emit a pair (op, df)
        yield op, df
+129 −0
Original line number Diff line number Diff line
#!/usr/bin/python3

__all__ = [
    "plot_bandwidth_vs_size",
    "heatmaps_bandwidth",
    "get_image_html_tag",
]

import numpy
import matplotlib.pyplot as plt
from cycler import cycler
import io
import base64

custom_cycler = cycler(linestyle=["-", "--", ":", "-."]) * cycler("color", ["#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd", "#8c564b", "#e377c2", "#7f7f7f", "#bcbd22", "#17becf"])

def plot_bandwidth_vs_size(df, size_name="size", prop_cycler=custom_cycler, **kwargs):
    """
    Creates a bandwidth-size plot. The "size" data are expected in the index of
    the dataframe, all other columns of the index are used for labels of the
    graph lines.

    :param df: a pandas.DataFrame instance
    :param size_name: name of the "size" column in the index
    :param prop_cycler:
        property cycler for the graph lines, see the documentation for details:
        https://matplotlib.org/stable/tutorials/intermediate/color_cycle.html
    :param kwargs:
        optional keyword arguments passed to matplotlib's errorbar function
    :returns: a tuple (fig, ax) as returned by plt.subplots()
    """
    # prepare the dataframe
    assert "bandwidth" in df.columns
    assert size_name in df.index.names
    df = df.reset_index(level=size_name).sort_index()

    # set default parameters for the plot
    kwargs.setdefault("capsize", 4)

    # plot the graph
    fig, ax = plt.subplots()
    ax.set_xlabel(size_name)
    ax.set_ylabel("bandwidth [GiB/s]")
    ax.set_prop_cycle(prop_cycler)
    for idx in df.index.unique():
        part = df.loc[idx]
        err = part["bandwidth"] * part["stddev/time"]
        ax.errorbar(part[size_name], part["bandwidth"], yerr=err, label=", ".join(idx), **kwargs)
    # see https://stackoverflow.com/a/43439132
    ax.legend(bbox_to_anchor=(1.04, 1), loc="upper left", borderaxespad=0.)

    return fig, ax

def heatmaps_bandwidth(df, x_name="columns", y_name="rows", *, cbar_kw=None, **kwargs):
    """
    Creates heatmaps two-dimensional data of bandwidth. The "size" data (i.e.
    x_name and y_name) are expected in the index of the dataframe, all other
    columns of the index are used to label the heatmaps. Heatmaps are generated
    using the Python generator interface for each unique tuple of dataframe
    index values.

    :param df: a pandas.DataFrame instance
    :param x_name: name of the column in the index to map along the x-axis
    :param y_name: name of the column in the index to map along the y-axis
    :param cbar_kw:
        optional dict of arguments passed to matplotlib's colorbar function
    :param kwargs:
        optional keyword arguments passed to matplotlib's imshow function
    :returns: a tuple (fig, ax) as returned by plt.subplots()
    """
    # prepare the dataframe
    assert "bandwidth" in df.columns
    assert x_name in df.index.names
    assert y_name in df.index.names
    df = df.reset_index(level=[x_name, y_name]).sort_index()

    if cbar_kw is None:
        cbar_kw = {}

    for idx in df.index.unique():
        # drop the index
        part = df.loc[idx].reset_index(drop=True)
        # get just the data we need
        part = part[[x_name, y_name, "bandwidth"]].set_index([y_name, x_name])
        # convert to a 2D array
        bandwidth = part.stack().unstack(level=x_name)
        # remove the column full of "bandwidth" from the index
        bandwidth = bandwidth.reset_index(level=1, drop=True)

        # figure setup
        fig, ax = plt.subplots()
        ax.set_xlabel(x_name)
        ax.set_ylabel(y_name)
        label = ", ".join(idx)
        ax.set_title(f"{label} bandwidth [GiB/s]")

        # plot the heatmap and colorbar
        im = ax.imshow(bandwidth, interpolation=None, **kwargs)
        cbar = ax.figure.colorbar(im, ax=ax, **cbar_kw)
        cbar.ax.set_ylabel("bandwidth", rotation=-90, va="bottom")

        # set ticks and their labels
        ax.set_xticks(numpy.arange(len(bandwidth.columns)))
        ax.set_yticks(numpy.arange(len(bandwidth.index)))
        ax.set_xticklabels(int(n) for n in bandwidth.columns)
        ax.set_yticklabels(int(n) for n in bandwidth.index)

        # rotate xtick labels and set their alignment
        plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor")

        yield fig, ax

def get_image_html_tag(fig, format="svg"):
    """
    Returns an HTML tag with embedded image data in the given format.

    :param fig: a matplotlib figure instance
    :param format: output image format (passed to fig.savefig)
    """
    stream = io.BytesIO()
    # bbox_inches: expand the canvas to include the legend that was put outside the plot
    # see https://stackoverflow.com/a/43439132
    fig.savefig(stream, format=format, bbox_inches="tight")
    data = stream.getvalue()

    if format == "svg":
        return data.decode("utf-8")
    data = base64.b64encode(data).decode("utf-8")
    return f"<img src=\"data:image/{format};base64,{data}\">"
+2 −0
Original line number Diff line number Diff line
@@ -6,6 +6,8 @@ set( PYTHON_SITE_PACKAGES_DIR lib/python${PYTHON_VERSION_MAJOR}.${PYTHON_VERSION
if( PYTHONINTERP_FOUND )
   CONFIGURE_FILE( "__init__.py.in" "__init__.py" )
   INSTALL( FILES ${CMAKE_CURRENT_BINARY_DIR}/__init__.py
                  BenchmarkLogs.py
                  BenchmarkPlots.py
                  LogParser.py
            DESTINATION ${PYTHON_SITE_PACKAGES_DIR}/TNL )
endif()
+1 −0
Original line number Diff line number Diff line
@@ -79,5 +79,6 @@ INSTALL( TARGETS tnl-init
         DESTINATION bin )

INSTALL( PROGRAMS tnl-err2eoc
                  tnl-benchmark-to-html.py
                  tnl-log-to-html.py
         DESTINATION bin )
+93 −0
Original line number Diff line number Diff line
#!/usr/bin/python3

import sys
import os.path
import matplotlib.pyplot as plt

from TNL.BenchmarkLogs import *
from TNL.BenchmarkPlots import *

if len(sys.argv) < 2 or len(sys.argv) > 3:
    print(f"""\
usage: {sys.argv[0]} FILE.log [OUTPUT.html]

where FILE.log contains one JSON record per line,
and OUTPUT.html is the output file name (by default, OUTPUT=FILE).
""", file=sys.stderr)
    sys.exit(1)

logFile = sys.argv[1]
if len(sys.argv) > 2:
    htmlFile = sys.argv[2]
else:
    htmlFile = os.path.splitext(logFile)[0] + ".html"


metadata = get_benchmark_metadata(logFile)
if metadata is not None and "title" in metadata:
    title = metadata["title"]
else:
    title = os.path.splitext(os.path.basename(logFile))[0]
dataframes = list(gen_dataframes_per_operation(logFile))

print(f"Writing output to {htmlFile}")
with open(htmlFile, 'w') as f:
    print("<html>", file=f)
    # add some basic style
    print("""\
<head>
<meta charset="UTF-8">
<style>
    h1, h2 { border-bottom: solid 1px lightgray; }
    table { border-collapse: collapse; }
    table.benchmark td { text-align: end; }
    th, td { padding: 2px; }
</style>
</head>
<body>""", file=f)

    print(f"<h1>{title}</h1>", file=f)
    if metadata is not None:
        print(dict_to_html_table(metadata), file=f)

    # create a TOC
    print(f"<h2>Table of contents</h2>", file=f)
    print("<ol>", file=f)
    for op, df in dataframes:
        id = op.replace(" ", "_")
        print(f"<li><a href=\"#{id}\">{op}</a></li>", file=f)
    print("</ol>", file=f)

    # formatters for specific columns of the table
    formatters = {
        "stddev": lambda value: f"{value:e}",
        "bandwidth": lambda value: f"{value:.3f}",
        "speedup": lambda value: f"{value:.3f}",
    }

    for op, df in dataframes:
        # section heading
        id = op.replace(" ", "_")
        print(f"<h2 id=\"{id}\">{op}</h2>", file=f)
        # table
        print(df.to_html(classes="benchmark", formatters=formatters), file=f)

        # graphs
        size_name = None
        if "size" in df.index.names:
            size_name = "size"
        elif "DOFs" in df.index.names:
            size_name = "DOFs"
        if size_name is not None:
            fig, ax = plot_bandwidth_vs_size(df, size_name)
            print(get_image_html_tag(fig, format="png"), file=f)
            plt.close(fig)

        # heatmaps
        if "rows" in df.index.names and "columns" in df.index.names:
            for fig, ax in heatmaps_bandwidth(df):
                print(get_image_html_tag(fig, format="png"), file=f)
                plt.close(fig)

    print("</body>", file=f)
    print("</html>", file=f)