make_bandwidth_test_page.py

###############################################################################
# (c) Copyright 2023 CERN for the benefit of the LHCb Collaboration           #
#                                                                             #
# This software is distributed under the terms of the GNU General Public      #
# Licence version 3 (GPL Version 3), copied verbatim in the file "COPYING".   #
#                                                                             #
# In applying this licence, CERN does not waive the privileges and immunities #
# granted to it by virtue of its status as an Intergovernmental Organization  #
# or submit itself to any jurisdiction.                                       #
###############################################################################
import argparse
import jinja2
import matplotlib.pyplot as plt
import pandas as pd
import yaml
import os
from math import log10
from dataclasses import dataclass, field
from typing import List
from collections import namedtuple
from PRConfig.bandwidth_helpers import FileNameHelper

plt.ioff()

REPORT_TEMPLATE = jinja2.Template("""
<html>
<head></head>
<body>
<p>
    slot.build_id: $$version$$<br>
    platform: $$platform$$<br>
    hostname: $$hostname$$<br>
    cpu_info: $$cpu_info$$<br>
    testing script path: {{SCRIPTPATH}}
</p>
<ul>
    <li><a href="{{BASE_PATH}}/run.log">Logs</a></li>
</ul>
<p style="color:{{EXIT_CODE_COLOUR}}">
    <b>{{EXIT_CODE_SENTENCE}}</b>
</p>
<p>
    Results per working group and stream:
    <ul>
    <li>Inclusive retention and rate</li>
    <li>(Jaccard) similarity matrix</li>
    <li>Average DstData size and bandwidth</li>
    <li>Average event size and bandwidth</li>
    </ul>
</p>
<p>
    Results per line: all of the above, plus
    <ul>
    <li>Exclusive retention and rate</li>
    <li>Descriptives (whether persistreco and/or extra outputs is enabled)</li>
    </ul>
</p>
<p> See: <a href="https://lbfence.cern.ch/alcm/public/figure/details/32">RTA & DPA Workflow</a> for reference figures regarding bandwidth.</p>
<p>
    Input sample information:
    <ul>
    <li>Config file: {{INPUT_CONFIG_PATH}}</li>
    <li>Input rate: {{INPUT_RATE}} kHz</li>
    <li>Number of interactions per bunch crossing (&#957): {{INPUT_NU}}</li>
    <li>Radius of VELO opening: {{INPUT_VELO_RADIUS}} mm</li>
    </ul>
</p>
{{HLT2_OR_SPRUCE_TEMPLATE}}
<p>
    Other results are shown by plots or tables (in the links) below. <br>
</p>
<object type="image/png" data="lines_per_wg.png"></object>
<p>
    The number of selection lines per working group. <br>
    "Other" category contains those lines with a parsed name that doesn't belong to any known WG. <br>
    To make lines properly categorized, one should follow the naming convention,
    name of lines should start with `Hlt2/Spruce[WG]_`.
</p>
<object type="image/png" data="hist__rate.png"></object>
<p>
    Distribution of rate of selection lines. <br>
    The total distribution is shown as a stacked histogram, split into several histograms of WGs. <br>
    The distributions per WG is attached in the html page below. <br>
    A line is considered to be "problematic" if it has a rate of 0 Hz
    or larger than 1 kHz, which requires some attention. <br>
    The rates of all lines are listed in a html page attached below. <br>
</p>
<object type="image/png" data="hist__dst_data_size.png"></object>
<p>
    Distribution of DstData RawBank size of selection lines. <br>
    The total distribution is shown as a stacked histogram, split into several histograms of WGs. <br>
    The distributions per WG is attached in the html page below.
</p>
<object type="image/png" data="hist__total_size.png"></object>
<p>
    Distribution of total event size of selection lines. <br>
    The total distribution is shown as a stacked histogram, split into several histograms of WGs. <br>
    The distributions per WG is attached in the html page below. <br>
    A line is considered to be "problematic" if its DstData size or total event size
    is larger than 1 MB, which requires some attention. <br>
    The event sizes of all lines are listed in a html page attached below. <br>
</p>
<object type="image/png" data="hist__dst_bandwidth.png"></object>
<p>
    Distribution of bandwidth computed from DstData RawBank size. <br>
    The total distribution is shown as a stacked histogram, split into several histograms of WGs. <br>
    The distributions per WG is attached in the html page below.
</p>
<object type="image/png" data="hist__tot_bandwidth.png"></object>
<p>
    Distribution of bandwidth computed from total event size. <br>
    The total distribution is shown as a stacked histogram, split into several histograms of WGs. <br>
    The distributions per WG is attached in the html page below. <br>
    Currently, a line is considered to be "problematic" if its bandwidth from DstData size
    is larger than 200 MB/s, which requires some attention. This is a temporary limit. <br>
    The event sizes of all lines are listed in a html page attached below. <br>
</p>
<object type="image/png" data="memory_consumption.png"></object>
<p>
    Memory consumption as functions of Wall-time. <br>
    The virtual memory size is the total amount of memory the process may hypothetically access. <br>
    The resident set size (RSS) is the portion of memory occupied by the run that is held in main memory (RAM). <br>
    The proportional set size (PSS) is the private memory occupied by the run itself plus the proportion of shared memory with one or more other processes. <br>
    As we only launch one test at the same time, PSS should be close to RSS in this case, and PSS gives the real memory that is used by this test. <br>
    Swap memory is used when RAM is full. <br>
    The maximum resident set size usage is $$max_rss$$ GB. <br>
    The maximum proportional set size usage is $$max_pss$$ GB. <br>
</p>
<ul>
    <li><a href="{{BASE_PATH}}/other_lines.html">Show list of lines in "Other" category</a></li>
    <li><a href="{{BASE_PATH}}/plots_per_wg.html">Show plots split by WGs</a></li>
    <li><a href="{{BASE_PATH}}/all_rates.html">Show rates, event sizes and bandwidths of all lines</a></li>
    <li><a href="{{BASE_PATH}}/similarities_jaccards.html"> Show similarities Jaccards of different stream configurations</a></li>
    <li><a href="{{BASE_PATH}}/rates_streaming.html"> Show rates of streams under different configurations</a></li>
    <li><a href="{{BASE_PATH}}/{{line_descr}}"> PersistReco and ExtraOutput for selection lines</a></li>
    <li><a href="{{BASE_PATH}}/{{rate_table_split_by_wg_stream}}"> Split by working group: rates, event sizes and bandwidths of all lines</a></li>
    $$comparison$$
    </b></b>
</ul>
<p> Additional results for HLT2 Bandwidth test (not available for Sprucing test) </p>
<ul>
    <li><a href="{{BASE_PATH}}/{{rate_table_split_by_prod_stream}}"> Split by production stream: rates, event sizes and bandwidths of all lines</a></li>
</ul>
</body>
</html>
""")

HLT2_REPORT_TEMPLATE = jinja2.Template("""<p>
    The bandwidth test was run under 3 streaming configurations: streamless (all lines written to the same output file), production-stream and wg-stream. <br>
    The definition of the production streaming and working-group streaming can be found below.
</p>
<ul>
    <li><a href="{{BASE_PATH}}/{{stream_config_json_prod}}">Production-stream configuration</a></li>
    <li><a href="{{BASE_PATH}}/{{stream_config_json_wg}}">WG-stream configuration</a></li>
</ul>
<p>
    The production stream configuration reflects the streaming we will have for data taking. <br>
    The rates, event sizes and bandwidths results from production-stream configuration is: <br>
</p>
{{table_5stream_rates}}""")

SPRUCE_REPORT_TEMPLATE = jinja2.Template("""<p>
    The bandwidth test was run under 2 streaming configurations: streamless and one stream per WG. <br>
    The definition of per-WG-stream configuration can be found below.
</p>
<ul>
    <li><a href="{{BASE_PATH}}/{{stream_config_json_wg}}">WG-stream configuration</a></li>
</ul>
<p>
    The wg-stream configuration is close to what we will have for data taking. <br>
    The rates, event sizes and bandwidths results from wg-stream configuration is: <br>
</p>
{{table_wgstream_rates}}""")

TABLE_OTHER_LINE_TEMPLATE = jinja2.Template("""
<p>
    List of line names that categorized to "Others".
</p>
{{table_other_lines}}
""")

PLOTS_PER_WG_TEMPLATE = jinja2.Template("""
<p>
    Plots of rates, event sizes and bandwidths for lines, split into different WGs.
</p>
{{plots_per_wg}}
""")

ALL_RATE_TEMPLATE = jinja2.Template("""
<p>
    Rates, event sizes and bandwidths of all lines, listed descending in retention rates. <br>
    The results are obtained by a per-event analysing under 5-stream configuration. <br>
    These numbers are also saved in a csv file: <a href="{{BASE_PATH}}/{{CSV_PATH}}">{{CSV_PATH}}</a>
</p>
""")

known_working_groups = [
    "B2CC",
    "B2OC",
    "BandQ",
    "BnoC",
    "Calib",
    "Calo",
    "Charm",
    "DPA",
    "HLT",
    "IFT",
    "Luminosity",
    "PID",
    "QCD",
    "QEE",
    "RD",
    "RTA",
    "Simulation",
    "SL",
    "Tagging",
    "Tracking",
]


@dataclass
class WGRateBWInfo:
    nlines: int = 0
    rate: List[float] = field(default_factory=lambda: [])
    dst_size: List[float] = field(default_factory=lambda: [])
    tot_size: List[float] = field(default_factory=lambda: [])
    dst_bw: List[float] = field(default_factory=lambda: [])
    tot_bw: List[float] = field(default_factory=lambda: [])


LineRateBWInfo = namedtuple(
    "LineRateBWInfo", ["rate", "dst_size", "tot_size", "dst_bw", "tot_bw"])


def histo_maker(entry_list,
                xlabel,
                title,
                plot_path,
                nbins=100,
                range=None,
                take_log=False,
                stacked=False,
                labels=[],
                legend=False):
    if take_log:
        safe_log = lambda rate: log10(max(rate, 0.1))
        title = f"{title} (all values <= log10(0.1) are in the first bin)"
        if stacked:
            # entry_list is a list of lists
            entry_list = [[safe_log(rate) for rate in lst]
                          for lst in entry_list]
        else:
            entry_list = [safe_log(rate) for rate in entry_list]

    fig = plt.figure()
    if range:
        # If specified, range should be a 2-tuple of floats (low, high)
        plt.hist(entry_list, nbins, range=range, stacked=stacked, label=labels)
    else:
        plt.hist(entry_list, nbins, stacked=stacked, label=labels)
    plt.xlabel(xlabel)
    plt.ylabel("Number of lines")
    if title: plt.title(title)
    if legend: plt.legend(loc='upper right')
    plt.yscale('log', nonpositive='clip')
    plt.savefig(plot_path, format="png")
    plt.close(fig)


def make_plots_per_wg(fname_helper, wg_name, wg_bw_info, process):
    '''
    Make plots of rates and event sizes for each WG.

    Arguments:
        wg_name: name of the working group
        rate_list: list containing rates of all lines from the WG
        dst_size_list: list containing DstData Rawbank size of all lines from the WG
        tot_size_list: list containing total event size of all lines from the WG
        process: either `hlt2` or `spruce`
    '''

    title = f"{wg_name} {process.capitalize()}"
    for attrib, xtitle, plot_bit, take_log, range in zip(
        ["rate", "dst_size", "tot_size", "dst_bw", "tot_bw"], [
            "Log10(Rate [Hz])", "DstData RawBank Size [kB]",
            "Total Event Size [kB]",
            "Log10(Bandwidth from DstData Size [MB/s])",
            "Log10(Bandwidth from Total Event Size [MB/s])"
        ], [
            "rate", "dst_data_size", "total_size", "dst_bandwidth",
            "tot_bandwidth"
        ], [True, False, False, True, True], [(-2, 7), None, None, (-2, 5),
                                              (-2, 5)]):
        histo_maker(
            getattr(wg_bw_info, attrib),
            xtitle,
            title,
            fname_helper.html_page_outputs_path(
                f"hist__{plot_bit}__{wg_name}.png"),
            range=range,
            take_log=take_log)


def make_plots(all_lines_bw_info,
               tot_rate,
               tot_bandwidth,
               process,
               wgs=known_working_groups):
    '''
    Make plots of rate and event sizes of all lines.
    It will create three stacked histograms containing distributions of all lines,
    and a pie chart showing the number of lines per WG.

    Arguments:
        rate_dict: dictionary of line names and their rates
        tot_rate: total rate of all lines
        evt_size_dict: dictionary of line names and their event sizes
        process: either `hlt2` or `spruce`
        wgs: list of working groups to categorize
    '''

    # Count number of lines and rates/evt sizes per WG
    rate_info_per_wg = {wg: WGRateBWInfo() for wg in wgs + ["Other"]}
    list_other_lines = []
    for line, bw_info in all_lines_bw_info.items():
        found_wg = False
        # Expect e.g {Hlt2,Spruce}<WG>_<rest-of-line-name>
        wg_guess = line.split("_")[0].removeprefix(process.capitalize())
        for wg in rate_info_per_wg.keys():
            if wg_guess.startswith(wg):
                found_wg = True
                rate_info_per_wg[wg].nlines += 1
                for attrib in [
                        "rate", "dst_size", "tot_size", "dst_bw", "tot_bw"
                ]:
                    getattr(rate_info_per_wg[wg], attrib).append(
                        getattr(bw_info, attrib))
        if not found_wg:
            list_other_lines.append(line)
            rate_info_per_wg["Other"].nlines += 1
            for attrib in ["rate", "dst_size", "tot_size", "dst_bw", "tot_bw"]:
                getattr(rate_info_per_wg["Other"], attrib).append(
                    getattr(bw_info, attrib))
    rate_info_per_wg = {
        k: info
        for k, info in rate_info_per_wg.items() if info.nlines != 0
    }

    # Sort the wg in number of lines
    rate_info_per_wg = {
        k: info
        for k, info in sorted(
            rate_info_per_wg.items(), key=lambda x: x[1].nlines)
        if info.nlines != 0
    }

    # Make a pie plot of lines per WG
    labels = [f"{k} ({int(v.nlines)})" for k, v in rate_info_per_wg.items()]
    fig = plt.figure()
    plt.pie([v.nlines for v in rate_info_per_wg.values()],
            radius=1,
            labels=labels,
            wedgeprops=dict(width=0.4, edgecolor="w"))
    plt.title(f"Number of {process.capitalize()} lines per WG")
    plt.savefig("tmp/Output/lines_per_wg.png", format="png")
    plt.close(fig)

    ### Make hist plots
    title = f"{process.capitalize()}"
    for attrib, xtitle, title, plot_bit, take_log, range in zip(
        ["rate", "dst_size", "tot_size", "dst_bw", "tot_bw"], [
            "Log10(Rate [Hz])", "DstData RawBank Size [kB]",
            "Total Event Size [kB]",
            "Log10(Bandwidth from DstData Size [MB/s])",
            "Log10(Bandwidth from Total Event Size [MB/s])"
        ], [
            f"Total Rate: {tot_rate:.2f} kHz", "", "", "",
            f"Total bandwidth: {tot_bandwidth:.2f} GB/s"
        ], [
            "rate", "dst_data_size", "total_size", "dst_bandwidth",
            "tot_bandwidth"
        ], [True, False, False, True, True],
        [(-2, 7), (0, 500 if process == 'hlt2' else 1000),
         (0, 500 if process == 'hlt2' else 1000), (-2, 5), (-2, 5)]):
        histo_maker(
            [getattr(info, attrib) for info in rate_info_per_wg.values()],
            xtitle,
            title,
            fname_helper.html_page_outputs_path(f"hist__{plot_bit}.png"),
            range=range,
            take_log=take_log,
            stacked=True,
            legend=True,
            labels=list(rate_info_per_wg.keys()))

    for wg_name, bw_info_per_wg in rate_info_per_wg.items():
        make_plots_per_wg(fname_helper, wg_name, bw_info_per_wg, process)

    return rate_info_per_wg.keys(), list_other_lines


def make_other_line_table(name_list):
    table_html_str = r'''<table border = "1">
    <tr>
        <th> Name </th>
    </tr>'''
    for name in name_list:
        table_html_str += '''
    <tr>
        <td> %s </td>
    </tr>''' % name
    table_html_str += '\n</table>'
    return table_html_str


def make_plots_per_wg_list(wg_list):
    list_html_str = ''
    for wg_name in wg_list:
        list_html_str += f'''
        <p>
            Plots of {wg_name} group:
        </p>
        <object type="image/png" data="hist__rate__{wg_name}.png"></object>
        <object type="image/png" data="hist__dst_data_size__{wg_name}.png"></object>
        <object type="image/png" data="hist__total_size__{wg_name}.png"></object>
        <object type="image/png" data="hist__dst_bandwidth__{wg_name}.png"></object>
        <object type="image/png" data="hist__tot_bandwidth__{wg_name}.png"></object>
        '''
    return list_html_str


def parse_yaml(file_path):
    with open(os.path.expandvars(file_path), 'r') as f:
        return yaml.safe_load(f)


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='make_bandwidth_test_page')
    parser.add_argument(
        '-p',
        '--process',
        type=str,
        choices=['hlt2', 'spruce'],
        required=True,
        help='Which stage was the test run on')
    parser.add_argument(
        '-c',
        '--input-config',
        type=str,
        required=True,
        help='Path to yaml config file defining the input.')
    parser.add_argument(
        '-s',
        '--script-path',
        type=str,
        required=True,
        help=
        'Path to the top-level testing script that is running/calling this script'
    )
    parser.add_argument(
        '-e',
        '--exit-code',
        type=int,
        required=True,
        help="Cumulative exit code of all previous jobs.")
    parser.add_argument(
        '--building-locally',
        action='store_true',
        help=
        'Makes links between pages work for building the pages locally rather than on the LHCbPR website.'
    )
    args = parser.parse_args()

    input_info = parse_yaml(args.input_config)
    fname_helper = FileNameHelper(args.process)

    if args.exit_code == 0:
        exit_code_sentence = "All sub-jobs in this test exited successfully."
        exit_code_bool = 1
        exit_code_col = "green"
    else:
        exit_code_sentence = "There were errors in some of the sub-jobs of this test; please see the logs."
        exit_code_bool = 0
        exit_code_col = "red"

    # Read info of all lines
    df = pd.read_csv(
        fname_helper.final_rate_table_all_lines_path("csv"), sep=',')
    number_of_lines = len(df)

    GB_to_MB = 1000
    kHz_to_Hz = 1000
    rate_bw_info_by_line = {
        df['Line'][i]: LineRateBWInfo(
            df['Rate (kHz)'][i] * kHz_to_Hz, df["Avg DstData Size (kB)"][i],
            df["Avg Total Event Size (kB)"][i],
            df["DstData Bandwidth (GB/s)"][i] * GB_to_MB,
            df["Total Bandwidth (GB/s)"][i] * GB_to_MB)
        for i in range(number_of_lines)
    }

    # Prepare messages to GitLab
    # limits on rate: 1 kHz for Hlt2 rate and 0.5% for Sprucing retention
    tol = 1000 if args.process == 'hlt2' else 500
    n_low_rate = len(
        [info for info in rate_bw_info_by_line.values() if info.rate == 0])
    n_high_rate = len(
        [info for info in rate_bw_info_by_line.values() if info.rate > tol])

    prod_df = pd.read_csv(
        fname_helper.final_rate_table_all_streams_path(
            "production" if args.process == "hlt2" else "wg", ext="csv"))
    tot_rate = sum(prod_df['Rate (kHz)'])
    tot_bandwidth = sum(prod_df['Total Bandwidth (GB/s)'])

    # Make plots & tables
    wg_list, other_line_list = make_plots(
        rate_bw_info_by_line,
        tot_rate=tot_rate,
        tot_bandwidth=tot_bandwidth,
        process=args.process)

    other_line_table = make_other_line_table(other_line_list)
    plots_per_wg = make_plots_per_wg_list(wg_list)

    if args.process == 'hlt2':
        with open(
                fname_helper.final_rate_table_all_streams_path("production"),
                "r") as rate_html:
            table_5stream_rates = rate_html.read()
        hlt2_or_spruce_template = HLT2_REPORT_TEMPLATE.render(
            BASE_PATH=fname_helper.base_html_path(args.building_locally),
            stream_config_json_prod=fname_helper.stream_config_json_path(
                "production", full_path=False),
            stream_config_json_wg=fname_helper.stream_config_json_path(
                "wg", full_path=False),
            table_5stream_rates=table_5stream_rates)
    elif args.process == 'spruce':
        with open(fname_helper.final_rate_table_all_streams_path("wg"),
                  "r") as rate_html:
            table_wgstream_rates = rate_html.read()
        hlt2_or_spruce_template = SPRUCE_REPORT_TEMPLATE.render(
            BASE_PATH=fname_helper.base_html_path(args.building_locally),
            stream_config_json_wg=fname_helper.stream_config_json_path(
                "wg", full_path=False),
            table_wgstream_rates=table_wgstream_rates)

    with open(fname_helper.html_page_outputs_path("index.html"),
              "w") as html_file:
        html = REPORT_TEMPLATE.render(
            SCRIPTPATH=args.script_path,
            BASE_PATH=fname_helper.base_html_path(args.building_locally),
            HLT2_OR_SPRUCE_TEMPLATE=hlt2_or_spruce_template,
            INPUT_CONFIG_PATH=os.path.expandvars(args.input_config),
            INPUT_RATE=input_info['input_rate'],
            INPUT_NU=input_info['nu'],
            INPUT_VELO_RADIUS=input_info['velo_radial_opening'],
            EXIT_CODE_SENTENCE=exit_code_sentence,
            EXIT_CODE_COLOUR=exit_code_col,
            line_descr=fname_helper.line_descr_path(full_path=False),
            rate_table_split_by_prod_stream=fname_helper.
            final_rate_table_all_lines_split_by_stream_path(
                "production", full_path=False),
            rate_table_split_by_wg_stream=fname_helper.
            final_rate_table_all_lines_split_by_stream_path(
                "wg", full_path=False))
        html_file.write(html)

    with open(fname_helper.html_page_outputs_path("other_lines.html"),
              "w") as html_file:
        html = TABLE_OTHER_LINE_TEMPLATE.render(
            table_other_lines=other_line_table)
        html_file.write(html)

    with open(fname_helper.html_page_outputs_path("plots_per_wg.html"),
              "w") as html_file:
        html = PLOTS_PER_WG_TEMPLATE.render(plots_per_wg=plots_per_wg)
        html_file.write(html)

    with open(fname_helper.html_page_outputs_path("all_rates.html"),
              "w") as html_file:
        html = ALL_RATE_TEMPLATE.render(
            BASE_PATH=fname_helper.base_html_path(args.building_locally),
            CSV_PATH=fname_helper.final_rate_table_all_lines_path(
                "csv", full_path=False))
        html_file.write(html)
        with open(fname_helper.final_rate_table_all_lines_path("html"),
                  "r") as rate_table:
            html_file.write(rate_table.read())

    stream_configs = ["production", "wg"] if args.process == "hlt2" else ["wg"]
    with open(
            fname_helper.html_page_outputs_path("similarities_jaccards.html"),
            "w") as html_file:
        for stream_config in stream_configs:
            html_file.write(f"""
                <p>
                    The Jaccard similarity matrix (fractional overlap) of the {stream_config} streams is:
                </p>
                """)
            with open(
                    fname_helper.jaccard_similarities_path(stream_config),
                    "r") as jaccard:
                html_file.write(jaccard.read())

    with open(
            fname_helper.html_page_outputs_path("rates_streaming.html"),
            "w") as html_file:
        for stream_config in stream_configs:
            html_file.write(f"""
                <p>
                   The rates, event sizes and bandwidths of the {stream_config} streams are:
                </p>
                """)
            with open(
                    fname_helper.final_rate_table_all_streams_path(
                        stream_config), "r") as rate_html:
                html_file.write(rate_html.read())

    with open(fname_helper.html_page_outputs_path("message.txt"),
              "w") as message:
        message.write(
            f'all_jobs_successful_bool = {1 if args.exit_code == 0 else 0}\n')
        message.write(f'total_rate = {tot_rate:.2f} kHz\n')
        message.write(f'total_bandwidth = {tot_bandwidth:.2f} GB/s\n')
        message.write(f'n_low_rate = {n_low_rate:d}\n')
        message.write(f'n_high_rate = {n_high_rate:d}\n')
        pass