make_bandwidth_test_page.py

###############################################################################
# (c) Copyright 2023 CERN for the benefit of the LHCb Collaboration           #
#                                                                             #
# This software is distributed under the terms of the GNU General Public      #
# Licence version 3 (GPL Version 3), copied verbatim in the file "COPYING".   #
#                                                                             #
# In applying this licence, CERN does not waive the privileges and immunities #
# granted to it by virtue of its status as an Intergovernmental Organization  #
# or submit itself to any jurisdiction.                                       #
###############################################################################
import argparse
import jinja2
import matplotlib.pyplot as plt
import pandas as pd
import os
import json
from math import log10
from dataclasses import dataclass, field
from typing import List
from collections import namedtuple
from PRConfig.bandwidth_helpers import (FileNameHelper, parse_yaml, guess_wg,
                                        KNOWN_WORKING_GROUPS,
                                        KNOWN_STREAM_CONFIGS_BY_STAGE)

plt.ioff()

MAIN_HISTOGRAMS = ["rate", "total_size", "tot_bandwidth"]
EXTRA_HISTOGRAMS = ["dst_data_size", "dst_bandwidth"]
MAIN_BAR_CHARTS = {
    "rate": 'Rate (kHz)',
    "bandwidth": 'Bandwidth (GB/s)',
}
EXTRA_BAR_CHARTS = {"dstbandwidth": 'DstData Bandwidth (GB/s)'}
TDR_BANDWIDTHS = {
    "hlt2": {
        "production": {
            "full": 5.90,
            "turbo": 2.50,
            "turcal": 1.60,
            "total": 10.00,
        },
    },
    "spruce": {
        "full": {
            "total": 0.80
        },
        "turbo": {
            "total": 2.50
        },
        "turcal": {
            "total": 0.2
        },
        "no_bias": {
            "total": 0.0
        },
        "hlt2calib": {
            "total": 0.0
        },
        "lumi": {
            "total": 0.0
        }
    }
}
PRETTY_STREAM_NAMES = {
    "slepton": "SL",
    "sl": "SL",
    "qee": "QEE",
    "rd": "RD",
    "bandq": "B&Q",
    "b_to_open_charm": "B2OC",
    "b2oc": "B2OC",
    "bnoc": "BnoC",
    "b_to_charmonia": "B2CC",
    "b2cc": "B2CC",
    "charm": "Charm",
    "ift": "IFT",
    "full": "Full",
    "turbo": "Turbo",
    "turcal": "TurCal",
    "Turcal_mDST": "MDST",
    "Turcal_persistreco": "PersistReco",
    "Turcal_rawbanks": "RawBanks",
    "Turcal_persistrecorawbanks": "PRRawBanks",
    "no_bias": "NoBias",
    "lumi": "Lumi",
    "hlt2calib": "Calib"
}


def render_top_level_page(script_path: str,
                          base_path: str,
                          test_configs: List[tuple[str, str]],
                          to_disk_bar_chart=False):
    html_str = f"""
    <html>
    <head></head>
    <body>
    <p>
        slot.build_id: $$version$$<br>
        start time: $$start_time$$<br>
        end time: $$end_time$$<br>
        platform: $$platform$$<br>
        hostname: $$hostname$$<br>
        cpu_info: $$cpu_info$$<br>
        testing script path: {script_path}
    </p>
    <ul>
        <li><a href="{base_path}/run.log">Logs</a></li>
    </ul>
    <p>
        The bandwidth test ran the following sub-tests (process, streaming configuration): {test_configs}<br>
        The appropriate webpages can be found below for each test below. Scroll down for a report of the test's memory consumption.
    <ul>
    """
    for process, stream_config in test_configs:
        fname_helper = FileNameHelper(process, stream_config)
        html_str += f"""
        <li><a href="{base_path}/{fname_helper.index_html_page_path()}">{process.capitalize()} ({stream_config}) results</a></li>
        """
    html_str += """</ul></p>"""

    if to_disk_bar_chart:
        html_str += f"""
        <p>
            Summary of bandwidth of all streams to disk (only available for those tests that run all sprucing stages):<br>
        </p>
        <object type="image/png" data="{FileNameHelper(process="spruce", stream_config="").to_disk_bar_chart_path(full_path=False)}"></object>
        """

    html_str += """
    <p>
        <b> Memory consumption of this test: </b>
    </p>
    <object type="image/png" data="memory_consumption.png"></object>
    <p>
        Memory consumption as functions of Wall-time. <br>
        The virtual memory size is the total amount of memory the process may hypothetically access. <br>
        The resident set size (RSS) is the portion of memory occupied by the run that is held in main memory (RAM). <br>
        The proportional set size (PSS) is the private memory occupied by the run itself plus the proportion of shared memory with one or more other processes. <br>
        As we only launch one test at the same time, PSS should be close to RSS in this case, and PSS gives the real memory that is used by this test. <br>
        Swap memory is used when RAM is full. <br>
        The maximum resident set size usage is $$max_rss$$ GB. <br>
        The maximum proportional set size usage is $$max_pss$$ GB. <br>
    </p>
    </body>
    </html>"""
    return _render(html_str)


def render_single_test_page(process: str, stream_config: str,
                            input_config_path: str, streams: List[str],
                            args: argparse.Namespace):

    fname_helper = FileNameHelper(process, stream_config)
    base_path = fname_helper.base_html_path(args.building_locally)
    input_info = parse_yaml(input_config_path)

    exit_code = 1  # Assume failure
    with open(fname_helper.message_path(), "r") as f:
        exit_code = int(json.load(f)[process][stream_config]["code"])

    if exit_code == 0:
        exit_code_sentence = "All sub-jobs in this test exited successfully."
    else:
        exit_code_sentence = "There were errors in some of the sub-jobs of this test; please see the logs."

    headline_bar_chart_path = ""
    if process != "hlt1":
        headline_bar_chart_path = fname_helper.headline_bar_chart_path(
            full_path=False)

    html_str = f"""
    <html>
    <head></head>
    <body>
    <p style="color:{'green' if exit_code == 0 else 'red'}">
        <b>{exit_code_sentence}</b>
    </p>
    <p>
        This page contains the results of the {process} bandwidth test with the {stream_config} streaming configuration. Scroll down to see:
    <ul>
        <li> Summary of main results, </li>
        <li> Details of the streaming configuration, </li>
        <li> Links to other html pages produced by this test, </li>
        <li> Bar charts of rate and bandwidth for each WG within each stream (HLT2 only), </li>
        <li> A pie chart of all lines split by WGs (HLT2 and sprucing only), </li>
        <li> Information about the input sample, </li>
        <li> Stacked histograms of all lines, split by WG, of rate/bandwidth metrics. </li>
    </ul>
    <b>Main results:</b> <br>
    <object type="image/png" data="{headline_bar_chart_path}"></object>
    </p>
    """
    with open(fname_helper.final_rate_table_all_streams_path(),
              "r") as rate_html:
        html_str += rate_html.read()
    total_rate, total_bw = total_rate_and_bw(fname_helper)
    html_str += f"""
    <p>
        <b>The total bandwidth (rate) was measured to be {total_bw:.2f} GB/s ({total_rate:.2f} kHz).</b><br>
    </p>
    """

    stream_config_json_path = fname_helper.stream_config_json_path(
        full_path=False)
    html_str += f"""
    <p>
        The streaming configuration (i.e. which lines went to each stream) can be found in JSON format
        <a href="{base_path}/{stream_config_json_path}">here</a>. <br>
        This streaming configuration is our current set of lines to be used in the next data-taking period. <br>
        "DstData" is the raw bank to which reconstructed information (candidates, other reconstructed tracks etc.) are saved. <br>
        The "DstData bandwidth" is therefore the bandwidth counting only that raw bank. <br>
        The total event size (and total bandwidth) count all raw banks (incl. DstData, and detector raw banks if present) in the file. <br>
    """
    if process != "hlt1":
        html_str += """
            <b>NB:
            In real data-taking, raw banks are now left uncompressed when writing, and then the whole file is compressed afterwards.
            We account for this compression by multiplying event sizes and bandwidths by a scaling factor to give accurate per-stream bandwidths.
            The scaling factor is calculated for each file in the test as: '(size of the compressed file) / (size of the uncompressed file)'.
            </b>
            <br>
            </p>
        """
    else:
        html_str += """</p>"""

    html_str += f"""
    <p>
        Further results can be found in the links below:
    </p>
    <ul>
        {list_of_links_html(fname_helper, args.building_locally)}
        $${fname_helper.comparison_str()}$$
        </b></b>
    </ul>
    <p> See: <a href="https://lbfence.cern.ch/alcm/public/figure/details/32">RTA & DPA Workflow</a> for reference figures regarding bandwidth.</p>
    {render_bar_charts(fname_helper, streams)}
    {render_lines_pie_chart(fname_helper)}
    <p>
        <b>Input sample information:</b>
        <ul>
        <li>Config file: {os.path.expandvars(input_config_path)}</li>
        <li>Input rate: {input_info['input_rate']} kHz</li>
        <li>Number of interactions per bunch crossing (&#957): {input_info['nu']}</li>
        <li>Radius of VELO opening: {input_info['velo_radial_opening']} mm</li>
        </ul>
    </p>
    <p>
        <b>Stacked histograms of all lines, split by WG, of rate/bandwidth metrics:</b> <br>
        The total distributions are shown as a stacked histogram, split into several histograms of WGs. <br>
        The distributions per WG is attached in the html page linked above. <br>
        Total event size is calculated from summing all raw banks in the file (including DstData). <br>
        Where appropriate, the DstData raw bank size and DstData bandwidth are calculated from summing only the DstData raw bank. <br>
    </p>
    """
    for hist_suffix in MAIN_HISTOGRAMS:
        html_str += f"""
        <object type="image/png" data="{fname_helper.hist_path(hist_suffix, full_path=False)}"></object>
        """
    html_str += f"""
    {render_dst_data_hists(fname_helper)}
    </body>
    </html>
    """
    return _render(html_str)


@dataclass
class WGRateBWInfo:
    nlines: int = 0
    rate: List[float] = field(default_factory=lambda: [])
    dst_size: List[float] = field(default_factory=lambda: [])
    tot_size: List[float] = field(default_factory=lambda: [])
    dst_bw: List[float] = field(default_factory=lambda: [])
    tot_bw: List[float] = field(default_factory=lambda: [])


LineRateBWInfo = namedtuple(
    "LineRateBWInfo", ["rate", "dst_size", "tot_size", "dst_bw", "tot_bw"])


def histo_maker(entry_list,
                xlabel,
                plot_path,
                nbins=100,
                range=None,
                take_log=False,
                log_th=-4,
                stacked=False,
                labels=[],
                legend=False):
    title = ""
    if take_log:
        safe_log = lambda rate: log10(rate) if rate > float(f'1e{log_th}') else log_th - 1
        title = f"(all values <= log10(1e{log_th}) are in the first bin)"
        if stacked:
            # entry_list is a list of lists
            entry_list = [[safe_log(rate) for rate in lst]
                          for lst in entry_list]
        else:
            entry_list = [safe_log(rate) for rate in entry_list]

    fig = plt.figure()
    if range:
        # If specified, range should be a 2-tuple of floats (low, high)
        plt.hist(entry_list, nbins, range=range, stacked=stacked, label=labels)
    else:
        plt.hist(entry_list, nbins, stacked=stacked, label=labels)
    plt.xlabel(xlabel)
    plt.ylabel("Number of lines")
    if title: plt.title(title)
    if legend: plt.legend(loc='upper right')
    plt.yscale('log', nonpositive='clip')
    plt.savefig(plot_path, format="png")
    plt.close(fig)


def list_of_other_lines(process, all_lines_bw_info):
    return [
        line for line in all_lines_bw_info.keys()
        if guess_wg(line, process) == "Other"
    ]


def make_plots(all_lines_bw_info: dict[str, LineRateBWInfo],
               fname_helper: FileNameHelper):
    '''
    Make plots of rate, bandwidth and event sizes of all lines.
    It will create 5 stacked histograms containing distributions of all lines
    grouped by WG, and a pie chart showing the number of lines per WG.
    '''

    # Count number of lines and rates/evt sizes per WG
    rate_info_per_wg = {
        wg: WGRateBWInfo()
        for wg in KNOWN_WORKING_GROUPS + ["Other"]
    }
    for line, bw_info in all_lines_bw_info.items():
        wg_guess = guess_wg(line, fname_helper.process)
        rate_info_per_wg[wg_guess].nlines += 1
        for attrib in ["rate", "dst_size", "tot_size", "dst_bw", "tot_bw"]:
            getattr(rate_info_per_wg[wg_guess], attrib).append(
                getattr(bw_info, attrib))

    rate_info_per_wg = {
        k: info
        for k, info in rate_info_per_wg.items() if info.nlines != 0
    }

    # Make a pie chart of lines per WG
    labels = [f"{k} ({int(v.nlines)})" for k, v in rate_info_per_wg.items()]
    fig = plt.figure()
    pie = plt.pie([v.nlines for v in rate_info_per_wg.values()],
                  radius=1,
                  wedgeprops=dict(width=0.4, edgecolor="w"))
    plt.legend(
        pie[0],
        labels,
        loc='center',
        bbox_to_anchor=(1, 0.5),
        bbox_transform=plt.gcf().transFigure)
    plt.title(f"Number of {fname_helper.process.capitalize()} lines per WG")
    plt.savefig(
        fname_helper.pie_chart_path(full_path=True),
        format="png",
        bbox_inches='tight')
    plt.close(fig)

    # Stacked histograms
    for attrib, xtitle, plot_bit, log_th, range in zip(
        ["rate", "tot_size", "tot_bw", "dst_size", "dst_bw"], [
            "Log10(Rate [Hz])", "Total Event Size [kB]",
            "Log10(Bandwidth from Total Event Size [GB/s])",
            "DstData RawBank Size [kB]",
            "Log10(Bandwidth from DstData Size [GB/s])"
        ], MAIN_HISTOGRAMS + EXTRA_HISTOGRAMS, [-1, 0, -4, 0, -4],
        [(-2, 7),
         (0, 500 if fname_helper.process == 'hlt2' else 1000), (-5, 2),
         (0, 500 if fname_helper.process == 'hlt2' else 1000), (-5, 2)]):
        histo_maker(
            [getattr(info, attrib) for info in rate_info_per_wg.values()],
            xtitle,
            fname_helper.hist_path(plot_bit, full_path=True),
            range=range,
            take_log="Log10" in xtitle,
            log_th=log_th,
            stacked=True,
            legend=True,
            labels=list(rate_info_per_wg.keys()))

    return


def _important_bar_chart_maker(bandwidths: dict[str, dict[str, float]],
                               process: str,
                               stream_config="",
                               is_to_total_to_disk_bar_chart=True):
    fname_helper = FileNameHelper(process, stream_config)
    colors = {'Current': 'tab:orange', 'TDR': 'tab:grey'}
    width = 0.4

    fig, ax = plt.subplots()
    plt.grid(True, axis='y', zorder=0, linestyle='dashed')
    for i_col, (label, bandwidths_by_stream) in enumerate(bandwidths.items()):
        offset = width * i_col
        bars = ax.bar([x + offset for x in range(len(bandwidths_by_stream))],
                      bandwidths_by_stream.values(),
                      width=width,
                      label=label,
                      zorder=3,
                      color=colors[label])
        if process == "spruce" and not is_to_total_to_disk_bar_chart:
            # Only label the last bar - dont have per-WG expectations
            ax.bar_label(
                bars,
                labels=[''] * (len(bars) - 1) +
                [round(bandwidths_by_stream['Total'], 2)])
        else:
            ax.bar_label(
                bars,
                labels=[
                    round(val, 2) for val in bandwidths_by_stream.values()
                ])
    ax.set_ylabel('Bandwidth (GB/s)')

    if is_to_total_to_disk_bar_chart:
        title = "Sprucing (output to disk)"
    else:
        title = {
            "hlt2": "Hlt2 (output to tape)",
            "spruce": f"Sprucing of {stream_config} stream to disk"
        }[process]
    ax.set_title(title)

    # Have to do weird stuff with ticks for (spruce and not important chart) as only have 1 TDR bar
    tick_pos_opt = 'weird' if process == 'spruce' and not is_to_total_to_disk_bar_chart else 'default'
    tick_positions = {
        'default': [x + width / 2 for x in range(len(bandwidths_by_stream))],
        'weird': [x for x in range(len(bandwidths_by_stream) - 1)] +
        [len(bandwidths_by_stream) - 1 + width / 2]
    }[tick_pos_opt]
    ax.set_xticks(tick_positions, bandwidths_by_stream.keys())

    ax.legend(loc='upper center', ncols=2)
    plot_path = fname_helper.to_disk_bar_chart_path(
        full_path=True
    ) if is_to_total_to_disk_bar_chart else fname_helper.headline_bar_chart_path(
        full_path=True)
    plt.savefig(plot_path, format="png")
    plt.close(fig)


def headline_bar_charts(fname_helper: FileNameHelper):
    """Headline bar chart of rate/bandwidth per stream c.f. TDR"""

    process = fname_helper.process
    stream_config = fname_helper.stream_config
    rates_df = pd.read_csv(
        fname_helper.final_rate_table_all_streams_path(ext='csv'))

    bandwidths = {
        "Current":
        dict(zip(rates_df['Stream'], rates_df['Total Bandwidth (GB/s)'])),
        "TDR": {
            stream: TDR_BANDWIDTHS[process][stream_config].get(stream, 0)
            for stream in rates_df['Stream'].to_list()
        }
    }
    for series in ["Current", "TDR"]:
        bandwidths[series] = {
            PRETTY_STREAM_NAMES.get(stream, stream): val
            for stream, val in bandwidths[series].items()
        }

    bandwidths['Current']['Total'] = sum(bandwidths['Current'].values())
    bandwidths['TDR']['Total'] = TDR_BANDWIDTHS[process][stream_config][
        'total']

    _important_bar_chart_maker(
        bandwidths,
        process,
        stream_config,
        is_to_total_to_disk_bar_chart=False)


def _make_bar_chart(rates_df, column, stream, plot_path):
    """Bar charts of the WG-by-WG rates within 1 stream"""

    fig = plt.figure()
    plt.grid(True, axis='y', zorder=0, linestyle='dashed')
    bars = plt.bar(rates_df['WG'], rates_df[column], zorder=3)
    plt.bar_label(bars, labels=[round(val, 2) for val in rates_df[column]])
    plt.ylabel(column)
    plt.xticks(rates_df['WG'], rates_df['WG'], rotation='vertical')
    plt.subplots_adjust(bottom=0.25)
    plt.title(f'{column} for each WG in the {stream.capitalize()} stream')
    plt.savefig(plot_path, format="png")
    plt.close(fig)


def make_per_wg_bar_charts(fname_helper: FileNameHelper, streams: list[str]):
    all_bar_charts = {**MAIN_BAR_CHARTS, **EXTRA_BAR_CHARTS}
    for stream in streams:
        print(f"Making per-WG bar charts for {stream}")
        try:
            intra_stream_rates_df = pd.read_csv(
                fname_helper.tmp_rate_table_intra_stream_path(stream),
                header=None)

            # NOTE beware if the ordering of the columns ever changes in line-and-stream-rates.py
            intra_stream_rates_df.columns = ['WG'] + list(
                all_bar_charts.values())
            for metric, column in all_bar_charts.items():
                _make_bar_chart(
                    intra_stream_rates_df, column, stream,
                    fname_helper.bar_chart_path(
                        stream, metric, full_path=True))
        except pd.errors.EmptyDataError:
            print(f"Per-WG bar charts: skipping {stream} as no rates found")

    return


def write_html_page(page_path, rendered_html):
    if rendered_html:
        with open(page_path, "w") as html_file:
            html_file.write(rendered_html)


def _render(html_str):
    return jinja2.Template(html_str).render()


def render_all_lines_page(fname_helper, building_locally):
    csv_path = fname_helper.final_rate_table_all_lines_path(
        "csv", full_path=False)
    html_str = f"""
    <p>
        Rates, event sizes and bandwidths of all lines, listed descending in bandwidth. <br>
        Exclusive retentions/rates are calculated by counting those events in which only that line fired. <br>
        Bandwidths are inclusive: they are calculated by summing raw bank sizes for those events in which the trigger line fired. <br>
        These numbers are also saved in a csv file: <a href="{fname_helper.base_html_path(building_locally)}/{csv_path}">{csv_path}</a>
    </p>
    """
    with open(fname_helper.final_rate_table_all_lines_path("html"),
              "r") as rate_table:
        html_str += rate_table.read()
    return _render(html_str)


def render_other_line_table(process, lines):
    if process == "hlt1":
        return _render("")

    html_str = """
        <p>
        List of line names that categorized to "Others".
        </p>
        """
    html_str += r'''<table border = "1">
        <tr>
            <th> Name </th>
        </tr>'''
    for line in lines:
        html_str += f'''
            <tr>
                <td> {line} </td>
            </tr>'''
    html_str += '\n</table>'
    return _render(html_str)


def render_dst_data_hists(fname_helper: FileNameHelper):
    if fname_helper.process == "hlt1":
        return _render("")

    html_str = ''
    for hist_suffix in EXTRA_HISTOGRAMS:
        html_str += f"""
            <object type="image/png" data="{fname_helper.hist_path(hist_suffix, full_path=False)}"></object>
        """
    return _render(html_str)


def render_lines_pie_chart(fname_helper: FileNameHelper):
    if fname_helper.process == "hlt1":
        return _render("")
    return _render(f"""
        <p>
            <b>The number of selection lines per working group:</b> <br>
        </p>
        <object type="image/png" data="{fname_helper.pie_chart_path(full_path=False)}"></object>
        <p>
            "Other" category contains those lines with a parsed name that doesn't belong to any known WG. <br>
            To make lines properly categorized, one should follow the naming convention -
            name of lines should start with `Hlt2/Spruce[WG]_`.
        </p>
    """)


def render_bar_charts(fname_helper: FileNameHelper,
                      streams: list[str],
                      metrics=MAIN_BAR_CHARTS.keys()):
    if fname_helper.process != "hlt2":
        return _render("")

    html_str = ''
    for metric in metrics:
        html_str += f'''
        <p>
            <b>{metric.capitalize()} within each stream:</b>
        </p>
        <p>
            "TotalInclusive" is the physical {metric} of the stream.<br>
            Each WG-specific {metric} bar is calculated by iterating through the events in the streamed HLT2 output file, and counting {metric} due to that event if one or more lines from that WG fired.<br>
            These WG-specific {metric}s are therefore not physical (no per-WG streams exist at HLT2) but roughly indicate the proportion of the stream's {metric} due to each WG.<br>
            "SumWGs" is the simple arithmetic sum of all bars except "TotalInclusive" - the former will be larger than the latter if there is non-negligible WG-by-WG overlap.<br>
        </p>
        '''
        if metric == "bandwidth":
            html_str += '''
            <p>
                In the Turbo stream, the WG categorisation corresponds almost exactly to the output streams of the Sprucing passthrough of Turbo.<br>
                However, the "SumWGs" bandwidth here will not equal the sum of the sprucing passthrough streams, as the file formats and compression settings are different between HLT2 and sprucing.<br>
                Nevertheless, significant WG-by-WG overlap in Turbo here is an indicator that the sprucing passthrough streams will be significantly inflated by overlap with other WGs.<br>,
                (If an event fires Turbo lines from 2 WGs at HLT2, both candidates are streamed to both WG streams by the passthrough sprucing, inflating the bandwidth of both streams).<br>
            </p>
            '''
        for stream in streams:
            html_str += f'''
                <object type="image/png" data="{fname_helper.bar_chart_path(stream, metric, full_path=False)}"></object>
            '''
    return _render(html_str)


SIM_MATRICES_DESCR = """
    <p>
        The overlap between two streams, A and B, w.r.t to one of the stream, A, is computed as |A n B| / |A|.
        It shows how much events in the stream A are covered by another stream B. <br>
        The columns in the overlap matrices are target streams (A) and the rows are comparison streams (B),
        i.e. the numbers correspond to overlaps w.r.t to the column streams. <br>
    </p>
    <p>
        The Jaccard index between two streams, A and B, is computed as |A n B| / |A u B|.
        It shows how similar the two streams are and is useful in bandwidth division. <br>
    </p>
"""


def render_sim_matrices_page(fname_helper: FileNameHelper):
    if fname_helper.process == "hlt1":
        return _render("")

    html_str = SIM_MATRICES_DESCR
    html_str += f"""
        <p>
            The overlap matrix of the {fname_helper.stream_config} streams is:
        </p>
    """
    with open(fname_helper.overlap_matrix_path(), "r") as overlap:
        html_str += overlap.read()
    html_str += f"""
        <p>
            The Jaccard similarity matrix of the {fname_helper.stream_config} streams is:
        </p>
    """
    with open(fname_helper.jaccard_similarities_path(), "r") as jaccard:
        html_str += jaccard.read()
    return _render(html_str)


def render_extra_sim_matrices(fname_helper: FileNameHelper,
                              streams: list[str]):
    if fname_helper.process != "hlt2":
        return _render("")

    html_str = SIM_MATRICES_DESCR
    for stream in streams:
        html_str += f"""
            <p>
                The overlap matrix of the {stream.capitalize()} stream is:
            </p>
        """
        with open(fname_helper.intra_stream_overlap_matrix_path(stream),
                  "r") as overlap:
            html_str += overlap.read()
        html_str += f"""
            <p>
                The Jaccard similarity matrix of the {stream.capitalize()} stream is:
            </p>
        """
        with open(
                fname_helper.intra_stream_jaccard_similarities_path(stream),
                "r") as jaccard:
            html_str += jaccard.read()
    return _render(html_str)


def list_of_links_html(fname_helper: FileNameHelper, building_locally: bool):
    base_path = fname_helper.base_html_path(building_locally)
    _all_rates_path = fname_helper.all_rates_html_page_path(full_path=False)
    links = [
        f"""<li><a href="{base_path}/{_all_rates_path}"> A single rate/bandwidth table featuring every trigger line in all streams</a></li>"""
    ]
    if fname_helper.process != "hlt1":
        _rate_table_split_by_stream = fname_helper.final_rate_table_all_lines_split_by_stream_path(
            full_path=False)
        _sim_matrices_path = fname_helper.sim_matrices_html_page_path(
            full_path=False)
        links += [
            f"""<li><a href="{base_path}/{_rate_table_split_by_stream}"> Rate/bandwidth tables for each stream, with 1 row per trigger line</a></li>""",
            f"""<li><a href="{base_path}/{_sim_matrices_path}"> Jaccard similarity and overlap matrices between streams</a></li>"""
        ]

    if fname_helper.process == "hlt2":
        _rate_table_by_stream_by_wg = fname_helper.final_rate_table_all_lines_split_by_stream_by_wg_path(
            full_path=False)
        _extra_bar_charts_path = fname_helper.extra_bar_charts_html_page_path(
            full_path=False)
        _extra_sim_matrices_path = fname_helper.extra_sim_matrices_html_page_path(
            full_path=False)
        links += [
            f"""<li><a href="{base_path}/{_rate_table_by_stream_by_wg}"> Rate/bandwidth tables for each stream, split also by WG, with 1 row per trigger line</a></li>""",
            f"""<li><a href="{base_path}/{_extra_bar_charts_path}">Bar charts as below for DstData bandwidth</a></li>""",
            f"""<li><a href="{base_path}/{_extra_sim_matrices_path}">Similarity and overlap matrices between WGs within each stream</a></li>""",
        ]

    if fname_helper.process != "hlt1":
        _other_lines_path = fname_helper.other_lines_html_page_path(
            full_path=False)
        _line_descr_path = fname_helper.line_descr_path(full_path=False)
        links += [
            f"""<li><a href="{base_path}/{_other_lines_path}">List of lines in "Other" category</a></li>""",
            f"""<li><a href="{base_path}/{_line_descr_path}"> PersistReco and ExtraOutput info for all lines in all streams</a></li>"""
        ]

    return "\n".join(links)


def total_rate_and_bw(fname_helper: FileNameHelper):
    streams_df = pd.read_csv(
        fname_helper.final_rate_table_all_streams_path(ext="csv"))
    return sum(streams_df['Rate (kHz)']), sum(
        streams_df['Total Bandwidth (GB/s)'])


def write_message(fname_helper: FileNameHelper,
                  bw_info_by_line: dict[str, LineRateBWInfo]):
    """Append to message.json for the BandwidthTestHandler to send info to Gitlab and Mattermost"""

    high = {
        'hlt1': 1e6,
        'hlt2': 1000,
        'spruce': 500
    }[fname_helper.process]  # Hz
    n_low_rate = len(
        [info for info in bw_info_by_line.values() if info.rate == 0])
    n_high_rate = len(
        [info for info in bw_info_by_line.values() if info.rate > high])
    tot_rate, tot_bandwidth = total_rate_and_bw(fname_helper)

    # load up message.json
    with open(fname_helper.message_path(), "r") as message:
        info = json.load(message)

    for k, v in {
            "total_rate": tot_rate,
            "total_bandwidth": tot_bandwidth,
            "n_low_rate": n_low_rate,
            "n_high_rate": n_high_rate
    }.items():
        info[fname_helper.process][fname_helper.stream_config][k] = v

    with open(fname_helper.message_path(), "w") as f:
        json.dump(info, f, indent=4)
    return 0


def make_html_for_single_test(process: str, stream_config: str,
                              input_config_path: str,
                              args: argparse.Namespace):
    fname_helper = FileNameHelper(process, stream_config)

    with open(fname_helper.stream_config_json_path(),
              "r") as stream_config_json:
        streams = list(json.load(stream_config_json).keys())

    ### Make plots & tables
    # Headline bar charts
    if stream_config != "streamless":
        headline_bar_charts(fname_helper)

    if process == 'hlt2':
        make_per_wg_bar_charts(fname_helper, streams)

    df = pd.read_csv(
        fname_helper.final_rate_table_all_lines_path("csv"), sep=',')
    kHz_to_Hz = 1000
    rate_bw_info_by_line = {
        df['Line'][i]: LineRateBWInfo(
            df['Rate (kHz)'][i] * kHz_to_Hz, df["Avg DstData Size (kB)"][i],
            df["Avg Total Event Size (kB)"][i],
            df["DstData Bandwidth (GB/s)"][i], df["Total Bandwidth (GB/s)"][i])
        for i in range(len(df))
    }

    make_plots(rate_bw_info_by_line, fname_helper=fname_helper)

    # Extra pages
    write_html_page(
        fname_helper.other_lines_html_page_path(full_path=True),
        render_other_line_table(
            process, list_of_other_lines(process, rate_bw_info_by_line)))

    write_html_page(
        fname_helper.all_rates_html_page_path(full_path=True),
        render_all_lines_page(fname_helper, args.building_locally))

    write_html_page(
        fname_helper.sim_matrices_html_page_path(full_path=True),
        render_sim_matrices_page(fname_helper))

    write_html_page(
        fname_helper.extra_bar_charts_html_page_path(full_path=True),
        render_bar_charts(
            fname_helper, streams, metrics=EXTRA_BAR_CHARTS.keys()))

    write_html_page(
        fname_helper.extra_sim_matrices_html_page_path(full_path=True),
        render_extra_sim_matrices(fname_helper, streams))

    # Main page
    with open(fname_helper.index_html_page_path(full_path=True),
              "w") as html_file:
        html = render_single_test_page(process, stream_config,
                                       input_config_path, streams, args)
        html_file.write(html)

    # Prepare messages to GitLab
    write_message(fname_helper, rate_bw_info_by_line)
    return


def total_bw_to_disk_bar_chart(stream_configs: list[str]):

    disk_process = "spruce"
    DISK_TDR_BANDWIDTHS = TDR_BANDWIDTHS[disk_process]

    bandwidths = {
        "Current": {
            PRETTY_STREAM_NAMES.get(stream_config, stream_config):
            total_rate_and_bw(FileNameHelper(disk_process, stream_config))[1]
            for stream_config in stream_configs
        },
        "TDR": {
            PRETTY_STREAM_NAMES.get(stream_config, stream_config):
            DISK_TDR_BANDWIDTHS[stream_config]['total']
            for stream_config in stream_configs
        }
    }

    bandwidths['Current']['Total'] = sum(bandwidths['Current'].values())
    bandwidths['TDR']['Total'] = sum(bandwidths['TDR'].values())
    print("Summary of bandwidths to disk:")
    print(bandwidths)

    _important_bar_chart_maker(
        bandwidths, disk_process, is_to_total_to_disk_bar_chart=True)


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='make_bandwidth_test_page')
    parser.add_argument(
        '--per-test-info',
        type=str,
        nargs='+',
        required=True,
        help=
        "List of strings, each being a colon-separated list corresponding to <process>:<stream_config>:<input_config_yaml_path>"
    )
    parser.add_argument(
        '-s',
        '--script-path',
        type=str,
        required=True,
        help=
        'Path to the top-level testing script that is running/calling this script.'
    )
    parser.add_argument(
        '--building-locally',
        action='store_true',
        help=
        'Makes links between pages work for building the pages locally rather than on the LHCbPR website.'
    )
    args = parser.parse_args()

    processes_and_stream_configs = []
    # Unpack args.per_test_info into process, stream_config, input_config
    for per_test_info in args.per_test_info:
        assert len(
            per_test_info.split(':')
        ) == 3, "per_test_info must be colon-separated list of <process>:<stream_config>:<input_config_yaml_path>"
        process, stream_config, input_config = per_test_info.split(':')
        assert process in ['hlt1', 'hlt2', 'spruce'
                           ], "process must be one of 'hlt1', 'hlt2', 'spruce'"

        make_html_for_single_test(process, stream_config, input_config, args)
        processes_and_stream_configs.append((process, stream_config))

    # Bar chart of total bandwidth to disk
    expected_stream_configs_to_disk = KNOWN_STREAM_CONFIGS_BY_STAGE["spruce"]
    to_disk_stream_configs = [
        stream_config
        for process, stream_config in processes_and_stream_configs
        if process == 'spruce'
    ]
    make_total_bw_to_disk_bar_chart = sorted(
        expected_stream_configs_to_disk) == sorted(to_disk_stream_configs)
    if make_total_bw_to_disk_bar_chart:
        total_bw_to_disk_bar_chart(to_disk_stream_configs)

    # Top-level page
    base_path = FileNameHelper.base_html_path(args.building_locally)
    with open(FileNameHelper.top_level_index_html_path(), "w") as html_file:
        html = render_top_level_page(
            args.script_path,
            base_path,
            processes_and_stream_configs,
            to_disk_bar_chart=make_total_bw_to_disk_bar_chart)
        html_file.write(html)