Ross John Hunter · 96d7714b · a6b06eb1 · 3af7fb4e · d70e374b · 43d25c15
--- a/python/MooreTests/calculate_stream_overlap.py 0 → 100755

+ 96

− 0
+++ b/python/MooreTests/calculate_stream_overlap.py 0 → 100755

+ 96

− 0
+###############################################################################
+# (c) Copyright 2000-2023 CERN for the benefit of the LHCb Collaboration      #
+#                                                                             #
+# This software is distributed under the terms of the GNU General Public      #
+# Licence version 3 (GPL Version 3), copied verbatim in the file "COPYING".   #
+#                                                                             #
+# In applying this licence, CERN does not waive the privileges and immunities #
+# granted to it by virtue of its status as an Intergovernmental Organization  #
+# or submit itself to any jurisdiction.                                       #
+###############################################################################
+''' Compare event numbers extracted from mdf outputs of different streams to
+    quantify the overlap between the streams by Jaccard similarity index.
+    Writes similarity matrix to console and out to .html for usage in BW test page.
+'''
+import argparse
+import pandas as pd
+from sklearn.metrics.pairwise import pairwise_distances
+import json
+from PRConfig.bandwidth_helpers import FileNameHelper
+
+
+def get_all_event_numbers(args):
+    fname_helper = FileNameHelper(args.process)
+    ret = {}
+    for stream in args.streams:
+        with open(
+                fname_helper.event_no_fname(args.stream_config, stream),
+                'r') as f:
+            ret.update(json.load(f))
+    return ret
+
+
+def calculate_similarity_matrix(event_numbers_by_stream):
+
+    all_event_numbers = set([
+        evt_no for evt_no_list in event_numbers_by_stream.values()
+        for evt_no in evt_no_list
+    ])
+    print(
+        f"Found {len(all_event_numbers)} unique event numbers across {len(event_numbers_by_stream.keys())} streams."
+    )
+
+    df = pd.DataFrame(
+        False,
+        index=list(all_event_numbers),
+        columns=event_numbers_by_stream.keys())
+    for stream, evt_no_list in event_numbers_by_stream.items():
+        for evt_no in evt_no_list:
+            df[stream][evt_no] = True
+
+    jaccard = 1 - pairwise_distances(
+        df.T.to_numpy(), metric='jaccard'
+    )  # .T bcuz pairwise_distance must expect the fields to take similarity between to be rows rather than columns
+    jaccard_sim_matrix_df = pd.DataFrame(
+        jaccard, columns=df.columns, index=df.columns)
+
+    return jaccard_sim_matrix_df
+
+
+def save(df, htmlpath):
+    # Generate HTML table for similarity matrix
+    html = df.to_html(float_format=lambda x: f"{x:.1%}")
+    with open(htmlpath, 'w') as f:
+        f.write(html)
+
+
+def main():
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '-p', '--process', type=str, required=True, choices=['hlt2', 'spruce'])
+    parser.add_argument(
+        '--stream-config',
+        type=str,
+        required=True,
+        choices=["wg", "production"])
+    parser.add_argument('--streams', nargs='+', type=str, required=True)
+    args = parser.parse_args()
+    fname_helper = FileNameHelper(args.process)
+
+    event_numbers = get_all_event_numbers(args)
+    for stream in args.streams:
+        print(
+            f"Found {len(event_numbers[stream])} events for {stream} stream.")
+
+    ofile = fname_helper.jaccard_similarities_path(args.stream_config)
+    sim_matrix = calculate_similarity_matrix(event_numbers)
+    print(
+        f"Calculated similarity matrix. Printing and saving to html at {ofile}."
+    )
+    print(sim_matrix)
+    save(sim_matrix, ofile)
+
+
+if __name__ == "__main__":
+    main()