Skip to content
Snippets Groups Projects

Refactor and speed-up the periodic BW tests

Merged Ross John Hunter requested to merge rjhunter-trim-fat-from-BW-tests into master
All threads resolved!
Files
17
+ 96
0
###############################################################################
# (c) Copyright 2000-2023 CERN for the benefit of the LHCb Collaboration #
# #
# This software is distributed under the terms of the GNU General Public #
# Licence version 3 (GPL Version 3), copied verbatim in the file "COPYING". #
# #
# In applying this licence, CERN does not waive the privileges and immunities #
# granted to it by virtue of its status as an Intergovernmental Organization #
# or submit itself to any jurisdiction. #
###############################################################################
''' Compare event numbers extracted from mdf outputs of different streams to
quantify the overlap between the streams by Jaccard similarity index.
Writes similarity matrix to console and out to .html for usage in BW test page.
'''
import argparse
import pandas as pd
from sklearn.metrics.pairwise import pairwise_distances
import json
from PRConfig.bandwidth_helpers import FileNameHelper
def get_all_event_numbers(args):
fname_helper = FileNameHelper(args.process)
ret = {}
for stream in args.streams:
with open(
fname_helper.event_no_fname(args.stream_config, stream),
'r') as f:
ret.update(json.load(f))
return ret
def calculate_similarity_matrix(event_numbers_by_stream):
all_event_numbers = set([
evt_no for evt_no_list in event_numbers_by_stream.values()
for evt_no in evt_no_list
])
print(
f"Found {len(all_event_numbers)} unique event numbers across {len(event_numbers_by_stream.keys())} streams."
)
df = pd.DataFrame(
False,
index=list(all_event_numbers),
columns=event_numbers_by_stream.keys())
for stream, evt_no_list in event_numbers_by_stream.items():
for evt_no in evt_no_list:
df[stream][evt_no] = True
jaccard = 1 - pairwise_distances(
df.T.to_numpy(), metric='jaccard'
) # .T bcuz pairwise_distance must expect the fields to take similarity between to be rows rather than columns
jaccard_sim_matrix_df = pd.DataFrame(
jaccard, columns=df.columns, index=df.columns)
return jaccard_sim_matrix_df
def save(df, htmlpath):
# Generate HTML table for similarity matrix
html = df.to_html(float_format=lambda x: f"{x:.1%}")
with open(htmlpath, 'w') as f:
f.write(html)
def main():
parser = argparse.ArgumentParser()
parser.add_argument(
'-p', '--process', type=str, required=True, choices=['hlt2', 'spruce'])
parser.add_argument(
'--stream-config',
type=str,
required=True,
choices=["wg", "production"])
parser.add_argument('--streams', nargs='+', type=str, required=True)
args = parser.parse_args()
fname_helper = FileNameHelper(args.process)
event_numbers = get_all_event_numbers(args)
for stream in args.streams:
print(
f"Found {len(event_numbers[stream])} events for {stream} stream.")
ofile = fname_helper.jaccard_similarities_path(args.stream_config)
sim_matrix = calculate_similarity_matrix(event_numbers)
print(
f"Calculated similarity matrix. Printing and saving to html at {ofile}."
)
print(sim_matrix)
save(sim_matrix, ofile)
if __name__ == "__main__":
main()
Loading