From b5122c01fb8d04230f6a5486e0628cf4470c1c91 Mon Sep 17 00:00:00 2001 From: Ryunosuke O'Neil <r.oneil@cern.ch> Date: Fri, 26 Nov 2021 17:41:47 +0100 Subject: [PATCH 01/23] Average throughput decrease checker --- .gitlab-ci.yml | 2 + checker/plotting/csv_plotter.py | 93 ++++++++++------------- checker/plotting/post_combined_message.py | 74 ++++++++++++++---- scripts/ci/jobs/publish_throughput.sh | 17 ++++- 4 files changed, 117 insertions(+), 69 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 56b036c96d6..d9147a2d513 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -15,6 +15,8 @@ variables: RUN_THROUGHPUT_OPTIONS_HIP: "-n 5000 --events-per-slice 5000 -m 3000 -t 10 -r 1000" RUN_THROUGHPUT_OPTIONS_CPU: "-n 100 -m 100 -r 200" + AVG_THROUGHPUT_DECREASE_THRESHOLD: "-2.5" # (%); fail throughput jobs if averaged throughput % change falls below -2.5% + OVERRIDE_CUDA_ARCH_FLAG: "-gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_86,code=sm_86" stages: diff --git a/checker/plotting/csv_plotter.py b/checker/plotting/csv_plotter.py index 860c9a447e3..55db8323d28 100755 --- a/checker/plotting/csv_plotter.py +++ b/checker/plotting/csv_plotter.py @@ -3,30 +3,33 @@ # (c) Copyright 2018-2020 CERN for the benefit of the LHCb Collaboration # ############################################################################### import csv -import subprocess -import traceback from optparse import OptionParser from termgraph import TermGraph import requests -import urllib -def get_master_throughput(job_name, - csvfile="devices_throughputs.csv", - scale=1.0): +def parse_throughput(content, scale=1.0): + throughput = {} + content_reader = csv.reader(content.splitlines()) + for row in content_reader: + if row: + throughput[row[0]] = float(row[1]) * scale + return throughput + + +def get_master_throughput( + job_name, + url="https://gitlab.cern.ch/lhcb/Allen/-/jobs/artifacts/master/raw/", + csvfile="devices_throughputs.csv", + scale=1.0, +): try: master_throughput = {} if job_name: - base_url = ( - "https://gitlab.cern.ch/lhcb/Allen/-/jobs/artifacts/master/raw/" - + csvfile) - r = requests.get( - base_url, params={"job": job_name}, allow_redirects=True) + base_url = url + csvfile + r = requests.get(base_url, params={"job": job_name}, allow_redirects=True) content = r.content.decode("utf-8") - content_reader = csv.reader(content.splitlines()) - for row in content_reader: - if row: - master_throughput[row[0]] = float(row[1]) * scale + master_throughput = parse_throughput(content, scale=scale) return master_throughput except Exception as e: print("get_master_throughput exception:", e) @@ -38,8 +41,7 @@ def format_text(title, plot_data, unit, x_max, master_throughput={}): final_vals = [] final_tags = [] - keylist = sorted( - plot_data.keys(), key=lambda x: plot_data[x], reverse=True) + keylist = sorted(plot_data.keys(), key=lambda x: plot_data[x], reverse=True) for k in keylist: val = plot_data[k] final_tags.append(k) @@ -54,8 +56,7 @@ def format_text(title, plot_data, unit, x_max, master_throughput={}): # Add relative throughputs if requested if master_throughput: speedup_wrt_master = { - a: plot_data.get(a, b) / b - for a, b in master_throughput.items() + a: plot_data.get(a, b) / b for a, b in master_throughput.items() } annotated_output = "" for line in output.splitlines(): @@ -72,39 +73,25 @@ def format_text(title, plot_data, unit, x_max, master_throughput={}): def send_to_mattermost(text, mattermost_url): - subprocess.call([ - "curl", - "-i", - "-X", - "POST", - "-H", - "Content-Type: application/json", - "-d", - text, - mattermost_url, - ]) + request_json = {"text": text} + response = requests.post( + mattermost_url, json=request_json, headers={"Content-Type": "application/json"} + ) + + assert response.ok, "send_to_mattermost request failed." def produce_plot( - filename, - unit="", - title="", - x_max=10, - mattermost_url=None, - scale=1.0, - normalize=False, - print_text=True, - master_throughput={}, + plot_data, + unit="", + title="", + x_max=10, + mattermost_url=None, + scale=1.0, + normalize=False, + print_text=True, + master_throughput={}, ): - plot_data = {} - with open(filename) as csvfile: - csv_reader = csv.reader(csvfile, delimiter=",") - for row in csv_reader: - try: - plot_data[row[0]] = float(row[1]) * scale - except: - print(traceback.format_exc()) - # Convert throughputs to speedups if normalize: norm = min(plot_data.values()) @@ -112,7 +99,8 @@ def produce_plot( plot_data[k] /= norm text, raw_output = format_text( - title, plot_data, unit, x_max, master_throughput=master_throughput) + title, plot_data, unit, x_max, master_throughput=master_throughput + ) if print_text: print(text) @@ -127,8 +115,8 @@ def main(): Produces a plot of the performance breakdown of the sequence under execution """ usage = ( - "%prog [options] <data_file>\n" + - 'Example: %prog data.csv -m "http://{your-mattermost-site}/hooks/xxx-generatedkey-xxx"' + "%prog [options] <data_file>\n" + + 'Example: %prog data.csv -m "http://{your-mattermost-site}/hooks/xxx-generatedkey-xxx"' ) parser = OptionParser(usage=usage) parser.add_option( @@ -142,8 +130,7 @@ def main(): "--unit", dest="unit", default="", - help= - "A unit suffix to append to evey value. Default is an empty string", + help="A unit suffix to append to evey value. Default is an empty string", ) parser.add_option( "-x", diff --git a/checker/plotting/post_combined_message.py b/checker/plotting/post_combined_message.py index 42aab4ef396..8d22421e4fa 100644 --- a/checker/plotting/post_combined_message.py +++ b/checker/plotting/post_combined_message.py @@ -3,8 +3,14 @@ # (c) Copyright 2018-2020 CERN for the benefit of the LHCb Collaboration # ############################################################################### import os +import sys from optparse import OptionParser -from csv_plotter import produce_plot, send_to_mattermost, get_master_throughput +from csv_plotter import ( + produce_plot, + send_to_mattermost, + get_master_throughput, + parse_throughput, +) def main(): @@ -13,15 +19,13 @@ def main(): """ usage = ( "%prog [options] <-t throughput_data_file> <-b throughput_breakdown_data_file>\n" - + - 'Example: %prog -t throughput_data.csv -b throughput_breakdown.csv -m "http://{your-mattermost-site}/hooks/xxx-generatedkey-xxx"' + + 'Example: %prog -t throughput_data.csv -b throughput_breakdown.csv -m "http://{your-mattermost-site}/hooks/xxx-generatedkey-xxx"' ) parser = OptionParser(usage=usage) parser.add_option( "-m", "--mattermost_url", - default=os.environ["MATTERMOST_KEY"] - if "MATTERMOST_KEY" in os.environ else "", + default=os.environ["MATTERMOST_KEY"] if "MATTERMOST_KEY" in os.environ else "", dest="mattermost_url", help="The url where to post outputs generated for mattermost", ) @@ -45,7 +49,13 @@ def main(): help="Title for your graph. (default: empty string)", ) parser.add_option( - "-j", "--job", dest="job", default="", help="Name of CI job") + "--allowed-average-decrease", + dest="min_avg_tput_change", + default=-2.5, + help="Max tolerated average throughput decrease (%).", + ) + + parser.add_option("-j", "--job", dest="job", default="", help="Name of CI job") (options, args) = parser.parse_args() @@ -54,27 +64,63 @@ def main(): "No mattermost URL was found in MATTERMOST_KEY, or passed as a command line argument." ) + with open(options.throughput) as csvfile: + throughput = parse_throughput(csvfile.read(), scale=1e-3) + with open(options.breakdown) as csvfile: + breakdown = parse_throughput(csvfile.read(), scale=1) + master_throughput = get_master_throughput( - options.job, csvfile=options.throughput, scale=1e-3) + options.job, csvfile=options.throughput, scale=1e-3 + ) + + speedup_wrt_master = { + a: throughput.get(a, b) / b for a, b in master_throughput.items() + } + + # Average throughputs across all devices and complain if we are above decr % threshold + avg_throughput_decr = False + extra_messages = "" + + n_dev = len(speedup_wrt_master.values()) + if n_dev > 0: + average_speedup = sum(speedup_wrt_master.values()) / n_dev + change = (average_speedup - 1.0) * 100.0 + + print(f"Device-averaged speedup: {average_speedup}") + print(f" % change: {change}") + + extra_messages = f"Device-averaged speedup (% change): {average_speedup:.2f} ({change:.2f} %)" + + tput_tol = float(options.min_avg_tput_change) + + if change < tput_tol: + print("*** Average throughput decrease above threshold.") + extra_messages += ( + f" :warning: decrease exceeds {abs(float(tput_tol))} % threshold" + ) + avg_throughput_decr = True + else: + print("No throughput reference available") + extra_messages = f":warning: No reference available for comparison." + throughput_text = produce_plot( - options.throughput, + throughput, master_throughput=master_throughput, unit="kHz", scale=1e-3, print_text=True, ) - breakdown_text = produce_plot(options.breakdown, unit="%", print_text=True) + breakdown_text = produce_plot(breakdown, unit="%", print_text=True) + text = f"""{{"text": "{options.title}:\n{extra_messages}\n```\n{throughput_text}```\n\nBreakdown of sequence:\n```\n{breakdown_text}```"}}""" - text = '{"text": "%s:\n```\n%s```\n\nBreakdown of sequence:\n```\n%s```"}' % ( - options.title, - throughput_text, - breakdown_text, - ) print(text) if options.mattermost_url is not None: send_to_mattermost(text, options.mattermost_url) + if avg_throughput_decr: + sys.exit(5) + if __name__ == "__main__": main() diff --git a/scripts/ci/jobs/publish_throughput.sh b/scripts/ci/jobs/publish_throughput.sh index bd0fdeb265e..b45a063b9bd 100755 --- a/scripts/ci/jobs/publish_throughput.sh +++ b/scripts/ci/jobs/publish_throughput.sh @@ -14,6 +14,8 @@ echo "" echo "run_throughput outputs:" ls -1 | grep output | grep run_throughput +THROUGHPUT_ALARM=1 +THROUGHPUT_MESSAGES="" for SEQUENCE_DATASET in $(ls -1 | grep "run_throughput" | grep -Ei "run_throughput_output_([a-z0-9_]+?)" | sed 's/^run_throughput_output_//') ; do INPUT_FILES=$(cat run_throughput_output_${SEQUENCE_DATASET}/${BREAKDOWN_DEVICE_ID}/input_files.txt) SEQUENCE=$(cat run_throughput_output_${SEQUENCE_DATASET}/${BREAKDOWN_DEVICE_ID}/sequence.txt) @@ -29,7 +31,18 @@ for SEQUENCE_DATASET in $(ls -1 | grep "run_throughput" | grep -Ei "run_throughp -j "${CI_JOB_NAME}" \ -l "Throughput of [branch **\`${CI_COMMIT_REF_NAME} (${CI_COMMIT_SHORT_SHA})\`**, sequence **\`${SEQUENCE}\`** over dataset **\`${INPUT_FILES}\`**](https://gitlab.cern.ch/lhcb/Allen/pipelines/${CI_PIPELINE_ID})" \ -t devices_throughputs_${SEQUENCE_DATASET}.csv \ - -b run_throughput_output_${SEQUENCE_DATASET}/${BREAKDOWN_DEVICE_ID}/algo_breakdown.csv - + -b run_throughput_output_${SEQUENCE_DATASET}/${BREAKDOWN_DEVICE_ID}/algo_breakdown.csv \ + --allowed-average-decrease "${AVG_THROUGHPUT_DECREASE_THRESHOLD}" # (%) python checker/plotting/post_telegraf.py -f devices_throughputs_${SEQUENCE_DATASET}.csv . -s "${SEQUENCE}" -b "${CI_COMMIT_REF_NAME}" -d "${INPUT_FILES}" + RC=$? + + if [ "$RC"= "5" ]; then + THROUGHPUT_ALARM=1 + THROUGHPUT_MESSAGES="${THROUGHPUT_MESSAGES}\n*** Device-averaged throughput change is less than ${AVG_THROUGHPUT_DECREASE_THRESHOLD} %" + fi done + + +echo ${THROUGHPUT_MESSAGES} + +exit $THROUGHPUT_ALARM -- GitLab From 5297976737df7b28c1e015bbf6d20e39c85e15b0 Mon Sep 17 00:00:00 2001 From: Gitlab CI <noreply@cern.ch> Date: Fri, 26 Nov 2021 16:42:47 +0000 Subject: [PATCH 02/23] Fixed formatting patch generated by https://gitlab.cern.ch/lhcb/Allen/-/jobs/17904736 --- checker/plotting/csv_plotter.py | 50 ++++++++++++++++++--------------- 1 file changed, 27 insertions(+), 23 deletions(-) diff --git a/checker/plotting/csv_plotter.py b/checker/plotting/csv_plotter.py index 55db8323d28..3bd1614374b 100755 --- a/checker/plotting/csv_plotter.py +++ b/checker/plotting/csv_plotter.py @@ -18,16 +18,17 @@ def parse_throughput(content, scale=1.0): def get_master_throughput( - job_name, - url="https://gitlab.cern.ch/lhcb/Allen/-/jobs/artifacts/master/raw/", - csvfile="devices_throughputs.csv", - scale=1.0, + job_name, + url="https://gitlab.cern.ch/lhcb/Allen/-/jobs/artifacts/master/raw/", + csvfile="devices_throughputs.csv", + scale=1.0, ): try: master_throughput = {} if job_name: base_url = url + csvfile - r = requests.get(base_url, params={"job": job_name}, allow_redirects=True) + r = requests.get( + base_url, params={"job": job_name}, allow_redirects=True) content = r.content.decode("utf-8") master_throughput = parse_throughput(content, scale=scale) return master_throughput @@ -41,7 +42,8 @@ def format_text(title, plot_data, unit, x_max, master_throughput={}): final_vals = [] final_tags = [] - keylist = sorted(plot_data.keys(), key=lambda x: plot_data[x], reverse=True) + keylist = sorted( + plot_data.keys(), key=lambda x: plot_data[x], reverse=True) for k in keylist: val = plot_data[k] final_tags.append(k) @@ -56,7 +58,8 @@ def format_text(title, plot_data, unit, x_max, master_throughput={}): # Add relative throughputs if requested if master_throughput: speedup_wrt_master = { - a: plot_data.get(a, b) / b for a, b in master_throughput.items() + a: plot_data.get(a, b) / b + for a, b in master_throughput.items() } annotated_output = "" for line in output.splitlines(): @@ -75,22 +78,23 @@ def format_text(title, plot_data, unit, x_max, master_throughput={}): def send_to_mattermost(text, mattermost_url): request_json = {"text": text} response = requests.post( - mattermost_url, json=request_json, headers={"Content-Type": "application/json"} - ) + mattermost_url, + json=request_json, + headers={"Content-Type": "application/json"}) assert response.ok, "send_to_mattermost request failed." def produce_plot( - plot_data, - unit="", - title="", - x_max=10, - mattermost_url=None, - scale=1.0, - normalize=False, - print_text=True, - master_throughput={}, + plot_data, + unit="", + title="", + x_max=10, + mattermost_url=None, + scale=1.0, + normalize=False, + print_text=True, + master_throughput={}, ): # Convert throughputs to speedups if normalize: @@ -99,8 +103,7 @@ def produce_plot( plot_data[k] /= norm text, raw_output = format_text( - title, plot_data, unit, x_max, master_throughput=master_throughput - ) + title, plot_data, unit, x_max, master_throughput=master_throughput) if print_text: print(text) @@ -115,8 +118,8 @@ def main(): Produces a plot of the performance breakdown of the sequence under execution """ usage = ( - "%prog [options] <data_file>\n" - + 'Example: %prog data.csv -m "http://{your-mattermost-site}/hooks/xxx-generatedkey-xxx"' + "%prog [options] <data_file>\n" + + 'Example: %prog data.csv -m "http://{your-mattermost-site}/hooks/xxx-generatedkey-xxx"' ) parser = OptionParser(usage=usage) parser.add_option( @@ -130,7 +133,8 @@ def main(): "--unit", dest="unit", default="", - help="A unit suffix to append to evey value. Default is an empty string", + help= + "A unit suffix to append to evey value. Default is an empty string", ) parser.add_option( "-x", -- GitLab From 1b1456471ffd0ad20a2819ba915c7fef4e8e380c Mon Sep 17 00:00:00 2001 From: Ryunosuke O'Neil <r.oneil@cern.ch> Date: Fri, 26 Nov 2021 18:03:08 +0100 Subject: [PATCH 03/23] Fix typo --- scripts/ci/jobs/publish_throughput.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/ci/jobs/publish_throughput.sh b/scripts/ci/jobs/publish_throughput.sh index b45a063b9bd..d1c4480dd00 100755 --- a/scripts/ci/jobs/publish_throughput.sh +++ b/scripts/ci/jobs/publish_throughput.sh @@ -14,7 +14,7 @@ echo "" echo "run_throughput outputs:" ls -1 | grep output | grep run_throughput -THROUGHPUT_ALARM=1 +THROUGHPUT_ALARM=0 THROUGHPUT_MESSAGES="" for SEQUENCE_DATASET in $(ls -1 | grep "run_throughput" | grep -Ei "run_throughput_output_([a-z0-9_]+?)" | sed 's/^run_throughput_output_//') ; do INPUT_FILES=$(cat run_throughput_output_${SEQUENCE_DATASET}/${BREAKDOWN_DEVICE_ID}/input_files.txt) -- GitLab From 0232cedd90444805f1241d806ac68074e1e7171b Mon Sep 17 00:00:00 2001 From: Ryunosuke O'Neil <r.oneil@cern.ch> Date: Fri, 26 Nov 2021 18:13:41 +0100 Subject: [PATCH 04/23] Single-device throughput decrease alarm --- .gitlab-ci.yml | 3 ++- checker/plotting/post_combined_message.py | 32 +++++++++++++++++++++-- scripts/ci/jobs/publish_throughput.sh | 9 +++++-- 3 files changed, 39 insertions(+), 5 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index d9147a2d513..d149723ddc3 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -15,7 +15,8 @@ variables: RUN_THROUGHPUT_OPTIONS_HIP: "-n 5000 --events-per-slice 5000 -m 3000 -t 10 -r 1000" RUN_THROUGHPUT_OPTIONS_CPU: "-n 100 -m 100 -r 200" - AVG_THROUGHPUT_DECREASE_THRESHOLD: "-2.5" # (%); fail throughput jobs if averaged throughput % change falls below -2.5% + AVG_THROUGHPUT_DECREASE_THRESHOLD: "-2.5" # (%); fail throughput check if averaged throughput % change falls below -2.5% + DEVICE_THROUGHPUT_DECREASE_THRESHOLD: "-5.0" # (%); fail throughput check if single device throughput % change falls below -5.0% OVERRIDE_CUDA_ARCH_FLAG: "-gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_86,code=sm_86" diff --git a/checker/plotting/post_combined_message.py b/checker/plotting/post_combined_message.py index 8d22421e4fa..e8f08bee516 100644 --- a/checker/plotting/post_combined_message.py +++ b/checker/plotting/post_combined_message.py @@ -48,6 +48,9 @@ def main(): default="", help="Title for your graph. (default: empty string)", ) + + parser.add_option("-j", "--job", dest="job", default="", help="Name of CI job") + parser.add_option( "--allowed-average-decrease", dest="min_avg_tput_change", @@ -55,7 +58,12 @@ def main(): help="Max tolerated average throughput decrease (%).", ) - parser.add_option("-j", "--job", dest="job", default="", help="Name of CI job") + parser.add_option( + "--allowed-single-decrease", + dest="min_single_tput_change", + default=-5.0, + help="Max tolerated single-device throughput decrease (%).", + ) (options, args) = parser.parse_args() @@ -79,6 +87,7 @@ def main(): # Average throughputs across all devices and complain if we are above decr % threshold avg_throughput_decr = False + single_throughput_decr = False extra_messages = "" n_dev = len(speedup_wrt_master.values()) @@ -96,13 +105,29 @@ def main(): if change < tput_tol: print("*** Average throughput decrease above threshold.") extra_messages += ( - f" :warning: decrease exceeds {abs(float(tput_tol))} % threshold" + f" :warning: decrease _exceeds_ {abs(float(tput_tol))} % threshold\n" ) avg_throughput_decr = True else: print("No throughput reference available") extra_messages = f":warning: No reference available for comparison." + if not avg_throughput_decr: + extra_messages += "\n" if len(extra_messages) > 0 else "" + tput_tol = float(options.min_single_tput_change) + + for device, speedup in speedup_wrt_master.items(): + change = (speedup - 1.0) * 100.0 + print(f"{device} speedup: {average_speedup}") + print(f"{device} % change: {change}") + + if change < tput_tol: + print( + f"*** {device} Single-device throughput decrease above threshold." + ) + extra_messages += f":warning: **{device}** throughput decrease _exceeds_ {abs(float(tput_tol))} % threshold\n" + avg_throughput_decr = True + throughput_text = produce_plot( throughput, master_throughput=master_throughput, @@ -121,6 +146,9 @@ def main(): if avg_throughput_decr: sys.exit(5) + if single_throughput_decr: + sys.exit(6) + if __name__ == "__main__": main() diff --git a/scripts/ci/jobs/publish_throughput.sh b/scripts/ci/jobs/publish_throughput.sh index d1c4480dd00..b2acc95b733 100755 --- a/scripts/ci/jobs/publish_throughput.sh +++ b/scripts/ci/jobs/publish_throughput.sh @@ -32,14 +32,19 @@ for SEQUENCE_DATASET in $(ls -1 | grep "run_throughput" | grep -Ei "run_throughp -l "Throughput of [branch **\`${CI_COMMIT_REF_NAME} (${CI_COMMIT_SHORT_SHA})\`**, sequence **\`${SEQUENCE}\`** over dataset **\`${INPUT_FILES}\`**](https://gitlab.cern.ch/lhcb/Allen/pipelines/${CI_PIPELINE_ID})" \ -t devices_throughputs_${SEQUENCE_DATASET}.csv \ -b run_throughput_output_${SEQUENCE_DATASET}/${BREAKDOWN_DEVICE_ID}/algo_breakdown.csv \ - --allowed-average-decrease "${AVG_THROUGHPUT_DECREASE_THRESHOLD}" # (%) + --allowed-average-decrease "${AVG_THROUGHPUT_DECREASE_THRESHOLD}" \ + --allowed-single-decrease "${DEVICE_THROUGHPUT_DECREASE_THRESHOLD}" # (%) python checker/plotting/post_telegraf.py -f devices_throughputs_${SEQUENCE_DATASET}.csv . -s "${SEQUENCE}" -b "${CI_COMMIT_REF_NAME}" -d "${INPUT_FILES}" RC=$? - if [ "$RC"= "5" ]; then + if [ "$RC" = "5" ]; then THROUGHPUT_ALARM=1 THROUGHPUT_MESSAGES="${THROUGHPUT_MESSAGES}\n*** Device-averaged throughput change is less than ${AVG_THROUGHPUT_DECREASE_THRESHOLD} %" fi + if [ "$RC" = "6" ]; then + THROUGHPUT_ALARM=1 + THROUGHPUT_MESSAGES="${THROUGHPUT_MESSAGES}\n*** Single-device throughput change, for at least one device, is less than ${DEVICE_THROUGHPUT_DECREASE_THRESHOLD} %" + fi done -- GitLab From a1bcebcab3302a32af4e3d21b0aa3e5a41de7cf2 Mon Sep 17 00:00:00 2001 From: Ryunosuke O'Neil <r.oneil@cern.ch> Date: Fri, 26 Nov 2021 18:20:32 +0100 Subject: [PATCH 05/23] improved formatting --- checker/plotting/post_combined_message.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/checker/plotting/post_combined_message.py b/checker/plotting/post_combined_message.py index e8f08bee516..7400e3bb8b0 100644 --- a/checker/plotting/post_combined_message.py +++ b/checker/plotting/post_combined_message.py @@ -104,9 +104,7 @@ def main(): if change < tput_tol: print("*** Average throughput decrease above threshold.") - extra_messages += ( - f" :warning: decrease _exceeds_ {abs(float(tput_tol))} % threshold\n" - ) + extra_messages += f" :warning: :eyes: decrease _exceeds_ {abs(float(tput_tol))} % threshold\n" avg_throughput_decr = True else: print("No throughput reference available") @@ -125,7 +123,7 @@ def main(): print( f"*** {device} Single-device throughput decrease above threshold." ) - extra_messages += f":warning: **{device}** throughput decrease _exceeds_ {abs(float(tput_tol))} % threshold\n" + extra_messages += f":warning: :eyes: **{device}** throughput decrease _exceeds_ {abs(float(tput_tol))} % threshold\n" avg_throughput_decr = True throughput_text = produce_plot( @@ -136,9 +134,8 @@ def main(): print_text=True, ) breakdown_text = produce_plot(breakdown, unit="%", print_text=True) - text = f"""{{"text": "{options.title}:\n{extra_messages}\n```\n{throughput_text}```\n\nBreakdown of sequence:\n```\n{breakdown_text}```"}}""" - print(text) + text = f"{options.title}:\n```\n{throughput_text}```\n{extra_messages}\n\nBreakdown of sequence:\n```\n{breakdown_text}```" if options.mattermost_url is not None: send_to_mattermost(text, options.mattermost_url) -- GitLab From 85214d6fd6f6b8fcbd2a07ad5d02f42780ea03a8 Mon Sep 17 00:00:00 2001 From: Ryunosuke O'Neil <r.oneil@cern.ch> Date: Fri, 26 Nov 2021 19:20:53 +0100 Subject: [PATCH 06/23] make threshold less sensitive at -7.5%. Catch throughput alarm properly --- .gitlab-ci.yml | 2 +- scripts/ci/jobs/publish_throughput.sh | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index d149723ddc3..02a3d41ef4c 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -16,7 +16,7 @@ variables: RUN_THROUGHPUT_OPTIONS_CPU: "-n 100 -m 100 -r 200" AVG_THROUGHPUT_DECREASE_THRESHOLD: "-2.5" # (%); fail throughput check if averaged throughput % change falls below -2.5% - DEVICE_THROUGHPUT_DECREASE_THRESHOLD: "-5.0" # (%); fail throughput check if single device throughput % change falls below -5.0% + DEVICE_THROUGHPUT_DECREASE_THRESHOLD: "-7.5" # (%); fail throughput check if single device throughput % change falls below -10.0% OVERRIDE_CUDA_ARCH_FLAG: "-gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_86,code=sm_86" diff --git a/scripts/ci/jobs/publish_throughput.sh b/scripts/ci/jobs/publish_throughput.sh index b2acc95b733..44c2c166ddd 100755 --- a/scripts/ci/jobs/publish_throughput.sh +++ b/scripts/ci/jobs/publish_throughput.sh @@ -3,7 +3,7 @@ # (c) Copyright 2018-2020 CERN for the benefit of the LHCb Collaboration # ############################################################################### -set -euxo pipefail +set -uxo pipefail setupViews @@ -34,8 +34,9 @@ for SEQUENCE_DATASET in $(ls -1 | grep "run_throughput" | grep -Ei "run_throughp -b run_throughput_output_${SEQUENCE_DATASET}/${BREAKDOWN_DEVICE_ID}/algo_breakdown.csv \ --allowed-average-decrease "${AVG_THROUGHPUT_DECREASE_THRESHOLD}" \ --allowed-single-decrease "${DEVICE_THROUGHPUT_DECREASE_THRESHOLD}" # (%) - python checker/plotting/post_telegraf.py -f devices_throughputs_${SEQUENCE_DATASET}.csv . -s "${SEQUENCE}" -b "${CI_COMMIT_REF_NAME}" -d "${INPUT_FILES}" RC=$? + + python checker/plotting/post_telegraf.py -f devices_throughputs_${SEQUENCE_DATASET}.csv . -s "${SEQUENCE}" -b "${CI_COMMIT_REF_NAME}" -d "${INPUT_FILES}" if [ "$RC" = "5" ]; then THROUGHPUT_ALARM=1 @@ -47,7 +48,6 @@ for SEQUENCE_DATASET in $(ls -1 | grep "run_throughput" | grep -Ei "run_throughp fi done - echo ${THROUGHPUT_MESSAGES} exit $THROUGHPUT_ALARM -- GitLab From 4d66a9f36f07c4981852ef1c705a7de11f6f13fc Mon Sep 17 00:00:00 2001 From: Ryunosuke O'Neil <r.oneil@cern.ch> Date: Fri, 26 Nov 2021 19:51:01 +0100 Subject: [PATCH 07/23] Fix typo in print --- checker/plotting/post_combined_message.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/checker/plotting/post_combined_message.py b/checker/plotting/post_combined_message.py index 7400e3bb8b0..60957c4d718 100644 --- a/checker/plotting/post_combined_message.py +++ b/checker/plotting/post_combined_message.py @@ -116,7 +116,7 @@ def main(): for device, speedup in speedup_wrt_master.items(): change = (speedup - 1.0) * 100.0 - print(f"{device} speedup: {average_speedup}") + print(f"{device} speedup: {speedup}") print(f"{device} % change: {change}") if change < tput_tol: -- GitLab From 91be8219a527e51dd6914ceb719be8530d7df5bc Mon Sep 17 00:00:00 2001 From: Ryunosuke O'Neil <r.oneil@cern.ch> Date: Fri, 26 Nov 2021 21:21:44 +0100 Subject: [PATCH 08/23] print error properly --- scripts/ci/jobs/publish_throughput.sh | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/scripts/ci/jobs/publish_throughput.sh b/scripts/ci/jobs/publish_throughput.sh index 44c2c166ddd..32f76bf60d3 100755 --- a/scripts/ci/jobs/publish_throughput.sh +++ b/scripts/ci/jobs/publish_throughput.sh @@ -35,16 +35,18 @@ for SEQUENCE_DATASET in $(ls -1 | grep "run_throughput" | grep -Ei "run_throughp --allowed-average-decrease "${AVG_THROUGHPUT_DECREASE_THRESHOLD}" \ --allowed-single-decrease "${DEVICE_THROUGHPUT_DECREASE_THRESHOLD}" # (%) RC=$? - + python checker/plotting/post_telegraf.py -f devices_throughputs_${SEQUENCE_DATASET}.csv . -s "${SEQUENCE}" -b "${CI_COMMIT_REF_NAME}" -d "${INPUT_FILES}" if [ "$RC" = "5" ]; then THROUGHPUT_ALARM=1 - THROUGHPUT_MESSAGES="${THROUGHPUT_MESSAGES}\n*** Device-averaged throughput change is less than ${AVG_THROUGHPUT_DECREASE_THRESHOLD} %" + THROUGHPUT_MESSAGES="${THROUGHPUT_MESSAGES} +*** Device-averaged throughput change is less than ${AVG_THROUGHPUT_DECREASE_THRESHOLD} %" fi if [ "$RC" = "6" ]; then THROUGHPUT_ALARM=1 - THROUGHPUT_MESSAGES="${THROUGHPUT_MESSAGES}\n*** Single-device throughput change, for at least one device, is less than ${DEVICE_THROUGHPUT_DECREASE_THRESHOLD} %" + THROUGHPUT_MESSAGES="${THROUGHPUT_MESSAGES} +*** Single-device throughput change, for at least one device, is less than ${DEVICE_THROUGHPUT_DECREASE_THRESHOLD} %" fi done -- GitLab From 7a198b31eb33caf7da9f4a9cc32b5394ea2db169 Mon Sep 17 00:00:00 2001 From: Ryunosuke O'Neil <r.oneil@cern.ch> Date: Fri, 26 Nov 2021 21:22:49 +0100 Subject: [PATCH 09/23] try not to print everything to make the log a bit more readable --- scripts/ci/jobs/publish_throughput.sh | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/scripts/ci/jobs/publish_throughput.sh b/scripts/ci/jobs/publish_throughput.sh index 32f76bf60d3..ced2432ad56 100755 --- a/scripts/ci/jobs/publish_throughput.sh +++ b/scripts/ci/jobs/publish_throughput.sh @@ -22,10 +22,10 @@ for SEQUENCE_DATASET in $(ls -1 | grep "run_throughput" | grep -Ei "run_throughp cat run_throughput_output_${SEQUENCE_DATASET}/*/output.txt | grep --color=none "select device" | sed 's/.*:\ [0-9]*\,\ //' > devices_${SEQUENCE_DATASET}.txt cat run_throughput_output_${SEQUENCE_DATASET}/*/output.txt | grep --color=none "events/s" | awk '{ print $1; }' > throughputs_${SEQUENCE_DATASET}.txt - cat devices_${SEQUENCE_DATASET}.txt - cat throughputs_${SEQUENCE_DATASET}.txt + # cat devices_${SEQUENCE_DATASET}.txt + # cat throughputs_${SEQUENCE_DATASET}.txt paste -d, devices_${SEQUENCE_DATASET}.txt throughputs_${SEQUENCE_DATASET}.txt > devices_throughputs_${SEQUENCE_DATASET}.csv - cat devices_throughputs_${SEQUENCE_DATASET}.csv + # cat devices_throughputs_${SEQUENCE_DATASET}.csv python checker/plotting/post_combined_message.py \ -j "${CI_JOB_NAME}" \ @@ -50,6 +50,8 @@ for SEQUENCE_DATASET in $(ls -1 | grep "run_throughput" | grep -Ei "run_throughp fi done +echo "" +echo "" echo ${THROUGHPUT_MESSAGES} exit $THROUGHPUT_ALARM -- GitLab From 81489d93a64263f70e1e861e0b1026153fad265b Mon Sep 17 00:00:00 2001 From: Ryunosuke O'Neil <r.oneil@cern.ch> Date: Fri, 26 Nov 2021 21:31:55 +0100 Subject: [PATCH 10/23] Improved logging output --- checker/plotting/csv_plotter.py | 4 +-- checker/plotting/post_combined_message.py | 30 +++++++++++------------ scripts/ci/jobs/publish_throughput.sh | 16 +++++++++--- 3 files changed, 28 insertions(+), 22 deletions(-) diff --git a/checker/plotting/csv_plotter.py b/checker/plotting/csv_plotter.py index 3bd1614374b..a5ebce82a37 100755 --- a/checker/plotting/csv_plotter.py +++ b/checker/plotting/csv_plotter.py @@ -50,8 +50,8 @@ def format_text(title, plot_data, unit, x_max, master_throughput={}): final_vals.append(val) # Plot - print(final_tags) - print(final_vals) + # print(final_tags) + # print(final_vals) tg = TermGraph(suffix=unit, x_max=x_max) output = tg.chart(final_vals, final_tags) diff --git a/checker/plotting/post_combined_message.py b/checker/plotting/post_combined_message.py index 60957c4d718..45a1c8a612c 100644 --- a/checker/plotting/post_combined_message.py +++ b/checker/plotting/post_combined_message.py @@ -98,7 +98,7 @@ def main(): print(f"Device-averaged speedup: {average_speedup}") print(f" % change: {change}") - extra_messages = f"Device-averaged speedup (% change): {average_speedup:.2f} ({change:.2f} %)" + extra_messages = f"*Device-averaged speedup (% change):* {average_speedup:.2f} ({change:.2f} %)" tput_tol = float(options.min_avg_tput_change) @@ -110,21 +110,19 @@ def main(): print("No throughput reference available") extra_messages = f":warning: No reference available for comparison." - if not avg_throughput_decr: - extra_messages += "\n" if len(extra_messages) > 0 else "" - tput_tol = float(options.min_single_tput_change) - - for device, speedup in speedup_wrt_master.items(): - change = (speedup - 1.0) * 100.0 - print(f"{device} speedup: {speedup}") - print(f"{device} % change: {change}") - - if change < tput_tol: - print( - f"*** {device} Single-device throughput decrease above threshold." - ) - extra_messages += f":warning: :eyes: **{device}** throughput decrease _exceeds_ {abs(float(tput_tol))} % threshold\n" - avg_throughput_decr = True + # single device throughput decrease check + extra_messages += "\n" if len(extra_messages) > 0 else "" + tput_tol = float(options.min_single_tput_change) + + for device, speedup in speedup_wrt_master.items(): + change = (speedup - 1.0) * 100.0 + print(f"{device} speedup: {speedup}") + print(f"{device} % change: {change}") + + if change < tput_tol: + print(f"*** {device} Single-device throughput decrease above threshold.") + extra_messages += f":warning: :eyes: **{device}** throughput decrease _exceeds_ {abs(float(tput_tol))} % threshold\n" + avg_throughput_decr = True throughput_text = produce_plot( throughput, diff --git a/scripts/ci/jobs/publish_throughput.sh b/scripts/ci/jobs/publish_throughput.sh index ced2432ad56..4bdbc0dea7d 100755 --- a/scripts/ci/jobs/publish_throughput.sh +++ b/scripts/ci/jobs/publish_throughput.sh @@ -20,6 +20,12 @@ for SEQUENCE_DATASET in $(ls -1 | grep "run_throughput" | grep -Ei "run_throughp INPUT_FILES=$(cat run_throughput_output_${SEQUENCE_DATASET}/${BREAKDOWN_DEVICE_ID}/input_files.txt) SEQUENCE=$(cat run_throughput_output_${SEQUENCE_DATASET}/${BREAKDOWN_DEVICE_ID}/sequence.txt) + echo "" + echo "********************************************************************************************************************************************" + echo "********************************************************************************************************************************************" + echo "Throughput of [branch ${CI_COMMIT_REF_NAME} (${CI_COMMIT_SHORT_SHA}), sequence ${SEQUENCE} over dataset ${INPUT_FILES}" + echo "" + echo "" cat run_throughput_output_${SEQUENCE_DATASET}/*/output.txt | grep --color=none "select device" | sed 's/.*:\ [0-9]*\,\ //' > devices_${SEQUENCE_DATASET}.txt cat run_throughput_output_${SEQUENCE_DATASET}/*/output.txt | grep --color=none "events/s" | awk '{ print $1; }' > throughputs_${SEQUENCE_DATASET}.txt # cat devices_${SEQUENCE_DATASET}.txt @@ -41,13 +47,15 @@ for SEQUENCE_DATASET in $(ls -1 | grep "run_throughput" | grep -Ei "run_throughp if [ "$RC" = "5" ]; then THROUGHPUT_ALARM=1 THROUGHPUT_MESSAGES="${THROUGHPUT_MESSAGES} -*** Device-averaged throughput change is less than ${AVG_THROUGHPUT_DECREASE_THRESHOLD} %" - fi - if [ "$RC" = "6" ]; then +*** sequence ${SEQUENCE} over dataset ${INPUT_FILES} - Device-averaged throughput change is less than ${AVG_THROUGHPUT_DECREASE_THRESHOLD} %" + elif [ "$RC" = "6" ]; then THROUGHPUT_ALARM=1 THROUGHPUT_MESSAGES="${THROUGHPUT_MESSAGES} -*** Single-device throughput change, for at least one device, is less than ${DEVICE_THROUGHPUT_DECREASE_THRESHOLD} %" +*** sequence ${SEQUENCE} over dataset ${INPUT_FILES} - Single-device throughput change, for at least one device, is less than ${DEVICE_THROUGHPUT_DECREASE_THRESHOLD} %" fi + echo "" + echo "" + done echo "" -- GitLab From cb621e7373d8a1da997a123f81f0b6b8bf8e5f7d Mon Sep 17 00:00:00 2001 From: Ryunosuke O'Neil <r.oneil@cern.ch> Date: Sun, 28 Nov 2021 19:42:55 +0100 Subject: [PATCH 11/23] grafana: also submit build options --- checker/plotting/post_telegraf.py | 6 ++++-- scripts/ci/jobs/publish_throughput.sh | 3 ++- scripts/ci/jobs/run_throughput.sh | 3 ++- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/checker/plotting/post_telegraf.py b/checker/plotting/post_telegraf.py index 3d7648dd3da..d28c712b85a 100755 --- a/checker/plotting/post_telegraf.py +++ b/checker/plotting/post_telegraf.py @@ -20,8 +20,10 @@ def send_to_telegraf(throughput, device, options): now = time.time() timestamp = int(now) * 1000000000 - telegraf_string = "AllenCIPerformance_v3,branch=%s,device=%s,sequence=%s,dataset=%s " % ( - options.branch, device, options.sequence, options.dataset) + telegraf_string = ( + "AllenCIPerformance_v3,branch=%s,device=%s,sequence=%s,dataset=%s,build_options=%s " + % (options.branch, device, options.sequence, options.dataset, options.buildopts) + ) telegraf_string += "performance=%.2f " % (throughput) telegraf_string += " %d" % timestamp diff --git a/scripts/ci/jobs/publish_throughput.sh b/scripts/ci/jobs/publish_throughput.sh index 4bdbc0dea7d..4b78a83d7af 100755 --- a/scripts/ci/jobs/publish_throughput.sh +++ b/scripts/ci/jobs/publish_throughput.sh @@ -19,6 +19,7 @@ THROUGHPUT_MESSAGES="" for SEQUENCE_DATASET in $(ls -1 | grep "run_throughput" | grep -Ei "run_throughput_output_([a-z0-9_]+?)" | sed 's/^run_throughput_output_//') ; do INPUT_FILES=$(cat run_throughput_output_${SEQUENCE_DATASET}/${BREAKDOWN_DEVICE_ID}/input_files.txt) SEQUENCE=$(cat run_throughput_output_${SEQUENCE_DATASET}/${BREAKDOWN_DEVICE_ID}/sequence.txt) + BUILDOPTIONS=$(cat run_throughput_output_${SEQUENCE_DATASET}/${BREAKDOWN_DEVICE_ID}/buildopts.txt) echo "" echo "********************************************************************************************************************************************" @@ -42,7 +43,7 @@ for SEQUENCE_DATASET in $(ls -1 | grep "run_throughput" | grep -Ei "run_throughp --allowed-single-decrease "${DEVICE_THROUGHPUT_DECREASE_THRESHOLD}" # (%) RC=$? - python checker/plotting/post_telegraf.py -f devices_throughputs_${SEQUENCE_DATASET}.csv . -s "${SEQUENCE}" -b "${CI_COMMIT_REF_NAME}" -d "${INPUT_FILES}" + python checker/plotting/post_telegraf.py -f devices_throughputs_${SEQUENCE_DATASET}.csv . -s "${SEQUENCE}" -b "${CI_COMMIT_REF_NAME}" -d "${INPUT_FILES}" -o "${BUILDOPTIONS}" if [ "$RC" = "5" ]; then THROUGHPUT_ALARM=1 diff --git a/scripts/ci/jobs/run_throughput.sh b/scripts/ci/jobs/run_throughput.sh index 8e5f7b1afe8..1ce212b21d3 100755 --- a/scripts/ci/jobs/run_throughput.sh +++ b/scripts/ci/jobs/run_throughput.sh @@ -19,7 +19,7 @@ fi RUN_OPTIONS="--mdf ${ALLEN_DATA}/mdf_input/${DATA_TAG}.mdf --sequence ${SEQUENCE} --run-from-json 1 ${RUN_OPTIONS}" set -euxo pipefail -OUTPUT_FOLDER_REL="${TEST_NAME}_output_${SEQUENCE}_${DATA_TAG}/${DEVICE_ID}" +OUTPUT_FOLDER_REL="${TEST_NAME}_output_${SEQUENCE}_${DATA_TAG}_${OPTIONS}/${DEVICE_ID}" mkdir -p ${OUTPUT_FOLDER_REL} OUTPUT_FOLDER=$(realpath ${OUTPUT_FOLDER_REL}) @@ -120,6 +120,7 @@ echo "Throughput (kHz, 2 d.p.): ${THROUGHPUT_KHZ}" echo "${DATA_TAG}" > "${OUTPUT_FOLDER}/input_files.txt" echo "${SEQUENCE}" > "${OUTPUT_FOLDER}/sequence.txt" +echo "${OPTIONS}" > "${OUTPUT_FOLDER}/buildopts.txt" echo "${THROUGHPUT}" > "${OUTPUT_FOLDER}/throughput.txt" echo "${CI_COMMIT_SHORT_SHA}" > "${OUTPUT_FOLDER}/revision.txt" -- GitLab From b086058071401d217cb8967b24dc0c9d15d753a1 Mon Sep 17 00:00:00 2001 From: Ryunosuke O'Neil <r.oneil@cern.ch> Date: Sun, 28 Nov 2021 19:44:00 +0100 Subject: [PATCH 12/23] send "default" for empty buildopts string --- checker/plotting/post_telegraf.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/checker/plotting/post_telegraf.py b/checker/plotting/post_telegraf.py index d28c712b85a..d5094a2200b 100755 --- a/checker/plotting/post_telegraf.py +++ b/checker/plotting/post_telegraf.py @@ -20,9 +20,12 @@ def send_to_telegraf(throughput, device, options): now = time.time() timestamp = int(now) * 1000000000 - telegraf_string = ( - "AllenCIPerformance_v3,branch=%s,device=%s,sequence=%s,dataset=%s,build_options=%s " - % (options.branch, device, options.sequence, options.dataset, options.buildopts) + telegraf_string = "AllenCIPerformance_v3,branch=%s,device=%s,sequence=%s,dataset=%s,build_options=%s " % ( + options.branch, + device, + options.sequence, + options.dataset, + options.buildopts if len(options.buildopts) > 0 else "default", ) telegraf_string += "performance=%.2f " % (throughput) -- GitLab From d1d86c5cdfd42eb5dd7eafa151caedf6d501ee75 Mon Sep 17 00:00:00 2001 From: Ryunosuke O'Neil <r.oneil@cern.ch> Date: Mon, 29 Nov 2021 13:18:54 +0100 Subject: [PATCH 13/23] work around unbound variable error --- scripts/ci/common.sh | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/scripts/ci/common.sh b/scripts/ci/common.sh index 8a2f444ee76..af9f60e0388 100644 --- a/scripts/ci/common.sh +++ b/scripts/ci/common.sh @@ -52,6 +52,13 @@ function check_build_exists() { fi } +# Define OPTIONS as empty, if not already defined +if [ ! -z ${OPTIONS+x} ]; then + echo "OPTIONS is not defined - this is fine, but I will set it to empty." + OPTIONS="" +fi + + export BUILD_SEQUENCES="all" TOPLEVEL=${PWD} -- GitLab From 4774633fb7bab030c9f3e62493ad5f7d41cdcad2 Mon Sep 17 00:00:00 2001 From: Ryunosuke O'Neil <r.oneil@cern.ch> Date: Mon, 29 Nov 2021 15:11:05 +0100 Subject: [PATCH 14/23] oops! --- scripts/ci/common.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/ci/common.sh b/scripts/ci/common.sh index af9f60e0388..77f248b20a0 100644 --- a/scripts/ci/common.sh +++ b/scripts/ci/common.sh @@ -53,7 +53,7 @@ function check_build_exists() { } # Define OPTIONS as empty, if not already defined -if [ ! -z ${OPTIONS+x} ]; then +if [ -z ${OPTIONS+x} ]; then echo "OPTIONS is not defined - this is fine, but I will set it to empty." OPTIONS="" fi -- GitLab From 3458e2b7586368a698b6b4d46f44e5e90a036720 Mon Sep 17 00:00:00 2001 From: Ryunosuke O'Neil <r.oneil@cern.ch> Date: Mon, 29 Nov 2021 15:38:19 +0100 Subject: [PATCH 15/23] Add missing cmd line arg --- checker/plotting/post_telegraf.py | 85 +++++++++++++++++-------------- 1 file changed, 47 insertions(+), 38 deletions(-) diff --git a/checker/plotting/post_telegraf.py b/checker/plotting/post_telegraf.py index d5094a2200b..411a6581f20 100755 --- a/checker/plotting/post_telegraf.py +++ b/checker/plotting/post_telegraf.py @@ -32,11 +32,11 @@ def send_to_telegraf(throughput, device, options): telegraf_string += " %d" % timestamp try: - print('Sending telegraf string: %s' % telegraf_string) + print("Sending telegraf string: %s" % telegraf_string) response = session.post(options.telegraf_url, data=telegraf_string) - print('http response: %s' % response.headers) + print("http response: %s" % response.headers) except: - print('Failed to submit data string %s' % telegraf_string) + print("Failed to submit data string %s" % telegraf_string) print(traceback.format_exc()) @@ -49,64 +49,73 @@ def main(argv): global final_msg parser = OptionParser() parser.add_option( - '-f', - '--filename', - dest='filename', - default= - 'devices_throughputs_hlt1_pp_default_upgrade_mc_minbias_scifi_v5_000.csv', - help='The csv file containing the throughput and device name') + "-f", + "--filename", + dest="filename", + default="devices_throughputs_hlt1_pp_default_upgrade_mc_minbias_scifi_v5_000.csv", + help="The csv file containing the throughput and device name", + ) parser.add_option( - '-b', - '--branch', - dest='branch', - default='UNKNOWN', - help='branch tag to be forwarded to telegraf/grafana') + "-b", + "--branch", + dest="branch", + default="UNKNOWN", + help="branch tag to be forwarded to telegraf/grafana", + ) parser.add_option( - '-s', - '--sequence', - dest='sequence', - default='UNKNOWN', - help='sequence name tag to be forwarded to telegraf/grafana') + "-s", + "--sequence", + dest="sequence", + default="UNKNOWN", + help="sequence name tag to be forwarded to telegraf/grafana", + ) parser.add_option( - '-d', - '--dataset', - dest='dataset', - default='UNKNOWN', - help='dataset to be forwarded to telegraf/grafana') + "-d", + "--dataset", + dest="dataset", + default="UNKNOWN", + help="dataset to be forwarded to telegraf/grafana", + ) parser.add_option( - '-t', - '--telegraf_url', - dest='telegraf_url', - #default='http://dcinflux01.lbdaq.cern.ch:8189/telegraf', - #Unfortunately lhcb online names are still not resolved by CERN dns... IP address it is (at least for cern based machines) - default='http://10.128.124.77:8189/telegraf', - help='URL to send telegraf output to') + "-o", + "--build-options", + dest="buildopts", + default="", + help="build options to be forwarded to telegraf/grafana", + ) + parser.add_option( + "-t", + "--telegraf_url", + dest="telegraf_url", + # default='http://dcinflux01.lbdaq.cern.ch:8189/telegraf', + # Unfortunately lhcb online names are still not resolved by CERN dns... IP address it is (at least for cern based machines) + default="http://10.128.124.77:8189/telegraf", + help="URL to send telegraf output to", + ) (options, args) = parser.parse_args() if options.filename is None: parser.print_help() - print('Please specify an input file') + print("Please specify an input file") return try: os.path.isfile(options.filename) except: - print('Failed to open csv file with rates and devices: %s' % - options.filename) + print("Failed to open csv file with rates and devices: %s" % options.filename) traceback.print_exc() return with open(options.filename) as csv_file: - csv_reader = csv.reader(csv_file, delimiter=',') + csv_reader = csv.reader(csv_file, delimiter=",") for row in csv_reader: device = row[0] device_string = device.strip() - device_string = device_string.replace(' ', '\ ') + device_string = device_string.replace(" ", "\ ") throughput = float(row[1]) - print('Device: ' + device_string + - ', Throughput: %.2f' % (throughput)) + print("Device: " + device_string + ", Throughput: %.2f" % (throughput)) send_to_telegraf(throughput, device_string, options) -- GitLab From b3fde2887cd387b0d9eee4bc37d77c8d440c9e8e Mon Sep 17 00:00:00 2001 From: Gitlab CI <noreply@cern.ch> Date: Mon, 29 Nov 2021 14:39:09 +0000 Subject: [PATCH 16/23] Fixed formatting patch generated by https://gitlab.cern.ch/lhcb/Allen/-/jobs/17949083 --- checker/plotting/post_telegraf.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/checker/plotting/post_telegraf.py b/checker/plotting/post_telegraf.py index 411a6581f20..4ec00fa1cd4 100755 --- a/checker/plotting/post_telegraf.py +++ b/checker/plotting/post_telegraf.py @@ -52,7 +52,8 @@ def main(argv): "-f", "--filename", dest="filename", - default="devices_throughputs_hlt1_pp_default_upgrade_mc_minbias_scifi_v5_000.csv", + default= + "devices_throughputs_hlt1_pp_default_upgrade_mc_minbias_scifi_v5_000.csv", help="The csv file containing the throughput and device name", ) parser.add_option( @@ -104,7 +105,8 @@ def main(argv): os.path.isfile(options.filename) except: - print("Failed to open csv file with rates and devices: %s" % options.filename) + print("Failed to open csv file with rates and devices: %s" % + options.filename) traceback.print_exc() return @@ -115,7 +117,8 @@ def main(argv): device_string = device.strip() device_string = device_string.replace(" ", "\ ") throughput = float(row[1]) - print("Device: " + device_string + ", Throughput: %.2f" % (throughput)) + print("Device: " + device_string + + ", Throughput: %.2f" % (throughput)) send_to_telegraf(throughput, device_string, options) -- GitLab From 6d1ae717fc62a4b678c1cdd54a7d349eb8e124c3 Mon Sep 17 00:00:00 2001 From: Ryunosuke O'Neil <r.oneil@cern.ch> Date: Mon, 29 Nov 2021 17:25:44 +0100 Subject: [PATCH 17/23] Throughput increase/decrease label. --- checker/plotting/update_gitlab.py | 58 +++++++++++++++++++++++++++ scripts/ci/jobs/publish_throughput.sh | 14 ++++++- 2 files changed, 71 insertions(+), 1 deletion(-) create mode 100644 checker/plotting/update_gitlab.py diff --git a/checker/plotting/update_gitlab.py b/checker/plotting/update_gitlab.py new file mode 100644 index 00000000000..4e254b0996e --- /dev/null +++ b/checker/plotting/update_gitlab.py @@ -0,0 +1,58 @@ +import os +import gitlab + +import argparse + + +parser = argparse.ArgumentParser( + description="Update the current GitLab merge request with throughput CI results." +) + +parser.add_argument( + "--throughput-status", + help="Add a hlt1-throughput-decreased label to the merge request.", + choices=["decrease", "increase", "no-change", "nothing"], + default="nothing", +) + +args = parser.parse_args() + + +def get_merge_request(): + gl = gitlab.Gitlab( + "https://gitlab.cern.ch", private_token=os.environ[f"ALLENCI_PAT"] + ) + + gl.auth() + + proj_id = int(os.environ["CI_PROJECT_ID"]) + project = gl.projects.get(proj_id) + + mr = project.mergerequests.get(int(os.environ["CI_MERGE_REQUEST_IID"])) + + return mr + + +def toggle_label(mr, name, enabled): + if name in mr.labels and not enabled: + mr.labels = list([l for l in mr.labels if l != name]) + elif enabled and name not in mr.labels: + labels = mr.labels[:] + labels += [name] + mr.labels = list(set(labels)) + + +def main(): + mr = get_merge_request() + if args.throughput_status != "nothing": + toggle_label( + mr, "hlt1-throughput-decreased", args.throughput_status == "decrease" + ) + # toggle_label( + # mr, "hlt1-throughput-increased", args.throughput_status == "increase" + # ) + mr.save() + + +if __name__ == "__main__": + main() diff --git a/scripts/ci/jobs/publish_throughput.sh b/scripts/ci/jobs/publish_throughput.sh index 4b78a83d7af..b597f91f9a4 100755 --- a/scripts/ci/jobs/publish_throughput.sh +++ b/scripts/ci/jobs/publish_throughput.sh @@ -34,9 +34,15 @@ for SEQUENCE_DATASET in $(ls -1 | grep "run_throughput" | grep -Ei "run_throughp paste -d, devices_${SEQUENCE_DATASET}.txt throughputs_${SEQUENCE_DATASET}.txt > devices_throughputs_${SEQUENCE_DATASET}.csv # cat devices_throughputs_${SEQUENCE_DATASET}.csv + if [ "${BUILDOPTIONS}" = "" ]; then + BUILDOPTIONS_DISPLAY="default" + else + BUILDOPTIONS_DISPLAY=${BUILDOPTIONS} + fi + python checker/plotting/post_combined_message.py \ -j "${CI_JOB_NAME}" \ - -l "Throughput of [branch **\`${CI_COMMIT_REF_NAME} (${CI_COMMIT_SHORT_SHA})\`**, sequence **\`${SEQUENCE}\`** over dataset **\`${INPUT_FILES}\`**](https://gitlab.cern.ch/lhcb/Allen/pipelines/${CI_PIPELINE_ID})" \ + -l "Throughput of [branch **\`${CI_COMMIT_REF_NAME} (${CI_COMMIT_SHORT_SHA})\`**, sequence **\`${SEQUENCE}\`** over dataset **\`${INPUT_FILES}\`** build options \`${BUILDOPTIONS_DISPLAY}\`](https://gitlab.cern.ch/lhcb/Allen/pipelines/${CI_PIPELINE_ID})" \ -t devices_throughputs_${SEQUENCE_DATASET}.csv \ -b run_throughput_output_${SEQUENCE_DATASET}/${BREAKDOWN_DEVICE_ID}/algo_breakdown.csv \ --allowed-average-decrease "${AVG_THROUGHPUT_DECREASE_THRESHOLD}" \ @@ -59,6 +65,12 @@ for SEQUENCE_DATASET in $(ls -1 | grep "run_throughput" | grep -Ei "run_throughp done +if [ "${THROUGHPUT_ALARM}" = "1" ]; then + python checker/plotting/update-gitlab.py --throughput-status "decrease" +else + python checker/plotting/update-gitlab.py --throughput-status "no-change" +fi + echo "" echo "" echo ${THROUGHPUT_MESSAGES} -- GitLab From 73cceb9b621f8f4c9e9eb5530257b8a40c3df0c3 Mon Sep 17 00:00:00 2001 From: Ryunosuke O'Neil <r.oneil@cern.ch> Date: Mon, 29 Nov 2021 17:26:24 +0100 Subject: [PATCH 18/23] fix missing reference --- scripts/ci/jobs/run_throughput.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/ci/jobs/run_throughput.sh b/scripts/ci/jobs/run_throughput.sh index 1ce212b21d3..0470d50cc46 100755 --- a/scripts/ci/jobs/run_throughput.sh +++ b/scripts/ci/jobs/run_throughput.sh @@ -19,7 +19,7 @@ fi RUN_OPTIONS="--mdf ${ALLEN_DATA}/mdf_input/${DATA_TAG}.mdf --sequence ${SEQUENCE} --run-from-json 1 ${RUN_OPTIONS}" set -euxo pipefail -OUTPUT_FOLDER_REL="${TEST_NAME}_output_${SEQUENCE}_${DATA_TAG}_${OPTIONS}/${DEVICE_ID}" +OUTPUT_FOLDER_REL="${TEST_NAME}_output_${SEQUENCE}_${DATA_TAG}${OPTIONS}/${DEVICE_ID}" mkdir -p ${OUTPUT_FOLDER_REL} OUTPUT_FOLDER=$(realpath ${OUTPUT_FOLDER_REL}) -- GitLab From 07717de632b8c841b44513538a000f99e7fb6a69 Mon Sep 17 00:00:00 2001 From: Ryunosuke O'Neil <r.oneil@cern.ch> Date: Mon, 29 Nov 2021 17:28:38 +0100 Subject: [PATCH 19/23] missing copyright --- checker/plotting/update_gitlab.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/checker/plotting/update_gitlab.py b/checker/plotting/update_gitlab.py index 4e254b0996e..971005b70cf 100644 --- a/checker/plotting/update_gitlab.py +++ b/checker/plotting/update_gitlab.py @@ -1,3 +1,7 @@ +############################################################################### +# (c) Copyright 2018-2021 CERN for the benefit of the LHCb Collaboration # +############################################################################### + import os import gitlab -- GitLab From 9836d711e83466c76dc9cea3e0a72a579aea5161 Mon Sep 17 00:00:00 2001 From: Ryunosuke O'Neil <r.oneil@cern.ch> Date: Mon, 29 Nov 2021 18:06:03 +0100 Subject: [PATCH 20/23] don't quit on nonzero return code --- scripts/ci/jobs/publish_throughput.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/scripts/ci/jobs/publish_throughput.sh b/scripts/ci/jobs/publish_throughput.sh index b597f91f9a4..9be1946c0df 100755 --- a/scripts/ci/jobs/publish_throughput.sh +++ b/scripts/ci/jobs/publish_throughput.sh @@ -3,7 +3,8 @@ # (c) Copyright 2018-2020 CERN for the benefit of the LHCb Collaboration # ############################################################################### -set -uxo pipefail +set -uo pipefail +set +xe setupViews @@ -17,6 +18,7 @@ ls -1 | grep output | grep run_throughput THROUGHPUT_ALARM=0 THROUGHPUT_MESSAGES="" for SEQUENCE_DATASET in $(ls -1 | grep "run_throughput" | grep -Ei "run_throughput_output_([a-z0-9_]+?)" | sed 's/^run_throughput_output_//') ; do + INPUT_FILES=$(cat run_throughput_output_${SEQUENCE_DATASET}/${BREAKDOWN_DEVICE_ID}/input_files.txt) SEQUENCE=$(cat run_throughput_output_${SEQUENCE_DATASET}/${BREAKDOWN_DEVICE_ID}/sequence.txt) BUILDOPTIONS=$(cat run_throughput_output_${SEQUENCE_DATASET}/${BREAKDOWN_DEVICE_ID}/buildopts.txt) -- GitLab From a8a643fc7e00a72c603c267b61c3047d8dfacf24 Mon Sep 17 00:00:00 2001 From: Ryunosuke O'Neil <r.oneil@cern.ch> Date: Mon, 29 Nov 2021 19:17:39 +0100 Subject: [PATCH 21/23] typo --- scripts/ci/jobs/publish_throughput.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/ci/jobs/publish_throughput.sh b/scripts/ci/jobs/publish_throughput.sh index 9be1946c0df..33d435bcdf0 100755 --- a/scripts/ci/jobs/publish_throughput.sh +++ b/scripts/ci/jobs/publish_throughput.sh @@ -68,9 +68,9 @@ for SEQUENCE_DATASET in $(ls -1 | grep "run_throughput" | grep -Ei "run_throughp done if [ "${THROUGHPUT_ALARM}" = "1" ]; then - python checker/plotting/update-gitlab.py --throughput-status "decrease" + python checker/plotting/update_gitlab.py --throughput-status "decrease" else - python checker/plotting/update-gitlab.py --throughput-status "no-change" + python checker/plotting/update_gitlab.py --throughput-status "no-change" fi echo "" -- GitLab From a3120b08be33c23bf4768bfc0a9f978e1b0f5a5c Mon Sep 17 00:00:00 2001 From: Ryunosuke O'Neil <r.oneil@cern.ch> Date: Mon, 29 Nov 2021 19:44:14 +0100 Subject: [PATCH 22/23] check where the PAT is --- .gitlab-ci.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 02a3d41ef4c..b1122e913b9 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -61,6 +61,8 @@ check-copyright: stage: check image: gitlab-registry.cern.ch/ci-tools/ci-worker:cc7 script: + - python -c 'import os; print(list(os.environ.keys()))' + - exit 1 - curl -o lb-check-copyright "https://gitlab.cern.ch/lhcb-core/LbDevTools/-/raw/master/LbDevTools/SourceTools.py?inline=False" - python lb-check-copyright --license=Apache-2.0 origin/${TARGET_BRANCH} needs: [] -- GitLab From 9db5f2e7aa7c241220e7a9afe71c40a8c0d9b415 Mon Sep 17 00:00:00 2001 From: Ryunosuke O'Neil <r.oneil@cern.ch> Date: Tue, 30 Nov 2021 13:49:24 +0100 Subject: [PATCH 23/23] revert last --- .gitlab-ci.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index b1122e913b9..02a3d41ef4c 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -61,8 +61,6 @@ check-copyright: stage: check image: gitlab-registry.cern.ch/ci-tools/ci-worker:cc7 script: - - python -c 'import os; print(list(os.environ.keys()))' - - exit 1 - curl -o lb-check-copyright "https://gitlab.cern.ch/lhcb-core/LbDevTools/-/raw/master/LbDevTools/SourceTools.py?inline=False" - python lb-check-copyright --license=Apache-2.0 origin/${TARGET_BRANCH} needs: [] -- GitLab