diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 56b036c96d601a1075b4a289dbc0fd8240ebc04f..02a3d41ef4ced7453004f8c299144b592e74101c 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -15,6 +15,9 @@ variables: RUN_THROUGHPUT_OPTIONS_HIP: "-n 5000 --events-per-slice 5000 -m 3000 -t 10 -r 1000" RUN_THROUGHPUT_OPTIONS_CPU: "-n 100 -m 100 -r 200" + AVG_THROUGHPUT_DECREASE_THRESHOLD: "-2.5" # (%); fail throughput check if averaged throughput % change falls below -2.5% + DEVICE_THROUGHPUT_DECREASE_THRESHOLD: "-7.5" # (%); fail throughput check if single device throughput % change falls below -10.0% + OVERRIDE_CUDA_ARCH_FLAG: "-gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_86,code=sm_86" stages: diff --git a/checker/plotting/csv_plotter.py b/checker/plotting/csv_plotter.py index 860c9a447e3bdc56d17a21cd8178d7722173d844..a5ebce82a37d580fb18b75b98b5323a3761cad74 100755 --- a/checker/plotting/csv_plotter.py +++ b/checker/plotting/csv_plotter.py @@ -3,30 +3,34 @@ # (c) Copyright 2018-2020 CERN for the benefit of the LHCb Collaboration # ############################################################################### import csv -import subprocess -import traceback from optparse import OptionParser from termgraph import TermGraph import requests -import urllib -def get_master_throughput(job_name, - csvfile="devices_throughputs.csv", - scale=1.0): +def parse_throughput(content, scale=1.0): + throughput = {} + content_reader = csv.reader(content.splitlines()) + for row in content_reader: + if row: + throughput[row[0]] = float(row[1]) * scale + return throughput + + +def get_master_throughput( + job_name, + url="https://gitlab.cern.ch/lhcb/Allen/-/jobs/artifacts/master/raw/", + csvfile="devices_throughputs.csv", + scale=1.0, +): try: master_throughput = {} if job_name: - base_url = ( - "https://gitlab.cern.ch/lhcb/Allen/-/jobs/artifacts/master/raw/" - + csvfile) + base_url = url + csvfile r = requests.get( base_url, params={"job": job_name}, allow_redirects=True) content = r.content.decode("utf-8") - content_reader = csv.reader(content.splitlines()) - for row in content_reader: - if row: - master_throughput[row[0]] = float(row[1]) * scale + master_throughput = parse_throughput(content, scale=scale) return master_throughput except Exception as e: print("get_master_throughput exception:", e) @@ -46,8 +50,8 @@ def format_text(title, plot_data, unit, x_max, master_throughput={}): final_vals.append(val) # Plot - print(final_tags) - print(final_vals) + # print(final_tags) + # print(final_vals) tg = TermGraph(suffix=unit, x_max=x_max) output = tg.chart(final_vals, final_tags) @@ -72,21 +76,17 @@ def format_text(title, plot_data, unit, x_max, master_throughput={}): def send_to_mattermost(text, mattermost_url): - subprocess.call([ - "curl", - "-i", - "-X", - "POST", - "-H", - "Content-Type: application/json", - "-d", - text, + request_json = {"text": text} + response = requests.post( mattermost_url, - ]) + json=request_json, + headers={"Content-Type": "application/json"}) + + assert response.ok, "send_to_mattermost request failed." def produce_plot( - filename, + plot_data, unit="", title="", x_max=10, @@ -96,15 +96,6 @@ def produce_plot( print_text=True, master_throughput={}, ): - plot_data = {} - with open(filename) as csvfile: - csv_reader = csv.reader(csvfile, delimiter=",") - for row in csv_reader: - try: - plot_data[row[0]] = float(row[1]) * scale - except: - print(traceback.format_exc()) - # Convert throughputs to speedups if normalize: norm = min(plot_data.values()) diff --git a/checker/plotting/post_combined_message.py b/checker/plotting/post_combined_message.py index 42aab4ef396586833f987c7cb63ad61480040d38..45a1c8a612c0569cac8a549f377b15f1fde60218 100644 --- a/checker/plotting/post_combined_message.py +++ b/checker/plotting/post_combined_message.py @@ -3,8 +3,14 @@ # (c) Copyright 2018-2020 CERN for the benefit of the LHCb Collaboration # ############################################################################### import os +import sys from optparse import OptionParser -from csv_plotter import produce_plot, send_to_mattermost, get_master_throughput +from csv_plotter import ( + produce_plot, + send_to_mattermost, + get_master_throughput, + parse_throughput, +) def main(): @@ -13,15 +19,13 @@ def main(): """ usage = ( "%prog [options] <-t throughput_data_file> <-b throughput_breakdown_data_file>\n" - + - 'Example: %prog -t throughput_data.csv -b throughput_breakdown.csv -m "http://{your-mattermost-site}/hooks/xxx-generatedkey-xxx"' + + 'Example: %prog -t throughput_data.csv -b throughput_breakdown.csv -m "http://{your-mattermost-site}/hooks/xxx-generatedkey-xxx"' ) parser = OptionParser(usage=usage) parser.add_option( "-m", "--mattermost_url", - default=os.environ["MATTERMOST_KEY"] - if "MATTERMOST_KEY" in os.environ else "", + default=os.environ["MATTERMOST_KEY"] if "MATTERMOST_KEY" in os.environ else "", dest="mattermost_url", help="The url where to post outputs generated for mattermost", ) @@ -44,8 +48,22 @@ def main(): default="", help="Title for your graph. (default: empty string)", ) + + parser.add_option("-j", "--job", dest="job", default="", help="Name of CI job") + + parser.add_option( + "--allowed-average-decrease", + dest="min_avg_tput_change", + default=-2.5, + help="Max tolerated average throughput decrease (%).", + ) + parser.add_option( - "-j", "--job", dest="job", default="", help="Name of CI job") + "--allowed-single-decrease", + dest="min_single_tput_change", + default=-5.0, + help="Max tolerated single-device throughput decrease (%).", + ) (options, args) = parser.parse_args() @@ -54,27 +72,78 @@ def main(): "No mattermost URL was found in MATTERMOST_KEY, or passed as a command line argument." ) + with open(options.throughput) as csvfile: + throughput = parse_throughput(csvfile.read(), scale=1e-3) + with open(options.breakdown) as csvfile: + breakdown = parse_throughput(csvfile.read(), scale=1) + master_throughput = get_master_throughput( - options.job, csvfile=options.throughput, scale=1e-3) + options.job, csvfile=options.throughput, scale=1e-3 + ) + + speedup_wrt_master = { + a: throughput.get(a, b) / b for a, b in master_throughput.items() + } + + # Average throughputs across all devices and complain if we are above decr % threshold + avg_throughput_decr = False + single_throughput_decr = False + extra_messages = "" + + n_dev = len(speedup_wrt_master.values()) + if n_dev > 0: + average_speedup = sum(speedup_wrt_master.values()) / n_dev + change = (average_speedup - 1.0) * 100.0 + + print(f"Device-averaged speedup: {average_speedup}") + print(f" % change: {change}") + + extra_messages = f"*Device-averaged speedup (% change):* {average_speedup:.2f} ({change:.2f} %)" + + tput_tol = float(options.min_avg_tput_change) + + if change < tput_tol: + print("*** Average throughput decrease above threshold.") + extra_messages += f" :warning: :eyes: decrease _exceeds_ {abs(float(tput_tol))} % threshold\n" + avg_throughput_decr = True + else: + print("No throughput reference available") + extra_messages = f":warning: No reference available for comparison." + + # single device throughput decrease check + extra_messages += "\n" if len(extra_messages) > 0 else "" + tput_tol = float(options.min_single_tput_change) + + for device, speedup in speedup_wrt_master.items(): + change = (speedup - 1.0) * 100.0 + print(f"{device} speedup: {speedup}") + print(f"{device} % change: {change}") + + if change < tput_tol: + print(f"*** {device} Single-device throughput decrease above threshold.") + extra_messages += f":warning: :eyes: **{device}** throughput decrease _exceeds_ {abs(float(tput_tol))} % threshold\n" + avg_throughput_decr = True + throughput_text = produce_plot( - options.throughput, + throughput, master_throughput=master_throughput, unit="kHz", scale=1e-3, print_text=True, ) - breakdown_text = produce_plot(options.breakdown, unit="%", print_text=True) + breakdown_text = produce_plot(breakdown, unit="%", print_text=True) - text = '{"text": "%s:\n```\n%s```\n\nBreakdown of sequence:\n```\n%s```"}' % ( - options.title, - throughput_text, - breakdown_text, - ) - print(text) + text = f"{options.title}:\n```\n{throughput_text}```\n{extra_messages}\n\nBreakdown of sequence:\n```\n{breakdown_text}```" if options.mattermost_url is not None: send_to_mattermost(text, options.mattermost_url) + if avg_throughput_decr: + sys.exit(5) + + if single_throughput_decr: + sys.exit(6) + if __name__ == "__main__": main() diff --git a/checker/plotting/post_telegraf.py b/checker/plotting/post_telegraf.py index 3d7648dd3da1676797168f267a747a2284e9b056..4ec00fa1cd4ac5d27934c5d32bb8decbae175262 100755 --- a/checker/plotting/post_telegraf.py +++ b/checker/plotting/post_telegraf.py @@ -20,18 +20,23 @@ def send_to_telegraf(throughput, device, options): now = time.time() timestamp = int(now) * 1000000000 - telegraf_string = "AllenCIPerformance_v3,branch=%s,device=%s,sequence=%s,dataset=%s " % ( - options.branch, device, options.sequence, options.dataset) + telegraf_string = "AllenCIPerformance_v3,branch=%s,device=%s,sequence=%s,dataset=%s,build_options=%s " % ( + options.branch, + device, + options.sequence, + options.dataset, + options.buildopts if len(options.buildopts) > 0 else "default", + ) telegraf_string += "performance=%.2f " % (throughput) telegraf_string += " %d" % timestamp try: - print('Sending telegraf string: %s' % telegraf_string) + print("Sending telegraf string: %s" % telegraf_string) response = session.post(options.telegraf_url, data=telegraf_string) - print('http response: %s' % response.headers) + print("http response: %s" % response.headers) except: - print('Failed to submit data string %s' % telegraf_string) + print("Failed to submit data string %s" % telegraf_string) print(traceback.format_exc()) @@ -44,64 +49,76 @@ def main(argv): global final_msg parser = OptionParser() parser.add_option( - '-f', - '--filename', - dest='filename', + "-f", + "--filename", + dest="filename", default= - 'devices_throughputs_hlt1_pp_default_upgrade_mc_minbias_scifi_v5_000.csv', - help='The csv file containing the throughput and device name') + "devices_throughputs_hlt1_pp_default_upgrade_mc_minbias_scifi_v5_000.csv", + help="The csv file containing the throughput and device name", + ) parser.add_option( - '-b', - '--branch', - dest='branch', - default='UNKNOWN', - help='branch tag to be forwarded to telegraf/grafana') + "-b", + "--branch", + dest="branch", + default="UNKNOWN", + help="branch tag to be forwarded to telegraf/grafana", + ) parser.add_option( - '-s', - '--sequence', - dest='sequence', - default='UNKNOWN', - help='sequence name tag to be forwarded to telegraf/grafana') + "-s", + "--sequence", + dest="sequence", + default="UNKNOWN", + help="sequence name tag to be forwarded to telegraf/grafana", + ) parser.add_option( - '-d', - '--dataset', - dest='dataset', - default='UNKNOWN', - help='dataset to be forwarded to telegraf/grafana') + "-d", + "--dataset", + dest="dataset", + default="UNKNOWN", + help="dataset to be forwarded to telegraf/grafana", + ) parser.add_option( - '-t', - '--telegraf_url', - dest='telegraf_url', - #default='http://dcinflux01.lbdaq.cern.ch:8189/telegraf', - #Unfortunately lhcb online names are still not resolved by CERN dns... IP address it is (at least for cern based machines) - default='http://10.128.124.77:8189/telegraf', - help='URL to send telegraf output to') + "-o", + "--build-options", + dest="buildopts", + default="", + help="build options to be forwarded to telegraf/grafana", + ) + parser.add_option( + "-t", + "--telegraf_url", + dest="telegraf_url", + # default='http://dcinflux01.lbdaq.cern.ch:8189/telegraf', + # Unfortunately lhcb online names are still not resolved by CERN dns... IP address it is (at least for cern based machines) + default="http://10.128.124.77:8189/telegraf", + help="URL to send telegraf output to", + ) (options, args) = parser.parse_args() if options.filename is None: parser.print_help() - print('Please specify an input file') + print("Please specify an input file") return try: os.path.isfile(options.filename) except: - print('Failed to open csv file with rates and devices: %s' % + print("Failed to open csv file with rates and devices: %s" % options.filename) traceback.print_exc() return with open(options.filename) as csv_file: - csv_reader = csv.reader(csv_file, delimiter=',') + csv_reader = csv.reader(csv_file, delimiter=",") for row in csv_reader: device = row[0] device_string = device.strip() - device_string = device_string.replace(' ', '\ ') + device_string = device_string.replace(" ", "\ ") throughput = float(row[1]) - print('Device: ' + device_string + - ', Throughput: %.2f' % (throughput)) + print("Device: " + device_string + + ", Throughput: %.2f" % (throughput)) send_to_telegraf(throughput, device_string, options) diff --git a/checker/plotting/update_gitlab.py b/checker/plotting/update_gitlab.py new file mode 100644 index 0000000000000000000000000000000000000000..971005b70cf91bbfbb0931a69323aa41c0da75ca --- /dev/null +++ b/checker/plotting/update_gitlab.py @@ -0,0 +1,62 @@ +############################################################################### +# (c) Copyright 2018-2021 CERN for the benefit of the LHCb Collaboration # +############################################################################### + +import os +import gitlab + +import argparse + + +parser = argparse.ArgumentParser( + description="Update the current GitLab merge request with throughput CI results." +) + +parser.add_argument( + "--throughput-status", + help="Add a hlt1-throughput-decreased label to the merge request.", + choices=["decrease", "increase", "no-change", "nothing"], + default="nothing", +) + +args = parser.parse_args() + + +def get_merge_request(): + gl = gitlab.Gitlab( + "https://gitlab.cern.ch", private_token=os.environ[f"ALLENCI_PAT"] + ) + + gl.auth() + + proj_id = int(os.environ["CI_PROJECT_ID"]) + project = gl.projects.get(proj_id) + + mr = project.mergerequests.get(int(os.environ["CI_MERGE_REQUEST_IID"])) + + return mr + + +def toggle_label(mr, name, enabled): + if name in mr.labels and not enabled: + mr.labels = list([l for l in mr.labels if l != name]) + elif enabled and name not in mr.labels: + labels = mr.labels[:] + labels += [name] + mr.labels = list(set(labels)) + + +def main(): + mr = get_merge_request() + if args.throughput_status != "nothing": + toggle_label( + mr, "hlt1-throughput-decreased", args.throughput_status == "decrease" + ) + # toggle_label( + # mr, "hlt1-throughput-increased", args.throughput_status == "increase" + # ) + mr.save() + + +if __name__ == "__main__": + main() diff --git a/scripts/ci/common.sh b/scripts/ci/common.sh index 8a2f444ee76e99651b2a66a77e7fcdd1eccee4ca..77f248b20a0b339aa784877fc7d569789b12180e 100644 --- a/scripts/ci/common.sh +++ b/scripts/ci/common.sh @@ -52,6 +52,13 @@ function check_build_exists() { fi } +# Define OPTIONS as empty, if not already defined +if [ -z ${OPTIONS+x} ]; then + echo "OPTIONS is not defined - this is fine, but I will set it to empty." + OPTIONS="" +fi + + export BUILD_SEQUENCES="all" TOPLEVEL=${PWD} diff --git a/scripts/ci/jobs/publish_throughput.sh b/scripts/ci/jobs/publish_throughput.sh index bd0fdeb265ee78dfccc8c39afbaeef03fc620a8c..33d435bcdf06bf721a1f1d369cf6dced4be34446 100755 --- a/scripts/ci/jobs/publish_throughput.sh +++ b/scripts/ci/jobs/publish_throughput.sh @@ -3,7 +3,8 @@ # (c) Copyright 2018-2020 CERN for the benefit of the LHCb Collaboration # ############################################################################### -set -euxo pipefail +set -uo pipefail +set +xe setupViews @@ -14,22 +15,66 @@ echo "" echo "run_throughput outputs:" ls -1 | grep output | grep run_throughput +THROUGHPUT_ALARM=0 +THROUGHPUT_MESSAGES="" for SEQUENCE_DATASET in $(ls -1 | grep "run_throughput" | grep -Ei "run_throughput_output_([a-z0-9_]+?)" | sed 's/^run_throughput_output_//') ; do + INPUT_FILES=$(cat run_throughput_output_${SEQUENCE_DATASET}/${BREAKDOWN_DEVICE_ID}/input_files.txt) SEQUENCE=$(cat run_throughput_output_${SEQUENCE_DATASET}/${BREAKDOWN_DEVICE_ID}/sequence.txt) + BUILDOPTIONS=$(cat run_throughput_output_${SEQUENCE_DATASET}/${BREAKDOWN_DEVICE_ID}/buildopts.txt) + echo "" + echo "********************************************************************************************************************************************" + echo "********************************************************************************************************************************************" + echo "Throughput of [branch ${CI_COMMIT_REF_NAME} (${CI_COMMIT_SHORT_SHA}), sequence ${SEQUENCE} over dataset ${INPUT_FILES}" + echo "" + echo "" cat run_throughput_output_${SEQUENCE_DATASET}/*/output.txt | grep --color=none "select device" | sed 's/.*:\ [0-9]*\,\ //' > devices_${SEQUENCE_DATASET}.txt cat run_throughput_output_${SEQUENCE_DATASET}/*/output.txt | grep --color=none "events/s" | awk '{ print $1; }' > throughputs_${SEQUENCE_DATASET}.txt - cat devices_${SEQUENCE_DATASET}.txt - cat throughputs_${SEQUENCE_DATASET}.txt + # cat devices_${SEQUENCE_DATASET}.txt + # cat throughputs_${SEQUENCE_DATASET}.txt paste -d, devices_${SEQUENCE_DATASET}.txt throughputs_${SEQUENCE_DATASET}.txt > devices_throughputs_${SEQUENCE_DATASET}.csv - cat devices_throughputs_${SEQUENCE_DATASET}.csv + # cat devices_throughputs_${SEQUENCE_DATASET}.csv + + if [ "${BUILDOPTIONS}" = "" ]; then + BUILDOPTIONS_DISPLAY="default" + else + BUILDOPTIONS_DISPLAY=${BUILDOPTIONS} + fi python checker/plotting/post_combined_message.py \ -j "${CI_JOB_NAME}" \ - -l "Throughput of [branch **\`${CI_COMMIT_REF_NAME} (${CI_COMMIT_SHORT_SHA})\`**, sequence **\`${SEQUENCE}\`** over dataset **\`${INPUT_FILES}\`**](https://gitlab.cern.ch/lhcb/Allen/pipelines/${CI_PIPELINE_ID})" \ + -l "Throughput of [branch **\`${CI_COMMIT_REF_NAME} (${CI_COMMIT_SHORT_SHA})\`**, sequence **\`${SEQUENCE}\`** over dataset **\`${INPUT_FILES}\`** build options \`${BUILDOPTIONS_DISPLAY}\`](https://gitlab.cern.ch/lhcb/Allen/pipelines/${CI_PIPELINE_ID})" \ -t devices_throughputs_${SEQUENCE_DATASET}.csv \ - -b run_throughput_output_${SEQUENCE_DATASET}/${BREAKDOWN_DEVICE_ID}/algo_breakdown.csv - - python checker/plotting/post_telegraf.py -f devices_throughputs_${SEQUENCE_DATASET}.csv . -s "${SEQUENCE}" -b "${CI_COMMIT_REF_NAME}" -d "${INPUT_FILES}" + -b run_throughput_output_${SEQUENCE_DATASET}/${BREAKDOWN_DEVICE_ID}/algo_breakdown.csv \ + --allowed-average-decrease "${AVG_THROUGHPUT_DECREASE_THRESHOLD}" \ + --allowed-single-decrease "${DEVICE_THROUGHPUT_DECREASE_THRESHOLD}" # (%) + RC=$? + + python checker/plotting/post_telegraf.py -f devices_throughputs_${SEQUENCE_DATASET}.csv . -s "${SEQUENCE}" -b "${CI_COMMIT_REF_NAME}" -d "${INPUT_FILES}" -o "${BUILDOPTIONS}" + + if [ "$RC" = "5" ]; then + THROUGHPUT_ALARM=1 + THROUGHPUT_MESSAGES="${THROUGHPUT_MESSAGES} +*** sequence ${SEQUENCE} over dataset ${INPUT_FILES} - Device-averaged throughput change is less than ${AVG_THROUGHPUT_DECREASE_THRESHOLD} %" + elif [ "$RC" = "6" ]; then + THROUGHPUT_ALARM=1 + THROUGHPUT_MESSAGES="${THROUGHPUT_MESSAGES} +*** sequence ${SEQUENCE} over dataset ${INPUT_FILES} - Single-device throughput change, for at least one device, is less than ${DEVICE_THROUGHPUT_DECREASE_THRESHOLD} %" + fi + echo "" + echo "" + done + +if [ "${THROUGHPUT_ALARM}" = "1" ]; then + python checker/plotting/update_gitlab.py --throughput-status "decrease" +else + python checker/plotting/update_gitlab.py --throughput-status "no-change" +fi + +echo "" +echo "" +echo ${THROUGHPUT_MESSAGES} + +exit $THROUGHPUT_ALARM diff --git a/scripts/ci/jobs/run_throughput.sh b/scripts/ci/jobs/run_throughput.sh index 8e5f7b1afe8afd0e61de56d615614704394bc6ea..0470d50cc4666d5a34c76042e1e1d7257e813f1d 100755 --- a/scripts/ci/jobs/run_throughput.sh +++ b/scripts/ci/jobs/run_throughput.sh @@ -19,7 +19,7 @@ fi RUN_OPTIONS="--mdf ${ALLEN_DATA}/mdf_input/${DATA_TAG}.mdf --sequence ${SEQUENCE} --run-from-json 1 ${RUN_OPTIONS}" set -euxo pipefail -OUTPUT_FOLDER_REL="${TEST_NAME}_output_${SEQUENCE}_${DATA_TAG}/${DEVICE_ID}" +OUTPUT_FOLDER_REL="${TEST_NAME}_output_${SEQUENCE}_${DATA_TAG}${OPTIONS}/${DEVICE_ID}" mkdir -p ${OUTPUT_FOLDER_REL} OUTPUT_FOLDER=$(realpath ${OUTPUT_FOLDER_REL}) @@ -120,6 +120,7 @@ echo "Throughput (kHz, 2 d.p.): ${THROUGHPUT_KHZ}" echo "${DATA_TAG}" > "${OUTPUT_FOLDER}/input_files.txt" echo "${SEQUENCE}" > "${OUTPUT_FOLDER}/sequence.txt" +echo "${OPTIONS}" > "${OUTPUT_FOLDER}/buildopts.txt" echo "${THROUGHPUT}" > "${OUTPUT_FOLDER}/throughput.txt" echo "${CI_COMMIT_SHORT_SHA}" > "${OUTPUT_FOLDER}/revision.txt"