Skip to content
Snippets Groups Projects

Draft: Handle multiple throughput options in a single job

Closed Arthur Marius Hennequin requested to merge ahennequ_merge_throughput into master
1 file
+ 153
137
Compare changes
  • Side-by-side
  • Inline
@@ -176,6 +176,19 @@ def send_gitlab_feedback(
remove_labels=remove_labels,
)
def getOptions(options, log_files):
d = set()
for f in log_files:
name = f.split('/')[-1].split(".")[0]
option = "_".join(name.split("_")[1:])
d.add(option)
if len(d) == 0:
return [options], [log_files]
options = list(d)
all_log_files = []
for option in options:
all_log_files.append([f for f in log_files if option in f])
return options, all_log_files
class ThroughputProfileHandler(BaseHandler):
def __init__(self):
@@ -208,151 +221,154 @@ class ThroughputProfileHandler(BaseHandler):
if f.endswith(".log")
]
throughput = sum(get_throughput(f) for f in log_files if "ThroughputTest" in f)
str_tput = "{:.1f}".format(throughput)
self.saveFloat(
"max_throughput",
throughput,
description="maximum throughput",
group="throughput",
)
# measure the total bandwidth
# only for sprucing for now
measure_bandwidth = options in ['Moore_spruce_all_lines']
if measure_bandwidth:
run_log = os.path.join(directory, "run.log")
if not os.path.isfile(run_log):
log.warning('There is no run.log!')
measure_bandwidth = False
bandwidth = get_bandwidth(run_log) if measure_bandwidth else 0.
dirname = (
f"Throughput_{version}_{options}_{platform}_{startTime.replace(' ', '_')}"
)
targetRootWebDir = os.path.join(WWW_BASE_URL, dirname)
options, log_files = getOptions(options, log_files)
# concatenate log files into one file
with open("tests.log", "w") as outfile:
for fname in log_files:
outfile.write(
"\n{sep}\n{fname}\n{sep}\n\n".format(sep="=" * 80, fname=fname)
)
with open(fname) as infile:
for line in infile:
outfile.write(line)
trend_url = os.path.join(WWW_BASE_URL, f"trend_throughput_{options}_{slot}.png")
request = requests.get(trend_url)
if request.status_code != 200:
trend_url = None
with open("index.html", "w") as html_file:
html = REPORT_TEMPLATE.render(
version=version,
platform=platform,
hostname=hostname,
cpu_info=cpu_info,
options=options,
throughput=str_tput,
WWW_BASE_URL=WWW_BASE_URL,
dirname=dirname,
trend_url=trend_url,
for options, log_files in zip(options, log_files):
throughput = sum(get_throughput(f) for f in log_files if "ThroughputTest" in f)
str_tput = "{:.1f}".format(throughput)
self.saveFloat(
"max_throughput",
throughput,
description="maximum throughput",
group="throughput",
)
html_file.write(html)
log.debug("Generated HTML report:\n" + html)
for filename in [
os.path.join(directory, "flamy.svg"),
os.path.join(directory, "flamy.svg"),
os.path.join(directory, "FlameBars.pdf"),
os.path.join(directory, "FlameBars.png"),
"index.html",
"tests.log",
]:
publish.upload_eos_www(
filename,
os.path.join(dirname, os.path.basename(filename)),
# measure the total bandwidth
# only for sprucing for now
measure_bandwidth = options in ['Moore_spruce_all_lines']
if measure_bandwidth:
run_log = os.path.join(directory, "run.log")
if not os.path.isfile(run_log):
log.warning('There is no run.log!')
measure_bandwidth = False
bandwidth = get_bandwidth(run_log) if measure_bandwidth else 0.
dirname = (
f"Throughput_{version}_{options}_{platform}_{startTime.replace(' ', '_')}"
)
targetRootWebDir = os.path.join(WWW_BASE_URL, dirname)
self.saveString(
"algousage",
os.path.join(targetRootWebDir, "flamy.svg"),
description="link to algo usage plot",
group="performance",
)
# concatenate log files into one file
with open("tests.log", "w") as outfile:
for fname in log_files:
outfile.write(
"\n{sep}\n{fname}\n{sep}\n\n".format(sep="=" * 80, fname=fname)
)
with open(fname) as infile:
for line in infile:
outfile.write(line)
trend_url = os.path.join(WWW_BASE_URL, f"trend_throughput_{options}_{slot}.png")
request = requests.get(trend_url)
if request.status_code != 200:
trend_url = None
with open("index.html", "w") as html_file:
html = REPORT_TEMPLATE.render(
version=version,
platform=platform,
hostname=hostname,
cpu_info=cpu_info,
options=options,
throughput=str_tput,
WWW_BASE_URL=WWW_BASE_URL,
dirname=dirname,
trend_url=trend_url,
)
html_file.write(html)
log.debug("Generated HTML report:\n" + html)
for filename in [
os.path.join(directory, "flamy.svg"),
os.path.join(directory, "flamy.svg"),
os.path.join(directory, "FlameBars.pdf"),
os.path.join(directory, "FlameBars.png"),
"index.html",
"tests.log",
]:
publish.upload_eos_www(
filename,
os.path.join(dirname, os.path.basename(filename)),
)
# send notification on mattermost channel
cpu_model = cpu_info.split(" @")[0].replace("(R)", "").replace(" ", "-")
mattermost_message = (
"The results of latest throughput test "
f"[{options} {version} {platform} {cpu_model}]({targetRootWebDir}):\n"
f"`Throughput = {str_tput} Events/s`"
)
if measure_bandwidth:
mattermost_message += (
f", `Bandwidth = {bandwidth:.1f} MB/s`"
self.saveString(
"algousage",
os.path.join(targetRootWebDir, "flamy.svg"),
description="link to algo usage plot",
group="performance",
)
publish.post_mattermost(mattermost_message)
# let's post a reply to gitlab about the throughput test result
if (slot in ["lhcb-master-mr", "lhcb-master-ref", "lhcb-master"]) and (
options
in [
"Moore_hlt1_pp_default",
"Moore_hlt2_reco_baseline",
"Moore_hlt2_fastest_reco",
"Moore_hlt2_pp_thor",
"Moore_spruce_all_lines",
]
):
# The feedback needs to compare the results from the reference (*-ref or master)
# and the -mr builds. We don't know which completes first,
# so we must try both cases.
# For a better treatment in the future, see LBCORE-1984
for ref, test, trigger in dashboard.get_ci_test_pairs(slot, build_id):
try:
if test == (slot, build_id):
# The handler runs for the -mr build, so fetch the -ref results
new_throughput = throughput
web_link = targetRootWebDir
new_bandwidth = bandwidth
ref_throughput, ref_web_link = get_couchdb_throughput_link(
ref[0], ref[1], options
)
if measure_bandwidth:
ref_bandwidth = get_couchdb_bandwidth(
# send notification on mattermost channel
cpu_model = cpu_info.split(" @")[0].replace("(R)", "").replace(" ", "-")
mattermost_message = (
"The results of latest throughput test "
f"[{options} {version} {platform} {cpu_model}]({targetRootWebDir}):\n"
f"`Throughput = {str_tput} Events/s`"
)
if measure_bandwidth:
mattermost_message += (
f", `Bandwidth = {bandwidth:.1f} MB/s`"
)
publish.post_mattermost(mattermost_message)
# let's post a reply to gitlab about the throughput test result
if (slot in ["lhcb-master-mr", "lhcb-master-ref", "lhcb-master"]) and (
options
in [
"Moore_hlt1_pp_default",
"Moore_hlt2_reco_baseline",
"Moore_hlt2_fastest_reco",
"Moore_hlt2_pp_thor",
"Moore_spruce_all_lines",
]
):
# The feedback needs to compare the results from the reference (*-ref or master)
# and the -mr builds. We don't know which completes first,
# so we must try both cases.
# For a better treatment in the future, see LBCORE-1984
for ref, test, trigger in dashboard.get_ci_test_pairs(slot, build_id):
try:
if test == (slot, build_id):
# The handler runs for the -mr build, so fetch the -ref results
new_throughput = throughput
web_link = targetRootWebDir
new_bandwidth = bandwidth
ref_throughput, ref_web_link = get_couchdb_throughput_link(
ref[0], ref[1], options
)
else: ref_bandwidth = 0.
elif ref == (slot, build_id):
# The handler runs for the -ref build, so fetch the -mr results
ref_throughput = throughput
ref_web_link = targetRootWebDir
ref_bandwidth = bandwidth
new_throughput, web_link = get_couchdb_throughput_link(
test[0], test[1], options
)
if measure_bandwidth:
new_bandwidth = get_couchdb_bandwidth(
if measure_bandwidth:
ref_bandwidth = get_couchdb_bandwidth(
ref[0], ref[1], options
)
else: ref_bandwidth = 0.
elif ref == (slot, build_id):
# The handler runs for the -ref build, so fetch the -mr results
ref_throughput = throughput
ref_web_link = targetRootWebDir
ref_bandwidth = bandwidth
new_throughput, web_link = get_couchdb_throughput_link(
test[0], test[1], options
)
else: new_bandwidth = 0.
if measure_bandwidth:
new_bandwidth = get_couchdb_bandwidth(
test[0], test[1], options
)
else: new_bandwidth = 0.
else:
assert False
except dashboard.ResourceNotFound:
# The job for the other build hasn't finished yet => do nothing.
# The message will be posted from the other job's handler.
log.warning(
"Could not fetch results for other slot, not posting reply."
)
else:
assert False
except dashboard.ResourceNotFound:
# The job for the other build hasn't finished yet => do nothing.
# The message will be posted from the other job's handler.
log.warning(
"Could not fetch results for other slot, not posting reply."
)
else:
send_gitlab_feedback(
new_throughput,
ref_throughput,
new_bandwidth,
ref_bandwidth,
options,
web_link,
ref_web_link,
trigger,
)
send_gitlab_feedback(
new_throughput,
ref_throughput,
new_bandwidth,
ref_bandwidth,
options,
web_link,
ref_web_link,
trigger,
)
Loading