From 04c916ba6e516d74d9425eff1a7fca13eb61248f Mon Sep 17 00:00:00 2001 From: Marco Clemencic <marco.clemencic@cern.ch> Date: Tue, 26 Sep 2023 14:33:02 +0200 Subject: [PATCH] Add workaround for transient apptainer failures retry invocation if a specific signature message is printed by apptainer --- python/LbNightlyTools/BuildMethods.py | 23 ++++++-- python/LbNightlyTools/tests/test_apptainer.py | 54 +++++++++++++++++++ 2 files changed, 73 insertions(+), 4 deletions(-) create mode 100644 python/LbNightlyTools/tests/test_apptainer.py diff --git a/python/LbNightlyTools/BuildMethods.py b/python/LbNightlyTools/BuildMethods.py index 56a96fbd..b3f14245 100644 --- a/python/LbNightlyTools/BuildMethods.py +++ b/python/LbNightlyTools/BuildMethods.py @@ -1,5 +1,5 @@ ############################################################################### -# (c) Copyright 2013 CERN # +# (c) Copyright 2013-2023 CERN # # # # This software is distributed under the terms of the GNU General Public # # Licence version 3 (GPL Version 3), copied verbatim in the file "COPYING". # @@ -20,8 +20,8 @@ __author__ = "Marco Clemencic <marco.clemencic@cern.ch>" import logging import os import re -import shutil from datetime import datetime +from time import sleep from LbNightlyTools.Utils import compatible_lcg_external_files, find_path from LbNightlyTools.Utils import log_call as _log_call @@ -139,7 +139,22 @@ def log_call(cmd, *args, **kwargs): cwd=kwargs.get("cwd", os.getcwd()), ) - return _log_call(cmd, *args, **kwargs) + # Workaround for https://gitlab.cern.ch/lhcb-core/LbNightlyTools/-/issues/119 + for attempt in range(5): + if attempt != 0: + sleep(10) + result = _log_call(cmd, *args, **kwargs) + if ( + b"Failed to get file information for file descriptor 3" + not in result["stdout"] + ): + if attempt != 0: + __log__.debug("apptainer successfully started on attempt %d", attempt) + break + __log__.debug("apptainer failed to start on attempt %d", attempt) + else: + __log__.warning("giving up after repeated failures of apptainer") + return result def ensure_dir(path): @@ -244,7 +259,7 @@ class make(object): target, started.isoformat(), " ".join(quote(a) for a in cmd), - result["stdout"], + result["stdout"].decode("utf-8", errors="replace"), completed.isoformat(), ) .encode("utf-8") diff --git a/python/LbNightlyTools/tests/test_apptainer.py b/python/LbNightlyTools/tests/test_apptainer.py new file mode 100644 index 00000000..ea990e02 --- /dev/null +++ b/python/LbNightlyTools/tests/test_apptainer.py @@ -0,0 +1,54 @@ +############################################################################### +# (c) Copyright 2023 CERN # +# # +# This software is distributed under the terms of the GNU General Public # +# Licence version 3 (GPL Version 3), copied verbatim in the file "COPYING". # +# # +# In applying this licence, CERN does not waive the privileges and immunities # +# granted to it by virtue of its status as an Intergovernmental Organization # +# or submit itself to any jurisdiction. # +############################################################################### +import logging + +import pytest + +import LbNightlyTools.BuildMethods as bm + + +class mock_apptainer_call: + def __init__(self, max_failures): + self.max_failures = max_failures + self.n_of_invocations = 0 + + def __call__(self, *_args, **_kwargs): + self.n_of_invocations += 1 + if self.n_of_invocations <= self.max_failures: + return {"stdout": b"Failed to get file information for file descriptor 3\n"} + else: + return {"stdout": b"all good!\n"} + + +@pytest.mark.parametrize( + "max_failures,expected_message", + [ + (1, "apptainer successfully started on attempt 1"), + (10, "giving up after repeated failures of apptainer"), + ], +) +def test_workaroud(monkeypatch, caplog, max_failures, expected_message): + """ + https://gitlab.cern.ch/lhcb-core/LbNightlyTools/-/issues/119 + """ + # avoid sleep time between retries + monkeypatch.setattr(bm, "sleep", lambda _: None) + # avoid running anything and pretend we ran apptainer and that fails a number of times + monkeypatch.setattr(bm, "_log_call", mock_apptainer_call(max_failures)) + # avoid wrapping the command with apptainer + monkeypatch.setenv("BINARY_TAG", "dummy") + # capture DEBUG logging messages + caplog.set_level(logging.DEBUG) + + bm.log_call(["true"]) + + assert "apptainer failed to start on attempt 0" in caplog.text + assert expected_message in caplog.text -- GitLab