From 04c916ba6e516d74d9425eff1a7fca13eb61248f Mon Sep 17 00:00:00 2001
From: Marco Clemencic <marco.clemencic@cern.ch>
Date: Tue, 26 Sep 2023 14:33:02 +0200
Subject: [PATCH] Add workaround for transient apptainer failures

retry invocation if a specific signature message is printed by
apptainer
---
 python/LbNightlyTools/BuildMethods.py         | 23 ++++++--
 python/LbNightlyTools/tests/test_apptainer.py | 54 +++++++++++++++++++
 2 files changed, 73 insertions(+), 4 deletions(-)
 create mode 100644 python/LbNightlyTools/tests/test_apptainer.py

diff --git a/python/LbNightlyTools/BuildMethods.py b/python/LbNightlyTools/BuildMethods.py
index 56a96fbd..b3f14245 100644
--- a/python/LbNightlyTools/BuildMethods.py
+++ b/python/LbNightlyTools/BuildMethods.py
@@ -1,5 +1,5 @@
 ###############################################################################
-# (c) Copyright 2013 CERN                                                     #
+# (c) Copyright 2013-2023 CERN                                                     #
 #                                                                             #
 # This software is distributed under the terms of the GNU General Public      #
 # Licence version 3 (GPL Version 3), copied verbatim in the file "COPYING".   #
@@ -20,8 +20,8 @@ __author__ = "Marco Clemencic <marco.clemencic@cern.ch>"
 import logging
 import os
 import re
-import shutil
 from datetime import datetime
+from time import sleep
 
 from LbNightlyTools.Utils import compatible_lcg_external_files, find_path
 from LbNightlyTools.Utils import log_call as _log_call
@@ -139,7 +139,22 @@ def log_call(cmd, *args, **kwargs):
         cwd=kwargs.get("cwd", os.getcwd()),
     )
 
-    return _log_call(cmd, *args, **kwargs)
+    # Workaround for https://gitlab.cern.ch/lhcb-core/LbNightlyTools/-/issues/119
+    for attempt in range(5):
+        if attempt != 0:
+            sleep(10)
+        result = _log_call(cmd, *args, **kwargs)
+        if (
+            b"Failed to get file information for file descriptor 3"
+            not in result["stdout"]
+        ):
+            if attempt != 0:
+                __log__.debug("apptainer successfully started on attempt %d", attempt)
+            break
+        __log__.debug("apptainer failed to start on attempt %d", attempt)
+    else:
+        __log__.warning("giving up after repeated failures of apptainer")
+    return result
 
 
 def ensure_dir(path):
@@ -244,7 +259,7 @@ class make(object):
                 target,
                 started.isoformat(),
                 " ".join(quote(a) for a in cmd),
-                result["stdout"],
+                result["stdout"].decode("utf-8", errors="replace"),
                 completed.isoformat(),
             )
             .encode("utf-8")
diff --git a/python/LbNightlyTools/tests/test_apptainer.py b/python/LbNightlyTools/tests/test_apptainer.py
new file mode 100644
index 00000000..ea990e02
--- /dev/null
+++ b/python/LbNightlyTools/tests/test_apptainer.py
@@ -0,0 +1,54 @@
+###############################################################################
+# (c) Copyright 2023 CERN                                                     #
+#                                                                             #
+# This software is distributed under the terms of the GNU General Public      #
+# Licence version 3 (GPL Version 3), copied verbatim in the file "COPYING".   #
+#                                                                             #
+# In applying this licence, CERN does not waive the privileges and immunities #
+# granted to it by virtue of its status as an Intergovernmental Organization  #
+# or submit itself to any jurisdiction.                                       #
+###############################################################################
+import logging
+
+import pytest
+
+import LbNightlyTools.BuildMethods as bm
+
+
+class mock_apptainer_call:
+    def __init__(self, max_failures):
+        self.max_failures = max_failures
+        self.n_of_invocations = 0
+
+    def __call__(self, *_args, **_kwargs):
+        self.n_of_invocations += 1
+        if self.n_of_invocations <= self.max_failures:
+            return {"stdout": b"Failed to get file information for file descriptor 3\n"}
+        else:
+            return {"stdout": b"all good!\n"}
+
+
+@pytest.mark.parametrize(
+    "max_failures,expected_message",
+    [
+        (1, "apptainer successfully started on attempt 1"),
+        (10, "giving up after repeated failures of apptainer"),
+    ],
+)
+def test_workaroud(monkeypatch, caplog, max_failures, expected_message):
+    """
+    https://gitlab.cern.ch/lhcb-core/LbNightlyTools/-/issues/119
+    """
+    # avoid sleep time between retries
+    monkeypatch.setattr(bm, "sleep", lambda _: None)
+    # avoid running anything and pretend we ran apptainer and that fails a number of times
+    monkeypatch.setattr(bm, "_log_call", mock_apptainer_call(max_failures))
+    # avoid wrapping the command with apptainer
+    monkeypatch.setenv("BINARY_TAG", "dummy")
+    # capture DEBUG logging messages
+    caplog.set_level(logging.DEBUG)
+
+    bm.log_call(["true"])
+
+    assert "apptainer failed to start on attempt 0" in caplog.text
+    assert expected_message in caplog.text
-- 
GitLab