From 883f385ff87d248fd1ede52fa2aec6a2e696d1e6 Mon Sep 17 00:00:00 2001 From: Mark Stockton <mark@cern.ch> Date: Tue, 5 Nov 2019 18:13:49 +0100 Subject: [PATCH] Add log file checking to find failed child jobs and set mother return code to this result Tested by killing by force a child process --- .../TrigTransform/python/trigRecoExe.py | 25 ++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/HLT/Trigger/TrigTransforms/TrigTransform/python/trigRecoExe.py b/HLT/Trigger/TrigTransforms/TrigTransform/python/trigRecoExe.py index 62e9791b844..90060002dda 100644 --- a/HLT/Trigger/TrigTransforms/TrigTransform/python/trigRecoExe.py +++ b/HLT/Trigger/TrigTransforms/TrigTransform/python/trigRecoExe.py @@ -17,7 +17,7 @@ import subprocess from PyJobTransforms.trfExe import athenaExecutor #imports for preExecute -from PyJobTransforms.trfUtils import asetupReport, cvmfsDBReleaseCheck +from PyJobTransforms.trfUtils import asetupReport, cvmfsDBReleaseCheck, lineByLine import PyJobTransforms.trfEnv as trfEnv import PyJobTransforms.trfExceptions as trfExceptions from PyJobTransforms.trfExitCodes import trfExit as trfExit @@ -195,8 +195,27 @@ class trigRecoExecutor(athenaExecutor): def postExecute(self): - #TODO - #need to check for HLTMPPU.*Child Issue in the log file and throw an error message if there so we catch that the child died + #Adding check for HLTMPPU.*Child Issue in the log file + # Throws an error message if there so we catch that the child died + # Also sets the return code of the mother process to mark the job as failed + # Is based on trfValidation.scanLogFile + log = self._logFileName + msg.debug('Now scanning logfile {0}'.format(log)) + # Using the generator so that lines can be grabbed by subroutines if needed for more reporting + try: + myGen = lineByLine(log, substepName=self._substep) + except IOError as e: + msg.error('Failed to open transform logfile {0}: {1:s}'.format(log, e)) + for line, lineCounter in myGen: + # Check to see if any of the hlt children had an issue + if 'Child Issue' in line > -1: + try: + signal = int((re.search('signal ([0-9]*)', line)).group(1)) + except AttributeError: + #text signal not found so just return 0 + signal = 0 + msg.error('Detected issue with HLTChild, setting mother return code to %s' % (signal) ) + self._rc = signal msg.info("Check for expert-monitoring.root file") #the BS-BS step generates the files: -- GitLab