Commit 7bfde71f authored by Andrea Sciaba's avatar Andrea Sciaba
Browse files

Merge branch 'qa' into 'qa'

Timeout/logic improvements for xrootd-fallback test

See merge request !5
parents 5de0ffd0 0f6405e2
......@@ -27,9 +27,9 @@ CSWNFB_FILES = [ "/store/mc/SAM/GenericTTbar/AODSIM/" + \
"/store/mc/SAM/GenericTTbar/AODSIM/" + \
"CMSSW_9_2_6_91X_mcRun1_realistic_v2-v1/00000/" + \
"CE860B10-5D76-E711-BCA8-FA163EAA761A.root" ]
CSWNFB_SITES = ["T1_FR_CCIN2P3", "T1_RU_JINR", "T2_CN_Beijing", "T2_BE_IIHE", \
"T2_FR_GRIF_LLR", "T2_HU_Budapest", "T2_UK_London_Brunel", \
"T2_UK_London_IC", "T2_US_Nebraska", "T2_US_Wisconsin"]
CSWNFB_SITES = ["T1_FR_CCIN2P3", "T2_US_Nebraska", "T1_RU_JINR", "T2_UK_London_Brunel", \
"T2_CN_Beijing", "T2_BE_IIHE", "T2_FR_GRIF_LLR", "T2_HU_Budapest", \
"T2_UK_London_IC", "T2_US_Wisconsin"]
......@@ -47,19 +47,24 @@ process.SiteLocalConfigService = cms.Service("SiteLocalConfigService",
overrideSourceCacheHintDir = cms.untracked.string("application-only"),
)
process.dump = cms.EDAnalyzer("EventContentAnalyzer", listContent=cms.untracked.bool(False), getData=cms.untracked.bool(True))
process.dump = cms.EDAnalyzer("EventContentAnalyzer",
listContent=cms.untracked.bool(False),
verboseForModuleLabels = cms.untracked.vstring("recoTracks_generalTracks"),
getDataForModuleLabels=cms.untracked.vstring("recoTracks_generalTracks"),
getData=cms.untracked.bool(True),
)
process.load("FWCore.MessageService.MessageLogger_cfi")
process.MessageLogger.cerr.FwkReport.reportEvery = 1
process.maxEvents = cms.untracked.PSet(
input = cms.untracked.int32(10)
input = cms.untracked.int32(1)
)
process.p = cms.EndPath(process.dump)
"""
def configure_logging(lvl=logging.INFO):
logger = logging.getLogger("cms.CE.xrootd-access")
logger = logging.getLogger("cms.CE.xrootd-fallback")
logger.setLevel(lvl)
handler = logging.StreamHandler(sys.stdout)
formatter = logging.Formatter(fmt="[%(process)d] %(asctime)s [%(levelname)07s]: %(message)s")
......@@ -73,14 +78,14 @@ def print_summary(summary, retval):
log.info(summary)
else:
log.error(summary)
print "summary: %s" % summary
print "Summary: %s" % summary
return retval
def parse_opts():
parser = optparse.OptionParser()
parser.add_option("-v", "--verbose", dest="verbose", help="Increase logging verbosity", action="store_true", default=False)
parser.add_option("-H", "--host", dest="hostname", help="Hostname to use")
parser.add_option("-t", "--timeout", dest="timeout", help="Test timeout in seconds; default is 240", default=240, type="int")
parser.add_option("-t", "--timeout", dest="timeout", help="Test timeout in seconds; default is 300", default=300, type="int")
opts, args = parser.parse_args()
......@@ -108,20 +113,23 @@ def runCommandChild(cmd, args):
finally:
os._exit(127)
def runCommandParent(r, pid, opts):
def runCommandParent(r, pid, opts, cmsruntimeout):
flags = fcntl.fcntl(r, fcntl.F_GETFL)
flags |= os.O_NONBLOCK
fcntl.fcntl(r, fcntl.F_SETFL, flags)
xlist = []
rlist = [r]
wlist = []
timeout = opts.endtime - time.time()
if cmsruntimeout > 0:
endtime = min(opts.endtime, time.time() + cmsruntimeout)
else:
endtime = opts.endtime
timeout = endtime - time.time()
stdout = ""
exitCode = -1
while (timeout >= 0) and (r not in xlist):
rlist, wlist, xlist = select.select(rlist, wlist, xlist, timeout)
timeout = opts.endtime - time.time()
timeout = endtime - time.time()
if r in rlist:
newstr = os.read(r, 1024)
stdout += newstr
......@@ -141,18 +149,18 @@ def runCommandParent(r, pid, opts):
exitCode = -1
if (timeout < 0) and (exitCode < 0):
os.kill(pid, signal.SIGKILL)
print "Killed CMSSW child (pid %d) due to timeout." % pid
log.error("Killed CMSSW child (pid %d) due to timeout." % pid)
if exitCode < 0:
pid, exitCode = os.waitpid(pid, 0)
return stdout, exitCode
def runCommand(cmd, args, opts, combineStd=False):
def runCommand(cmd, args, opts, cmsruntimeout=0, combineStd=False):
r, w = os.pipe()
try:
pid = os.fork()
if pid: # parent
os.close(w)
return runCommandParent(r, pid, opts)
return runCommandParent(r, pid, opts, cmsruntimeout)
else:
os.close(r)
os.dup2(w, 1)
......@@ -370,6 +378,11 @@ def main():
no_trial = 0
while ( no_trial < 3 ):
if (opts.endtime - time.time()) < 60:
log.error("Timed out before reaching 3 attempts limit")
exitCode = 8015
break
xrootd_file = "/store/test/xrootd/" + CSWNFB_SITES[rndm_site] + CSWNFB_FILES[rndm_file]
log.info("Xrootd fullpath: %s" % xrootd_file)
......@@ -377,10 +390,11 @@ def main():
fd.write(cms_file % (xrootd_file, level))
fd.close()
stdout, exitCode = runCommand("cmsRun", ["test_xrootd.py"], opts, combineStd=True)
stdout, exitCode = runCommand("cmsRun", ["test_xrootd.py"], opts, 150, combineStd=True)
no_trial += 1
for line in stdout.split('\n'):
if re.search('opened', line) or re.search('redirect', line):
if re.search('opened', line) or re.search('redirect', line) or re.search('Reading', line) or re.search('server', line):
print line
maxlen = 12*1024
......@@ -393,18 +407,25 @@ def main():
if ( exitCode == 0 ):
break
log.error("Failed cmsRun. Output:")
if (opts.endtime - time.time()) < 60:
log.error("Not enough time left for another try")
break
log.error("Failed cmsRun output:")
print stdout
rndm_site = (rndm_site + 1) % len(CSWNFB_SITES)
if ( CSWNFB_SITES[rndm_site] == siteName ):
rndm_site = (rndm_site + 1) % len(CSWNFB_SITES)
log.info("retrying with fallback site: %s" % CSWNFB_SITES[rndm_site])
log.info("Retrying with fallback site: %s" % CSWNFB_SITES[rndm_site])
if exitCode:
returnCode = NAG_CRITICAL
if numCatalogs > 1: # Fallback correctly configured, so only WARN
if exitCode == 8015:
returnCode = NAG_WARNING
return print_summary("Test reached timeout before the third attempt; exit code %s" % exitCode, returnCode)
returnCode = NAG_CRITICAL
return print_summary("Failed cmsRun; exit code %d" % exitCode, returnCode)
log.info("Successful cmsRun.")
# Return the correct exit code.
......
......@@ -4,7 +4,7 @@
Summary: WLCG Compliant Probes from %{site}
Name: nagios-plugins-wlcg-org.cms
Version: 1.1.54
Version: 1.1.55
Release: 1%{?dist}
License: GPL
......@@ -52,6 +52,8 @@ install --directory %{buildroot}/etc/cron.d
/etc/cron.d/cms_glexec
%changelog
* Fri Feb 1 Andrea Sciaba <Andrea.Sciaba@cern.ch> 1.1.55-1.
- Made xrootd fallback test critical
* Thu Dec 6 Andrea Sciaba <Andrea.Sciaba@cern.ch> 1.1.54-1.
- fixes in xrootd-fallback and moved CMSSW to CMSSW_9_2_6 in tests
* Tue Nov 6 2018 Andrea Sciaba <Andrea.Sciaba@cern.ch> 1.1.53-1.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment