diff --git a/doc/release.notes b/doc/release.notes index 720aea8a5e0a788b1aaf35b4a9f3c319dd2dadbb..afbea0285256b3fda607d81bffcdfd5c2aface78 100644 --- a/doc/release.notes +++ b/doc/release.notes @@ -1,9 +1,19 @@ !----------------------------------------------------------------------------- ! Package : DBASE/MCStatTools -! Responsible : Marc-Olivier Bettler, Alex Grecu +! Responsible : Alex Grecu, Marc-Olivier Bettler ! Purpose : Tools for MC Production statistics ! (https://twiki.cern.ch/twiki/bin/view/LHCb/DownloadAndBuild) !----------------------------------------------------------------------------- +!========================== 2019-12-10 MCStatTools v4r7p6 ====================== +! 2019-12-10 - Alex Grecu + - add module __version__ attribute which hold date of last major change in module + code +! 2019-12-01 - Alex Grecu + - add module version debug messages +! 2019-11-30 - Alex Grecu + - patch for automatically detect and process XML logs from single ZIP + containing all production job log (drawback in terms of runtime and + traffic due to download and unzip) !========================== 2019-07-09 MCStatTools v4r7p5 ====================== ! 2019-06-28 - Alex Grecu diff --git a/scripts/DownloadAndBuildStat.py b/scripts/DownloadAndBuildStat.py old mode 100755 new mode 100644 index e86b93c46564fedcf777ff1401b9f31825653d64..e4ba4b891e5bbe475febba6b7ed3ecd7c20c1d81 --- a/scripts/DownloadAndBuildStat.py +++ b/scripts/DownloadAndBuildStat.py @@ -23,7 +23,7 @@ from glob import glob from datetime import date, datetime from optparse import OptionParser - +__version__ = '20190604' logs_server_url = 'http://lhcb-logs.cern.ch/storage/lhcb/MC/' dt_format = '%a, %d.%m.%Y @ %H:%M:%S' list_Dirac_id = [] @@ -179,8 +179,8 @@ def get_GenerationXml(cwd, nb_logs, prodId): return stat, max_ def mkJoin_Html (pID, statDict, nb_jobs): - global GaussStat, prodCache - import lbDirac_utils as lbDrk + global prodCache, lbDrk + import GaussStat mlog.info('Get meta-information from DIRAC...') jobDescription = {} #retrieve production information from DIRAC - use early cached info if valid @@ -298,9 +298,7 @@ parse_options() import lbDirac_utils as lbDrk #fails here if LHCbDirac environment not set import GenXmlLogParser as XmlParser -import GaussStat -GaussStat.sayHello() # if using "local" JSON files online_pids = [] @@ -386,6 +384,8 @@ if not fallbackOldAlgo: dirac_logger = logging.getLogger('gfal2') dirac_logger.setLevel(logging.WARN) prodWkPath = os.path.join(workPath, 'GenLogs-%d' % (prod)) + if not os.path.exists(prodWkPath) or not os.path.isdir(prodWkPath): + os.mkdir(prodWkPath, 0700) if use_local_logs: validFiles = preValidateLocalXMLs(prodWkPath) if len(validFiles) > 1: @@ -396,39 +396,46 @@ if not fallbackOldAlgo: mlog.info('Could not find enough local XMLs for prod. #%d. Fallback to normal execution.' % (prod)) del validFiles prodlfn = lbDrk.prod_lpath % prodCache[prod]['metadata'] - lfns = lbDrk.get_LFNs_on_SE(prodlfn, nbLimit=nb_logs) + lfns = lbDrk.get_LFNs_on_SE(prodlfn, nbLimit=nb_logs, localPath=prodWkPath) if len(lfns) == 0: mlog.warn('No valid xml LFNs found for production #%d. Skipping...' % (prod)) continue if len(lfns) < nb_logs: mlog.warn('Number of available logs smaller than requested. Will merge %d XML logs.' % (len(lfns))) - lfns.sort() - if not os.path.exists(prodWkPath) or not os.path.isdir(prodWkPath): - os.mkdir(prodWkPath, 0700) + # paths that are not LFNs but already downloaded XML file are prepended with file:// + mlFiles = filter(lambda x: x.startswith('file:///'), lfns) k = 0 - mlog.debug('Downloading XML logs for prod %d. Processing...' % (prod)) - fcnt = len(lfns) - pstart = datetime.now() - tvar = pstart - for i in range(0, fcnt): - lfn = lfns[i] - (head, xfname)= os.path.split(lfn) - (head, tail)= os.path.split(head) - suff = '' - (fbase, fext) = os.path.splitext(xfname) - if fext.endswith('gz'): - suff = '.tgz' - # change local name on download to process correctly with tarfile - fn = 'GeneratorLog-%s.xml%s' % (tail, suff) - if lbDrk.getSEfile(lfn, prodWkPath, localFilename=fn): - mlog.warn('Download of LFN %s from LogSE has failed. Skipping...' % (lfn)) - continue - k += 1 - if (datetime.now() - tvar).seconds > 15: - tvar = datetime.now() - mlog.debug('Prod %d: %3.2f%% XMLs processed...' % (prod, float(k)/float(fcnt)*100.)) - if k == nb_logs: - break + if len(mlFiles) > 0: + olfns = list(lfns) + lfns = filter(lambda x: not x in mlFiles, olfns) + k = len(mlFiles) + mlog.info('Already downloaded %d XML logs while searcing on LogSE...' % (k)) + if len(lfns) > 0: + lfns.sort() + fcnt = len(lfns) + mlog.debug('Downloading %d XML logs for prod #%d. Processing...' % (fcnt, prod)) + pstart = datetime.now() + tvar = pstart + for i in range(0, fcnt): + lfn = lfns[i] + (head, xfname) = os.path.split(lfn) + (head, jobid) = os.path.split(head) + (head, runid) = os.path.split(head) + suff = '' + (fbase, fext) = os.path.splitext(xfname) + if fext.endswith('gz'): + suff = '.tgz' + # change local name on download to process correctly with tarfile + fn = 'GeneratorLog-%s-%s.xml%s' % (runid, jobid, suff) + if lbDrk.getSEfile(lfn, prodWkPath, localFilename=fn): + mlog.warn('Download of LFN %s from LogSE has failed. Skipping...' % (lfn)) + continue + k += 1 + if (datetime.now() - tvar).seconds > 15: + tvar = datetime.now() + mlog.debug('Prod %d: %3.2f%% XMLs processed...' % (prod, float(k)/float(fcnt)*100.)) + if k == nb_logs: + break validFiles = preValidateLocalXMLs(prodWkPath) if len(validFiles) == 0: mlog.warning('No downloaded/available log files for production. Skipping Prod ID %d ...' % (prod)) diff --git a/scripts/GaussStat.py b/scripts/GaussStat.py index e41170f0fa193939ee5dbb4b05dbd3b40743ef54..1dbf25ea66e95f758253f4bf1bcac8a23a12f8f7 100644 --- a/scripts/GaussStat.py +++ b/scripts/GaussStat.py @@ -11,8 +11,10 @@ #============================================================================================== #============================================================================================== -global script_version +"""Module implementing functions required to create/merge generator statistics tables in HTML and JSON format.""" + script_version = '20190531' #previous versions: '20180530', '20180323','20171020','20130920','20190123' +__version__ = '20190531' simStatRepoPath = '/eos/project/l/lhcbwebsites/www/projects/STATISTICS' simStatFormat = '%sSTAT' simFmtRe = r'Sim(\d+).*' @@ -29,6 +31,7 @@ import re import json mlog = logging.getLogger('lbMcST.%s' % (__name__)) mlog.setLevel(mlog.parent.level) +mlog.debug('Module %s v. %s loaded.' % (__name__, script_version)) jsonDict=dict() #global dict to hold data to write as JSON (alternative to HTML generation) @@ -115,9 +118,6 @@ def getWGhtmlFile(jobdic): # ======================================================================== # Functions relative to smart rounding # ======================================================================== -def sayHello(): - global script_version - mlog.info('Module GaussStat.py v. %s imported.' % (script_version)) def nb_digit(val, nb_sig_digit=2): """ compute the number of digit after the point that needs to be diff --git a/scripts/GenXmlLogParser.py b/scripts/GenXmlLogParser.py index bbb2fcfbc8e311ecbaa7668045464972d4eecc00..9f45f67130e1de563b8f0af238718eee5f84ceb0 100644 --- a/scripts/GenXmlLogParser.py +++ b/scripts/GenXmlLogParser.py @@ -15,7 +15,9 @@ # TODO: Finish support for XML generator log versioning # add __str__ for debugging #------------------------------------------------------------------------------- +"""Module including all XML processing and merging code.""" +__version__ = '20190126' import os #import stat @@ -33,7 +35,7 @@ import tarfile import logging mlog = logging.getLogger('lbMcST.%s' % (__name__)) mlog.setLevel(mlog.parent.level) -mlog.info('using %s v. 20190123...' % (__name__)) +mlog.debug('Module %s v. %s loaded.' % (__name__, __version__)) xml_log_name = 'GeneratorLog.xml' diff --git a/scripts/LogServer_utils.py b/scripts/LogServer_utils.py index 43775cea4849c01b8c464998499c51b89dc9ea45..bc5b04e1bf8e8b6f1b25955cd09a56666d3c2107 100644 --- a/scripts/LogServer_utils.py +++ b/scripts/LogServer_utils.py @@ -6,7 +6,7 @@ """Module implementing helper code to access log files published on web server""" import tarfile -import zlib +#import zlib import os import urllib import urllib2 @@ -21,6 +21,7 @@ mlog.setLevel(mlog.parent.level) verbose = False logs_server_url = 'http://lhcb-logs.cern.ch/storage/lhcb/MC/' +__version__ = '20190419' xml_server_urlbase = 'https://lhcb-dirac-logse.web.cern.ch/lhcb-dirac-logse' diff --git a/scripts/XRootD_utils.py b/scripts/XRootD_utils.py index c0d7ef4e58e15b97efee351e06b31e9e65755460..f68bf88b9c94918531346c639f016b21db8e1763 100644 --- a/scripts/XRootD_utils.py +++ b/scripts/XRootD_utils.py @@ -16,7 +16,7 @@ from subprocess import Popen castor_xrd_server = 'castorlhcb.cern.ch' eos_xrd_server = 'eoslhcb.cern.ch' - +__version__ = '20190624' archive_location = '/castor/cern.ch/grid/lhcb/backup/log/' mlog = logging.getLogger('lbMcST.%s'%(__name__)) mlog.setLevel(mlog.parent.level) diff --git a/scripts/lbDirac_utils.py b/scripts/lbDirac_utils.py index ff441d06f1bc74ec969c1bc0e4e14ba847f765ce..8c1f96f2b88329ae4eae5afe80c1b44e0f454412 100644 --- a/scripts/lbDirac_utils.py +++ b/scripts/lbDirac_utils.py @@ -6,6 +6,7 @@ """Module implementing helper tools to access information from LHCb DIRAC system.""" ## see info provided by dirac-bookkeeping-production-information +__version__ = '20191201' import logging import subprocess as sp @@ -15,6 +16,7 @@ import sys import os import stat import re +import zipfile #Python 2-3 compat try: @@ -30,6 +32,7 @@ import zlib mlog = logging.getLogger('lbMcST.%s' % (__name__)) mlog.setLevel(mlog.parent.level) +mlog.debug('Module %s v. %s loaded.' % (__name__, __version__)) if __name__ == '__main__': mlog.error('This module cannot be run as a script.') @@ -43,6 +46,7 @@ except ImportError, exx: mlog.error('%s\nPlease, set up LHCbDirac environment:\n$ \x1b[1mlb-run --use=MCStatTools LHCbDirac bash\x1b[0m' % (exx.message)) sys.exit(1) +logSE_name = 'LogSE-EOS' logSE_name = 'LogSE-EOS' prod_lpath = '/lhcb/MC/%(simYear)s/LOG/%(prodID)08d' @@ -138,7 +142,59 @@ def dirac_guessGaussJobs(prodID): mlog.error(str(exx)) return ret ##### -def get_LFNs_on_SE(prodLFN, seName=logSE_name, targetFiles=['GeneratorLog.xml.gz','GeneratorLog.xml'], logger=mlog, nbLimit=None): +def ls_SE_path(prodLFN, seName=logSE_name, logger=mlog, ): + if len(prodLFN) == 0: + logger.error('LFN location cannot be empty string.') + return False + from DIRAC.Core.Base.Script import parseCommandLine + parseCommandLine() + from DIRAC.Resources.Storage.StorageElement import StorageElement + logSE = StorageElement(seName) + # find run directory LFNs + dd = processResponse(logSE.listDirectory(prodLFN), logger=logger) + print(dd) + return True +##### +def findDownloadLogLFN(containerLFN, localPath, targetFiles=['GeneratorLog.xml','GeneratorLog.xml.gz'], seName=logSE_name, logger=mlog): + ret = [] + tpath, zfname = os.path.split(containerLFN) + tpath, jobid = os.path.split(tpath) # remove jobid from path + tpath, runid = os.path.split(tpath) + tpath, prodid = os.path.split(tpath) + lzfname = '_'.join([prodid, runid, zfname]) + if getSEfile(containerLFN, localPath, seName = seName, localFilename=lzfname): + return [] + lzPath = os.path.join(localPath, lzfname) + try: + zf = zipfile.ZipFile(lzPath, 'r') + for fi in zf.filelist: + fbase = os.path.split(fi.filename)[1] + if fbase in targetFiles: + zf.extract(fi, localPath) + # local XML log name + xmlPath = os.path.join(localPath, fi.filename) + suff = '' + (fbase, fext) = os.path.splitext(fbase) + if fext.endswith('gz'): + suff = '.tgz' + fbase, fext = os.path.splitext(fbase) + newxfname = '-'.join([fbase, runid, jobid]) + '.xml' + suff + newXmlPath = os.path.join(localPath, newxfname) + os.rename(xmlPath, newXmlPath) + ret = [ 'file://' + newXmlPath, ] + break + zf.close() + except Exception, exx: + mlog.error(exx.message) + # clean up + if os.path.exists(lzPath): + os.unlink(lzPath) + lzbPath = os.path.join(localPath, jobid) + if os.path.exists(lzbPath): + os.removedirs(lzbPath) + return ret +##### +def get_LFNs_on_SE(prodLFN, seName=logSE_name, targetFiles=['GeneratorLog.xml.gz','GeneratorLog.xml'], logger=mlog, nbLimit=None, localPath=None): if len(prodLFN) == 0: return [] if targetFiles is None or len(targetFiles) == 0: @@ -158,8 +214,12 @@ def get_LFNs_on_SE(prodLFN, seName=logSE_name, targetFiles=['GeneratorLog.xml. logger.error('Did not find any valid production run LFNs.') return [] jobLFNs = [] + if len(runLFNs) > 1: + runLFNs.sort(reverse = True) + logger.debug('Found %d run dirs under %s...' % (len(runLFNs), prodLFN)) # find accessible job LFNs with LOGs for prun in runLFNs: + logger.debug('Getting job dirs in %s ...' % (prun)) dd = processResponse(logSE.listDirectory(prun), logger=logger) if dd is None: continue @@ -168,6 +228,9 @@ def get_LFNs_on_SE(prodLFN, seName=logSE_name, targetFiles=['GeneratorLog.xml. logger.warning('Could not find any job LFNs under %s' % (prun)) continue jobLFNs += tlfn + #temporary for tests + if not nbLimit is None and len(jobLFNs) >= int(float(nbLimit)*2.1): + break jcount = len(jobLFNs) logger.debug('Found %d job LFNs for production LFN %s' % (jcount, prodLFN)) if not nbLimit is None: @@ -177,17 +240,31 @@ def get_LFNs_on_SE(prodLFN, seName=logSE_name, targetFiles=['GeneratorLog.xml. # find valid LFNs for XML logs for j in range(0, jcount): lfn = jobLFNs[j] + #logger.debug('Processing LFN path: %s' % (lfn)) + jobName = os.path.split(lfn)[1] if (datetime.now()-tvar).seconds > 15: tvar = datetime.now() - mlog.debug('Validated %d/%d LFNs...' % (j, jcount)) + logger.debug('Validated %d/%d LFNs...' % (j, jcount)) dd = processResponse(logSE.listDirectory(lfn), logger=logger) if dd is None: + logger.debug('No LFNs under %s ...' % (lfn)) continue - flfns = filter(lambda fn: any([fn.endswith(target) for target in targetFiles]), getAccessibleLFNs(dd['Successful'].items()[0][1])) + if len(dd['Successful']) == 0: + logger.debug(str(dd)) + continue + jfLFNs = getAccessibleLFNs(dd['Successful'].items()[0][1]) + flfns = filter(lambda fn: any([fn.endswith(target) for target in targetFiles]), jfLFNs) if len(flfns) > 0: validLFNs += flfns + else: + # here we actually need localPath to pre-download XML files if found + if len(jfLFNs) == 1 and not localPath is None: + zfName = os.path.split(jfLFNs[0])[1] + if zfName.startswith(jobName) and zfName.lower().endswith('.zip'): + # dealing with latest policy of including all files in a ZIP + validLFNs += findDownloadLogLFN(jfLFNs[0], localPath) # 30% margin for errorneous transfers! - if not nbLimit is None and len(validLFNs) == int(float(nbLimit)*1.3): + if not nbLimit is None and len(validLFNs) >= int(float(nbLimit)*1.3): break logger.debug('%d valid LFNs found under %s:%s' % (len(validLFNs), seName, prodLFN)) return validLFNs diff --git a/scripts/lbTarZ_utils.py b/scripts/lbTarZ_utils.py index 7775a4ad639dd2e941c1bd9f48cb11f0ce9f7326..93d52045b992ebf0a57e0cbd01023c3fcd0d2d60 100644 --- a/scripts/lbTarZ_utils.py +++ b/scripts/lbTarZ_utils.py @@ -14,7 +14,7 @@ import re mlog = logging.getLogger('lbMcST.%s' % (__name__)) mlog.setLevel(mlog.parent.level) - +__version__ = '20190606' if __name__ == '__main__': mlog.error('This module cannot be run as a script.') raise NotImplementedError('Aborting execution...')