From 77130d427cd1123be933d0d75dd52151770566d5 Mon Sep 17 00:00:00 2001 From: Kevin Pedro <kpedro88@gmail.com> Date: Thu, 29 Mar 2018 04:45:24 -0500 Subject: [PATCH] rely more on phedex to get file lists --- Production/test/get_py.py | 98 ++++++++++++++++++++------------------- 1 file changed, 51 insertions(+), 47 deletions(-) diff --git a/Production/test/get_py.py b/Production/test/get_py.py index 5234fbfb..098a995f 100644 --- a/Production/test/get_py.py +++ b/Production/test/get_py.py @@ -1,6 +1,7 @@ import re,sys,getopt,urllib2,json from dbs.apis.dbsClient import DbsApi from optparse import OptionParser +from collections import defaultdict # Read parameters parser = OptionParser() @@ -18,15 +19,15 @@ makese = options.se if not makepy and not makewp and not makese: parser.error("No operations selected!") -#interface with DBS +# interface with DBS dbs3api = DbsApi("https://cmsweb.cern.ch/dbs/prod/global/DBSReader") -#format for dict entries: -# data: [['sample'] , []] -# MC: [['sample'] , [xsec]] -# MC w/ extended sample: [['sample','sample_ext'] , [xsec]] -# MC w/ negative weights (amcatnlo): [['sample'] , [xsec, neff]] -#MC w/ negative weights (amcatnlo) + extended sample: [['sample','sample_ext'] , [xsec, neff, neff_ext]] +# format for dict entries: +# data: [['sample'] , []] +# MC: [['sample'] , [xsec]] +# MC w/ extended sample: [['sample','sample_ext'] , [xsec]] +# MC w/ negative weights (amcatnlo): [['sample'] , [xsec, neff]] +# MC w/ negative weights (amcatnlo) + extended sample: [['sample','sample_ext'] , [xsec, neff, neff_ext]] if makewp: wname = "weights_"+dictname+".txt" @@ -41,15 +42,17 @@ for fitem in flist: x = fitem[1] nevents_all = [] for f in ff: # in case of extended samples + print f + if makepy: - #get sample name + # get sample name oname = f.split('/')[1] - #check for extended sample + # check for extended sample extcheck = re.search("ext[0-9]",f.split('/')[2]) if not extcheck==None and len(extcheck.group(0))>0: oname = oname+"_"+extcheck.group(0) - #make python file with preamble + # make python file with preamble pfile = open(oname+"_cff.py",'w') pfile.write("import FWCore.ParameterSet.Config as cms\n\n") pfile.write("maxEvents = cms.untracked.PSet( input = cms.untracked.int32(-1) )\n") @@ -57,65 +60,66 @@ for fitem in flist: pfile.write("secFiles = cms.untracked.vstring()\n") pfile.write("source = cms.Source (\"PoolSource\",fileNames = readFiles, secondaryFileNames = secFiles)\n") - #get dataset info - detail only needed in makewp case - filelist = [] + # get list of hosted files using PhEDEx API + filelist = set() + sitelist = defaultdict(int) + url='https://cmsweb.cern.ch/phedex/datasvc/json/prod/filereplicas?dataset=' + f + jstr = urllib2.urlopen(url).read() + jstr = jstr.replace("\n", " ") + result = json.loads(jstr) + for block in result['phedex']['block']: + for item in block['file']: + filelist.add(item['name']) + if makese: + for replica in item['replica']: + site = replica['node'] + addr = replica['se'] + # safety checks + if site is None: continue + if addr is None: addr = "" + ## if (site,addr) not in sitelist.keys(): sitelist[(site,addr)] = 0 + sitelist[(site,addr)] += 1 + + # get dataset info - detail only needed in makewp case nevents = 0 - print f - fileArrays = dbs3api.listFileArray(dataset=f,detail=makewp) - for fileArray in fileArrays: - if makepy: - filelist.append(fileArray["logical_file_name"]) - if makewp: - nevents += fileArray["event_count"] - nevents_all.append(nevents) + if makewp: + fileArrays = dbs3api.listFileArray(dataset=f,detail=makewp) + for fileArray in fileArrays: + if fileArray["logical_file_name"] in filelist: + nevents += fileArray["event_count"] + nevents_all.append(nevents) - # check for sites with 100% dataset presence (using PhEDEx API) + # check for sites with 100% dataset presence (based on PhEDEx) # refs: # https://github.com/dmwm/DAS/blob/master/src/python/DAS/services/combined/combined_service.py # https://github.com/gutsche/scripts/blob/master/PhEDEx/checkLocation.py if makese: - url='https://cmsweb.cern.ch/phedex/datasvc/json/prod/blockreplicas?dataset=' + f - jstr = urllib2.urlopen(url).read() - jstr = jstr.replace("\n", " ") - result = json.loads(jstr) - - site_list = {} - for block in result['phedex']['block']: - for replica in block['replica']: - site = replica['node'] - addr = replica['se'] - #safety checks - if site is None: continue - if addr is None: addr = "" - if (site,addr) not in site_list.keys(): site_list[(site,addr)] = 0 - site_list[(site,addr)] += replica['files'] - - # get total number of expected files from DBS - nfiles_tot = len(fileArrays) + # get total number of expected files + nfiles_tot = len(filelist) # calculate dataset fraction (presence) in % and check for completion highest_percent = 0 - for site,addr in site_list: - this_percent = float(site_list[(site,addr)])/float(nfiles_tot)*100 - site_list[(site,addr)] = this_percent + for site,addr in sitelist: + this_percent = float(sitelist[(site,addr)])/float(nfiles_tot)*100 + sitelist[(site,addr)] = this_percent if this_percent > highest_percent: highest_percent = this_percent sfile.write(f+"\n") if highest_percent < 100: sfile.write(" !!! No site has complete dataset !!! ( Highest: "+str(highest_percent)+"% )\n") - for site,addr in site_list: - this_percent = site_list[(site,addr)] + for site,addr in sorted(sitelist): + this_percent = sitelist[(site,addr)] if this_percent==highest_percent: sfile.write(" "+site+" ("+addr+")\n") if makepy: #sort list of files for consistency - filelist.sort() + filesort = sorted(filelist) counter = 0 #split into chunks of 255 - for lfn in filelist: + for lfn in filesort: if counter==0: pfile.write("readFiles.extend( [\n") pfile.write(" '"+lfn+"',\n") - if counter==254 or lfn==filelist[-1]: + if counter==254 or lfn==filesort[-1]: pfile.write("] )\n") counter = 0 else: -- GitLab