#!/usr/bin/python # # htcondor-jobfeatures script for Machine/Job Features on HTCondor # # Andrew McNab, University of Manchester. # Copyright (c) 2016. All rights reserved. # # Redistribution and use in source and binary forms, with or # without modification, are permitted provided that the following # conditions are met: # # o Redistributions of source code must retain the above # copyright notice, this list of conditions and the following # disclaimer. # o Redistributions in binary form must reproduce the above # copyright notice, this list of conditions and the following # disclaimer in the documentation and/or other materials # provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND # CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, # INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS # BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED # TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON # ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. # Create $JOBFEATURES files following the Machine/Job Features # specification in HSF-TN-2016-02 # # This script creates the $JOBFEATURES directory at # /tmp/mjf-$USER/jobfeatures-$GlobalJobId (replacing # with _) # and attempts to populate it from HTCondor Job Ad information # and from $MACHINEFEATURES=/etc/machinefeatures # # If the following variables are present in /var/run/mjf or # /etc/sysconfig/mjf then they are used as defaults if not given # by HTCondor: # # - allocated_cpu # - wall_limit_secs # - cpu_limit_secs # - max_rss_bytes # - max_swap_bytes # - scratch_limit_bytes # - hs06_job # # Values in /var/run/mjf are preferred over /etc/sysconfig/mjf. # # Additionally you can set mjf_tmp_dir in those files to use a # directory other than /tmp for the mjf-$USER directories. # import os import re import sys import stat import time os.umask(0022) try: fromMachineAd = open(os.environ['_CONDOR_MACHINE_AD'],'r').read() except: sys.stderr.write('Could not read $_CONDOR_MACHINE_AD file - exiting htcondor-jobfeatures') sys.exit(10) try: fromJobAd = open(os.environ['_CONDOR_JOB_AD'],'r').read() except: sys.stderr.write('Could not read $_CONDOR_JOB_AD file - exiting htcondor-jobfeatures') sys.exit(10) try: matchObject = re.search('^GlobalJobId *= *"(.*)"', fromJobAd) job_id = matchObject.group(1) except: print 'Could not get GlobalJobId from Job Ad' sys.exit(11) # Values in /var/run/mjf take precedence try: fromRun = open('/var/run/mjf','r').read() except: fromRun = '' # Also look in persistent /etc/sysconfig/mjf try: fromSysconfig = open('/etc/sysconfig/mjf','r').read() except: fromSysconfig = '' mjfTmpDir = '/tmp' if 'mjf_tmp_dir=' in fromRun + fromSysconfig: try: matchObject = re.search("^mjf_tmp_dir=(.*)", fromRun + '\n' + fromSysconfig) mjfTmpDir = matchObject.group(1) except: pass # First make sure the mjfTmpDir/mjf-$USER directory exists try: os.mkdir(mjfTmpDir + '/mjf-' + os.environ['USER']) except: # Ok if exists already pass try: userDirStat = os.stat(mjfTmpDir + '/mjf-' + os.environ['USER']) except: print mjfTmpDir + '/mjf-' + os.environ['USER'] + ' could not be created' sys.exit(12) # Check it is owned by us if userDirStat.st_uid != os.getuid() or userDirStat.st_gid != os.getgid(): print mjfTmpDir + '/mjf-' + os.environ['USER'] + ' has the wrong UID/GID' sys.exit(13) # Ensure the right permissions os.chmod(mjfTmpDir + '/mjf-' + os.environ['USER'], stat.S_IRWXU + stat.S_IRGRP + stat.S_IXGRP + stat.S_IROTH + stat.S_IXOTH) # Now the directory for this job (but with # -> _) jobfeaturesDir = mjfTmpDir + '/mjf-' + os.environ['USER'] + '/jobfeatures-' + job_id.replace('#', '_') try: os.mkdir(jobfeaturesDir) except: print 'Failed to create ' + jobfeaturesDir sys.exit(14) open(jobfeaturesDir + '/job_id', 'w').write(job_id) jobfeatures = {} jobfeatures['allocated_cpu'] = 1 jobstart_secs = int(time.time()) open(jobfeaturesDir + '/jobstart_secs', 'w').write(str(jobstart_secs)) # Get any defaults for these (integer) values for key in ['allocated_cpu', 'wall_limit_secs', 'cpu_limit_secs', 'max_rss_bytes', 'max_swap_bytes', 'scratch_limit_bytes']: if key + '=' in fromRun + fromSysconfig: try: matchObject = re.search(key + "=([0-9]*)", fromRun + '\n' + fromSysconfig) jobfeatures[key] = int(matchObject.group(1)) except: pass # Examine the Job Ad, for resource limits try: cpusMatchObject = re.search('CpusProvisioned *= *([0-9 +*/()-]*)', fromJobAd) jobfeatures['allocated_cpu'] = int(eval(cpusMatchObject.group(1))) except: pass try: wallMatchObject = re.search('^MaxJobRetirementTime *= *([0-9 +*/()-]*)', fromMachineAd) jobfeatures['wall_limit_secs'] = int(eval(wallMatchObject.group(1))) except: pass if not 'cpu_limit_secs' in jobfeatures and 'wall_limit_secs' in jobfeatures: # If not given in mjf files, we create a CPU seconds limit from wallclock # and allocated CPUs/processors jobfeatures['cpu_limit_secs'] = jobfeatures['wall_limit_secs'] * jobfeatures['allocated_cpu'] try: rssMatchObject = re.search('^MemoryProvisioned *= *([0-9 +*/()-]*)', fromJobAd) rssInt = int(eval(rssMatchObject.group(1))) except: pass else: # Safer to assume powers of 1000 rather than 1024 jobfeatures['max_rss_bytes'] = rssInt * 1000000 try: scratchMatchObject = re.search('^DiskProvisioned *= *([0-9 +*/()-]*)', fromJobAd) scratchInt = int(eval(rssMatchObject.group(1))) except: pass else: # Safer to assume powers of 1000 rather than 1024 jobfeatures['scratch_limit_bytes'] = scratchInt * 1000 # Write out if these have been set from files or prologue.user arguments for key in ['allocated_cpu', 'wall_limit_secs', 'cpu_limit_secs', 'max_rss_bytes', 'max_swap_bytes', 'scratch_limit_bytes']: if key in jobfeatures: open(jobfeaturesDir + '/' + key, 'w').write(str(jobfeatures[key])) # Try to get/calculate hs06_job hs06_job = None if 'hs06_job=' in fromRun + fromSysconfig: try: matchObject = re.search("hs06_job=([0-9.]*)", fromRun + '\n' + fromSysconfig) hs06_job = float(matchObject.group(1)) except: pass if not hs06_job: try: hs06 = float(open('/etc/machinefeatures/hs06','r').readline()) except: hs06 = None try: total_cpu = int(open('/etc/machinefeatures/total_cpu','r').readline()) except: total_cpu = None if hs06 and total_cpu: # Simple pro-rata allocation of total hs06 depending on processors for this job hs06_job = (jobfeatures['allocated_cpu'] * hs06) / total_cpu if hs06_job: # We got it from somewhere open(jobfeaturesDir + '/hs06_job', 'w').write('%.2f' % hs06_job)