Commit 666d716e authored by Andrew McNab's avatar Andrew McNab
Browse files

First version of HTCondor $JOBFEATURES

parent 4218dfcb
......@@ -33,7 +33,7 @@ include VERSION
INSTALL_FILES=VERSION prologue.user epilogue.user mjf.init \
mjf-torque.sh mjf-torque.csh mjf-get-total-cpu.torque \
mjf-htcondor.sh mjf-htcondor.csh
mjf-htcondor.sh mjf-htcondor.csh mjf-job-wrapper htcondor-jobfeatures
TGZ_FILES=$(INSTALL_FILES) Makefile mjf-torque.spec mjf-htcondor.spec README
......@@ -77,7 +77,9 @@ htcondor-install: $(INSTALL_FILES) install
$(RPM_BUILD_ROOT)/etc/profile.d/mjf.sh
cp mjf-htcondor.csh \
$(RPM_BUILD_ROOT)/etc/profile.d/mjf.csh
cp mjf-job-wrapper htcondor-jobfeatures \
$(RPM_BUILD_ROOT)/usr/sbin
htcondor-rpm: mjf-scripts.tgz
rm -Rf RPMTMP
mkdir -p RPMTMP/SOURCES RPMTMP/SPECS RPMTMP/BUILD \
......
VERSION=00.10
VERSION=00.11
#!/usr/bin/python
#
# htcondor-jobfeatures script for Machine/Job Features on HTCondor
#
# Andrew McNab, University of Manchester.
# Copyright (c) 2016. All rights reserved.
#
# Redistribution and use in source and binary forms, with or
# without modification, are permitted provided that the following
# conditions are met:
#
# o Redistributions of source code must retain the above
# copyright notice, this list of conditions and the following
# disclaimer.
# o Redistributions in binary form must reproduce the above
# copyright notice, this list of conditions and the following
# disclaimer in the documentation and/or other materials
# provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
# INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS
# BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
# TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
# ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
# Create $JOBFEATURES files following the Machine/Job Features
# specification in HSF-TN-2016-02
#
# This script creates the $JOBFEATURES directory at
# /tmp/mjf-$USER/jobfeatures-$GlobalJobId (replacing # with _)
# and attempts to populate it from HTCondor Job Ad information
# and from $MACHINEFEATURES=/etc/machinefeatures
#
# If the following variables are present in /var/run/mjf or
# /etc/sysconfig/mjf then they are used as defaults if not given
# by HTCondor:
#
# - allocated_cpu
# - wall_limit_secs
# - cpu_limit_secs
# - max_rss_bytes
# - max_swap_bytes
# - scratch_limit_bytes
# - hs06_job
#
# Values in /var/run/mjf are preferred over /etc/sysconfig/mjf.
#
# Additionally you can set mjf_tmp_dir in those files to use a
# directory other than /tmp for the mjf-$USER directories.
#
import os
import re
import sys
import stat
import time
os.umask(0022)
try:
fromJobAd = open(os.environ['_CONDOR_JOB_AD'],'r').read()
except:
sys.stderr.write('Could not read $_CONDOR_JOB_AD file - exiting htcondor-jobfeatures')
sys.exit(10)
try:
matchObject = re.search('^GlobalJobId *= *"(.*)"', fromJobAd)
job_id = matchObject.group(1)
except:
print 'Could not get GlobalJobId from Job Ad'
sys.exit(11)
# Values in /var/run/mjf take precedence
try:
fromRun = open('/var/run/mjf','r').read()
except:
fromRun = ''
# Also look in persistent /etc/sysconfig/mjf
try:
fromSysconfig = open('/etc/sysconfig/mjf','r').read()
except:
fromSysconfig = ''
mjfTmpDir = '/tmp'
if 'mjf_tmp_dir=' in fromRun + fromSysconfig:
try:
matchObject = re.search("^mjf_tmp_dir=(.*)", fromRun + '\n' + fromSysconfig)
mjfTmpDir = matchObject.group(1)
except:
pass
# First make sure the mjfTmpDir/mjf-$USER directory exists
try:
os.mkdir(mjfTmpDir + '/mjf-' + os.environ['USER'])
except:
# Ok if exists already
pass
try:
userDirStat = os.stat(mjfTmpDir + '/mjf-' + os.environ['USER'])
except:
print mjfTmpDir + '/mjf-' + os.environ['USER'] + ' could not be created'
sys.exit(12)
# Check it is owned by us
if userDirStat.st_uid != os.getuid() or userDirStat.st_gid != os.getgid():
print mjfTmpDir + '/mjf-' + os.environ['USER'] + ' has the wrong UID/GID'
sys.exit(13)
# Ensure the right permissions
os.chmod(mjfTmpDir + '/mjf-' + os.environ['USER'],
stat.S_IRWXU + stat.S_IRGRP + stat.S_IXGRP + stat.S_IROTH + stat.S_IXOTH)
# Now the directory for this job (but with # -> _)
jobfeaturesDir = mjfTmpDir + '/mjf-' + os.environ['USER'] + '/jobfeatures-' + job_id.replace('#', '_')
try:
os.mkdir(jobfeaturesDir)
except:
print 'Failed to create ' + jobfeaturesDir
sys.exit(14)
open(jobfeaturesDir + '/job_id', 'w').write(job_id)
jobfeatures = {}
jobfeatures['allocated_cpu'] = 1
jobstart_secs = int(time.time())
open(jobfeaturesDir + '/jobstart_secs', 'w').write(str(jobstart_secs))
# Get any defaults for these (integer) values
for key in ['allocated_cpu', 'wall_limit_secs', 'cpu_limit_secs',
'max_rss_bytes', 'max_swap_bytes', 'scratch_limit_bytes']:
if key + '=' in fromRun + fromSysconfig:
try:
matchObject = re.search(key + "=([0-9]*)", fromRun + '\n' + fromSysconfig)
jobfeatures[key] = int(matchObject.group(1))
except:
pass
# Examine the Job Ad, for resource limits
try:
cpusMatchObject = re.search('CpusProvisioned *= *([0-9]*)', fromJobAd)
jobfeatures['allocated_cpu'] = int(cpusMatchObject.group(1))
except:
pass
#try:
# wallMatchObject = re.search('walltime=([0-9]*):([0-9]*):([0-9]*)', sys.argv[5])
# jobfeatures['wall_limit_secs'] = int(wallMatchObject.group(1)) * 3600 + int(wallMatchObject.group(2)) * 60 + int(wallMatchObject.group(3))
#except:
# pass
#
#try:
# cputimeMatchObject = re.search('cput=([0-9]*):([0-9]*):([0-9]*)', sys.argv[5])
# jobfeatures['cpu_limit_secs'] = int(cputimeMatchObject.group(1)) * 3600 + int(cputimeMatchObject.group(2)) * 60 + int(cputimeMatchObject.group(3))
#except:
# pass
try:
rssMatchObject = re.search('^MemoryProvisioned *= *([0-9]*)', fromJobAd)
rssInt = int(rssMatchObject.group(1))
except:
pass
else:
# Safer to assume powers of 1000 rather than 1024
jobfeatures['max_rss_bytes'] = rssInt * 1000000
try:
scratchMatchObject = re.search('^DiskProvisioned *= *([0-9]*)', fromJobAd)
scratchInt = int(rssMatchObject.group(1))
except:
pass
else:
# Safer to assume powers of 1000 rather than 1024
jobfeatures['scratch_limit_bytes'] = scratchInt * 1000
# Write out if these have been set from files or prologue.user arguments
for key in ['allocated_cpu', 'wall_limit_secs', 'cpu_limit_secs',
'max_rss_bytes', 'max_swap_bytes', 'scratch_limit_bytes']:
if key in jobfeatures:
open(jobfeaturesDir + '/' + key, 'w').write(str(jobfeatures[key]))
# Try to get/calculate hs06_job
hs06_job = None
if 'hs06_job=' in fromRun + fromSysconfig:
try:
matchObject = re.search("hs06_job=([0-9.]*)", fromRun + '\n' + fromSysconfig)
hs06_job = float(matchObject.group(1))
except:
pass
if not hs06_job:
try:
hs06 = float(open('/etc/machinefeatures/hs06','r').readline())
except:
hs06 = None
try:
total_cpu = int(open('/etc/machinefeatures/total_cpu','r').readline())
except:
total_cpu = None
if hs06 and total_cpu:
# Simple pro-rata allocation of total hs06 depending on processors for this job
hs06_job = (jobfeatures['allocated_cpu'] * hs06) / total_cpu
if hs06_job:
# We got it from somewhere
open(jobfeaturesDir + '/hs06_job', 'w').write('%.2f' % hs06_job)
......@@ -10,7 +10,7 @@ Vendor: GridPP
Packager: Andrew McNab <Andrew.McNab@cern.ch>
%description
Currently this only creates $MACHINEFEATURES!
MJF implementations following HSF-TN-2016-02
%prep
......@@ -35,3 +35,4 @@ fi
%files
/etc/rc.d/init.d/*
/etc/profile.d/*
/usr/sbin/*
#!/bin/bash
#
# mjf-job-wrapper script for Machine/Job Features on HTCondor
#
# Andrew McNab, University of Manchester.
# Copyright (c) 2016. All rights reserved.
#
# Redistribution and use in source and binary forms, with or
# without modification, are permitted provided that the following
# conditions are met:
#
# o Redistributions of source code must retain the above
# copyright notice, this list of conditions and the following
# disclaimer.
# o Redistributions in binary form must reproduce the above
# copyright notice, this list of conditions and the following
# disclaimer in the documentation and/or other materials
# provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
# INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS
# BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
# TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
# ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
# Runs the job with its arguments (as given by $*), after running
# htcondor-jobfeatures to create the per-job $JOBFEATURES directory
#
# If you are not already using a job wrapper, you can set
#
# USER_JOB_WRAPPER = /usr/sbin/mjf-job-wrapper
#
# in your HTCondor configuration to run this script. Otherwise,
# you must modify your existing job wrapper to run (not source)
# the htcondor-jobfeatures Python script to create $JOBFEATURES
#
# This script outputs the value of $JOBFEATURES. (We could just
# run the script without recording the value of $JOBFEATURES.)
jobfeatures=`/usr/sbin/htcondor-jobfeatures`
$*
# We tidy these up. They are in /tmp by default so we could just
# leave them to be removed by the system instead.
if [ -d "$jobfeatures" ] ; then
rm -Rf "$jobfeatures"
fi
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment