htcondor-jobfeatures 7.07 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
#!/usr/bin/python
#
#  htcondor-jobfeatures script for Machine/Job Features on HTCondor
#
#  Andrew McNab, University of Manchester.
#  Copyright (c) 2016. All rights reserved.
#
#  Redistribution and use in source and binary forms, with or
#  without modification, are permitted provided that the following
#  conditions are met:
#
#    o Redistributions of source code must retain the above
#      copyright notice, this list of conditions and the following
#      disclaimer. 
#    o Redistributions in binary form must reproduce the above
#      copyright notice, this list of conditions and the following
#      disclaimer in the documentation and/or other materials
#      provided with the distribution. 
#
#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
#  CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
#  INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
#  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
#  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS
#  BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
#  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
#  TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
#  ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
#  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
#  OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
#  POSSIBILITY OF SUCH DAMAGE.

#  Create $JOBFEATURES files following the Machine/Job Features 
#  specification in HSF-TN-2016-02
#
#  This script creates the $JOBFEATURES directory at 
#  /tmp/mjf-$USER/jobfeatures-$GlobalJobId (replacing # with _)
#  and attempts to populate it from HTCondor Job Ad information 
#  and from $MACHINEFEATURES=/etc/machinefeatures
#
#  If the following variables are present in /var/run/mjf or 
#  /etc/sysconfig/mjf then they are used as defaults if not given
#  by HTCondor:
#
#  - allocated_cpu
#  - wall_limit_secs
#  - cpu_limit_secs
#  - max_rss_bytes
#  - max_swap_bytes
#  - scratch_limit_bytes
#  - hs06_job
#
# Values in /var/run/mjf are preferred over /etc/sysconfig/mjf.
#
# Additionally you can set mjf_tmp_dir in those files to use a 
# directory other than /tmp for the mjf-$USER directories.
#

import os
import re
import sys
import stat
import time

os.umask(0022)

Andrew McNab's avatar
Andrew McNab committed
68
69
70
71
72
73
try:
  fromMachineAd = open(os.environ['_CONDOR_MACHINE_AD'],'r').read()
except:
  sys.stderr.write('Could not read $_CONDOR_MACHINE_AD file - exiting htcondor-jobfeatures')
  sys.exit(10)

74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
try:
  fromJobAd = open(os.environ['_CONDOR_JOB_AD'],'r').read()
except:
  sys.stderr.write('Could not read $_CONDOR_JOB_AD file - exiting htcondor-jobfeatures')
  sys.exit(10)

try:
  matchObject = re.search('^GlobalJobId *= *"(.*)"', fromJobAd)
  job_id = matchObject.group(1)
except:
  print 'Could not get GlobalJobId from Job Ad'
  sys.exit(11)

# Values in /var/run/mjf take precedence
try:
  fromRun = open('/var/run/mjf','r').read()
except:
  fromRun = ''

# Also look in persistent /etc/sysconfig/mjf
try:
  fromSysconfig = open('/etc/sysconfig/mjf','r').read()
except:
  fromSysconfig = ''  

mjfTmpDir = '/tmp'

if 'mjf_tmp_dir=' in fromRun + fromSysconfig:
  try:
    matchObject = re.search("^mjf_tmp_dir=(.*)", fromRun + '\n' + fromSysconfig)
    mjfTmpDir = matchObject.group(1)
  except:
    pass

# First make sure the mjfTmpDir/mjf-$USER directory exists
try:
  os.mkdir(mjfTmpDir + '/mjf-' + os.environ['USER'])
except:
  # Ok if exists already
  pass

try: 
  userDirStat = os.stat(mjfTmpDir + '/mjf-' + os.environ['USER'])
except:
  print mjfTmpDir + '/mjf-' + os.environ['USER'] + ' could not be created'
  sys.exit(12)

# Check it is owned by us
if userDirStat.st_uid != os.getuid() or userDirStat.st_gid != os.getgid():
  print mjfTmpDir + '/mjf-' + os.environ['USER'] + ' has the wrong UID/GID'
  sys.exit(13)

# Ensure the right permissions
os.chmod(mjfTmpDir + '/mjf-' + os.environ['USER'], 
         stat.S_IRWXU + stat.S_IRGRP + stat.S_IXGRP + stat.S_IROTH + stat.S_IXOTH)

# Now the directory for this job (but with # -> _)
jobfeaturesDir = mjfTmpDir + '/mjf-' + os.environ['USER'] + '/jobfeatures-' + job_id.replace('#', '_')

try:
  os.mkdir(jobfeaturesDir)
except:
  print 'Failed to create ' + jobfeaturesDir
  sys.exit(14)

open(jobfeaturesDir + '/job_id', 'w').write(job_id)

jobfeatures = {}
jobfeatures['allocated_cpu'] = 1
  
jobstart_secs = int(time.time())
open(jobfeaturesDir + '/jobstart_secs', 'w').write(str(jobstart_secs))

# Get any defaults for these (integer) values
for key in ['allocated_cpu', 'wall_limit_secs', 'cpu_limit_secs',
            'max_rss_bytes', 'max_swap_bytes', 'scratch_limit_bytes']:

  if key + '=' in fromRun + fromSysconfig:
    try:
      matchObject = re.search(key + "=([0-9]*)", fromRun + '\n' + fromSysconfig)
      jobfeatures[key] = int(matchObject.group(1))
    except:
      pass

# Examine the Job Ad, for resource limits
try:
Andrew McNab's avatar
Andrew McNab committed
160
161
  cpusMatchObject = re.search('CpusProvisioned *= *([0-9 +*/()-]*)', fromJobAd)
  jobfeatures['allocated_cpu'] = int(eval(cpusMatchObject.group(1)))
162
163
164
except:
  pass

Andrew McNab's avatar
Andrew McNab committed
165
166
167
168
169
170
171
172
173
174
try:
  wallMatchObject = re.search('^MaxJobRetirementTime *= *([0-9 +*/()-]*)', fromMachineAd)
  jobfeatures['wall_limit_secs'] = int(eval(wallMatchObject.group(1)))
except:
  pass

if not 'cpu_limit_secs' in jobfeatures and 'wall_limit_secs' in jobfeatures:
  # If not given in mjf files, we create a CPU seconds limit from wallclock 
  # and allocated CPUs/processors
  jobfeatures['cpu_limit_secs'] = jobfeatures['wall_limit_secs'] * jobfeatures['allocated_cpu']
175
176
  
try:
Andrew McNab's avatar
Andrew McNab committed
177
178
  rssMatchObject = re.search('^MemoryProvisioned *= *([0-9 +*/()-]*)', fromJobAd)
  rssInt = int(eval(rssMatchObject.group(1)))
179
180
181
182
183
184
185
except:
  pass
else:    
  # Safer to assume powers of 1000 rather than 1024
  jobfeatures['max_rss_bytes'] = rssInt * 1000000

try:
Andrew McNab's avatar
Andrew McNab committed
186
187
  scratchMatchObject = re.search('^DiskProvisioned *= *([0-9 +*/()-]*)', fromJobAd)
  scratchInt = int(eval(rssMatchObject.group(1)))
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
except:
  pass
else:    
  # Safer to assume powers of 1000 rather than 1024
  jobfeatures['scratch_limit_bytes'] = scratchInt * 1000

# Write out if these have been set from files or prologue.user arguments
for key in ['allocated_cpu', 'wall_limit_secs', 'cpu_limit_secs',
            'max_rss_bytes', 'max_swap_bytes', 'scratch_limit_bytes']:

  if key in jobfeatures:
    open(jobfeaturesDir + '/' + key, 'w').write(str(jobfeatures[key]))

# Try to get/calculate hs06_job
hs06_job = None

if 'hs06_job=' in fromRun + fromSysconfig:
  try:
    matchObject = re.search("hs06_job=([0-9.]*)", fromRun + '\n' + fromSysconfig)
    hs06_job = float(matchObject.group(1))
  except:
    pass

if not hs06_job:
  try:
    hs06 = float(open('/etc/machinefeatures/hs06','r').readline())
  except:
    hs06 = None

  try:
    total_cpu = int(open('/etc/machinefeatures/total_cpu','r').readline())
  except:
    total_cpu = None

  if hs06 and total_cpu:
    # Simple pro-rata allocation of total hs06 depending on processors for this job
    hs06_job = (jobfeatures['allocated_cpu'] * hs06) / total_cpu
    
if hs06_job:
  # We got it from somewhere
  open(jobfeaturesDir + '/hs06_job', 'w').write('%.2f' % hs06_job)