make-jobfeatures.htcondor 8.71 KB
Newer Older
1
2
#!/usr/bin/python
#
3
#  make-jobfeatures script for Machine/Job Features on HTCondor
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
#
#  Andrew McNab, University of Manchester.
#  Copyright (c) 2016. All rights reserved.
#
#  Redistribution and use in source and binary forms, with or
#  without modification, are permitted provided that the following
#  conditions are met:
#
#    o Redistributions of source code must retain the above
#      copyright notice, this list of conditions and the following
#      disclaimer. 
#    o Redistributions in binary form must reproduce the above
#      copyright notice, this list of conditions and the following
#      disclaimer in the documentation and/or other materials
#      provided with the distribution. 
#
#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
#  CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
#  INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
#  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
#  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS
#  BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
#  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
#  TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
#  ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
#  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
#  OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
#  POSSIBILITY OF SUCH DAMAGE.

#  Create $JOBFEATURES files following the Machine/Job Features 
#  specification in HSF-TN-2016-02
#
#  This script creates the $JOBFEATURES directory at 
#  /tmp/mjf-$USER/jobfeatures-$GlobalJobId (replacing # with _)
#  and attempts to populate it from HTCondor Job Ad information 
#  and from $MACHINEFEATURES=/etc/machinefeatures
#
#  If the following variables are present in /var/run/mjf or 
#  /etc/sysconfig/mjf then they are used as defaults if not given
#  by HTCondor:
#
#  - allocated_cpu
#  - wall_limit_secs
#  - cpu_limit_secs
#  - max_rss_bytes
#  - max_swap_bytes
#  - scratch_limit_bytes
#  - hs06_job
53
#  - db12_job
54
#
55
#  Values in /var/run/mjf are preferred over /etc/sysconfig/mjf.
56
#
57
58
#  The following per-cpu values in either mjf file will be used
#  to calculate the corresponding per-job values if not given in
Andrew McNab's avatar
More GE    
Andrew McNab committed
59
#  either file or obtainable from HTCondor:
60
61
62
63
64
65
66
67
# 
#  - cpu_limit_secs_per_cpu
#  - max_rss_bytes_per_cpu
#  - max_swap_bytes_per_cpu
#  - scratch_limit_bytes_per_cpu
#
#  Additionally you can set mjf_tmp_dir in those files to use a 
#  directory other than /tmp for the mjf-$USER directories.
68
69
70
71
#

import os
import re
72
import pwd
73
74
75
76
77
78
import sys
import stat
import time

os.umask(0022)

Andrew McNab's avatar
Andrew McNab committed
79
80
81
try:
  fromMachineAd = open(os.environ['_CONDOR_MACHINE_AD'],'r').read()
except:
82
  sys.stderr.write('Could not read $_CONDOR_MACHINE_AD file - exiting make-jobfeatures\n')
Andrew McNab's avatar
Andrew McNab committed
83
84
  sys.exit(10)

85
86
87
try:
  fromJobAd = open(os.environ['_CONDOR_JOB_AD'],'r').read()
except:
88
  sys.stderr.write('Could not read $_CONDOR_JOB_AD file - exiting make-jobfeatures\n')
89
90
91
  sys.exit(10)

try:
Andrew McNab's avatar
Fixes    
Andrew McNab committed
92
  matchObject = re.search('GlobalJobId *= *"(.*)"', fromJobAd)
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
  job_id = matchObject.group(1)
except:
  print 'Could not get GlobalJobId from Job Ad'
  sys.exit(11)

# Values in /var/run/mjf take precedence
try:
  fromRun = open('/var/run/mjf','r').read()
except:
  fromRun = ''

# Also look in persistent /etc/sysconfig/mjf
try:
  fromSysconfig = open('/etc/sysconfig/mjf','r').read()
except:
  fromSysconfig = ''  

mjfTmpDir = '/tmp'

if 'mjf_tmp_dir=' in fromRun + fromSysconfig:
  try:
Andrew McNab's avatar
Fixes    
Andrew McNab committed
114
    matchObject = re.search("mjf_tmp_dir=(.*)", fromRun + '\n' + fromSysconfig)
115
116
117
118
    mjfTmpDir = matchObject.group(1)
  except:
    pass

119
120
121
122
123
try:
  userName = pwd.getpwuid(os.getuid())[0]
except:
  userName = str(os.getuid())

124
125
# First make sure the mjfTmpDir/mjf-$USER directory exists
try:
126
  os.mkdir(mjfTmpDir + '/mjf-' + userName)
127
128
129
130
131
except:
  # Ok if exists already
  pass

try: 
132
  userDirStat = os.stat(mjfTmpDir + '/mjf-' + userName)
133
except:
134
  print mjfTmpDir + '/mjf-' + userName + ' could not be created'
135
136
137
138
  sys.exit(12)

# Check it is owned by us
if userDirStat.st_uid != os.getuid() or userDirStat.st_gid != os.getgid():
139
  print mjfTmpDir + '/mjf-' + userName + ' has the wrong UID/GID'
140
141
142
  sys.exit(13)

# Ensure the right permissions
143
os.chmod(mjfTmpDir + '/mjf-' + userName, 
144
145
146
         stat.S_IRWXU + stat.S_IRGRP + stat.S_IXGRP + stat.S_IROTH + stat.S_IXOTH)

# Now the directory for this job (but with # -> _)
147
jobfeaturesDir = mjfTmpDir + '/mjf-' + userName + '/jobfeatures-' + job_id.replace('#', '_')
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173

try:
  os.mkdir(jobfeaturesDir)
except:
  print 'Failed to create ' + jobfeaturesDir
  sys.exit(14)

open(jobfeaturesDir + '/job_id', 'w').write(job_id)

jobfeatures = {}
jobfeatures['allocated_cpu'] = 1
  
jobstart_secs = int(time.time())
open(jobfeaturesDir + '/jobstart_secs', 'w').write(str(jobstart_secs))

# Get any defaults for these (integer) values
for key in ['allocated_cpu', 'wall_limit_secs', 'cpu_limit_secs',
            'max_rss_bytes', 'max_swap_bytes', 'scratch_limit_bytes']:

  if key + '=' in fromRun + fromSysconfig:
    try:
      matchObject = re.search(key + "=([0-9]*)", fromRun + '\n' + fromSysconfig)
      jobfeatures[key] = int(matchObject.group(1))
    except:
      pass

174
# Examine the Job Ad, for number of CPUs allocated
175
try:
Andrew McNab's avatar
Andrew McNab committed
176
177
  cpusMatchObject = re.search('CpusProvisioned *= *([0-9 +*/()-]*)', fromJobAd)
  jobfeatures['allocated_cpu'] = int(eval(cpusMatchObject.group(1)))
178
179
180
except:
  pass

181
182
183
184
185
186
187
188
189
190
191
192
193
194
# Look for any per-cpu values given in either file
for key in ['cpu_limit_secs', 'max_rss_bytes', 
            'max_swap_bytes', 'scratch_limit']:

  if key + '_per_cpu=' in fromRun + fromSysconfig:
    try:
      matchObject = re.search(key + "_per_cpu=([0-9]*)", fromRun + '\n' + fromSysconfig)

      if key not in jobfeatures:
        jobfeatures[key] = int(matchObject.group(1)) * jobfeatures['allocated_cpu']
    except:
      pass

# Examine the Job Ad, for resource limits
Andrew McNab's avatar
Andrew McNab committed
195
try:
Andrew McNab's avatar
Fixes    
Andrew McNab committed
196
  wallMatchObject = re.search('MaxJobRetirementTime *= *([0-9 +*/()-]*)', fromMachineAd)
Andrew McNab's avatar
Andrew McNab committed
197
198
199
200
201
202
203
204
  jobfeatures['wall_limit_secs'] = int(eval(wallMatchObject.group(1)))
except:
  pass

if not 'cpu_limit_secs' in jobfeatures and 'wall_limit_secs' in jobfeatures:
  # If not given in mjf files, we create a CPU seconds limit from wallclock 
  # and allocated CPUs/processors
  jobfeatures['cpu_limit_secs'] = jobfeatures['wall_limit_secs'] * jobfeatures['allocated_cpu']
205
206
  
try:
Andrew McNab's avatar
Fixes    
Andrew McNab committed
207
  rssMatchObject = re.search('MemoryProvisioned *= *([0-9 +*/()-]*)', fromJobAd)
Andrew McNab's avatar
Andrew McNab committed
208
  rssInt = int(eval(rssMatchObject.group(1)))
209
210
211
212
213
214
215
except:
  pass
else:    
  # Safer to assume powers of 1000 rather than 1024
  jobfeatures['max_rss_bytes'] = rssInt * 1000000

try:
Andrew McNab's avatar
Fixes    
Andrew McNab committed
216
217
  scratchMatchObject = re.search('DiskProvisioned *= *([0-9 +*/()-]*)', fromJobAd)
  scratchInt = int(eval(scratchMatchObject.group(1)))
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
except:
  pass
else:    
  # Safer to assume powers of 1000 rather than 1024
  jobfeatures['scratch_limit_bytes'] = scratchInt * 1000

# Write out if these have been set from files or prologue.user arguments
for key in ['allocated_cpu', 'wall_limit_secs', 'cpu_limit_secs',
            'max_rss_bytes', 'max_swap_bytes', 'scratch_limit_bytes']:

  if key in jobfeatures:
    open(jobfeaturesDir + '/' + key, 'w').write(str(jobfeatures[key]))

# Try to get/calculate hs06_job
hs06_job = None

if 'hs06_job=' in fromRun + fromSysconfig:
  try:
    matchObject = re.search("hs06_job=([0-9.]*)", fromRun + '\n' + fromSysconfig)
    hs06_job = float(matchObject.group(1))
  except:
    pass

if not hs06_job:
  try:
    hs06 = float(open('/etc/machinefeatures/hs06','r').readline())
  except:
    hs06 = None

  try:
    total_cpu = int(open('/etc/machinefeatures/total_cpu','r').readline())
  except:
    total_cpu = None

  if hs06 and total_cpu:
    # Simple pro-rata allocation of total hs06 depending on processors for this job
    hs06_job = (jobfeatures['allocated_cpu'] * hs06) / total_cpu
    
if hs06_job:
  # We got it from somewhere
  open(jobfeaturesDir + '/hs06_job', 'w').write('%.2f' % hs06_job)

260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
# Try to get/calculate db12_job
db12_job = None

if 'db12_job=' in fromRun + fromSysconfig:
  try:
    matchObject = re.search("db12_job=([0-9.]*)", fromRun + '\n' + fromSysconfig)
    db12_job = float(matchObject.group(1))
  except:
    pass

if not db12_job:
  try:
    db12 = float(open('/etc/machinefeatures/db12','r').readline())
  except:
    db12 = None

  try:
    total_cpu = int(open('/etc/machinefeatures/total_cpu','r').readline())
  except:
    total_cpu = None

  if db12 and total_cpu:
    # Simple pro-rata allocation of total db12 depending on processors for this job
    db12_job = (jobfeatures['allocated_cpu'] * db12) / total_cpu
    
if db12_job:
  # We got it from somewhere
  open(jobfeaturesDir + '/db12_job', 'w').write('%.2f' % db12_job)

289
290
291
292
# We output $JOBFEATURES in case the calling script needs it
print jobfeaturesDir