make-jobfeatures.htcondor 7.94 KB
Newer Older
1
2
#!/usr/bin/python
#
3
#  make-jobfeatures script for Machine/Job Features on HTCondor
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
#
#  Andrew McNab, University of Manchester.
#  Copyright (c) 2016. All rights reserved.
#
#  Redistribution and use in source and binary forms, with or
#  without modification, are permitted provided that the following
#  conditions are met:
#
#    o Redistributions of source code must retain the above
#      copyright notice, this list of conditions and the following
#      disclaimer. 
#    o Redistributions in binary form must reproduce the above
#      copyright notice, this list of conditions and the following
#      disclaimer in the documentation and/or other materials
#      provided with the distribution. 
#
#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
#  CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
#  INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
#  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
#  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS
#  BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
#  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
#  TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
#  ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
#  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
#  OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
#  POSSIBILITY OF SUCH DAMAGE.

#  Create $JOBFEATURES files following the Machine/Job Features 
#  specification in HSF-TN-2016-02
#
#  This script creates the $JOBFEATURES directory at 
#  /tmp/mjf-$USER/jobfeatures-$GlobalJobId (replacing # with _)
#  and attempts to populate it from HTCondor Job Ad information 
#  and from $MACHINEFEATURES=/etc/machinefeatures
#
#  If the following variables are present in /var/run/mjf or 
#  /etc/sysconfig/mjf then they are used as defaults if not given
#  by HTCondor:
#
#  - allocated_cpu
#  - wall_limit_secs
#  - cpu_limit_secs
#  - max_rss_bytes
#  - max_swap_bytes
#  - scratch_limit_bytes
#  - hs06_job
53
#  - db12_job
54
55
56
57
58
59
60
61
62
#
# Values in /var/run/mjf are preferred over /etc/sysconfig/mjf.
#
# Additionally you can set mjf_tmp_dir in those files to use a 
# directory other than /tmp for the mjf-$USER directories.
#

import os
import re
63
import pwd
64
65
66
67
68
69
import sys
import stat
import time

os.umask(0022)

Andrew McNab's avatar
Andrew McNab committed
70
71
72
try:
  fromMachineAd = open(os.environ['_CONDOR_MACHINE_AD'],'r').read()
except:
73
  sys.stderr.write('Could not read $_CONDOR_MACHINE_AD file - exiting make-jobfeatures\n')
Andrew McNab's avatar
Andrew McNab committed
74
75
  sys.exit(10)

76
77
78
try:
  fromJobAd = open(os.environ['_CONDOR_JOB_AD'],'r').read()
except:
79
  sys.stderr.write('Could not read $_CONDOR_JOB_AD file - exiting make-jobfeatures\n')
80
81
82
  sys.exit(10)

try:
Andrew McNab's avatar
Fixes    
Andrew McNab committed
83
  matchObject = re.search('GlobalJobId *= *"(.*)"', fromJobAd)
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
  job_id = matchObject.group(1)
except:
  print 'Could not get GlobalJobId from Job Ad'
  sys.exit(11)

# Values in /var/run/mjf take precedence
try:
  fromRun = open('/var/run/mjf','r').read()
except:
  fromRun = ''

# Also look in persistent /etc/sysconfig/mjf
try:
  fromSysconfig = open('/etc/sysconfig/mjf','r').read()
except:
  fromSysconfig = ''  

mjfTmpDir = '/tmp'

if 'mjf_tmp_dir=' in fromRun + fromSysconfig:
  try:
Andrew McNab's avatar
Fixes    
Andrew McNab committed
105
    matchObject = re.search("mjf_tmp_dir=(.*)", fromRun + '\n' + fromSysconfig)
106
107
108
109
    mjfTmpDir = matchObject.group(1)
  except:
    pass

110
111
112
113
114
try:
  userName = pwd.getpwuid(os.getuid())[0]
except:
  userName = str(os.getuid())

115
116
# First make sure the mjfTmpDir/mjf-$USER directory exists
try:
117
  os.mkdir(mjfTmpDir + '/mjf-' + userName)
118
119
120
121
122
except:
  # Ok if exists already
  pass

try: 
123
  userDirStat = os.stat(mjfTmpDir + '/mjf-' + userName)
124
except:
125
  print mjfTmpDir + '/mjf-' + userName + ' could not be created'
126
127
128
129
  sys.exit(12)

# Check it is owned by us
if userDirStat.st_uid != os.getuid() or userDirStat.st_gid != os.getgid():
130
  print mjfTmpDir + '/mjf-' + userName + ' has the wrong UID/GID'
131
132
133
  sys.exit(13)

# Ensure the right permissions
134
os.chmod(mjfTmpDir + '/mjf-' + userName, 
135
136
137
         stat.S_IRWXU + stat.S_IRGRP + stat.S_IXGRP + stat.S_IROTH + stat.S_IXOTH)

# Now the directory for this job (but with # -> _)
138
jobfeaturesDir = mjfTmpDir + '/mjf-' + userName + '/jobfeatures-' + job_id.replace('#', '_')
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166

try:
  os.mkdir(jobfeaturesDir)
except:
  print 'Failed to create ' + jobfeaturesDir
  sys.exit(14)

open(jobfeaturesDir + '/job_id', 'w').write(job_id)

jobfeatures = {}
jobfeatures['allocated_cpu'] = 1
  
jobstart_secs = int(time.time())
open(jobfeaturesDir + '/jobstart_secs', 'w').write(str(jobstart_secs))

# Get any defaults for these (integer) values
for key in ['allocated_cpu', 'wall_limit_secs', 'cpu_limit_secs',
            'max_rss_bytes', 'max_swap_bytes', 'scratch_limit_bytes']:

  if key + '=' in fromRun + fromSysconfig:
    try:
      matchObject = re.search(key + "=([0-9]*)", fromRun + '\n' + fromSysconfig)
      jobfeatures[key] = int(matchObject.group(1))
    except:
      pass

# Examine the Job Ad, for resource limits
try:
Andrew McNab's avatar
Andrew McNab committed
167
168
  cpusMatchObject = re.search('CpusProvisioned *= *([0-9 +*/()-]*)', fromJobAd)
  jobfeatures['allocated_cpu'] = int(eval(cpusMatchObject.group(1)))
169
170
171
except:
  pass

Andrew McNab's avatar
Andrew McNab committed
172
try:
Andrew McNab's avatar
Fixes    
Andrew McNab committed
173
  wallMatchObject = re.search('MaxJobRetirementTime *= *([0-9 +*/()-]*)', fromMachineAd)
Andrew McNab's avatar
Andrew McNab committed
174
175
176
177
178
179
180
181
  jobfeatures['wall_limit_secs'] = int(eval(wallMatchObject.group(1)))
except:
  pass

if not 'cpu_limit_secs' in jobfeatures and 'wall_limit_secs' in jobfeatures:
  # If not given in mjf files, we create a CPU seconds limit from wallclock 
  # and allocated CPUs/processors
  jobfeatures['cpu_limit_secs'] = jobfeatures['wall_limit_secs'] * jobfeatures['allocated_cpu']
182
183
  
try:
Andrew McNab's avatar
Fixes    
Andrew McNab committed
184
  rssMatchObject = re.search('MemoryProvisioned *= *([0-9 +*/()-]*)', fromJobAd)
Andrew McNab's avatar
Andrew McNab committed
185
  rssInt = int(eval(rssMatchObject.group(1)))
186
187
188
189
190
191
192
except:
  pass
else:    
  # Safer to assume powers of 1000 rather than 1024
  jobfeatures['max_rss_bytes'] = rssInt * 1000000

try:
Andrew McNab's avatar
Fixes    
Andrew McNab committed
193
194
  scratchMatchObject = re.search('DiskProvisioned *= *([0-9 +*/()-]*)', fromJobAd)
  scratchInt = int(eval(scratchMatchObject.group(1)))
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
except:
  pass
else:    
  # Safer to assume powers of 1000 rather than 1024
  jobfeatures['scratch_limit_bytes'] = scratchInt * 1000

# Write out if these have been set from files or prologue.user arguments
for key in ['allocated_cpu', 'wall_limit_secs', 'cpu_limit_secs',
            'max_rss_bytes', 'max_swap_bytes', 'scratch_limit_bytes']:

  if key in jobfeatures:
    open(jobfeaturesDir + '/' + key, 'w').write(str(jobfeatures[key]))

# Try to get/calculate hs06_job
hs06_job = None

if 'hs06_job=' in fromRun + fromSysconfig:
  try:
    matchObject = re.search("hs06_job=([0-9.]*)", fromRun + '\n' + fromSysconfig)
    hs06_job = float(matchObject.group(1))
  except:
    pass

if not hs06_job:
  try:
    hs06 = float(open('/etc/machinefeatures/hs06','r').readline())
  except:
    hs06 = None

  try:
    total_cpu = int(open('/etc/machinefeatures/total_cpu','r').readline())
  except:
    total_cpu = None

  if hs06 and total_cpu:
    # Simple pro-rata allocation of total hs06 depending on processors for this job
    hs06_job = (jobfeatures['allocated_cpu'] * hs06) / total_cpu
    
if hs06_job:
  # We got it from somewhere
  open(jobfeaturesDir + '/hs06_job', 'w').write('%.2f' % hs06_job)

237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
# Try to get/calculate db12_job
db12_job = None

if 'db12_job=' in fromRun + fromSysconfig:
  try:
    matchObject = re.search("db12_job=([0-9.]*)", fromRun + '\n' + fromSysconfig)
    db12_job = float(matchObject.group(1))
  except:
    pass

if not db12_job:
  try:
    db12 = float(open('/etc/machinefeatures/db12','r').readline())
  except:
    db12 = None

  try:
    total_cpu = int(open('/etc/machinefeatures/total_cpu','r').readline())
  except:
    total_cpu = None

  if db12 and total_cpu:
    # Simple pro-rata allocation of total db12 depending on processors for this job
    db12_job = (jobfeatures['allocated_cpu'] * db12) / total_cpu
    
if db12_job:
  # We got it from somewhere
  open(jobfeaturesDir + '/db12_job', 'w').write('%.2f' % db12_job)

266
267
268
269
# We output $JOBFEATURES in case the calling script needs it
print jobfeaturesDir