Job.h 18.8 KB
Newer Older
1
/*
2
  Copyright (C) 2002-2019 CERN for the benefit of the ATLAS collaboration
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
*/

#ifndef EVENT_LOOP_JOB_HH
#define EVENT_LOOP_JOB_HH

//          Copyright Nils Krumnack 2011.
// Distributed under the Boost Software License, Version 1.0.
//    (See accompanying file LICENSE_1_0.txt or copy at
//          http://www.boost.org/LICENSE_1_0.txt)

// Please feel free to contact me (krumnack@iastate.edu) for bug
// reports, feature suggestions, praise and complaints.


/// This module defines a class that manages a complete job
/// description.  The interface provided in this module is intended
/// for the general user.  The module is considered to be in the
/// pre-alpha stage.



#include <EventLoop/Global.h>

#include <vector>
27
#include <AnaAlgorithm/Global.h>
28
#include <EventLoop/JobConfig.h>
29
30
31
#include <SampleHandler/SampleHandler.h>
#include <SampleHandler/MetaObject.h>

32
33
34
35
36
namespace asg
{
  class AsgServiceConfig;
}

37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
namespace EL
{
  /// effects: standard swap
  /// guarantee: no-fail
  void swap (Job& a, Job& b);


  class Job
  {
    //
    // public interface
    //

    /// effects: test the invariant of this object
    /// guarantee: no-fail
  public:
    void testInvariant () const;


    /// effects: standard default constructor
    /// guarantee: strong
    /// failures: low level errors I
  public:
    Job ();


    /// effects: standard copy constructor
    /// guarantee: strong
    /// failures: out of memory II
  public:
    Job (const Job& that);


    /// effects: standard destructor
    /// guarantee: no-fail
  public:
    ~Job ();


    /// effects: standard assignment operator
    /// returns: *this
    /// guarantee: strong
    /// failures: out of memory II
  public:
    Job& operator = (const Job& that);


    /// description: the sample handler used
    /// guarantee: no-fail / strong
    /// failures: out of memory II
  public:
    const SH::SampleHandler& sampleHandler () const;
    void sampleHandler (const SH::SampleHandler& val_sampleHandler);


    /// description: the list of algorithms used
    /// guarantee: no-fail / strong
    /// failures: out of memory II
    /// invariant: alg != 0
  public:
97
    void algsAdd (std::unique_ptr<IAlgorithmWrapper> val_algorithm);
98
    void algsAdd (std::unique_ptr<Algorithm> val_algorithm);
99
    void algsAdd (Algorithm *alg_swallow);
100
    void algsAdd (const AnaAlgorithmConfig& config);
Nils Krumnack's avatar
Nils Krumnack committed
101
    void algsAdd (const AnaReentrantAlgorithmConfig& config);
102
    void algsAdd (const asg::AsgServiceConfig& config);
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165


    /// \brief add a clone of the given algorithm
    ///
    /// This is to be used when the user wants to reuse an algorithm
    /// for multiple Job objects or for some reason needs to delete it
    /// himself.
    /// \par Guarantee
    ///   strong
    /// \par Failures
    ///   algorithm cloning failures\n
    ///   out of memory I
  public:
    void algsAddClone (const Algorithm& alg);


    /// returns: whether we have the algorithm with the given name
    /// guarantee: no-fail
  public:
    bool algsHas (const std::string& name) const;


    /// description: the list of output datasets used
    /// guarantee: no-fail / strong
    /// failures(3): out of memory II
    /// requires(3,soft): !outputHas (val_output.label())
    /// note: while it is not a requirement it is highly recommended
    ///   that you add the outputs from Algorithm::doSetupJob.  That
    ///   way they get automatically enabled/disabled when you
    ///   enable/disable the algorithms.  Plus your code will be
    ///   slightly less spread out.
  public:
    typedef OutputStream* outputMIter;
    typedef const OutputStream* outputIter;
    outputMIter outputBegin ();
    outputIter outputBegin () const;
    outputMIter outputEnd ();
    outputIter outputEnd () const;
    void outputAdd (const OutputStream& val_output);


    /// returns: whether we have an output with the given name
    /// guarantee: no-fail
  public:
    bool outputHas (const std::string& name) const;


    /// effects: register this job to use XAODs
    /// guarantee: strong
    /// failures: out of memory II
    /// failures: TEventSvc not available
  public:
    void useXAOD ();


    /// description: the list of options to the job
    /// guarantee: no-fail
    /// postcondition: result != 0
  public:
    SH::MetaObject *options ();
    const SH::MetaObject *options () const;


166
167
168
169
170
171
172
    /// \brief the \ref JobConfig object we are wrapping
    /// \par Guarantee
    ///   no-fail
  public:
    const JobConfig& jobConfig () const noexcept;


173
174
175
176
177
178
179
180
181
182
183
184
    /// description: the name of the option for overwriting the
    ///   submission directory.  if you set this to a non-zero value
    ///   it will remove any existing submit-directory before trying
    ///   to create a new one.
    /// rationale: normally you don't want to silently remove an
    ///   existing submission directory, since it may contain valuable
    ///   data, but for some cases like debugging you may be annoyed
    ///   to delete it manually.
  public:
    static const std::string optRemoveSubmitDir;


185
186
187
188
189
190
191
192
193
194
195
    /// \brief the submit-dir mode (allowed values: "no-clobber",
    /// "overwrite", "unique", "unique-link")
  public:
    static const std::string optSubmitDirMode;

    /// \brief the date-format to use when generating unique
    /// submission directory names
  public:
    static const std::string optUniqueDateFormat;


196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
    /// description: the name of the option used for setting the
    ///   maximum number of events to process per sample
    /// rationale: this is used for test runs where you don't want to
    ///   process all events, but just want a quick look
  public:
    static const std::string optMaxEvents;


    /// description: the name of the option used for skipping a
    ///   certain number of events in the beginning
    /// rationale: this is used for test runs where you are only
    ///   interested in a particular set of events
  public:
    static const std::string optSkipEvents;


    /// description: the name of the option for selecting the number
    ///   of files per batch job.  (only BatchDriver and derived
    ///   drivers).

    /// rationale: this is typically used if you are either processing
    ///   fairly small input files or are producing a skim with a very
    ///   high reduction factor.  it will reduce the inefficiency
    ///   associated with starting a lot of jobs or reading a lot of
    ///   output files
  public:
    static const std::string optFilesPerWorker;


    /// description: the name of the option for selecting the number
    ///   of events per batch job.  (only BatchDriver and derived
    ///   drivers).
    /// warning: this option will be ignored unless you have called
    ///   SH::scanNEvents first.
    /// rationale: this allows to make your jobs all approximately
    ///   equal size and gives you much finer control of the lengths
    ///   of your jobs.  if you did run SH::scanNEvents, but didn't
    ///   specify a number of events per worker this will produce the
    ///   same number of jobs as you would have otherwise, but spread
    ///   the workload more evenly.
  public:
    static const std::string optEventsPerWorker;


    /// description: the name of the option for supplying extra submit
    ///   parameters to batch systems
    /// rationale: the primary purpose of this is to allow selecting
    ///   the right queue on your batch system, but it can be used for
    ///   all kind of submission options that EventLoop doesn't
    ///   support directly.
  public:
    static const std::string optSubmitFlags;


    /// description: the name of the option for supplying extra
    ///   parameters for condor systems
  public:
    static const std::string optCondorConf;


    /// description: this option allows to configure the TTreeCache
    ///   size for this job.  if it is smaller or equal to 0, the
    ///   cache is turned off.
    /// rationale: if you read data from across the network using the
    ///   cache mechanism will greatly increase your performance.  for
    ///   local disks you may gain or lose performance.
  public:
    static const std::string optCacheSize;


    /// description: this option allows to configure the number of
    ///   tree entries used for learning cache behavior.
    /// rationale: there is a trade-off here, if you set this too low
    ///   you will fail to cache more rarely used variables.  if you
    ///   set it too high it will take too long until the cache kicks
    ///   in.
    /// warning: right now this is performed on a per-file basis,
    ///   which at some point will be raised to a per-job basis, which
    ///   is the limit of how far caching can go in this approach.  if
    ///   you need multi-job support, either use D3PDReader or contact
    ///   me on how to do this.
  public:
    static const std::string optCacheLearnEntries;


    /// description: the name of the option for turning on
    ///   D3PDPerfStats.  To use D3PDPerfStats set this to a non-zero
    ///   value.
    /// warning: this only works if you read data through D3PDReader
    /// rationale: this can be used for fine-tuning TTreeCache or
    ///   generally optimizing i/o performance
  public:
    static const std::string optD3PDPerfStats;


    /// description: the name of the D3PDPerfStats object produced as
    ///   I gather it, as well as the name of the option passed into
    ///   the job.
    /// rationale: I use the same name in two places to allow reading
    ///   the stats object from one output and passing it to the next
  public:
    static const std::string optD3PDReadStats;


    /// description: the name of the option for turning on
    ///   XAODPerfStats.  To use XAODPerfStats set this to a non-zero
    ///   value.
    /// warning: this only works if you read data through XAODReader
    /// rationale: this can be used for fine-tuning TTreeCache or
    ///   generally optimizing i/o performance
  public:
    static const std::string optXAODPerfStats;


    /// description: the name of the XAODPerfStats object produced as
    ///   I gather it, as well as the name of the option passed into
    ///   the job.
    /// rationale: I use the same name in two places to allow reading
    ///   the stats object from one output and passing it to the next
  public:
    static const std::string optXAODReadStats;


    /// description: these options configure the D3PDReader TTreeCache
    ///   settings.  if you use more than one option, the result is
    ///   undefined.
    /// warning: this only works if you use D3PDReader
    /// rationale: the idea is that in your first job you create the
    ///   D3PDReadStats object, which you then pass to subsequent
    ///   jobs.
  public:
    static const std::string optD3PDCacheMinEvent;
    static const std::string optD3PDCacheMinEventFraction;
    static const std::string optD3PDCacheMinByte;
    static const std::string optD3PDCacheMinByteFraction;


    /// description: the option to turn on the performance tree in
    ///   PROOF.  if this is set to 1, it will write out the tree
    /// rationale: this can help in tuning your PROOF cluster or
    ///   tuning what you do in PROOF, but it may have some overhead,
    ///   which is why we don't do it by default
  public:
    static const std::string optPerfTree;


342
343
344
345
    /// \brief the option to select whether our input is xAODs
  public:
    static const std::string optXAODInput;

346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
    /// description: the option to select the access mode for xAODs.
    ///   this can be "branch" for branch access, or "class" for
    ///   access.  if this option isn't specified EventLoop will pick
    ///   whatever is currently recommended.
  public:
    static const std::string optXaodAccessMode;
    static const std::string optXaodAccessMode_branch;
    static const std::string optXaodAccessMode_class;
    static const std::string optXaodAccessMode_athena;


    /// \brief the option to turn on/off the xAOD summary reporting at
    /// the end of the job
  public:
    static const std::string optXAODSummaryReport;


    /// description: the option to turn on printing of i/o statistics
    ///   at the end of each file
    /// rationale: while this is not as complete as whole sample
    ///   statistics, it can be helpful in some circumstances when
    ///   debugging the performance
    /// warning: this is not supported for all drivers
  public:
    static const std::string optPrintPerFileStats;


    /// description: the option to turn off collection of performance data
  public:
    static const std::string optDisableMetrics;


    /// description: the option to reset the shell on the worker nodes
    /// rationale: this is currently only used by the LSFDriver where
    ///   it is enabled by default to reset it on lxbatch.
  public:
    static const std::string optResetShell;

    /// \brief the option not to unsetup the environment in \ref
    /// LocalDriver
  public:
    static const std::string optLocalNoUnsetup;


    /// \brief the option to do processing in a background process in PROOF
  public:
    static const std::string optBackgroundProcess;


395
396
397
398
399
    /// \brief the output sample name
  public:
    static const std::string optOutputSampleName;


400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
    /// description: grid-specific options
    /// rationale: these are named so as to correspond to prun equivalents,
    ///   bare the optGrid prefix.
  public:
    static const std::string optGridDestSE;
    static const std::string optGridSite;
    static const std::string optGridExcludedSite;
    static const std::string optGridNGBPerJob;
    static const std::string optGridMemory;
    static const std::string optGridMaxCpuCount;
    static const std::string optGridNFiles;
    static const std::string optGridNFilesPerJob;
    static const std::string optGridNJobs;
    static const std::string optGridMaxFileSize;
    static const std::string optGridMaxNFilesPerJob;
    static const std::string optGridExpress;
    static const std::string optGridNoSubmit;
    static const std::string optGridMergeOutput;
418
419
420
    static const std::string optGridAddNthFieldOfInDSToLFN;
    static const std::string optGridWorkingGroup;
    static const std::string optGridShowCmd;
421
422
    static const std::string optGridCpuTimePerEvent;
    static const std::string optGridMaxWalltime;
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
    static const std::string optTmpDir;
    static const std::string optRootVer;
    static const std::string optCmtConfig;
    static const std::string optGridDisableAutoRetry;
    static const std::string optOfficial;
    static const std::string optVoms;

    /// these options are defined in \ref SH::MetaNames
    /// \{

    static const std::string optRetries;
    static const std::string optRetriesWait;

    /// \}


439
440
441
442
    /// a list of files that need to be available within the worker job
    static const std::string optUserFiles;


443
444
445
446
447

    /// description: batch-specific options
    /// rationale: these options are for configuring batch drivers
  public:
    static const std::string optBatchSharedFileSystem;
448
449
    /// The content of this string will be executed in the job script on the worker node
    /// before the main executable is run.
450
    static const std::string optBatchSlurmExtraConfigLines;
451
452
453
    /// Append a command before the main executable is called
    /// This is useful is you want to execute the command e.g. within shifter.
    static const std::string optBatchSlurmWrapperExec;
454
455
    /// This overrides the asetup command if you need to use a custom one
    static const std::string optBatchSetupCommand;
456

457
458
459
460
461
462
463
    /// \brief this is the name of the docker image, when using docker
    /// with a supported batch driver
    static const std::string optDockerImage;

    /// \brief any extra options we may want to pass to docker
    static const std::string optDockerOptions;

464
465
466
    /// \brief the job submission configuration file (used by some
    /// drivers that need more complex configuration)
    static const std::string optBatchConfigFile;
Nils Krumnack's avatar
Nils Krumnack committed
467

468
469
470
471
472
    /// \brief the job submission setup file.  unlike \ref
    /// optBatchConfigFile this only gets used once per submission
    /// instead of once per job.
    static const std::string optBatchSetupFile;

473

474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
  public:
    /// @name Options controlling the memory monitoring behaviour of the job
    /// @{

    /// The minimal per-event resident memory increase for triggering an error
    ///
    /// This is the main setting for triggering a failure in memory-leaking
    /// analysis jobs. It sets the limit on the per-event resident memory
    /// increase of the job for it to be still successful. It is an integer
    /// property, setting the limit in kilobytes.
    ///
    static const std::string optMemResidentPerEventIncreaseLimit;

    /// The minimal per-event virtual memory increase for triggering an error
    ///
    /// Implemented very similarly to @c optMemResidentPerEventIncreaseLimit.
    /// But since normally we don't care about the virtual memory usage of the
    /// jobs that much, it is set to zero by default. Making
    /// @c optMemResidentPerEventIncreaseLimit control the behaviour of the
    /// job.
    ///
    static const std::string optMemVirtualPerEventIncreaseLimit;

    /// The minimal resident memory increase necessary to trigger an error
    ///
    /// It is an integer property, setting the limit in kilobytes. Jobs have
    /// to increase their resident memory usage by this amount to trigger a
    /// failure.
    ///
    static const std::string optMemResidentIncreaseLimit;

    /// The minimal virtual memory increase necessary to trigger an error
    ///
    /// Implemented very similarly to @c optMemResidentIncreaseLimit. Since
    /// normally the virtual memory usage is not considered in producing a
    /// failure, it is set to zero by default.
    ///
    static const std::string optMemVirtualIncreaseLimit;

    /// Failure behaviour of the code when a "significant memory leak" is found
    ///
    /// This flag allows the user to select what should happen when the code
    /// finds a memory leak in the job that is larger than the values set by
    /// @c optMemResidentPerEventIncreaseLimit,
    /// @c optMemVirtualPerEventIncreaseLimit, @c optMemResidentIncreaseLimit
    /// and @c optMemVirtualIncreaseLimit.
    ///
    /// It's a boolean property. When set to @c true, the job fails if a
    /// significant memory leak is detected. If set to @c false, only a warning
    /// is printed.
    ///
    static const std::string optMemFailOnLeak;

    /// @}


530
531
532
533
534
535
536
537
538
539
    /// \brief the name of the histogram output stream
    ///
    /// Normally users don't need to worry about the histogram output
    /// stream, and it just gets created for you automatically, but in
    /// some situations you will need to reconfigure the histogram
    /// output stream.
    static const std::string histogramStreamName;



540
541
542
543
544
545
546
547
548
549
550
551
552
    //
    // private interface
    //

    friend void swap (Job& a, Job& b);

    /// description: members directly corresponding to accessors
  private:
    SH::SampleHandler m_sampleHandler;
  private:
    std::vector<EL::OutputStream> m_output;
  private:
    SH::MetaObject m_options;
553
554
555
556

    /// \brief the \ref JobConfig object we use
  private:
    EL::JobConfig m_jobConfig;
557
558
559
560
  };
}

#endif