Job.h 18.8 KB
 1 /*  2  Copyright (C) 2002-2019 CERN for the benefit of the ATLAS collaboration  3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 */ #ifndef EVENT_LOOP_JOB_HH #define EVENT_LOOP_JOB_HH // Copyright Nils Krumnack 2011. // Distributed under the Boost Software License, Version 1.0. // (See accompanying file LICENSE_1_0.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) // Please feel free to contact me (krumnack@iastate.edu) for bug // reports, feature suggestions, praise and complaints. /// This module defines a class that manages a complete job /// description. The interface provided in this module is intended /// for the general user. The module is considered to be in the /// pre-alpha stage. #include #include  Nils Krumnack committed Aug 29, 2017 27 #include  Nils Krumnack committed Feb 03, 2018 28 #include  29 30 31 #include #include  Nils Krumnack committed Mar 29, 2021 32 33 34 35 36 namespace asg { class AsgServiceConfig; }  37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 namespace EL { /// effects: standard swap /// guarantee: no-fail void swap (Job& a, Job& b); class Job { // // public interface // /// effects: test the invariant of this object /// guarantee: no-fail public: void testInvariant () const; /// effects: standard default constructor /// guarantee: strong /// failures: low level errors I public: Job (); /// effects: standard copy constructor /// guarantee: strong /// failures: out of memory II public: Job (const Job& that); /// effects: standard destructor /// guarantee: no-fail public: ~Job (); /// effects: standard assignment operator /// returns: *this /// guarantee: strong /// failures: out of memory II public: Job& operator = (const Job& that); /// description: the sample handler used /// guarantee: no-fail / strong /// failures: out of memory II public: const SH::SampleHandler& sampleHandler () const; void sampleHandler (const SH::SampleHandler& val_sampleHandler); /// description: the list of algorithms used /// guarantee: no-fail / strong /// failures: out of memory II /// invariant: alg != 0 public:  Nils Krumnack committed Mar 29, 2021 97  void algsAdd (std::unique_ptr val_algorithm);  Nils Krumnack committed Feb 03, 2018 98  void algsAdd (std::unique_ptr val_algorithm);  99  void algsAdd (Algorithm *alg_swallow);  Nils Krumnack committed Aug 10, 2017 100  void algsAdd (const AnaAlgorithmConfig& config);  Nils Krumnack committed Mar 29, 2021 101  void algsAdd (const AnaReentrantAlgorithmConfig& config);  Nils Krumnack committed Mar 29, 2021 102  void algsAdd (const asg::AsgServiceConfig& config);  103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165  /// \brief add a clone of the given algorithm /// /// This is to be used when the user wants to reuse an algorithm /// for multiple Job objects or for some reason needs to delete it /// himself. /// \par Guarantee /// strong /// \par Failures /// algorithm cloning failures\n /// out of memory I public: void algsAddClone (const Algorithm& alg); /// returns: whether we have the algorithm with the given name /// guarantee: no-fail public: bool algsHas (const std::string& name) const; /// description: the list of output datasets used /// guarantee: no-fail / strong /// failures(3): out of memory II /// requires(3,soft): !outputHas (val_output.label()) /// note: while it is not a requirement it is highly recommended /// that you add the outputs from Algorithm::doSetupJob. That /// way they get automatically enabled/disabled when you /// enable/disable the algorithms. Plus your code will be /// slightly less spread out. public: typedef OutputStream* outputMIter; typedef const OutputStream* outputIter; outputMIter outputBegin (); outputIter outputBegin () const; outputMIter outputEnd (); outputIter outputEnd () const; void outputAdd (const OutputStream& val_output); /// returns: whether we have an output with the given name /// guarantee: no-fail public: bool outputHas (const std::string& name) const; /// effects: register this job to use XAODs /// guarantee: strong /// failures: out of memory II /// failures: TEventSvc not available public: void useXAOD (); /// description: the list of options to the job /// guarantee: no-fail /// postcondition: result != 0 public: SH::MetaObject *options (); const SH::MetaObject *options () const;  Nils Krumnack committed Feb 03, 2018 166 167 168 169 170 171 172  /// \brief the \ref JobConfig object we are wrapping /// \par Guarantee /// no-fail public: const JobConfig& jobConfig () const noexcept;  173 174 175 176 177 178 179 180 181 182 183 184  /// description: the name of the option for overwriting the /// submission directory. if you set this to a non-zero value /// it will remove any existing submit-directory before trying /// to create a new one. /// rationale: normally you don't want to silently remove an /// existing submission directory, since it may contain valuable /// data, but for some cases like debugging you may be annoyed /// to delete it manually. public: static const std::string optRemoveSubmitDir;  Nils Krumnack committed Aug 22, 2019 185 186 187 188 189 190 191 192 193 194 195  /// \brief the submit-dir mode (allowed values: "no-clobber", /// "overwrite", "unique", "unique-link") public: static const std::string optSubmitDirMode; /// \brief the date-format to use when generating unique /// submission directory names public: static const std::string optUniqueDateFormat;  196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341  /// description: the name of the option used for setting the /// maximum number of events to process per sample /// rationale: this is used for test runs where you don't want to /// process all events, but just want a quick look public: static const std::string optMaxEvents; /// description: the name of the option used for skipping a /// certain number of events in the beginning /// rationale: this is used for test runs where you are only /// interested in a particular set of events public: static const std::string optSkipEvents; /// description: the name of the option for selecting the number /// of files per batch job. (only BatchDriver and derived /// drivers). /// rationale: this is typically used if you are either processing /// fairly small input files or are producing a skim with a very /// high reduction factor. it will reduce the inefficiency /// associated with starting a lot of jobs or reading a lot of /// output files public: static const std::string optFilesPerWorker; /// description: the name of the option for selecting the number /// of events per batch job. (only BatchDriver and derived /// drivers). /// warning: this option will be ignored unless you have called /// SH::scanNEvents first. /// rationale: this allows to make your jobs all approximately /// equal size and gives you much finer control of the lengths /// of your jobs. if you did run SH::scanNEvents, but didn't /// specify a number of events per worker this will produce the /// same number of jobs as you would have otherwise, but spread /// the workload more evenly. public: static const std::string optEventsPerWorker; /// description: the name of the option for supplying extra submit /// parameters to batch systems /// rationale: the primary purpose of this is to allow selecting /// the right queue on your batch system, but it can be used for /// all kind of submission options that EventLoop doesn't /// support directly. public: static const std::string optSubmitFlags; /// description: the name of the option for supplying extra /// parameters for condor systems public: static const std::string optCondorConf; /// description: this option allows to configure the TTreeCache /// size for this job. if it is smaller or equal to 0, the /// cache is turned off. /// rationale: if you read data from across the network using the /// cache mechanism will greatly increase your performance. for /// local disks you may gain or lose performance. public: static const std::string optCacheSize; /// description: this option allows to configure the number of /// tree entries used for learning cache behavior. /// rationale: there is a trade-off here, if you set this too low /// you will fail to cache more rarely used variables. if you /// set it too high it will take too long until the cache kicks /// in. /// warning: right now this is performed on a per-file basis, /// which at some point will be raised to a per-job basis, which /// is the limit of how far caching can go in this approach. if /// you need multi-job support, either use D3PDReader or contact /// me on how to do this. public: static const std::string optCacheLearnEntries; /// description: the name of the option for turning on /// D3PDPerfStats. To use D3PDPerfStats set this to a non-zero /// value. /// warning: this only works if you read data through D3PDReader /// rationale: this can be used for fine-tuning TTreeCache or /// generally optimizing i/o performance public: static const std::string optD3PDPerfStats; /// description: the name of the D3PDPerfStats object produced as /// I gather it, as well as the name of the option passed into /// the job. /// rationale: I use the same name in two places to allow reading /// the stats object from one output and passing it to the next public: static const std::string optD3PDReadStats; /// description: the name of the option for turning on /// XAODPerfStats. To use XAODPerfStats set this to a non-zero /// value. /// warning: this only works if you read data through XAODReader /// rationale: this can be used for fine-tuning TTreeCache or /// generally optimizing i/o performance public: static const std::string optXAODPerfStats; /// description: the name of the XAODPerfStats object produced as /// I gather it, as well as the name of the option passed into /// the job. /// rationale: I use the same name in two places to allow reading /// the stats object from one output and passing it to the next public: static const std::string optXAODReadStats; /// description: these options configure the D3PDReader TTreeCache /// settings. if you use more than one option, the result is /// undefined. /// warning: this only works if you use D3PDReader /// rationale: the idea is that in your first job you create the /// D3PDReadStats object, which you then pass to subsequent /// jobs. public: static const std::string optD3PDCacheMinEvent; static const std::string optD3PDCacheMinEventFraction; static const std::string optD3PDCacheMinByte; static const std::string optD3PDCacheMinByteFraction; /// description: the option to turn on the performance tree in /// PROOF. if this is set to 1, it will write out the tree /// rationale: this can help in tuning your PROOF cluster or /// tuning what you do in PROOF, but it may have some overhead, /// which is why we don't do it by default public: static const std::string optPerfTree;  Nils Krumnack committed Feb 06, 2019 342 343 344 345  /// \brief the option to select whether our input is xAODs public: static const std::string optXAODInput;  346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394  /// description: the option to select the access mode for xAODs. /// this can be "branch" for branch access, or "class" for /// access. if this option isn't specified EventLoop will pick /// whatever is currently recommended. public: static const std::string optXaodAccessMode; static const std::string optXaodAccessMode_branch; static const std::string optXaodAccessMode_class; static const std::string optXaodAccessMode_athena; /// \brief the option to turn on/off the xAOD summary reporting at /// the end of the job public: static const std::string optXAODSummaryReport; /// description: the option to turn on printing of i/o statistics /// at the end of each file /// rationale: while this is not as complete as whole sample /// statistics, it can be helpful in some circumstances when /// debugging the performance /// warning: this is not supported for all drivers public: static const std::string optPrintPerFileStats; /// description: the option to turn off collection of performance data public: static const std::string optDisableMetrics; /// description: the option to reset the shell on the worker nodes /// rationale: this is currently only used by the LSFDriver where /// it is enabled by default to reset it on lxbatch. public: static const std::string optResetShell; /// \brief the option not to unsetup the environment in \ref /// LocalDriver public: static const std::string optLocalNoUnsetup; /// \brief the option to do processing in a background process in PROOF public: static const std::string optBackgroundProcess;  Tadej Novak committed Jul 23, 2019 395 396 397 398 399  /// \brief the output sample name public: static const std::string optOutputSampleName;  400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417  /// description: grid-specific options /// rationale: these are named so as to correspond to prun equivalents, /// bare the optGrid prefix. public: static const std::string optGridDestSE; static const std::string optGridSite; static const std::string optGridExcludedSite; static const std::string optGridNGBPerJob; static const std::string optGridMemory; static const std::string optGridMaxCpuCount; static const std::string optGridNFiles; static const std::string optGridNFilesPerJob; static const std::string optGridNJobs; static const std::string optGridMaxFileSize; static const std::string optGridMaxNFilesPerJob; static const std::string optGridExpress; static const std::string optGridNoSubmit; static const std::string optGridMergeOutput;  Tadej Novak committed Jul 23, 2019 418 419 420  static const std::string optGridAddNthFieldOfInDSToLFN; static const std::string optGridWorkingGroup; static const std::string optGridShowCmd;  Tadej Novak committed Oct 29, 2019 421 422  static const std::string optGridCpuTimePerEvent; static const std::string optGridMaxWalltime;  423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438  static const std::string optTmpDir; static const std::string optRootVer; static const std::string optCmtConfig; static const std::string optGridDisableAutoRetry; static const std::string optOfficial; static const std::string optVoms; /// these options are defined in \ref SH::MetaNames /// \{ static const std::string optRetries; static const std::string optRetriesWait; /// \}  Nils Krumnack committed May 19, 2021 439 440 441 442  /// a list of files that need to be available within the worker job static const std::string optUserFiles;  443 444 445 446 447  /// description: batch-specific options /// rationale: these options are for configuring batch drivers public: static const std::string optBatchSharedFileSystem;  Simone Pagan Griso committed Aug 03, 2017 448 449  /// The content of this string will be executed in the job script on the worker node /// before the main executable is run.  Simone Pagan Griso committed Aug 03, 2017 450  static const std::string optBatchSlurmExtraConfigLines;  Simone Pagan Griso committed Aug 03, 2017 451 452 453  /// Append a command before the main executable is called /// This is useful is you want to execute the command e.g. within shifter. static const std::string optBatchSlurmWrapperExec;  Giordon Holtsberg Stark committed Aug 17, 2017 454 455  /// This overrides the asetup command if you need to use a custom one static const std::string optBatchSetupCommand;  456   Nils Krumnack committed Jul 23, 2019 457 458 459 460 461 462 463  /// \brief this is the name of the docker image, when using docker /// with a supported batch driver static const std::string optDockerImage; /// \brief any extra options we may want to pass to docker static const std::string optDockerOptions;  Nils Krumnack committed Jul 24, 2019 464 465 466  /// \brief the job submission configuration file (used by some /// drivers that need more complex configuration) static const std::string optBatchConfigFile;  Nils Krumnack committed Jul 24, 2019 467   Nils Krumnack committed Jul 24, 2019 468 469 470 471 472  /// \brief the job submission setup file. unlike \ref /// optBatchConfigFile this only gets used once per submission /// instead of once per job. static const std::string optBatchSetupFile;  473   474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529  public: /// @name Options controlling the memory monitoring behaviour of the job /// @{ /// The minimal per-event resident memory increase for triggering an error /// /// This is the main setting for triggering a failure in memory-leaking /// analysis jobs. It sets the limit on the per-event resident memory /// increase of the job for it to be still successful. It is an integer /// property, setting the limit in kilobytes. /// static const std::string optMemResidentPerEventIncreaseLimit; /// The minimal per-event virtual memory increase for triggering an error /// /// Implemented very similarly to @c optMemResidentPerEventIncreaseLimit. /// But since normally we don't care about the virtual memory usage of the /// jobs that much, it is set to zero by default. Making /// @c optMemResidentPerEventIncreaseLimit control the behaviour of the /// job. /// static const std::string optMemVirtualPerEventIncreaseLimit; /// The minimal resident memory increase necessary to trigger an error /// /// It is an integer property, setting the limit in kilobytes. Jobs have /// to increase their resident memory usage by this amount to trigger a /// failure. /// static const std::string optMemResidentIncreaseLimit; /// The minimal virtual memory increase necessary to trigger an error /// /// Implemented very similarly to @c optMemResidentIncreaseLimit. Since /// normally the virtual memory usage is not considered in producing a /// failure, it is set to zero by default. /// static const std::string optMemVirtualIncreaseLimit; /// Failure behaviour of the code when a "significant memory leak" is found /// /// This flag allows the user to select what should happen when the code /// finds a memory leak in the job that is larger than the values set by /// @c optMemResidentPerEventIncreaseLimit, /// @c optMemVirtualPerEventIncreaseLimit, @c optMemResidentIncreaseLimit /// and @c optMemVirtualIncreaseLimit. /// /// It's a boolean property. When set to @c true, the job fails if a /// significant memory leak is detected. If set to @c false, only a warning /// is printed. /// static const std::string optMemFailOnLeak; /// @}  Nils Krumnack committed Jan 24, 2019 530 531 532 533 534 535 536 537 538 539  /// \brief the name of the histogram output stream /// /// Normally users don't need to worry about the histogram output /// stream, and it just gets created for you automatically, but in /// some situations you will need to reconfigure the histogram /// output stream. static const std::string histogramStreamName;  540 541 542 543 544 545 546 547 548 549 550 551 552  // // private interface // friend void swap (Job& a, Job& b); /// description: members directly corresponding to accessors private: SH::SampleHandler m_sampleHandler; private: std::vector m_output; private: SH::MetaObject m_options;  Nils Krumnack committed Feb 03, 2018 553 554 555 556  /// \brief the \ref JobConfig object we use private: EL::JobConfig m_jobConfig;  557 558 559 560  }; } #endif