Compare revisions

af4a7944 · e5976caa · eb8e3dea · e277cc02 · e0907f3a · 7667fb6b
--- a/.flake8
+++ b/.flake8
+[flake8]
+# To be removed after the whole systematics business: E266, E712
+extend-ignore = E203, E231, E501, E722, W503, B950, E266, E712
+select = C,E,F,W,T,B,B9,I
+exclude = higgs_dna/scripts/*
+per-file-ignores =
+    tests/*: T
+    noxfile.py: T
--- a/.gitattributes
+++ b/.gitattributes
-higgs_dna/metaconditions/*.json filter=lfs diff=lfs merge=lfs -text
-higgs_dna/metaconditions/*/*.json filter=lfs diff=lfs merge=lfs -text
-higgs_dna/metaconditions/*/*/*.json filter=lfs diff=lfs merge=lfs -text
--- a/.gitignore
+++ b/.gitignore
 # Editors
 .vscode/
 .idea/
+.swp

 # Vagrant
 .vagrant/
@@ -135,6 +136,63 @@ tests/log
 dask-report.html
 *.cc

-#sshfs on mac
+# sshfs on mac
 ._*
 Diphoton_CDFs.pkl.gx
+
+# Sample directory (input txts and output jsons)
+higgs_dna/samples/*.txt
+higgs_dna/samples/*.json
+
+### The following ingredients can be pulled with scripts/pull_files.py and are therefore not added to the index
+
+## golden JSON
+higgs_dna/metaconditions/CAF/
+
+## JSONs for systematics
+# Weight-based Egamma objects
+higgs_dna/systematics/JSONs/TriggerSF/
+higgs_dna/systematics/JSONs/Preselection/
+higgs_dna/systematics/JSONs/ElectronVetoSF/
+higgs_dna/systematics/JSONs/SF_photon_ID/
+higgs_dna/systematics/JSONs/LooseMvaSF/
+# Weight-based jets
+higgs_dna/systematics/JSONs/cTagSF/
+higgs_dna/systematics/JSONs/bTagSF/
+# Weight-based event level
+higgs_dna/systematics/JSONs/pileup
+# Variable-based Egamma (potentially custom for Hgg)
+higgs_dna/systematics/JSONs/scaleAndSmearing/
+higgs_dna/systematics/JSONs/FNUF/
+higgs_dna/systematics/JSONs/Material
+higgs_dna/systematics/JSONs/ShowerShape/
+# Variable-based central (e.g. JME)
+higgs_dna/systematics/JSONs/POG/
+# B-Tagging efficiencies (analysis-specific), keep midRun3.json.gz as an example file
+higgs_dna/systematics/JSONs/bTagEff/
+!higgs_dna/systematics/JSONs/bTagEff/*/midRun3.json.gz
+# JEC and JER (not sure why it is named data to be honest)
+higgs_dna/systematics/data/
+
+# CDFs for decorrelation
+higgs_dna/tools/Diphoton_CDFs.pkl.gz
+higgs_dna/tools/Smeared_Diphoton_CDFs.pkl.gz
+higgs_dna/tools/decorrelation_CDFs/
+higgs_dna/tools/*HHbbgg.json
+
+# Flow models
+higgs_dna/tools/flows/
+
+# analysis specific stuff
+higgs_dna/tools/lowmass_diphoton_mva/
+higgs_dna/tools/lowmass_dykiller/
+
+# metaconditions
+higgs_dna/metaconditions/corrections/2017_Legacy_xgb/
+higgs_dna/metaconditions/corrections/corrections_summary_2017_Legacy_xgb.json
+higgs_dna/metaconditions/diphoton/altDiphoModel_coffea.json
+higgs_dna/metaconditions/diphoton/weights071123_even.xgb
+higgs_dna/metaconditions/diphoton/weights071123_odd.xgb
+higgs_dna/metaconditions/photon_id_mva_weights/*.json
+higgs_dna/metaconditions/hpc_bdt/*.h5
+higgs_dna/metaconditions/hpc_bdt/*.xgb
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
-image: "python:3.9"
+image: "registry.cern.ch/docker.io/library/python:3.10-slim"

 stages:
  - build
@@ -25,7 +25,8 @@ variables:
 build:
  stage: build
  script:
-    - pip install -e .[dev]
+    - pip install --no-cache-dir -e .[dev]
+    - rm -rf /root/.cache /tmp/*

 build_docker:
    stage: build docker
@@ -35,9 +36,9 @@ build_docker:
      # To push to a specific docker tag other than latest(the default), amend the --destination parameter, e.g. --destination $CI_REGISTRY_IMAGE:$CI_BUILD_REF_NAME
      # See https://docs.gitlab.com/ee/ci/variables/predefined_variables.html#variables-reference for available variables
      #IMAGE_DESTINATION: ${CI_REGISTRY_IMAGE}:latest
-      BUILD_ARGS: "FROM_IMAGE=gitlab-registry.cern.ch/batch-team/dask-lxplus/lxdask-cc7:latest CLUSTER=lxplus"
-      IMAGE_DESTINATION: ${CI_REGISTRY_IMAGE}:lxplus-${CI_COMMIT_SHORT_SHA}
-    image: 
+      BUILD_ARGS: "FROM_IMAGE=gitlab-registry.cern.ch/batch-team/dask-lxplus/lxdask-al9:latest CLUSTER=lxplus-el9"
+      IMAGE_DESTINATION: ${CI_REGISTRY_IMAGE}:lxplus-el9-${CI_COMMIT_SHORT_SHA}
+    image:
        # The kaniko debug image is recommended because it has a shell, and a shell is required for an image to be used with GitLab CI/CD.
        name: gcr.io/kaniko-project/executor:debug
        entrypoint: [""]
@@ -55,26 +56,32 @@ build_docker_latest:
  rules:
    - if: '$CI_PIPELINE_SOURCE == "push" && $CI_COMMIT_REF_NAME == "master"'
  variables:
-    IMAGE_ORIGIN_TAG: ${CI_REGISTRY_IMAGE}:lxplus-${CI_COMMIT_SHORT_SHA}
+    IMAGE_ORIGIN_TAG: ${CI_REGISTRY_IMAGE}:lxplus-el9-${CI_COMMIT_SHORT_SHA}
     #keep only latest for now as we have only the lxplus one
     #change in the future in case we need more
-    IMAGE_DESTINATION_TAG: ${CI_REGISTRY_IMAGE}:latest
+    IMAGE_DESTINATION_TAG: ${CI_REGISTRY_IMAGE}:lxplus-el9-latest

 static_analysis:
  stage: static analysis
  before_script:
    - pip install flake8
    - pip install mypy --quiet
-    - pip install -e.[dev]
+    - pip install --no-cache-dir -e .[dev]
+    - rm -rf /root/.cache /tmp/*
  script:
    - flake8 higgs_dna/*
    # mypy will pick up what is specified in pyproject.toml
    #- mypy

 unit_test:
+  # Provide access to CMVFS for BTagging SF
+  tags: 
+    - k8s-cvmfs
  stage: test
  before_script:
-    - pip install -e .[dev]
+    - pip install --no-cache-dir -e .[dev]
+    - rm -rf /root/.cache /tmp/*
+    - apt-get update && apt-get install -y wget
  script:
    - pwd
    - ls -l

--- a/Dockerfile
+++ b/Dockerfile
-ARG FROM_IMAGE=gitlab-registry.cern.ch/batch-team/dask-lxplus/lxdask-cc7:latest
+ARG FROM_IMAGE=gitlab-registry.cern.ch/batch-team/dask-lxplus/lxdask-al9:latest
 FROM ${FROM_IMAGE}

-ARG CLUSTER=lxplus
+ARG CLUSTER=lxplus-el9

-ADD . . 
+ADD . .

 RUN echo "=======================================" && \
    echo "Installing HiggsDNA" && \
+    echo "on cluster environment: $CLUSTER" && \
+    echo "Current time:" $(date) && \
    echo "=======================================" && \
-    if [[ ${CLUSTER} == "lxplus" ]]; then \
    yum -y update && \
    yum -y install git-lfs && \
-    echo "Fixing dependencies in the image" && \
-    conda install -y numba>=0.57.0 llvmlite==0.40.0 numpy>=1.22.0 && \
-    pip install --upgrade dask-lxplus; \
+    if [[ ${CLUSTER} == "lxplus-cc7" ]]; then \
+        echo "Fixing dependencies in the image" && \
+        conda install -y numba>=0.57.0 llvmlite==0.40.0 numpy>=1.22.0 && \
+        python -m pip install -U dask-lxplus==0.3.2 dask-jobqueue==0.8.2; \
    fi && \
-    pip3 install .
\ No newline at end of file
+    installed_version=$(pip show pyarrow | grep 'Version:' | awk '{print $2}'); \
+    if [ "$(printf '%s\n' "$installed_version" "11.0.0" | sort -V | head -n1)" = "11.0.0" ]; then \
+        pip install --upgrade pyarrow; \
+    fi && \
+    echo "Installing HiggsDNA" && \
+    python -m pip install . --verbose
--- a/MANIFEST.in
+++ b/MANIFEST.in
-recursive-include higgs_dna/metaconditions *.json
\ No newline at end of file
--- a/docs/source/concepts.rst
+++ b/docs/source/concepts.rst
@@ -43,7 +43,7 @@ where ``simple_analysis.json`` looks like this:
          ],
          "year":[
            "SampleName1": ["2022preEE"],
-            "SampleName2": ["2017"]            
+            "SampleName2": ["2017"]
          ]
          "systematics": {
              "SampleName1": [
@@ -70,6 +70,7 @@ The next two flags that you will want to specify are ``dump`` and ``executor``:

 * ``iterative``
 * ``futures``
+* ``dask/local``
 * ``dask/condor``
 * ``dask/slurm``
 * ``dask/lpc``
@@ -91,7 +92,6 @@ As usual, a description of all the options is printed when running::

        run_analysis.py --help

-
 .. _def-processor:

 ----------

--- a/docs/source/developers.rst
+++ b/docs/source/developers.rst
@@ -13,6 +13,10 @@ Please lint, format and run tests before sending a PR:
   black higgs_dna
   pytest

+We follow certain conventions in the codebase. Please make sure to follow them when contributing:
+
+#. Make use of the common abbreviations for frequent packages, e.g., ``np`` for ``numpy`` and ``ak`` for ``awkward``.
+#. When using the ``argparse`` package, use kebab case for the argument names, e.g., ``--input-file``. Do not use snake case (underscores). Note: Internally, this is converted back to snake case, so you access the argument with ``args.input_file``, but using kebab case in argparse is the unix convention.

 --------------------
 Update Documentation

--- a/docs/source/examples.rst
+++ b/docs/source/examples.rst
@@ -5,6 +5,7 @@ Examples
   notebooks/basics
   standalone_examples/tnp
   standalone_examples/base
+   standalone_examples/btagging
   standalone_examples/Zmmy
   standalone_examples/jerc
   
\ No newline at end of file
--- a/docs/source/images/docker.png
+++ b/docs/source/images/docker.png
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -12,7 +12,7 @@ HiggsDNA - Higgs to Diphoton NanoAOD Framework
   installation
   concepts
   jobs
-   output_grooming
+   postprocessing
   developers
   examples
   api

--- a/docs/source/installation.rst
+++ b/docs/source/installation.rst
@@ -28,7 +28,7 @@ Docker & Singularity

 In case you experience issues using the conda environment, docker images are available to try. These are especially useful on LXPLUS, where instead of ``docker`` the ``singularity`` command should be used (also called ``apptainer``).

-Since the main goal of the Docker images to use on LXPLUS it to ease the experience of submitting jobs using Dask, the base image used is the one provided with the `dask-lxplus <https://gitlab.cern.ch/batch-team/dask-lxplus>`_ package (as can be seen in the `Dockerfile <https://gitlab.cern.ch/HiggsDNA-project/HiggsDNA/-/blob/master/docker/Dockerfile_lxplus>`_).
+Since the main goal of the Docker images to use on LXPLUS it to ease the experience of submitting jobs using Dask, the base image used is the one provided with the `dask-lxplus <https://gitlab.cern.ch/batch-team/dask-lxplus>`_ package (as can be seen in the `Dockerfile <https://gitlab.cern.ch/HiggsDNA-project/HiggsDNA/-/blob/master/Dockerfile?ref_type=heads>`_).

 Docker images are built in CI every time a commit is pushed on the master branch, with the most recent one tagged as ``latest``.

@@ -36,18 +36,18 @@ Docker images are built in CI every time a commit is pushed on the master branch

 If you want to use the latest image with HiggsDNA already installed in it (i.e. no development), you can pull it with::

-        apptainer shell --bind /afs -B /cvmfs/cms.cern.ch \
-        --bind /tmp --bind /eos/cms/ \
-        --env KRB5CCNAME=$KRB5CCNAME --bind /etc/sysconfig/ngbauth-submit \
-        docker://gitlab-registry.cern.ch/higgsdna-project/higgsdna:latest
+        apptainer shell -B /afs -B /cvmfs/cms.cern.ch \
+        -B /tmp  -B /eos/cms/ \
+        -B /etc/sysconfig/ngbauth-submit -B ${XDG_RUNTIME_DIR} --env KRB5CCNAME="FILE:${XDG_RUNTIME_DIR}/krb5cc" \
+        /cvmfs/unpacked.cern.ch/gitlab-registry.cern.ch/higgsdna-project/higgsdna:latest

 If you want to also develop you can pull the same image and create a virtual environment inside it::

        # pull and access the image
-        apptainer shell --bind /afs -B /cvmfs/cms.cern.ch \
-        --bind /tmp --bind /eos/cms/ \
-        --env KRB5CCNAME=$KRB5CCNAME --bind /etc/sysconfig/ngbauth-submit \
-        docker://gitlab-registry.cern.ch/higgsdna-project/higgsdna:latest
+        apptainer shell -B /afs -B /cvmfs/cms.cern.ch \
+        -B /tmp  -B /eos/cms/ \
+        -B /etc/sysconfig/ngbauth-submit -B ${XDG_RUNTIME_DIR} --env KRB5CCNAME="FILE:${XDG_RUNTIME_DIR}/krb5cc" \
+        /cvmfs/unpacked.cern.ch/gitlab-registry.cern.ch/higgsdna-project/higgsdna:latest

        # create virtual environment 
        python -m venv --system-site-packages myenv

--- a/docs/source/jobs.rst
+++ b/docs/source/jobs.rst
@@ -60,22 +60,22 @@ If you work on a remote cluster (so pretty much all the time) you can see the da
 LXPLUS Vanilla Submitter
 ------------------------

-In order to provide an alternative to Dask to work on LXPLUS, a vanilla submitter was implemented. Being very basic and a temporary solution until Dask is mature, the distribution model is quite simple and one ROOT file is assigned to each job. 
+In order to provide an alternative to Dask to work on LXPLUS, a vanilla submitter was implemented. Being very basic and a temporary solution until Dask is mature, the distribution model is quite simple and one ROOT file is assigned to each job.

-A directory called ``.higgs_dna_vanilla_lxplus`` is created in the current path, with a subdirectory with the name of the ``json-analysis``, with a data-time suffix of the form ``YMD_HMS``. This suffix is used to avoid overwriting previous submissions. There, two subdirectories called ``input`` and ``jobs`` are created: the former contains the new JSON files split by ROOT file, while the latter contains the submit files that are passed to ``condor_submit``. 
+A directory called ``.higgs_dna_vanilla_lxplus`` is created in the current path, with a subdirectory with the name of the ``json-analysis``, with a data-time suffix of the form ``YMD_HMS``. This suffix is used to avoid overwriting previous submissions. There, two subdirectories called ``input`` and ``jobs`` are created: the former contains the new JSON files split by ROOT file, while the latter contains the submit files that are passed to ``condor_submit``.

-By default all jobs (files) for a given sample are submitted to the same cluster. You can change this behaviour by setting ``cluster_per_sample=False`` in the ``LXPlusVanillaSubmitter`` class constructor. In this case, each job will be submitted to a separate cluster. 
+By default all jobs (files) for a given sample are submitted to the same cluster. You can change this behaviour by setting ``cluster_per_sample=False`` in the ``LXPlusVanillaSubmitter`` class constructor. In this case, each job will be submitted to a separate cluster.

 An example of command line is the following:

 .. code-block:: bash

-     run_analysis.py --json-analysis /afs/cern.ch/work/g/gallim/devel/HiggsDNA/analyses/json_analysis_file_example.json --dump /afs/cern.ch/work/g/gallim/devel/vanilla_lxplus_tests --skipCQR --executor vanilla_lxplus --queue espresso 
-      
+     run_analysis.py --json-analysis /afs/cern.ch/work/g/gallim/devel/HiggsDNA/analyses/json_analysis_file_example.json --dump /afs/cern.ch/work/g/gallim/devel/vanilla_lxplus_tests --skipCQR --executor vanilla_lxplus --queue espresso
+
 the arguments ``--queue`` and ``--memory`` are the same used also for ``dask/lxplus``.

 .. warning::
-   
+
      When working from ``eos``, specific care has to be taken in order correctly fetch log and error files. As explained `here <https://batchdocs.web.cern.ch/troubleshooting/eos.html#no-eos-submission-allowed>`_ one can run:

      .. code-block:: bash
@@ -86,6 +86,55 @@ the arguments ``--queue`` and ``--memory`` are the same used also for ``dask/lxp
      to fetch logs and error files and then remove the finished jobs that otherwise would be kept showing when running ``condor_q``.

 .. note::
-   As already stated above, this submitter is just a temporary solution and it is not meant to be complete nor maximally efficient. 
-   
+   As already stated above, this submitter is just a temporary solution and it is not meant to be complete nor maximally efficient.
+
   A smarter solution would have to be implemented as an executor directly in Coffea, and it is on the to-do list.
+
+Getting the list of unprocessed samples
+---------------------------------------
+
+.. _previous section:
+
+Parquet naming convention
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+By default, the parquet file names include the ``UUID`` contained in the ``ROOT`` file header (`see header format <https://root.cern/doc/v632/header.html>`_).
+However, this ``UUID`` differs from the one used in `DAS <https://cmsweb.cern.ch/das/>`_, which can make it difficult to quickly identify which files have successfully been processed.
+To address this, the ``choose_naming_convention`` function allows users to select the naming convention for the output files, either aligning with the DAS ``UUID`` or retaining the legacy format.
+
+The naming convention can be specified using the ``self.name_convention`` attribute:
+
+* ``"DAS"``: Replaces the default ``ROOT``-specific UUID with the DAS UUID, extracted from the source ``.root`` file's name. This makes it easier to match the processed files with their corresponding DAS entries.
+* ``"Legacy"``: Retains the original parquet filename format, using the UUID as contained in the ``ROOT`` file header.
+
+
+How to get the list of unprocessed samples
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+During the workflow execution, timeouts when attempting to access the root files, and/or other issues, might result in incomplete or missing processing of certain data chunks or files.
+
+To avoid rerunning all jobs, you can extract the list of unprocessed samples and run the workflow exclusively on those files.
+The script ``get_unprocessed_files.py`` can be used to generate such list both for the ``DAS`` and the ``Legacy`` file naming convention.
+
+The script can be called on a ``source`` directory containing all the datasets which were processed so far. It will go through the datasets as defined in the original ``json``,
+and will look into their ``nominal`` subfolder for all the missing or partially processed parquets. From there it will produce a new ``ouput`` json, which will contain the list of unprocessed samples.
+
+.. note::
+    The script will be specifically searching through the datasets **as defined** in the ``json`` file.
+
+
+The script can be used as follow:
+
+.. code-block:: bash
+
+     python get_unprocessed_files.py --convention myconvention --source ./run3_analysis/my_processed_datasets/ --json my_samples.json --output my_unprocessed_samples.json
+
+where ``--convention`` is used to specify the naming convention used (options are ``DAS`` or ``Legacy``), ``--source`` should point to the directory containing the datasets, ``--json`` should be the json for which we wish to obtain the missing parquets, and
+``--output`` will be the newly produced json file.
+
+.. warning::
+    This script works best with the ``DAS`` parquet naming (see `previous section`_).
+    For the ``DAS`` convention, the script uses ``dasgoclient`` to retrieve the `DAS <https://cmsweb.cern.ch/das/>`_ ``UUID`` in a single query, and compares it to the available parquets.
+
+    On the contrary, the ``UUID`` for the ``Legacy`` convention is retrieved from the ROOT file headers, which we read using ``xrootd``.
+    This method is not only prone to the traditional ``xrootd`` errors but is also slower, as each header must be read individually.
--- a/docs/source/modules/higgs_dna.workflows.rst
+++ b/docs/source/modules/higgs_dna.workflows.rst
@@ -20,6 +20,14 @@ higgs\_dna.workflows.base module
   :undoc-members:
   :show-inheritance:

+higgs\_dna.workflows.btagging module
+--------------------------------
+
+.. automodule:: higgs_dna.workflows.btagging
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
 higgs\_dna.workflows.dystudies module
 -------------------------------------


--- a/docs/source/output_grooming.rst
+++ b/docs/source/output_grooming.rst
-Workspace preparation for FinalFit interface
+Postprocessing
 ============================================

 Standard Procedure
@@ -12,7 +12,7 @@ The script will perform multiple steps:

 All the steps can be performed in one go with a command more or less like this::

-        python3 prepare_output_file.py --input [path to output dir] --merge --root --ws --syst --cats --args "--do_syst"
+        python3 prepare_output_file.py --input [path to output dir] --merge --root --ws --syst --cats --args "--do-syst"

 or the single steps can be performed by running the auxiliary files (``merge_parquet.py``, ``convert_parquet_to_root.py``, ``Tree2WS``) separately.
 A complete set of options for the main script is listed below.
@@ -57,13 +57,75 @@ During the merging step MC samples can also be normalised to the ``efficiency x
 Root step 
 ---------

-During this step the script calls multiple times the script ``convert_parquet_to_root.py``. The arguments to pass to the script, for instance if you want the systematic variation included in the output ``ROOT tree`` are specified when calling ``prepare_output_file.py`` using ``--args "--do_syst"``.
+During this step the script calls multiple times the script ``convert_parquet_to_root.py``. The arguments to pass to the script, for instance if you want the systematic variation included in the output ``ROOT tree`` are specified when calling ``prepare_output_file.py`` using ``--args "--do-syst"``.
 As before the script creates a new called ``root`` under ``out_dir``, if this directory already exists it will throw an error and exit. In the script there is a dictionary called ``outfiles`` that contains the name of the output root file that will be created according to the process tipe, if the wf is run using the main script this correspond to the proces containd in ``process_dict``.

+By default, ``prepare_output_file.py`` uses the local execution to process files. If one wants to process the files via HTCondor (tested on LXPLUS), the ``--apptainer`` flag is to be used. It uses a docker image of the HiggsDNA master branch in conjunction with HTCondor to facilitate the work.
+
+Data processing with local
+--------------------------
+
+To process the data locally, we have to know some things. First, we need to specify the absolute input path (``--input``) which leads to your output of ``run_analysis.py`` (unmerged parquet files). The output folder in which the merged parquet files are stored needs to be specified with ``--output``. If one wants to categorize the files, the ``--cats`` keyword is used in conjunction with ``--catsDict`` which points to the ``category.json`` to be considered. Are systematics desired, they have to be activated with ``--syst``.
+
+In order to merge the parquet files according to the categories and produce the ROOT files in the same step, the following command is to be used:
+
+.. code-block:: python
+
+    python prepare_output_file.py --input /absolute/input/path --cats --catDict /absolute/path/to/cat_data.json --varDict /absolute/path/to/varDict_data.json --syst --merge --root --output /absolute/output/path
+
+Using the condor-way, one has to pay attention when processing data as an additional step wrt. the local-way is required, and the merge and ROOT-production step have to be separated:
+
+Data processing with Docker
+---------------------------
+
+The first step is to merge the data parquet files according to the chosen categories. Since the data come in so-called eras (era ``A``, era ``B``, etc.), they have to be merged, such that we have per era and category a parquet file. This is the purpose of the following command, which has to be executed first:
+
+.. code-block:: python
+
+    python prepare_output_file.py --input /absolute/input/path --cats --catDict /absolute/path/to/cat_data.json --varDict /absolute/path/to/varDict_data.json --syst --merge --output /absolute/output/path --apptainer
+
+Studies in the past showed that for 2022 data there is not much of a difference significance-wise between splitting ``preEE`` and ``postEE`` datasets (referencing to the ECAL Endcap water leak in 2022) and merging them. For this reason, it was merged to one big dataset for HIG-23-014. The following command merges the era datasets to an ``allData.parquet`` file according to the categories. One needs in addition the flag ``--merge-data-only``:
+
+.. code-block:: python
+
+    python prepare_output_file.py --input /absolute/input/path --cats --catDict /absolute/path/to/cat_data.json --varDict /absolute/path/to/varDict_data.json --syst --merge --output /absolute/output/path --merge-data-only --apptainer
+
+Finally, we convert the parquet files to ROOT:
+
+.. code-block:: python
+
+    python prepare_output_file.py --input /absolute/input_path/to_folder_with_merged --cats --catDict /absolute/path/to/cat_data.json --varDict /absolute/path/to/varDict_data.json --syst --root --output /absolute/input_path/to_folder_with_merged --apptainer
+
+Whenever the parquet files are merged (after the first step), a folder ``merged`` in the ``/absolute/output/path`` is created. For getting the ROOT files, one has to use the folder ``/absolute/output/path`` (which is now containing the ``merged`` subfolders) as the new input folder. The file processing for MC samples functions in a similar way:
+
+MC processing with Docker
+-------------------------
+
+Similar to data, the MC samples can be processed with HTCondor. Here we only have two steps. The first consists of merging the parquet files according to the categories just like in the data case:
+
+.. code-block:: python
+
+    python prepare_output_file.py --input /absolute/input/path --cats --catDict /absolute/path/to/cat_mc.json --varDict /absolute/path/to/varDict_mc.json --syst --merge --output /absolute/output/path --apptainer
+
+In order to convert the parquet files to ROOT, one executes:
+
+.. code-block:: python
+
+    python prepare_output_file.py --input /absolute/input_path/to_folder_with_merged --cats --catDict /absolute/path/to/cat_mc.json --varDict /absolute/path/to/varDict_mc.json --syst --root --output /absolute/input_path/to_folder_with_merged --apptainer
+
+One can specify a separate path which is hosting all the sub and sh files with ``--condor-logs``. If the condor log, err, and out files are desired (e.g. for debugging purposes) they can be explicitly produced with ``--make-condor-logs``.
+
+A valid command would for example be:
+
+.. code-block:: python
+
+    python prepare_output_file.py --input /absolute/input/path --cats --catDict /absolute/path/to/cat_mc.json --varDict /absolute/path/to/varDict_mc.json --syst --merge --output /absolute/output/path --condor-logs /absolute/path/to/condor/logs --make-condor-logs --apptainer
+
+
 Workspace step
 --------------

-During this step the main script uses multiple time the ``Flashgg_FinalFit``, it moves to the directory defined in the ``--final_fit`` option (improvable) and uses the ``Tree2WS`` script there on the content of the ``root`` directory previously created. The output is stored in ``out_dir/root/smaple_name/ws/``.
+During this step the main script uses multiple time the ``Flashgg_FinalFit``, it moves to the directory defined in the ``--final-fit`` option (improvable) and uses the ``Tree2WS`` script there on the content of the ``root`` directory previously created. The output is stored in ``out_dir/root/smaple_name/ws/``.

 Commands
 --------
@@ -72,34 +134,42 @@ The workflow is meant to be run in one go using the ``prepare_output_file.py`` s

 To run everything starting from the output of HiggsDNA with categories and systematic variatrion one can use::

-        python3 prepare_output_file.py --input [path to output dir] --merge --root --ws --syst --cats --args "--do_syst"
+        python3 prepare_output_file.py --input [path to output dir] --merge --root --ws --syst --cats --args "--do-syst"

 and everithing should run smoothly, it does for me at least (I've not tried the scripts in a while so thing may have to be adjusted in this document).
-Some options can be removed. If you want to use ``--syst`` and ``--root`` you should also add ``--args "--do_syst"``.
+Some options can be removed. If you want to use ``--syst`` and ``--root`` you should also add ``--args "--do-syst"``.

 The complete list of options for the main file is here:

    * ``--merge``, "Do merging of the .parquet files"
    * ``--root``, "Do root conversion step"
    * ``--ws``, "Do root to workspace conversion step"
-    * ``--ws_config``, "configuration file for Tree2WS, as it is now it must be stored in Tree2WS directory in FinalFit",
-    * ``--final_fit``, "FlashggFinalFit path" # the default is just for me, it should be changed but I don't see a way to make this generally valid
+    * ``--ws-config``, "configuration file for Tree2WS, as it is now it must be stored in Tree2WS directory in FinalFit",
+    * ``--final-fit``, "FlashggFinalFit path" # the default is just for me, it should be changed but I don't see a way to make this generally valid
    * ``--syst``, "Do systematics variation treatment"
    * ``--cats``, ="Split into categories",
-    * ``--args``, "additional options for root converter: --do_syst, --notag",
+    * ``--args``, "additional options for root converter: --do-syst, --notag",
    * ``--skip-normalisation``, "Independent of file type, skip normalisation step",
    * ``--verbose``, "verbose lefer for the logger: INFO (default), DEBUG",
+    * ``--output``, "Output path for the merged and ROOT files.",
+    * ``--folder-structure``, "Uses the given folder structure for the dirlist. Mainly used for debug purposes.",
+    * ``--apptainer``, "Run HTCondor with Docker image of HiggsDNA's current master branch.",
+    * ``--merge-data-only``, "Flag for merging data to an allData file. Only used when --condor is used, and only when we process data.",
+    * ``--make-condor-logs``, "Create condor log files.",
+    * ``--condor-logs``, "Output path of the Condor log files.",
+

 The merging step can also be run separately using::

-        python3 merge_parquet.py --source [path to the directory containing .paruets] --target [target directory path] --cats [cat_dict]
+        python3 merge_parquet.py --source [path to the directory containing .parquets] --target [target directory path] --cats [cat_dict]

 the script works also without the ``--cats`` option, it creates a dummy selection of ``Pt > -1`` and call the category ``UNTAGGED``.

 Same for the root step::

-        python3 convert_parquet_to_root.py [/path/to/merged.parquet] [path to output file containing also the filename] mc (or data depending what you're doing) --process [process name (should match one of the outfiles dict entries)] --do_syst --cats [cat_dict] --vars [variation.json]
+        python3 convert_parquet_to_root.py [/path/to/merged.parquet] [path to output file containing also the filename] mc (or data depending what you're doing) --process [process name (should match one of the outfiles dict entries)] --do-syst --cats [cat_dict] --vars [variation.json]
+
+``--do-syst`` is not mandatory, but if it's there also the dictionary containing the variations must be specified with the ``--var`` option. As before the script works also without the ``--cats`` option.

-``--do_syst`` is not mandatory, but if it's there also the dictionary containing the variations must be specified with the ``--var`` option. As before the script works also without the ``--cats`` option.


--- a/docs/source/standalone_examples/base.rst
+++ b/docs/source/standalone_examples/base.rst
@@ -27,7 +27,7 @@ We save this file as ``runnerJSON.json`` (for example in the root directory of H

    {
        "samplejson": "<path_to_sampleJSON.json>",
-        "workflow": "dystudies", 
+        "workflow": "base", 
        "metaconditions": "Era2017_legacy_xgb_v1",
        "taggers": [],
        "systematics": {
@@ -38,8 +38,6 @@ We save this file as ``runnerJSON.json`` (for example in the root directory of H
        }
    }

-For unknown reasons, the workflow has to be specified with the key `dystudies`. This nomenclature might be changed in the future.
-
 Finally, we have the runner command:

 .. code-block:: bash

--- a/docs/source/standalone_examples/btagging.rst
+++ b/docs/source/standalone_examples/btagging.rst
+BTagging workflow
+=================
+
+The BTagging workflow is a slightly changed base workflow. The main difference is that the BTagging workflow includes derivation of BTagging efficiencies for a given analysis as recommended by [BTV](https://btv-wiki.docs.cern.ch/PerformanceCalibration/fixedWPSFRecommendations/#b-tagging-efficiencies-in-simulation). The processor that implements the main operations performed can be found at ``higgs_dna/workflows/btagging:BTaggingProcessor``.
+
+To use b-jet related variables and weights, one has to adhere to the following procedures:
+
+- Variables
+  * Execute first `pull_files.py --target bTag`
+  * Choose an appropriate btagging MVA: `deepJet` (all NanoAOD versions \>= v11), `particleNet` (NanoAOD \>= v12) and `robustParticleTransformer` (NanoAOD \>= v12)
+    * Set it in the processor variable `self.bjet_mva`
+  * Choose the Working Point: `L` (Loose), `M` (Medium), `T` (Tight), `XT` (extra Tight) or `XXT` (extra extra Tight)
+    * Set it in the processor variable `self.bjet_wp`
+- Weights
+  * Execute first `pull_files.py --target bTag`
+  * Since the btagging efficiency weights have to be computed **per analysis**, we have to produce them first with the `BTagging` processor
+      * Example `Btagging` processor found in `./higgs_dna/workflows`
+         * Important: You have to apply your selections **before** the indicated MANDATORY PART. The latter must not be changed.
+      * Select in your `runner.json` for the workflow `BTagging` and run the processor with no systematics (they are not necessary for the `BTagging` processor) over all your samples of your analysis
+      * Pickle `.pkl` files are produced that contain the btagging efficiencies binned as per recommendation by [BTV](https://btv-wiki.docs.cern.ch/PerformanceCalibration/fixedWPSFRecommendations/)
+        * pT in `[20, 30, 50, 70, 100, 140, 200, 300, 600, 1000]`
+        * abs(eta): No Binning
+        * hadronFlavour: `[0, 4, 5]`
+      * Once `.pkl` files are produced, generate the correctionlib file by calling `btagging_eff.py --input path/to/BTagging/output --output-name analysis-name` , where `analysis-name` is the name the correctionlib will be saved as (per default in `./higgs_dna/systematics/JSONs/bTagEff/year/analysis-name.json.gz` )
+  * When the correctionlib is produced and stored in `./higgs_dna/systematics/JSONs/bTagEff/year` , we have to add an additional string to the dictionary in `runner.json`:     `"bTagEffFileName": "analysis-name.json.gz"` (**Important**: Only state the name of the file. Do not add the path, as HiggsDNA looks for it in `./higgs_dna/systematics/JSONs/bTagEff/` by itself!)
+  * Now, you can add the desired corrections and systematics to your `runner.json` and launch the production of your samples
+    * Currently only `bTagFixedWP_PNetTight` (MVA: ParticleNet, Tight WP) is implemented. But feel free to add more with the new function for fixed btagging working points `bTagFixedWP` found in `higgs_dna/systematics/event_weight_systematics.py`
+  * If a systematic or correction from `bTagFixedWP` is used, the weights `weight` and `weight_central` are **not** containing the weight of the btagging scale factors. Instead it is added separately as `weight_bTagFixedWP` to be used later.
+
+
+An example JSON for the `BTaggingProcessor` could look like this:
+
+.. code-block:: json
+    {
+        "samplejson": "./samples.json",
+        "workflow": "BTagging",
+        "metaconditions": "Era2022_v1",
+        "year": {
+            "Channel_postEE": ["2022postEE"],
+            "Channel_preEE": ["2022preEE"]
+        },
+        "corrections": {
+            "Channel_postEE": ["Pileup", "Et_dependent_Smearing", "energyErrShift"],
+            "Channel_preEE":  ["Pileup", "Et_dependent_Smearing", "energyErrShift"]
+        },
+        "systematics": {
+            "Channel_postEE": [],
+            "Channel_preEE":  []
+        }
+    }
+
+Then after the production of the `.pkl` files, the final `runner.json` for your analyis involving the Particle Net MVA with a tight working point could look like this:
+
+.. code-block:: json
+    {
+        "samplejson": "./samples.json",
+        "workflow": "base",
+        "metaconditions": "Validation_Plots_22_23",
+        "bTagEffFileName": "DY_PNetT_Base.json.gz",
+        "year": {
+            "Channel_postEE": ["2022postEE"],
+            "Channel_preEE": ["2022preEE"]
+        },
+        "corrections": {
+            "Channel_postEE": ["bTagFixedWP_PNetTight", "jerc_jet_syst", "Pileup", "Et_dependent_Smearing", "energyErrShift"],
+            "Channel_preEE":  ["bTagFixedWP_PNetTight", "jerc_jet_syst", "Pileup", "Et_dependent_Smearing", "energyErrShift"]
+        },
+        "systematics": {
+            "Channel_postEE": ["bTagFixedWP_PNetTight", "Pileup", "Et_dependent_ScaleEB", "Et_dependent_ScaleEE", "Et_dependent_Smearing", "energyErrShift"],
+            "Channel_preEE":  ["bTagFixedWP_PNetTight", "Pileup", "Et_dependent_ScaleEB", "Et_dependent_ScaleEE", "Et_dependent_Smearing", "energyErrShift"]
+        }
+    }
+
+
+Right now, the following btagging corrections and systematics are implemented:
+
+    - Particle Net
+        - `bTagFixedWP_PNetLoose` (Loose WP)
+        - `bTagFixedWP_PNetMedium` (Medium WP)
+        - `bTagFixedWP_PNetTight` (Tight WP)
+        - `bTagFixedWP_PNetExtraTight` (Extra Tight WP)
+        - `bTagFixedWP_PNetExtraExtraTight` (Extra Extra Tight WP)
+
+    - Deep Jet
+        - `bTagFixedWP_deepJetLoose` (Loose WP)
+        - `bTagFixedWP_deepJetMedium` (Medium WP)
+        - `bTagFixedWP_deepJetTight` (Tight WP)
+        - `bTagFixedWP_deepJetExtraTight` (Extra Tight WP)
+        - `bTagFixedWP_deepJetExtraExtraTight` (Extra Extra Tight WP)
+
+    - Robust Particle Transformer
+        - `bTagFixedWP_robustParticleTransformerLoose` (Loose WP)
+        - `bTagFixedWP_robustParticleTransformerMedium` (Medium WP)
+        - `bTagFixedWP_robustParticleTransformerTight` (Tight WP)
+        - `bTagFixedWP_robustParticleTransformerExtraTight` (Extra Tight WP)
+        - `bTagFixedWP_robustParticleTransformerExtraExtraTight` (Extra Extra Tight WP)
+
+
+IMPORTANT: When the systematics are applied, the weights `weight` and `weight_central` are **not** containing the weight of the btagging scale factors. Instead it is added separately as `weight_bTagFixedWP` to be used later.
+    
+
+
+
+
+
--- a/docs/source/standalone_examples/jerc.rst
+++ b/docs/source/standalone_examples/jerc.rst
@@ -40,7 +40,7 @@ An example to do ``JEC`` and ``JER`` with considering their systematics is provi

    {
        "samplejson": "samples_nanov12_EE_v0.json",
-        "workflow": "dystudies",
+        "workflow": "base",
        "metaconditions": "Era2022_v1",
        "taggers": [],
        "year": {
@@ -114,7 +114,7 @@ Now, for Run3, only ``total`` uncertainty or ``full splitted JEC systematics`` a

    {
        "samplejson": "samples_nanov12_dy_v0.json",
-        "workflow": "dystudies",
+        "workflow": "base",
        "metaconditions": "Era2022_v1",
        "taggers": [],
        "year": {
@@ -215,7 +215,7 @@ The example json configuration is

    {
        "samplejson": "samples_nanov9_v0.json",
-        "workflow": "dystudies",
+        "workflow": "base",
        "metaconditions": "Era2018_legacy_v1",
        "taggers": [],
        "year": {

--- a/environment.yml
+++ b/environment.yml
@@ -3,20 +3,23 @@ channels:
  - conda-forge
  - pytorch
 dependencies:
-  - python>=3.6
+  - python>=3.8
+  - setuptools<71
  - ipython
  - coffea<2023
+  - awkward<2
+  - correctionlib<=2.5.0
  - pyaml
  - xrootd # use 5.6.4 if you experience problems, recommended as of March 2024
  - numba
-  - pyarrow
+  - pyarrow>=11.0.0
  - pandas
  - rich
  - matplotlib
-  - vector
+  - vector<=1.4.3
  - conda-pack
  - tqdm
-  - xgboost==1.5.1
+  - xgboost
  - dask
  - distributed
  - dask-jobqueue
@@ -31,5 +34,7 @@ dependencies:
  - pandoc
  - pytorch
  - torchvision
-  - pip:
-    - zuko==1.0.1
+  - requests
+  - onnx
+  - onnxruntime
+  - zuko
--- a/higgs_dna/metaconditions/Era2017_RR-31Mar2018_v1.json
+++ b/higgs_dna/metaconditions/Era2017_RR-31Mar2018_v1.json
No results found