From 8adb99dd0b4ea58761e3f948ac0746dcc7f0dc6b Mon Sep 17 00:00:00 2001
From: Matteo Marchegiani <matmarcheg@gmail.com>
Date: Tue, 4 Apr 2023 18:32:13 +0200
Subject: [PATCH 01/22] Pass trigger dict instead of finalstate key

---
 pocket_coffea/lib/cut_functions.py | 6 +++---
 pocket_coffea/lib/triggers.py      | 6 ++----
 2 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/pocket_coffea/lib/cut_functions.py b/pocket_coffea/lib/cut_functions.py
index cad82693..aa2a3e28 100644
--- a/pocket_coffea/lib/cut_functions.py
+++ b/pocket_coffea/lib/cut_functions.py
@@ -18,11 +18,11 @@ def _get_trigger_mask_proxy(events, params, year, isMC, **kwargs):
     Helper function to call the HLT trigger mask
     '''
     return get_trigger_mask(
-        events, params["key"], year, isMC, params["primaryDatasets"], params["invert"]
+        events, params["trigger_dict"], year, isMC, params["primaryDatasets"], params["invert"]
     )
 
 
-def get_HLTsel(key, primaryDatasets=None, invert=False):
+def get_HLTsel(key, trigger_dict, primaryDatasets=None, invert=False):
     '''Create the HLT trigger mask
 
     The Cut function reads the triggers configuration and create the mask.
@@ -47,7 +47,7 @@ def get_HLTsel(key, primaryDatasets=None, invert=False):
         name += "_NOT"
     return Cut(
         name=name,
-        params={"key": key, "primaryDatasets": primaryDatasets, "invert": invert},
+        params={"trigger_dict": trigger_dict, "primaryDatasets": primaryDatasets, "invert": invert},
         function=_get_trigger_mask_proxy,
     )
 
diff --git a/pocket_coffea/lib/triggers.py b/pocket_coffea/lib/triggers.py
index 3a5e4d68..7d004d5c 100644
--- a/pocket_coffea/lib/triggers.py
+++ b/pocket_coffea/lib/triggers.py
@@ -3,7 +3,7 @@ import awkward as ak
 from ..parameters.triggers import triggers
 
 
-def get_trigger_mask(events, key, year, isMC, primaryDatasets=None, invert=False):
+def get_trigger_mask(events, trigger_dict, year, isMC, primaryDatasets=None, invert=False):
     '''Computes the HLT trigger mask
 
     The function reads the triggers configuration and create the mask.
@@ -21,9 +21,7 @@ def get_trigger_mask(events, key, year, isMC, primaryDatasets=None, invert=False
     :param invert: Invert the mask, returning which events do not path ANY of the triggers
     :returns: the events mask.
     '''
-    if key not in triggers:
-        raise Exception("Requested trigger config not found!")
-    cfg = triggers[key][year]
+    cfg = trigger_dict[year]
     # If is MC
     triggers_to_apply = []
     if primaryDatasets:
-- 
GitLab


From f395b44dc3092e41fffb37c7c52a031ac072ee6d Mon Sep 17 00:00:00 2001
From: Matteo Marchegiani <matmarcheg@gmail.com>
Date: Tue, 4 Apr 2023 18:32:40 +0200
Subject: [PATCH 02/22] Custom object preselection

---
 .../parameters/object_preselection.py         | 21 +++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/pocket_coffea/parameters/object_preselection.py b/pocket_coffea/parameters/object_preselection.py
index e93c0453..e7247f51 100644
--- a/pocket_coffea/parameters/object_preselection.py
+++ b/pocket_coffea/parameters/object_preselection.py
@@ -1,6 +1,27 @@
 # Common cuts applied to objects
 
 object_preselection = {
+    "dimuon": {
+        "Muon": {
+            "pt": 15,
+            "eta": 2.4,
+            "iso": 0.25, #PFIsoLoose
+            "id": "tightId",
+        },
+        "Electron": {
+            "pt": 15,
+            "eta": 2.4,
+            "iso": 0.06,
+            "id": "mvaFall17V2Iso_WP80",
+        },
+        "Jet": {
+            "dr": 0.4,
+            "pt": 30,
+            "eta": 2.4,
+            "jetId": 2,
+            "puId": {"wp": "L", "value": 4, "maxpt": 50.0},
+        },
+    },
     "dilepton": {
         "Muon": {
             "pt": 15,
-- 
GitLab


From a45f0b298fd32c0cb77de078d1164ba0df48b612 Mon Sep 17 00:00:00 2001
From: Matteo Marchegiani <matmarcheg@gmail.com>
Date: Tue, 4 Apr 2023 18:33:08 +0200
Subject: [PATCH 03/22] Ignore subsamples

---
 pocket_coffea/workflows/genweights.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/pocket_coffea/workflows/genweights.py b/pocket_coffea/workflows/genweights.py
index 1a404377..4137b8d4 100644
--- a/pocket_coffea/workflows/genweights.py
+++ b/pocket_coffea/workflows/genweights.py
@@ -24,10 +24,10 @@ class genWeightsProcessor(BaseProcessorABC):
             self._xsec = self.events.metadata["xsec"]
 
         # Check if the user specified any subsamples without performing any operation
-        if self._sample in self._subsamplesCfg:
-            self._hasSubsamples = True
-        else:
-            self._hasSubsamples = False
+        #if self._sample in self._subsamples:
+        #    self._hasSubsamples = True
+        #else:
+        #    self._hasSubsamples = False
 
     def apply_object_preselection(self, variation):
         pass
@@ -45,7 +45,7 @@ class genWeightsProcessor(BaseProcessorABC):
         self.output['cutflow']['initial'][self._dataset] += self.nEvents_initial
         if self._isMC:
             self.output['sum_genweights'][self._dataset] = ak.sum(self.events.genWeight)
-            if self._hasSubsamples:
-                raise Exception("This processor cannot compute the sum of genweights of subsamples.")
+            #if self._hasSubsamples:
+            #    raise Exception("This processor cannot compute the sum of genweights of subsamples.")
 
         return self.output
-- 
GitLab


From 7affb6064b44c6d230cc9664173907767330a7f4 Mon Sep 17 00:00:00 2001
From: Matteo Marchegiani <matmarcheg@gmail.com>
Date: Tue, 4 Apr 2023 19:11:12 +0200
Subject: [PATCH 04/22] Tweak plotting script defaults

---
 pocket_coffea/utils/plot_utils.py | 10 +++++++---
 scripts/plot/make_plots.py        |  4 ++--
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/pocket_coffea/utils/plot_utils.py b/pocket_coffea/utils/plot_utils.py
index de39bc9c..3af4092a 100644
--- a/pocket_coffea/utils/plot_utils.py
+++ b/pocket_coffea/utils/plot_utils.py
@@ -51,9 +51,13 @@ class PlotManager:
         for name, h_dict in hist_cfg.items():
             self.shape_objects[name] = Shape(h_dict, name, plot_dir, only_cat=self.only_cat, style_cfg=style_cfg, data_key=self.data_key, log=self.log, density=self.density)
 
-    def plot_datamc_all(self, ratio=True, syst=True, spliteras=False):
+    def plot_datamc_all(self, syst=True, spliteras=False):
         '''Plots all the histograms contained in the dictionary, for all years and categories.'''
         for name, datamc in self.shape_objects.items():
+            if ((datamc.is_mc_only) | (datamc.is_data_only)):
+                ratio = False
+            else:
+                ratio = True
             datamc.plot_datamc_all(ratio, syst, spliteras, save=self.save)
 
 
@@ -282,7 +286,7 @@ class Shape:
         '''Formats the figure's axes, labels, ticks, xlim and ylim.'''
         ylabel = "Counts" if not self.density else "A.U."
         self.ax.set_ylabel(ylabel, fontsize=self.style.fontsize)
-        self.ax.legend(fontsize=self.style.fontsize, ncols=2, loc="upper right")
+        self.ax.legend(fontsize=self.style.fontsize, ncol=2, loc="upper right")
         self.ax.tick_params(axis='x', labelsize=self.style.fontsize)
         self.ax.tick_params(axis='y', labelsize=self.style.fontsize)
         self.ax.set_xlim(self.xedges[0], self.xedges[-1])
@@ -324,7 +328,7 @@ class Shape:
                 handles_new.append(handles[i])
             labels = labels_new
             handles = handles_new
-            self.ax.legend(handles, labels, fontsize=self.style.fontsize, ncols=2, loc="upper right")
+            self.ax.legend(handles, labels, fontsize=self.style.fontsize, ncol=2, loc="upper right")
 
     def plot_mc(self, ax=None):
         '''Plots the MC histograms as a stacked plot.'''
diff --git a/scripts/plot/make_plots.py b/scripts/plot/make_plots.py
index 3b565783..61f803db 100644
--- a/scripts/plot/make_plots.py
+++ b/scripts/plot/make_plots.py
@@ -33,7 +33,7 @@ parser.add_argument('--plot_dir', default=None, help='Sub-directory inside the p
 parser.add_argument('-v', '--version', type=str, default=None, help='Version of output (e.g. `v01`, `v02`, etc.)')
 parser.add_argument('-j', '--workers', type=int, default=8, help='Number of parallel workers to use for plotting')
 parser.add_argument('-o', '--only', type=str, default='', help='Filter histograms name with string', required=False)
-parser.add_argument('-oc', '--only_cat', type=str, nargs="+", help='Filter categories with string', required=False)
+parser.add_argument('-oc', '--only_cat', type=str, default=[''], nargs="+", help='Filter categories with string', required=False)
 parser.add_argument('-os', '--only_syst', type=str, nargs="+", default='', help='Filter systematics with a list of strings', required=False)
 parser.add_argument('--split_systematics', action='store_true', help='Split systematic uncertainties in the ratio plot')
 parser.add_argument('--partial_unc_band', action='store_true', help='Plot only the partial uncertainty band corresponding to the systematics specified as the argument `only_syst`')
@@ -77,7 +77,7 @@ def make_plots(entrystart, entrystop):
         density=args.density,
         save=True
     )
-    plotter.plot_datamc_all(ratio=True, syst=True, spliteras=False)
+    plotter.plot_datamc_all(syst=True, spliteras=False)
 
 # Filter dictionary of histograms with `args.only`
 accumulator['variables'] = { k : v for k,v in accumulator['variables'].items() if args.only in k }
-- 
GitLab


From c8ded014f516fe4196ee72deec32406189eb629d Mon Sep 17 00:00:00 2001
From: mmarchegiani <matmarcheg@gmail.com>
Date: Wed, 5 Apr 2023 12:07:32 +0200
Subject: [PATCH 05/22] Fix argument exclude in make_plots

---
 scripts/plot/make_plots.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/scripts/plot/make_plots.py b/scripts/plot/make_plots.py
index e9804be7..aa3362b1 100644
--- a/scripts/plot/make_plots.py
+++ b/scripts/plot/make_plots.py
@@ -35,7 +35,7 @@ parser.add_argument('-j', '--workers', type=int, default=8, help='Number of para
 parser.add_argument('-o', '--only', type=str, default='', help='Filter histograms name with string', required=False)
 parser.add_argument('-oc', '--only_cat', type=str, default=[''], nargs="+", help='Filter categories with string', required=False)
 parser.add_argument('-os', '--only_syst', type=str, nargs="+", default='', help='Filter systematics with a list of strings', required=False)
-parser.add_argument('-e', '--exclude', type=str, default='', help='Exclude categories with string', required=False)
+parser.add_argument('-e', '--exclude', type=str, default=None, help='Exclude categories with string', required=False)
 parser.add_argument('--split_systematics', action='store_true', help='Split systematic uncertainties in the ratio plot')
 parser.add_argument('--partial_unc_band', action='store_true', help='Plot only the partial uncertainty band corresponding to the systematics specified as the argument `only_syst`')
 parser.add_argument('--overwrite', action='store_true', help='Overwrite plots in output folder')
@@ -82,7 +82,8 @@ def make_plots(entrystart, entrystop):
 
 # Filter dictionary of histograms with `args.only`
 accumulator['variables'] = { k : v for k,v in accumulator['variables'].items() if args.only in k }
-accumulator['variables'] = { k : v for k,v in accumulator['variables'].items() if not args.exclude in k }
+if args.exclude:
+    accumulator['variables'] = { k : v for k,v in accumulator['variables'].items() if not args.exclude in k }
 HistsToPlot = list(accumulator['variables'].keys())
 
 NtotHists = len(HistsToPlot)
-- 
GitLab


From e6fd8b72fc82cf07811f00073c62abb80b3d1fa7 Mon Sep 17 00:00:00 2001
From: mmarchegiani <matmarcheg@gmail.com>
Date: Wed, 5 Apr 2023 15:35:40 +0200
Subject: [PATCH 06/22] Documentation of full analysis example

---
 docs/analysis_example.rst | 58 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 58 insertions(+)
 create mode 100644 docs/analysis_example.rst

diff --git a/docs/analysis_example.rst b/docs/analysis_example.rst
new file mode 100644
index 00000000..54cf1300
--- /dev/null
+++ b/docs/analysis_example.rst
@@ -0,0 +1,58 @@
+Full analysis example
+########################
+
+A full example of all the steps needed to run a full analysis with PocketCoffea is reported, starting from the creation of the dataset, the customization of parameters and the production of the final shapes and plots.
+As an example, a simplified version of the Drell-Yan analysis targeting the Z->mumu channel is implemented.
+The main steps that need to be performed are the following:
+
+* Build the json datasets
+* Compute the sum of genweights for Monte Carlo datasets and rebuild json datasets
+* Define selections: trigger, skim, object preselection, event preselection and categorization
+* Define weights and variations
+* Define histograms
+* Run the processor
+* Make the plots from processor output
+
+Configuration file
+================
+
+The parameters specific to the analysis have to be specified in the configuration file. This file contains a pure python dictionary named `cfg` that is read by the `Configurator` module.
+
+Build dataset
+================
+
+The datasets include a Drell-Yan Monte Carlo dataset and a `SingleMuon` dataset. We have to look for the DAS key of the 
+The list of datasets has to be written in a structured dictionary together with the corresponding metadata in a json file. 
+Follow the instructions in the Build dataset example.
+
+Compute the sum of genweights
+================
+
+The sum of the genweights of Monte Carlo samples needs to be computed in order to normalize Monte Carlo datasets
+To compute the sum of genweights, we need to run a dedicated Coffea processor, `genWeightsProcessor`, that just opens all the files, reads the genweight
+
+Define selections
+================
+
+The selections are performed at two levels:
+* Object preselection: selecting the "good" objects that will be used in the final analysis (e.g. `JetGood`, `MuonGood`, `ElectronGood`...). These selections include the detector acceptance cuts, the object identification working points, the muon isolation, the b-tagging working point, etc.
+* Event selection: selections on the events that enter the final analysis, done in three steps:
+   * Skim: loose cut on the events. The following steps of the analysis are performed only on the events passing the skim selection.
+   * Preselection: baseline selection for the analysis.
+   * Categorization: selection to split the events passing the event preselection into different categories.
+
+
+Define weights and variations
+================
+
+
+Define histograms
+================
+
+
+Run the processor
+================
+
+
+Produce plots
+================
-- 
GitLab


From 6d027d996744e543f7cd240be6482d42501794be Mon Sep 17 00:00:00 2001
From: Matteo Marchegiani <matmarcheg@gmail.com>
Date: Wed, 5 Apr 2023 23:47:04 +0200
Subject: [PATCH 07/22] Nicer defaults in config/base.py

---
 config/base.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/config/base.py b/config/base.py
index 69da8ade..e8a27747 100644
--- a/config/base.py
+++ b/config/base.py
@@ -9,7 +9,7 @@ import numpy as np
 
 cfg =  {
     "dataset" : {
-        "jsons": ["datasets/backgrounds_MC_ttbar_2018.json",
+        "jsons": ["datasets/backgrounds_MC_ttbar.json",
                     ],
         "filter" : {
             "samples": ["TTToSemiLeptonic"],
@@ -27,14 +27,15 @@ cfg =  {
     "run_options" : {
         "executor"       : "dask/lxplus",
         "workers"        : 1,
-        "scaleout"       : 120,
+        "scaleout"       : 20,
         "queue"          : "microcentury",
         "walltime"       : "00:40:00",
         "mem_per_worker" : "4GB", # GB
+        "disk_per_worker" : "1GB", # GB
         "exclusive"      : False,
         "chunk"          : 200000,
         "retries"        : 50,
-        "treereduction"  : None,
+        "treereduction"  : 20,
         "max"            : None,
         "skipbadfiles"   : None,
         "voms"           : None,
-- 
GitLab


From e6e689dc0cb45c3785cfee37e0dd8cbefe4eafe8 Mon Sep 17 00:00:00 2001
From: Matteo Marchegiani <matmarcheg@gmail.com>
Date: Wed, 5 Apr 2023 23:47:32 +0200
Subject: [PATCH 08/22] Improved runner.py

---
 scripts/runner.py | 21 ++++++++++-----------
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/scripts/runner.py b/scripts/runner.py
index 879a0340..69353cd3 100644
--- a/scripts/runner.py
+++ b/scripts/runner.py
@@ -10,18 +10,13 @@ print("""
 
 import os
 import sys
-import json
 import argparse
-import time
 import pickle
 import socket
-import logging 
+import logging
 
-import uproot
-from coffea.nanoevents import NanoEventsFactory
-from coffea.util import load, save
+from coffea.util import save
 from coffea import processor
-from pprint import pprint
 
 from pocket_coffea.utils.configurator import Configurator
 from pocket_coffea.utils.network import get_proxy_path
@@ -52,7 +47,7 @@ if __name__ == '__main__':
                         log_line_template="%(color_on)s[%(levelname)-8s] %(message)s%(color_off)s")):
         print("Failed to setup logging, aborting.")
         exit(1) 
-    
+
     if args.cfg[-3:] == ".py":
         config = Configurator(args.cfg, overwrite_output_dir=args.outputdir)
     elif args.cfg[-4:] == ".pkl":
@@ -90,12 +85,15 @@ if __name__ == '__main__':
         os.system(f'cp {_x509_localpath} {_x509_path}')
         
     if (run_env:=config.run_options.get("env", "singularity")) == "singularity":
+        _configs_path = config.run_options["configs_path"]
         env_extra = [
             'export XRD_RUNFORKHANDLER=1',
             f'export X509_USER_PROXY={_x509_path}',
             # f'export X509_CERT_DIR={os.environ["X509_CERT_DIR"]}',
             f'source {sys.prefix}/bin/activate',
         ]
+        if "configs_path" in config.run_options:
+            env_extra += f'export PYTHONPATH={_configs_path}'
     elif run_env == "conda":
         env_extra = [
             'export XRD_RUNFORKHANDLER=1',
@@ -225,8 +223,9 @@ if __name__ == '__main__':
                                         },
                                         chunksize=config.run_options['chunk'], maxchunks=config.run_options['max']
                                         )
-            save(output, config.outfile)
-            print(f"Saving output to {config.outfile}")
+            outfile = config.outfile.replace("{dataset}","all")
+            save(output, outfile )
+            print(f"Saving output to {outfile}")
 
 
     # DASK runners
@@ -274,7 +273,7 @@ if __name__ == '__main__':
             cluster = CernCluster(
                 cores=1,
                 memory=config.run_options['mem_per_worker'],
-                disk=config.run_options.get('disk_per_worker', "20GB"),
+                disk=config.run_options.get('disk_per_worker', "1GB"),
                 image_type="singularity",
                 worker_image="/cvmfs/unpacked.cern.ch/gitlab-registry.cern.ch/batch-team/dask-lxplus/lxdask-cc7:latest",
                 death_timeout="3600",
-- 
GitLab


From d520c87c4e8e7e88cee1f24814a5ac8165b66c9f Mon Sep 17 00:00:00 2001
From: Matteo Marchegiani <matmarcheg@gmail.com>
Date: Thu, 6 Apr 2023 02:22:44 +0200
Subject: [PATCH 09/22] Clean runner.py

---
 scripts/runner.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/scripts/runner.py b/scripts/runner.py
index 69353cd3..6caaf95a 100644
--- a/scripts/runner.py
+++ b/scripts/runner.py
@@ -85,15 +85,12 @@ if __name__ == '__main__':
         os.system(f'cp {_x509_localpath} {_x509_path}')
         
     if (run_env:=config.run_options.get("env", "singularity")) == "singularity":
-        _configs_path = config.run_options["configs_path"]
         env_extra = [
             'export XRD_RUNFORKHANDLER=1',
             f'export X509_USER_PROXY={_x509_path}',
             # f'export X509_CERT_DIR={os.environ["X509_CERT_DIR"]}',
             f'source {sys.prefix}/bin/activate',
         ]
-        if "configs_path" in config.run_options:
-            env_extra += f'export PYTHONPATH={_configs_path}'
     elif run_env == "conda":
         env_extra = [
             'export XRD_RUNFORKHANDLER=1',
-- 
GitLab


From 1319d43e74f42c293a72b59603a3cef3b8f62920 Mon Sep 17 00:00:00 2001
From: Matteo Marchegiani <matmarcheg@gmail.com>
Date: Thu, 6 Apr 2023 02:27:09 +0200
Subject: [PATCH 10/22] Update documentation

---
 docs/analysis_example.rst | 168 +++++++++++++++++++++++++++++++++++---
 1 file changed, 158 insertions(+), 10 deletions(-)

diff --git a/docs/analysis_example.rst b/docs/analysis_example.rst
index 54cf1300..3c52e6b8 100644
--- a/docs/analysis_example.rst
+++ b/docs/analysis_example.rst
@@ -1,7 +1,7 @@
 Full analysis example
 ########################
 
-A full example of all the steps needed to run a full analysis with PocketCoffea is reported, starting from the creation of the dataset, the customization of parameters and the production of the final shapes and plots.
+A full example of all the steps needed to run a full analysis with PocketCoffea is reported, starting from the creation of the datasets list, the customization of parameters and the production of the final shapes and plots.
 As an example, a simplified version of the Drell-Yan analysis targeting the Z->mumu channel is implemented.
 The main steps that need to be performed are the following:
 
@@ -13,23 +13,139 @@ The main steps that need to be performed are the following:
 * Run the processor
 * Make the plots from processor output
 
+Setup PocketCoffea
+================
+
+The code of PocketCoffea run in this example is referring to a dedicated `branch <https://github.com/mmarchegiani/PocketCoffea/tree/analysis_example/>`_. Move to the ``analysis_example`` branch of PocketCoffea:
+
+.. code-block:: bash
+
+	cd path/to/PocketCoffea
+	git checkout -b analysis_example
+	git pull $REMOTE analysis_example
+
 Configuration file
 ================
 
-The parameters specific to the analysis have to be specified in the configuration file. This file contains a pure python dictionary named `cfg` that is read by the `Configurator` module.
+The parameters specific to the analysis have to be specified in the configuration file. This file contains a pure python dictionary named ``cfg`` that is read and manipulated by the ``Configurator`` module.
+A dedicated `repository <https://github.com/PocketCoffea/AnalysisConfigs>`_ is setup to collect the config files from different analysis. Clone the repository and install it as an editable package in the ``pocket-coffea`` environment:
+
+.. code-block:: bash
+
+	git clone https://github.com/PocketCoffea/AnalysisConfigs
+	pip install -e .
+
+The repository contains a pre-existing config file ``configs/base.py`` that can be used as a template to write the custom config file for our analysis.
+Create a dedicated folder ``zmumu`` under the ``configs`` folder. This folder will contain all the config files, the datasets definition json file, the workflows files and possibly extra files with parameters that are needed for the analysis:
+
+.. code-block:: bash
+
+	mkdir configs/zmumu
+	mkdir configs/zmumu/datasets
+	touch configs/zmumu/datasets/datasets_definitions.json
+	cp configs/base.py configs/zmumu/config.py
+
+Now the ``configs/zmumu/config.py`` file is ready to be edited and customized for our analysis.
+
 
 Build dataset
 ================
 
-The datasets include a Drell-Yan Monte Carlo dataset and a `SingleMuon` dataset. We have to look for the DAS key of the 
-The list of datasets has to be written in a structured dictionary together with the corresponding metadata in a json file. 
-Follow the instructions in the Build dataset example.
+The datasets include a Drell-Yan Monte Carlo dataset and a ``SingleMuon`` dataset. We have to look for the corresponding `DAS <https://cmsweb.cern.ch/das/>`_ keys:
+
+::
+
+	/DYJetsToLL_M-50_TuneCP5_13TeV-amcatnloFXFX-pythia8/RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v2/NANOAODSIM
+	/SingleMuon/Run2018C-UL2018_MiniAODv2_NanoAODv9-v2/NANOAOD
+
+The list of datasets has to be written in a structured dictionary together with the corresponding metadata in a json file. This json file is then read by the ``build_dataset.py`` script to produce the actual json datasets that are passed as input to the Coffea processor. The steps are the following:
+
+1. Create a json file that contains the required datasets, ``dataset_definitions.json``.
+Each entry of the dictionary corresponds to a dataset. Datasets include a list of DAS keys, the output json dataset path and the metadata. In addition a label ``sample`` is specified to group datasets under the same sample (e.g. group QCD datasets from different HT bins in a single sample).
+The general idea is the following:
+
+* The **dataset** key uniquely identifies a dataset.
+* The **sample** key is the one used internally in the framework to fill histograms with events from the same sample.
+* The names of the data samples should all start with the same prefix (e.g. ``DATA_``).
+* The **json_output** key defines the path of the destination json file which will store the list of files.
+* Several **files** entries can be defined for each dataset, including a list of DAS names and a dedicated metadata dictionary.
+* The **metadata** keys should include:
+	* For **Monte Carlo**: ``year``, ``isMC`` and ``xsec``.
+	* For **Data**: ``year``, ``isMC``, ``era`` and ``primaryDataset``.
+
+When the json datasets are built, the metadata parameters are linked to the files list, defining a unique dataset entry with the corresponding files.
+The `primaryDataset` key for Data datasets is needed in order to apply a trigger selection only to the corresponding dataset (e.g. apply the `SingleMuon` trigger only to datasets having `primaryDataset=SingleMuon`).
+The structure of the ``datasets_definitions.json`` file after filling in the dictionary with the parameters relevant to our Drell-Yan and SingleMuon datasets should be the following:
+
+.. code-block:: json
+
+   "DYJetsToLL_M-50":{
+        "sample": "DYJetsToLL",
+        "json_output"    : "datasets/DYJetsToLL_M-50.json",
+        "files":[
+            { "das_names": ["/DYJetsToLL_M-50_TuneCP5_13TeV-amcatnloFXFX-pythia8/RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v2/NANOAODSIM"],
+              "metadata": {
+                  "year":"2018",
+                  "isMC": true,
+		          "xsec": 6077.22,
+                  }
+            }
+        ]
+  },
+    "DATA_SingleMuon": {
+        "sample": "DATA_SingleMuonC",
+        "json_output": "datasets/DATA_SingleMuonC.json",
+        "files": [
+            
+            {
+                "das_names": [
+                    "/SingleMuon/Run2018C-UL2018_MiniAODv2_NanoAODv9-v2/NANOAOD"
+                ],
+                "metadata": {
+                    "year": "2018",
+                    "isMC": false,
+                    "primaryDataset": "SingleMuon",
+                    "era": "C"
+                },
+                "das_parents_names": [
+                    "/SingleMuon/Run2018C-UL2018_MiniAODv2-v2/MINIAOD"
+                ]
+            }
+        ]
+    }
+    
+2. To produce the json files containing the file lists, run the following command:
+
+.. code-block:: bash
+
+	cd zmumu
+	build_dataset.py --cfg datasets/dataset_definitions.json
+
+Four ``json`` files are produced as output, two for each dataset: a version includes file paths with a specific prefix corresponding to a site (corresponding to the site that is currently available, e.g. ``dcache-cms-xrootd.desy.de:1094``) while another has a global redirector prefix (e.g. ``xrootd-cms.infn.it``), and is named with the suffix `_redirector.json`
+If one has to rebuild the dataset to include more datasets, the extra argument ``--overwrite`` can be provided to the script.
+
+.. code-block:: bash
+
+	ls zmumu/datasets
+	datasets_definitions.json DATA_SingleMuonC.json DATA_SingleMuonC_redirector.json DYJetsToLL_M-50.json DYJetsToLL_M-50_redirector.json
+
 
 Compute the sum of genweights
 ================
 
-The sum of the genweights of Monte Carlo samples needs to be computed in order to normalize Monte Carlo datasets
-To compute the sum of genweights, we need to run a dedicated Coffea processor, `genWeightsProcessor`, that just opens all the files, reads the genweight
+The sum of the genweights of Monte Carlo samples needs to be computed in order to properly normalize Monte Carlo datasets.
+To compute the sum of genweights, we need to run a dedicated Coffea processor, ``genWeightsProcessor``, that just opens all the files, reads the genweight.
+
+1. Take the `genWeight.py` configuration file, modify the ``samples`` in ``dataset`` dict
+2. Run ``runner.py --cfg configs/$dir/genweights.py`` to get the coffea file contains genweight info. 
+3. Embed the ``sum_genweight`` info to ``sample_defintion.json``  
+
+.. code-block:: python
+
+	python ../PocketCoffea/scripts/dataset/append_genweights.py --cfg configs/zmumu/datasets/datasets_definitions_zmm.json -i output/genweights/genweights_2018/output_all.coffea  --overwrite
+
+4. Run ``build_dataset.py --cfg dataset_definitions.json`` again to embed the info
+
 
 Define selections
 ================
@@ -48,11 +164,43 @@ Define weights and variations
 
 Define histograms
 ================
-
-
+Wrapped in the ``variable`` dictionary under ``config.py``.
+
+- Create custom histogram with ``key:$HistConf_obj`` , create `Axis` in a list (1 element for 1D-hist, 2 elements for 2D-hist)
+
+
+.. code-block:: python
+   
+   "variables":
+       {
+           # 1D plots
+           "mll" : HistConf( [Axis(coll="ll", field="mass", bins=100, start=50, stop=150, label=r"$M_{\ell\ell}$ [GeV]")] 
+    	}
+	
+	# coll : collection/objects under events
+	# field: fields under collections
+	# bins, start, stop: # bins, axis-min, axis-max
+	# label: axis label name
+.. _hist: http://cnn.com/ https://github.com/PocketCoffea/PocketCoffea/blob/main/pocket_coffea/parameters/histograms.py	
+
+- There are some predefined `hist`_. 
+
+.. code-block:: python
+
+	"variables":
+       {
+        **count_hist(name="nJets", coll="JetGood",bins=8, start=0, stop=8),
+	# Muon kinematics
+	**muon_hists(coll="MuonGood", pos=0),
+	# Jet kinematics
+        **jet_hists(coll="JetGood", pos=0),
+    	}
+
+	
 Run the processor
 ================
 
 
+
 Produce plots
-================
+================
\ No newline at end of file
-- 
GitLab


From e4d16250f1636495aa2a236550e699ea9d38c678 Mon Sep 17 00:00:00 2001
From: Matteo Marchegiani <matmarcheg@gmail.com>
Date: Thu, 6 Apr 2023 02:56:34 +0200
Subject: [PATCH 11/22] Documenting sum of genweights

---
 docs/analysis_example.rst | 37 ++++++++++++++++++++++++++-----------
 1 file changed, 26 insertions(+), 11 deletions(-)

diff --git a/docs/analysis_example.rst b/docs/analysis_example.rst
index 3c52e6b8..1053238a 100644
--- a/docs/analysis_example.rst
+++ b/docs/analysis_example.rst
@@ -60,7 +60,7 @@ The datasets include a Drell-Yan Monte Carlo dataset and a ``SingleMuon`` datase
 
 The list of datasets has to be written in a structured dictionary together with the corresponding metadata in a json file. This json file is then read by the ``build_dataset.py`` script to produce the actual json datasets that are passed as input to the Coffea processor. The steps are the following:
 
-1. Create a json file that contains the required datasets, ``dataset_definitions.json``.
+1) Create a json file that contains the required datasets, ``dataset_definitions.json``.
 Each entry of the dictionary corresponds to a dataset. Datasets include a list of DAS keys, the output json dataset path and the metadata. In addition a label ``sample`` is specified to group datasets under the same sample (e.g. group QCD datasets from different HT bins in a single sample).
 The general idea is the following:
 
@@ -96,7 +96,7 @@ The structure of the ``datasets_definitions.json`` file after filling in the dic
         "sample": "DATA_SingleMuonC",
         "json_output": "datasets/DATA_SingleMuonC.json",
         "files": [
-            
+
             {
                 "das_names": [
                     "/SingleMuon/Run2018C-UL2018_MiniAODv2_NanoAODv9-v2/NANOAOD"
@@ -113,8 +113,8 @@ The structure of the ``datasets_definitions.json`` file after filling in the dic
             }
         ]
     }
-    
-2. To produce the json files containing the file lists, run the following command:
+
+2) To produce the json files containing the file lists, run the following command:
 
 .. code-block:: bash
 
@@ -133,18 +133,33 @@ If one has to rebuild the dataset to include more datasets, the extra argument `
 Compute the sum of genweights
 ================
 
-The sum of the genweights of Monte Carlo samples needs to be computed in order to properly normalize Monte Carlo datasets.
-To compute the sum of genweights, we need to run a dedicated Coffea processor, ``genWeightsProcessor``, that just opens all the files, reads the genweight.
+The sum of the genweights of Monte Carlo datasets needs to be computed in order to properly normalize Monte Carlo datasets.
+To compute the sum of genweights, we need to run a dedicated Coffea processor, ``genWeightsProcessor``, that just opens all the files, reads the genweight of each event and stores their sum in a dictionary in the output file.
+Copy the config and workflows file for the genweights from PocketCoffea, run the ``genWeightsProcessor`` and append the 
 
-1. Take the `genWeight.py` configuration file, modify the ``samples`` in ``dataset`` dict
-2. Run ``runner.py --cfg configs/$dir/genweights.py`` to get the coffea file contains genweight info. 
-3. Embed the ``sum_genweight`` info to ``sample_defintion.json``  
+#. Copy the config and workflows file for the genweights from PocketCoffea and modify the ``samples`` in the ``dataset`` dictionary:
+
+.. code-block:: bash
+
+   cp PocketCoffea/config/genweights/genweights_2018.py zmumu/genweights_2018.py
+
+#. Run the ``genWeightsProcessor`` to get the coffea output containing the sum of genweights:
+
+.. code-block:: bash
+
+   runner.py --cfg zmumu/genweights.py --full
+
+#. Append the ``sum_genweights`` metadata to ``datasets_definitions.json`` using the ``append_genweights.py`` script:
 
 .. code-block:: python
 
-	python ../PocketCoffea/scripts/dataset/append_genweights.py --cfg configs/zmumu/datasets/datasets_definitions_zmm.json -i output/genweights/genweights_2018/output_all.coffea  --overwrite
+	python ../PocketCoffea/scripts/dataset/append_genweights.py --cfg configs/zmumu/datasets/datasets_definitions.json -i output/genweights/genweights_2018/output_all.coffea --overwrite
+
+#. Run the ``build_dataset.py`` script again to produced the new json datasets updated with the ``sum_genweights`` metadata:
+
+.. code-block:: python
 
-4. Run ``build_dataset.py --cfg dataset_definitions.json`` again to embed the info
+   build_dataset.py --cfg dataset_definitions.json
 
 
 Define selections
-- 
GitLab


From 76b4c06a8289d122bcd0d9368478c558fbbe7a23 Mon Sep 17 00:00:00 2001
From: Matteo Marchegiani <matmarcheg@gmail.com>
Date: Thu, 6 Apr 2023 02:57:38 +0200
Subject: [PATCH 12/22] Typo

---
 docs/analysis_example.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/analysis_example.rst b/docs/analysis_example.rst
index 1053238a..35570935 100644
--- a/docs/analysis_example.rst
+++ b/docs/analysis_example.rst
@@ -159,7 +159,7 @@ Copy the config and workflows file for the genweights from PocketCoffea, run the
 
 .. code-block:: python
 
-   build_dataset.py --cfg dataset_definitions.json
+   build_dataset.py --cfg datasets_definitions.json --overwrite
 
 
 Define selections
-- 
GitLab


From 43c64373710ed655a857344bcdbb95366e286cfd Mon Sep 17 00:00:00 2001
From: Matteo Marchegiani <matmarcheg@gmail.com>
Date: Thu, 6 Apr 2023 03:14:14 +0200
Subject: [PATCH 13/22] Object preselection

---
 docs/analysis_example.rst | 61 ++++++++++++++++++++++++++++++---------
 1 file changed, 48 insertions(+), 13 deletions(-)

diff --git a/docs/analysis_example.rst b/docs/analysis_example.rst
index 35570935..90e2b220 100644
--- a/docs/analysis_example.rst
+++ b/docs/analysis_example.rst
@@ -60,8 +60,8 @@ The datasets include a Drell-Yan Monte Carlo dataset and a ``SingleMuon`` datase
 
 The list of datasets has to be written in a structured dictionary together with the corresponding metadata in a json file. This json file is then read by the ``build_dataset.py`` script to produce the actual json datasets that are passed as input to the Coffea processor. The steps are the following:
 
-1) Create a json file that contains the required datasets, ``dataset_definitions.json``.
-Each entry of the dictionary corresponds to a dataset. Datasets include a list of DAS keys, the output json dataset path and the metadata. In addition a label ``sample`` is specified to group datasets under the same sample (e.g. group QCD datasets from different HT bins in a single sample).
+1) Create a json file that contains the required datasets, ``dataset_definitions.json``. Each entry of the dictionary corresponds to a dataset. Datasets include a list of DAS keys, the output json dataset path and the metadata. In addition a label ``sample`` is specified to group datasets under the same sample (e.g. group QCD datasets from different HT bins in a single sample).
+
 The general idea is the following:
 
 * The **dataset** key uniquely identifies a dataset.
@@ -135,42 +135,77 @@ Compute the sum of genweights
 
 The sum of the genweights of Monte Carlo datasets needs to be computed in order to properly normalize Monte Carlo datasets.
 To compute the sum of genweights, we need to run a dedicated Coffea processor, ``genWeightsProcessor``, that just opens all the files, reads the genweight of each event and stores their sum in a dictionary in the output file.
-Copy the config and workflows file for the genweights from PocketCoffea, run the ``genWeightsProcessor`` and append the 
 
-#. Copy the config and workflows file for the genweights from PocketCoffea and modify the ``samples`` in the ``dataset`` dictionary:
+1) Copy the config and workflows file for the genweights from PocketCoffea and modify the ``samples`` in the ``dataset`` dictionary:
 
 .. code-block:: bash
 
    cp PocketCoffea/config/genweights/genweights_2018.py zmumu/genweights_2018.py
 
-#. Run the ``genWeightsProcessor`` to get the coffea output containing the sum of genweights:
+2) Run the ``genWeightsProcessor`` to get the coffea output containing the sum of genweights:
 
 .. code-block:: bash
 
    runner.py --cfg zmumu/genweights.py --full
 
-#. Append the ``sum_genweights`` metadata to ``datasets_definitions.json`` using the ``append_genweights.py`` script:
+3) Append the ``sum_genweights`` metadata to ``datasets_definitions.json`` using the ``append_genweights.py`` script:
 
 .. code-block:: python
 
 	python ../PocketCoffea/scripts/dataset/append_genweights.py --cfg configs/zmumu/datasets/datasets_definitions.json -i output/genweights/genweights_2018/output_all.coffea --overwrite
 
-#. Run the ``build_dataset.py`` script again to produced the new json datasets updated with the ``sum_genweights`` metadata:
+4) Run the ``build_dataset.py`` script again to produced the new json datasets updated with the ``sum_genweights`` metadata:
 
-.. code-block:: python
+.. code-block:: python build_dataset.py --cfg datasets_definitions.json --overwrite
 
-   build_dataset.py --cfg datasets_definitions.json --overwrite
+Now the json datasets contain all the necessary information to run the full analysis.
 
 
 Define selections
 ================
 
 The selections are performed at two levels:
-* Object preselection: selecting the "good" objects that will be used in the final analysis (e.g. `JetGood`, `MuonGood`, `ElectronGood`...). These selections include the detector acceptance cuts, the object identification working points, the muon isolation, the b-tagging working point, etc.
+* Object preselection: selecting the "good" objects that will be used in the final analysis (e.g. `JetGood`, `MuonGood`, `ElectronGood`...).
 * Event selection: selections on the events that enter the final analysis, done in three steps:
-   * Skim: loose cut on the events. The following steps of the analysis are performed only on the events passing the skim selection.
-   * Preselection: baseline selection for the analysis.
-   * Categorization: selection to split the events passing the event preselection into different categories.
+   1. Skim: loose cut on the events. The following steps of the analysis are performed only on the events passing the skim selection, while the others are discarded from the branch ``events``.
+   2. Preselection: baseline event selection for the analysis.
+   3. Categorization: selection to split the events passing the event preselection into different categories (e.g. signal region, control region).
+
+Object preselection
+----------------
+
+To select the objects entering the final analysis, we need to specify a series of cut parameters for the leptons and jets in the file ``PocketCoffea/pocket_coffea/parameters/object_preselection.py``. These selections include the pT, eta acceptance cuts, the object identification working points, the muon isolation, the b-tagging working point, etc.
+For the Z->mumu analysis, we just use the standard definitions for the muon, electron and jet objects, that we include as a dictionary under the key ``dimuon``:
+
+.. code-block:: json
+
+   object_preselection = {
+      "dimuon": {
+         "Muon": {
+               "pt": 15,
+               "eta": 2.4,
+               "iso": 0.25, #PFIsoLoose
+               "id": "tightId",
+         },
+         "Electron": {
+               "pt": 15,
+               "eta": 2.4,
+               "iso": 0.06,
+               "id": "mvaFall17V2Iso_WP80",
+         },
+         "Jet": {
+               "dr": 0.4,
+               "pt": 30,
+               "eta": 2.4,
+               "jetId": 2,
+               "puId": {"wp": "L", "value": 4, "maxpt": 50.0},
+         },
+   ...
+
+Event selection
+----------------
+
+The skim selection is defined as a
 
 
 Define weights and variations
-- 
GitLab


From 5510d5cb679bb54723b8b89b8932f8ea99a5b0b7 Mon Sep 17 00:00:00 2001
From: Matteo Marchegiani <matmarcheg@gmail.com>
Date: Thu, 6 Apr 2023 04:01:58 +0200
Subject: [PATCH 14/22] Documenting event selection

---
 docs/analysis_example.rst | 122 ++++++++++++++++++++++++++++++++++++--
 1 file changed, 117 insertions(+), 5 deletions(-)

diff --git a/docs/analysis_example.rst b/docs/analysis_example.rst
index 90e2b220..9f29a014 100644
--- a/docs/analysis_example.rst
+++ b/docs/analysis_example.rst
@@ -2,7 +2,7 @@ Full analysis example
 ########################
 
 A full example of all the steps needed to run a full analysis with PocketCoffea is reported, starting from the creation of the datasets list, the customization of parameters and the production of the final shapes and plots.
-As an example, a simplified version of the Drell-Yan analysis targeting the Z->mumu channel is implemented.
+As an example, a toy example version of the Drell-Yan analysis targeting the Z->mumu channel is implemented.
 The main steps that need to be performed are the following:
 
 * Build the json datasets
@@ -136,7 +136,7 @@ Compute the sum of genweights
 The sum of the genweights of Monte Carlo datasets needs to be computed in order to properly normalize Monte Carlo datasets.
 To compute the sum of genweights, we need to run a dedicated Coffea processor, ``genWeightsProcessor``, that just opens all the files, reads the genweight of each event and stores their sum in a dictionary in the output file.
 
-1) Copy the config and workflows file for the genweights from PocketCoffea and modify the ``samples`` in the ``dataset`` dictionary:
+1) Copy the config and workflows file for the genweights from PocketCoffea and modify the ``samples`` in the ``dataset`` dictionary with the names of our samples:
 
 .. code-block:: bash
 
@@ -165,9 +165,11 @@ Define selections
 ================
 
 The selections are performed at two levels:
+
 * Object preselection: selecting the "good" objects that will be used in the final analysis (e.g. `JetGood`, `MuonGood`, `ElectronGood`...).
 * Event selection: selections on the events that enter the final analysis, done in three steps:
-   1. Skim: loose cut on the events. The following steps of the analysis are performed only on the events passing the skim selection, while the others are discarded from the branch ``events``.
+
+   1. Skim and trigger: loose cut on the events and trigger requirements.
    2. Preselection: baseline event selection for the analysis.
    3. Categorization: selection to split the events passing the event preselection into different categories (e.g. signal region, control region).
 
@@ -177,7 +179,7 @@ Object preselection
 To select the objects entering the final analysis, we need to specify a series of cut parameters for the leptons and jets in the file ``PocketCoffea/pocket_coffea/parameters/object_preselection.py``. These selections include the pT, eta acceptance cuts, the object identification working points, the muon isolation, the b-tagging working point, etc.
 For the Z->mumu analysis, we just use the standard definitions for the muon, electron and jet objects, that we include as a dictionary under the key ``dimuon``:
 
-.. code-block:: json
+.. code-block:: python
 
    object_preselection = {
       "dimuon": {
@@ -202,11 +204,120 @@ For the Z->mumu analysis, we just use the standard definitions for the muon, ele
          },
    ...
 
+The ``finalstate`` label has to be changed to ``dimuon`` such that the processor can query the corresponding parameters for the object preselection defined above:
+
+.. code-block:: python
+
+   cfg = {
+    ...
+    "finalstate" : "dimuon",
+    ...
+   }
+
+
 Event selection
 ----------------
 
-The skim selection is defined as a
+In PocketCoffea, the event selections are implemented with a dedicated `Cut` object, that stores both the information of the cutting function and its input parameters.
+Several factory ``Cut`` objects are available in ``pocket_coffea.lib.cut_functions``, otherwise the user can define its custom ``Cut`` objects.
+
 
+Skim
+~~~~~~~~~~~~~~~~~~~~~
+
+The skim selection of the events is performed "on the fly" to reduce the number of processed events. At this stage we also apply the HLT trigger requirements required by the analysis.
+The following steps of the analysis are performed only on the events passing the skim selection, while the others are discarded from the branch ``events``, therefore reducing the computational load on the processor.
+In the config file, we specify two skim cuts: one is selecting events with at least one 15 GeV muon and the second is requiring the HLT ``SingleMuon`` path.
+In the preamble of ``config.py``, we define our custom trigger dictionary, which we pass as an argument to the factory function ``get_HLTsel()``:
+
+.. code-block:: python
+
+   trigger_dict = {
+      "2018": {
+         "SingleEle": [
+               "Ele32_WPTight_Gsf",
+               "Ele28_eta2p1_WPTight_Gsf_HT150",
+         ],
+         "SingleMuon": [
+               "IsoMu24",
+         ],
+      },
+   }
+
+   cfg = {
+    ...
+    "skim": [get_nObj_min(1, 15., "Muon"),
+             get_HLTsel("dimuon", trigger_dict, primaryDatasets=["SingleMuon"])],
+    ...
+   }
+
+
+Event preselection
+~~~~~~~~~~~~~~~~~~~~~
+
+In the Z->mumu analysis, we want to select events with exactly two muons with opposite charge. In addition, we require a cut on the leading muon pT and on the dilepton invariant mass, to select the Z boson mass window.
+The parameters are directly passed to the constructor of the ``Cut`` object as the dictionary ``params``. We can define the function ``dimuon`` and the ``Cut`` object ``dimuon_presel`` in the preamble of config:
+
+.. code-block:: python
+
+   def dimuon(events, params, year, sample, **kwargs):
+
+      # Masks for same-flavor (SF) and opposite-sign (OS)
+      SF = ((events.nMuonGood == 2) & (events.nElectronGood == 0))
+      OS = events.ll.charge == 0
+
+      mask = (
+         (events.nLeptonGood == 2)
+         & (ak.firsts(events.MuonGood.pt) > params["pt_leading_muon"])
+         & OS & SF
+         & (events.ll.mass > params["mll"]["low"])
+         & (events.ll.mass < params["mll"]["high"])
+      )
+
+      # Pad None values with False
+      return ak.where(ak.is_none(mask), False, mask)
+
+   dimuon_presel = Cut(
+      name="dilepton",
+      params={
+         "pt_leading_muon": 25,
+         "mll": {'low': 25, 'high': 2000},
+      },
+      function=dimuon,
+   )
+
+In a scenario of an analysis requiring several different cuts, a dedicated library of cuts and functions can be defined in a separate file and imported in the config file.
+The ``preselections`` field in the config file is updated accordingly:
+
+.. code-block:: python
+
+
+   cfg = {
+      ...
+      "preselections" : [dimuon_presel],
+      ...
+   }
+
+
+Categorization
+~~~~~~~~~~~~~~~~~~~~~
+
+In the toy Z->mumu analysis, no further categorization of the events is performed. Only a ``baseline`` category is defined with the ``passthrough`` factory cut that is just passing the events through without any further selection:
+
+.. code-block:: python
+
+   cfg = {
+      ...
+      # Cuts and plots settings
+      "finalstate" : "dimuon",
+      "skim": [get_nObj_min(1, 15., "Muon"),
+               get_HLTsel("dimuon", trigger_dict, primaryDatasets=["SingleMuon"])],
+      "preselections" : [dimuon_presel],
+      "categories": {
+         "baseline": [passthrough],
+      },
+      ...
+   }
 
 Define weights and variations
 ================
@@ -214,6 +325,7 @@ Define weights and variations
 
 Define histograms
 ================
+
 Wrapped in the ``variable`` dictionary under ``config.py``.
 
 - Create custom histogram with ``key:$HistConf_obj`` , create `Axis` in a list (1 element for 1D-hist, 2 elements for 2D-hist)
-- 
GitLab


From c0d0670165529a5f42f3af0af8b58568fd1eb436 Mon Sep 17 00:00:00 2001
From: Matteo Marchegiani <matmarcheg@gmail.com>
Date: Thu, 6 Apr 2023 04:23:21 +0200
Subject: [PATCH 15/22] Define weights and variations

---
 docs/analysis_example.rst | 57 ++++++++++++++++++++++++++++++++++++---
 1 file changed, 53 insertions(+), 4 deletions(-)

diff --git a/docs/analysis_example.rst b/docs/analysis_example.rst
index 9f29a014..54eb1749 100644
--- a/docs/analysis_example.rst
+++ b/docs/analysis_example.rst
@@ -169,9 +169,9 @@ The selections are performed at two levels:
 * Object preselection: selecting the "good" objects that will be used in the final analysis (e.g. `JetGood`, `MuonGood`, `ElectronGood`...).
 * Event selection: selections on the events that enter the final analysis, done in three steps:
 
-   1. Skim and trigger: loose cut on the events and trigger requirements.
-   2. Preselection: baseline event selection for the analysis.
-   3. Categorization: selection to split the events passing the event preselection into different categories (e.g. signal region, control region).
+   1) Skim and trigger: loose cut on the events and trigger requirements.
+   2) Preselection: baseline event selection for the analysis.
+   3) Categorization: selection to split the events passing the event preselection into different categories (e.g. signal region, control region).
 
 Object preselection
 ----------------
@@ -219,7 +219,7 @@ Event selection
 ----------------
 
 In PocketCoffea, the event selections are implemented with a dedicated `Cut` object, that stores both the information of the cutting function and its input parameters.
-Several factory ``Cut`` objects are available in ``pocket_coffea.lib.cut_functions``, otherwise the user can define its custom ``Cut`` objects.
+Several factory ``Cut`` objects are available in ``pocket_coffea.lib.cut_functions``, otherwise the user can define their own custom ``Cut`` objects.
 
 
 Skim
@@ -319,9 +319,58 @@ In the toy Z->mumu analysis, no further categorization of the events is performe
       ...
    }
 
+If for example Z->ee events were also included in the analysis, one could have defined a more general "dilepton" preselection and categorized the events as ``2e`` or ``2mu`` depending if they contain two electrons or two muons, respectively.
+
 Define weights and variations
 ================
 
+The application of the nominal value of scale factors and weights is switched on and off just by adding the corresponding key in the ``weights`` dictionary:
+
+.. code-block:: python
+
+   cfg = {
+      ...
+      "weights": {
+         "common": {
+            "inclusive": ["genWeight","lumi","XS",
+                          "pileup",
+                          "sf_mu_id","sf_mu_iso",
+                          ],
+            "bycategory" : {
+            }
+        },
+        "bysample": {
+        }
+      },
+      ...
+   }
+
+In our case, we are applying the nominal scaling of Monte Carlo by ``lumi * XS / genWeight`` together with the pileup reweighting and the muon ID and isolation scale factors.
+The reweighting of the events is managed internally by the module ``WeightsManager``.
+To store also the up and down systematic variations corresponding to a given weight, one can specify it in the ``variations`` dictionary:
+
+.. code-block:: python
+
+   cfg = {
+      ...
+      "variations": {
+         "weights": {
+            "common": {
+               "inclusive": [ "pileup",
+                              "sf_mu_id", "sf_mu_iso"
+                           ],
+               "bycategory" : {
+               }
+            },
+         "bysample": {
+         }    
+         },  
+      },
+      ...
+   }
+
+In this case we will store the variations corresponding to the systematic variation of pileup and the muon ID and isolation scale factors.
+These systematic uncertainties will be included in the final plots.
 
 Define histograms
 ================
-- 
GitLab


From a7d6a4a4936e5c33f4dc041f150efc94b2d45799 Mon Sep 17 00:00:00 2001
From: Matteo Marchegiani <matmarcheg@gmail.com>
Date: Thu, 6 Apr 2023 04:25:42 +0200
Subject: [PATCH 16/22] Formatting doc

---
 docs/analysis_example.rst | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/docs/analysis_example.rst b/docs/analysis_example.rst
index 54eb1749..960e44dc 100644
--- a/docs/analysis_example.rst
+++ b/docs/analysis_example.rst
@@ -3,6 +3,7 @@ Full analysis example
 
 A full example of all the steps needed to run a full analysis with PocketCoffea is reported, starting from the creation of the datasets list, the customization of parameters and the production of the final shapes and plots.
 As an example, a toy example version of the Drell-Yan analysis targeting the Z->mumu channel is implemented.
+
 The main steps that need to be performed are the following:
 
 * Build the json datasets
@@ -28,6 +29,7 @@ Configuration file
 ================
 
 The parameters specific to the analysis have to be specified in the configuration file. This file contains a pure python dictionary named ``cfg`` that is read and manipulated by the ``Configurator`` module.
+
 A dedicated `repository <https://github.com/PocketCoffea/AnalysisConfigs>`_ is setup to collect the config files from different analysis. Clone the repository and install it as an editable package in the ``pocket-coffea`` environment:
 
 .. code-block:: bash
@@ -36,6 +38,7 @@ A dedicated `repository <https://github.com/PocketCoffea/AnalysisConfigs>`_ is s
 	pip install -e .
 
 The repository contains a pre-existing config file ``configs/base.py`` that can be used as a template to write the custom config file for our analysis.
+
 Create a dedicated folder ``zmumu`` under the ``configs`` folder. This folder will contain all the config files, the datasets definition json file, the workflows files and possibly extra files with parameters that are needed for the analysis:
 
 .. code-block:: bash
@@ -75,6 +78,7 @@ The general idea is the following:
 
 When the json datasets are built, the metadata parameters are linked to the files list, defining a unique dataset entry with the corresponding files.
 The `primaryDataset` key for Data datasets is needed in order to apply a trigger selection only to the corresponding dataset (e.g. apply the `SingleMuon` trigger only to datasets having `primaryDataset=SingleMuon`).
+
 The structure of the ``datasets_definitions.json`` file after filling in the dictionary with the parameters relevant to our Drell-Yan and SingleMuon datasets should be the following:
 
 .. code-block:: json
@@ -177,6 +181,7 @@ Object preselection
 ----------------
 
 To select the objects entering the final analysis, we need to specify a series of cut parameters for the leptons and jets in the file ``PocketCoffea/pocket_coffea/parameters/object_preselection.py``. These selections include the pT, eta acceptance cuts, the object identification working points, the muon isolation, the b-tagging working point, etc.
+
 For the Z->mumu analysis, we just use the standard definitions for the muon, electron and jet objects, that we include as a dictionary under the key ``dimuon``:
 
 .. code-block:: python
@@ -228,6 +233,7 @@ Skim
 The skim selection of the events is performed "on the fly" to reduce the number of processed events. At this stage we also apply the HLT trigger requirements required by the analysis.
 The following steps of the analysis are performed only on the events passing the skim selection, while the others are discarded from the branch ``events``, therefore reducing the computational load on the processor.
 In the config file, we specify two skim cuts: one is selecting events with at least one 15 GeV muon and the second is requiring the HLT ``SingleMuon`` path.
+
 In the preamble of ``config.py``, we define our custom trigger dictionary, which we pass as an argument to the factory function ``get_HLTsel()``:
 
 .. code-block:: python
@@ -287,6 +293,7 @@ The parameters are directly passed to the constructor of the ``Cut`` object as t
    )
 
 In a scenario of an analysis requiring several different cuts, a dedicated library of cuts and functions can be defined in a separate file and imported in the config file.
+
 The ``preselections`` field in the config file is updated accordingly:
 
 .. code-block:: python
@@ -347,6 +354,7 @@ The application of the nominal value of scale factors and weights is switched on
 
 In our case, we are applying the nominal scaling of Monte Carlo by ``lumi * XS / genWeight`` together with the pileup reweighting and the muon ID and isolation scale factors.
 The reweighting of the events is managed internally by the module ``WeightsManager``.
+
 To store also the up and down systematic variations corresponding to a given weight, one can specify it in the ``variations`` dictionary:
 
 .. code-block:: python
-- 
GitLab


From 8d33eb9ce130088523197b3f5d54348eaf200c5c Mon Sep 17 00:00:00 2001
From: Matteo Marchegiani <matmarcheg@gmail.com>
Date: Thu, 6 Apr 2023 09:28:01 +0200
Subject: [PATCH 17/22] Update index

---
 docs/index.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/index.rst b/docs/index.rst
index 2df6a7f4..cbf791b1 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -21,6 +21,7 @@ The user can customize the process from the confguration file or by redefining w
    concepts
    configuration
    examples
+   analysis_example
    performance
    api
 
-- 
GitLab


From 876284c118c5b0c07c6ac6a46d5102410a1bd1e6 Mon Sep 17 00:00:00 2001
From: Matteo Marchegiani <matmarcheg@gmail.com>
Date: Thu, 6 Apr 2023 10:46:43 +0200
Subject: [PATCH 18/22] Document runner and plotting

---
 docs/analysis_example.rst | 37 ++++++++++++++++++++++++++++++++++++-
 1 file changed, 36 insertions(+), 1 deletion(-)

diff --git a/docs/analysis_example.rst b/docs/analysis_example.rst
index 960e44dc..9bfa19fe 100644
--- a/docs/analysis_example.rst
+++ b/docs/analysis_example.rst
@@ -418,8 +418,43 @@ Wrapped in the ``variable`` dictionary under ``config.py``.
 	
 Run the processor
 ================
+Run the coffea processor to get ``.coffea`` files! The ``coffea`` executor can be run locally with ``iterative, futures`` and scaleout to clusters. We now test the setup on ``lxplus, naf-desy`` more sites can also be added later.
 
+.. code-block:: python
 
+	# read all information from the config file
+	runner.py --cfg configs/zmumu/config.py 
+	# iteractive run is also possible
+	## run --test for iterative processor with ``--limit-chunks/-lc``(default:2) and ``--limit-files/-lf``(default:1)
+	runner.py --cfg configs/zmumu/config.py --test --lf 1 --lc  2
+	## change the --executor and numbers of jobs with -s/--scaleout
+	runner.py --cfg configs/zmumu/config.py --executor futures -s 10
+	
+The scaleout configurations really depends on cluster and schedulers with different sites(lxplus, LPC, naf-desy).
+
+.. code-block:: python
+
+	## Example for naf-desy
+	"run_options" : {
+		"executor"       : "parsl/condor/naf-desy", # scheduler/cluster-type/site
+		"workers"        : 1, # cpus for each job
+		"scaleout"       : 300, # numbers of job
+		"queue"          : "microcentury",# job queue time for condor
+		"walltime"       : "00:40:00", # walltime for condor jobs
+		"disk_per_worker": "4GB", # disk size for each job(stored files)
+		"mem_per_worker" : "2GB", # RAM size for each job
+		"exclusive"      : False, # not used for condor
+		"chunk"          : 200000, #chunk size 
+		"retries"        : 20, # numbers of retries when job failes
+		"max"            : None, # numbers of chunks 
+		"skipbadfiles"   : None, # skip badfiles
+		"voms"           : None, # point to the voms certificate directory
+		"limit"          : None, # limited files
+	    },
+	
 
 Produce plots
-================
\ No newline at end of file
+================
+
+``python ../PocketCoffea/scripts/plot/make_plots.py --cfg configs/zmumu/config.py -i output/test_zmumu_v01/output_all.coffea``
+
-- 
GitLab


From 1e7d4da1130d84ec4ae9636854f46a1631a15577 Mon Sep 17 00:00:00 2001
From: Matteo Marchegiani <matmarcheg@gmail.com>
Date: Thu, 6 Apr 2023 13:52:34 +0200
Subject: [PATCH 19/22] Update doc

---
 docs/analysis_example.rst | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/docs/analysis_example.rst b/docs/analysis_example.rst
index 9bfa19fe..a7b120fc 100644
--- a/docs/analysis_example.rst
+++ b/docs/analysis_example.rst
@@ -418,17 +418,17 @@ Wrapped in the ``variable`` dictionary under ``config.py``.
 	
 Run the processor
 ================
-Run the coffea processor to get ``.coffea`` files! The ``coffea`` executor can be run locally with ``iterative, futures`` and scaleout to clusters. We now test the setup on ``lxplus, naf-desy`` more sites can also be added later.
+Run the coffea processor to get ``.coffea`` output files. The ``coffea`` processor can be run locally with ``iterative`` or ``futures`` executors or scaleout to clusters. We now test the setup on ``lxplus``, ``naf-desy`` but more sites can also be included later.
 
 .. code-block:: python
 
 	# read all information from the config file
-	runner.py --cfg configs/zmumu/config.py 
-	# iteractive run is also possible
+	runner.py --cfg configs/zmumu/config.py --full
+	# iterative run is also possible
 	## run --test for iterative processor with ``--limit-chunks/-lc``(default:2) and ``--limit-files/-lf``(default:1)
-	runner.py --cfg configs/zmumu/config.py --test --lf 1 --lc  2
+	runner.py --cfg configs/zmumu/config.py  --full --test --lf 1 --lc  2
 	## change the --executor and numbers of jobs with -s/--scaleout
-	runner.py --cfg configs/zmumu/config.py --executor futures -s 10
+	runner.py --cfg configs/zmumu/config.py  --full --executor futures -s 10
 	
 The scaleout configurations really depends on cluster and schedulers with different sites(lxplus, LPC, naf-desy).
 
@@ -456,5 +456,7 @@ The scaleout configurations really depends on cluster and schedulers with differ
 Produce plots
 ================
 
-``python ../PocketCoffea/scripts/plot/make_plots.py --cfg configs/zmumu/config.py -i output/test_zmumu_v01/output_all.coffea``
+.. code-block:: bash
+
+   python ../PocketCoffea/scripts/plot/make_plots.py --cfg configs/zmumu/config.py -i output/test_zmumu/output_all.coffea
 
-- 
GitLab


From 5e2c446c82cbda8352d9682844b4f8b004626272 Mon Sep 17 00:00:00 2001
From: mmarchegiani <matmarcheg@gmail.com>
Date: Thu, 6 Apr 2023 20:59:59 +0200
Subject: [PATCH 20/22] Scaleout on naf-desy cluster

---
 scripts/runner.py | 38 +++++++++++++++++++++++++++++++++-----
 1 file changed, 33 insertions(+), 5 deletions(-)

diff --git a/scripts/runner.py b/scripts/runner.py
index 6caaf95a..2db3fc95 100644
--- a/scripts/runner.py
+++ b/scripts/runner.py
@@ -102,6 +102,12 @@ if __name__ == '__main__':
             'ulimit -u 32768',
             'export MALLOC_TRIM_THRESHOLD_=0'
         ]
+    condor_extra = [
+        f"cd {os.getcwd()}",
+        f'source {os.environ["HOME"]}/.bashrc',
+        f"source {os.getcwd()}/CondaSetup.sh",
+        f'conda activate {os.environ["CONDA_PREFIX"]}',
+    ]
     logging.debug(env_extra)
 
 
@@ -123,6 +129,7 @@ if __name__ == '__main__':
                                     executor_args={
                                         'skipbadfiles':config.run_options['skipbadfiles'],
                                         'schema': processor.NanoAODSchema,
+                                        'xrootdtimeout': config.run_options.get('xrootdtimeout', 600),
                                         'workers': config.run_options['scaleout']},
                                     chunksize=config.run_options['chunk'],
                                     maxchunks=config.run_options['max']
@@ -139,7 +146,7 @@ if __name__ == '__main__':
         from parsl.config import Config
         from parsl.executors import HighThroughputExecutor
         from parsl.launchers import SrunLauncher, SingleNodeLauncher
-        from parsl.addresses import address_by_hostname
+        from parsl.addresses import address_by_hostname, address_by_query
 
         if 'slurm' in config.run_options['executor']:
             slurm_htex = Config(
@@ -185,7 +192,6 @@ if __name__ == '__main__':
         elif 'condor' in config.run_options['executor']:
             #xfer_files = [process_worker_pool, _x509_path]
             #print(xfer_files)
-
             condor_htex = Config(
                 executors=[
                     HighThroughputExecutor(
@@ -205,8 +211,30 @@ if __name__ == '__main__':
                         ),
                     )
                 ],
-                #retries=20,
+                retries=config.run_options["retries"],
             )
+            ## Site config for naf-desy
+            if "naf-desy" in config.run_options['executor']:
+                condor_htex = Config(
+                    executors=[
+                        HighThroughputExecutor(
+                            label="coffea_parsl_condor",
+                            address=address_by_query(),
+                            max_workers=1,
+                            worker_debug=True,
+                            provider=CondorProvider(
+                                nodes_per_block=1,
+                                cores_per_slot=config.run_options["workers"],
+                                mem_per_slot=config.run_options["mem_per_worker"],
+                                init_blocks=config.run_options["scaleout"],
+                                max_blocks=(config.run_options["scaleout"]) + 10,
+                                worker_init="\n".join(env_extra + condor_extra),
+                                walltime=config.run_options["walltime"],
+                            ),
+                        )
+                    ],
+                    retries=config.run_options["retries"],
+                )
             dfk = parsl.load(condor_htex)
 
             output = processor.run_uproot_job(config.fileset,
@@ -248,10 +276,10 @@ if __name__ == '__main__':
         elif 'condor' in config.run_options['executor']:
             log_folder = "condor_log"
             cluster = HTCondorCluster(
-                 cores=1,
+                 cores=config.run_options['workers'],
                  memory=config.run_options['mem_per_worker'],
                  disk=config.run_options.get('disk_per_worker', "20GB"),
-                 env_extra=env_extra,
+                 job_script_prologue=env_extra,
             )
         elif 'lxplus' in config.run_options["executor"]:
             log_folder = "condor_log"
-- 
GitLab


From fd9a3993b25176b95cb90a2d85944335b6b23a5b Mon Sep 17 00:00:00 2001
From: mmarchegiani <matmarcheg@gmail.com>
Date: Thu, 6 Apr 2023 21:01:04 +0200
Subject: [PATCH 21/22] Custom scaling by processed luminosity in plotting

---
 pocket_coffea/utils/plot_utils.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/pocket_coffea/utils/plot_utils.py b/pocket_coffea/utils/plot_utils.py
index e26c6f53..cabbf97a 100644
--- a/pocket_coffea/utils/plot_utils.py
+++ b/pocket_coffea/utils/plot_utils.py
@@ -25,10 +25,13 @@ class Style:
             setattr(self, key, item)
         self.has_labels = False
         self.has_samples_map = False
+        self.has_lumi = False
         if "labels" in style_cfg:
             self.has_labels = True
         if "samples_map" in style_cfg:
             self.has_samples_map = True
+        if "lumi_processed" in style_cfg:
+            self.has_lumi = True
         self.set_defaults()
 
     def set_defaults(self):
@@ -74,6 +77,8 @@ class Shape:
         self.plot_dir = plot_dir
         self.only_cat = only_cat
         self.style = Style(style_cfg)
+        if self.style.has_lumi:
+            self.lumi_fraction = {year : l / lumi[year]['tot'] for year, l in self.style.lumi_processed.items()}
         self.data_key = data_key
         self.log = log
         self.density = density
-- 
GitLab


From cdf108c38fe94d07985ad2ce58cf00019698d0f9 Mon Sep 17 00:00:00 2001
From: mmarchegiani <matmarcheg@gmail.com>
Date: Sun, 9 Apr 2023 10:18:19 +0200
Subject: [PATCH 22/22] Fix setup.cfg

---
 setup.cfg | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.cfg b/setup.cfg
index 11288f88..552a1242 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -77,7 +77,7 @@ docs =
     myst-parser>=0.13
     sphinx-book-theme>=0.1.0
     sphinx-copybutton
-    #sphinx-apidoc
+    sphinx-apidoc
     nox
 test =
     pytest>=6
-- 
GitLab