Add split method

2bc18bff · Chengyang Pan · 7de06add · 2bc18bff
Commit 2bc18bff authored 11 months ago by Chengyang Pan
--- a/scripts/postprocessing/convert_parquet_to_root.py
+++ b/scripts/postprocessing/convert_parquet_to_root.py
@@ -33,6 +33,20 @@ parser.add_argument(
    default=False,
    help="create branches for systematic variations",
 )
+parser.add_argument(
+    "--do_fiducial",
+    dest="do_fiducial",
+    action="store_true",
+    default=False,
+    help="Do the fiducial analysis",
+)
+parser.add_argument(
+    "--do_differential",
+    dest="do_differential",
+    action="store_true",
+    default=False,
+    help="Do the njets fiducial differential analysis",
+)
 parser.add_argument(
    "--cats",
    type=str,
@@ -66,6 +80,11 @@ rename_dict = {
 }
 df_dict = {}
+df_dict_in = {}
+df_dict_out = {}
+df_dict_in_0jets = {}
+df_dict_in_1jets = {}
+df_dict_in_2jets = {}
 outfiles = {
    "ch": target_path.replace(
        "merged.root", "output_cHToGG_M125_13TeV_amcatnloFXFX_pythia8.root"
@@ -163,6 +182,11 @@ if args.do_syst:
    # object systematics come from a different file (you are supposed to have merged .parquet with the merge_parquet.py script)
    for var in variation_dict:
        df_dict[var] = {}
+        df_dict_in_0jets[var] = {}
+        df_dict_in_1jets[var] = {}
+        df_dict_in_2jets[var] = {}
+        df_dict_in[var] = {}
+        df_dict_out[var] = {}
        for cat in cat_dict:
            var_path = source_path.replace(
                "merged.parquet", f"{variation_dict[var]}/{cat}_merged.parquet"
@@ -175,12 +199,54 @@ if args.do_syst:
            logger.info("Successfully read from parquet file with awkward.")
-            dict = {}
+            if args.do_fiducial:
-            for i in eve.fields:
+                # split it into in/out
-                i_re = rename_dict[i] if i in rename_dict else i
+                eve_in = eve[eve.fiducialGeometricTagger_20 == 21]
-                dict[i_re] = eve[i]
+                eve_out = eve[eve.fiducialGeometricTagger_20 == 20]
-            df_dict[var][cat] = dict
+                if args.do_differential:
+                    # Based on fiducial in/out splitting, split it into 3 njets
+                    eve_in_0jets = eve_in[eve_in.njetsTagger_20 == 20]
+                    eve_in_1jets = eve_in[eve_in.njetsTagger_20 == 21]
+                    eve_in_2jets = eve_in[eve_in.njetsTagger_20 >= 22]
+                    dict_in_0jets = {}
+                    dict_in_1jets = {}
+                    dict_in_2jets = {}
+                    dict_out = {}
+                    for i in eve.fields:
+                        i_re = rename_dict[i] if i in rename_dict else i
+                        dict_in_0jets[i_re] = eve_in_0jets[i]
+                        dict_in_1jets[i_re] = eve_in_1jets[i]
+                        dict_in_2jets[i_re] = eve_in_2jets[i]
+                        dict_out[i_re] = eve_out[i]
+                    df_dict_in_0jets[var][cat] = dict_in_0jets
+                    df_dict_in_1jets[var][cat] = dict_in_1jets
+                    df_dict_in_2jets[var][cat] = dict_in_2jets
+                    df_dict_out[var][cat] = dict_out
+                # only inclusive fiducial splitting
+                else:
+                    dict_in = {}
+                    dict_out = {}
+                    for i in eve.fields:
+                        i_re = rename_dict[i] if i in rename_dict else i
+                        dict_in[i_re] = eve_in[i]
+                        dict_out[i_re] = eve_out[i]
+                    df_dict_in[var][cat] = dict_in
+                    df_dict_out[var][cat] = dict_out
+            # no fiducial splitting
+            else:
+                dict = {}
+                for i in eve.fields:
+                    i_re = rename_dict[i] if i in rename_dict else i
+                    dict[i_re] = eve[i]
+                df_dict[var][cat] = dict
            logger.debug(
                f"Successfully created dict from awkward arrays for {var} variation for category: {cat}."
@@ -198,12 +264,54 @@ else:
        logger.info("Successfully read from parquet file with awkward.")
-        dict = {}
+        if args.do_fiducial:
-        for i in eve.fields:
+            # split it into in/out
-            i_re = rename_dict[i] if i in rename_dict else i
+            eve_in = eve[eve.fiducialGeometricTagger_20 == 21]
-            dict[i_re] = eve[i]
+            eve_out = eve[eve.fiducialGeometricTagger_20 == 20]
-        df_dict[cat] = dict
+            if args.do_differential:
+                # Based on fiducial in/out splitting, split it into 3 njets
+                eve_in_0jets = eve_in[eve_in.njetsTagger_20 == 20]
+                eve_in_1jets = eve_in[eve_in.njetsTagger_20 == 21]
+                eve_in_2jets = eve_in[eve_in.njetsTagger_20 >= 22]
+                dict_in_0jets = {}
+                dict_in_1jets = {}
+                dict_in_2jets = {}
+                dict_out = {}
+                for i in eve.fields:
+                    i_re = rename_dict[i] if i in rename_dict else i
+                    dict_in_0jets[i_re] = eve_in_0jets[i]
+                    dict_in_1jets[i_re] = eve_in_1jets[i]
+                    dict_in_2jets[i_re] = eve_in_2jets[i]
+                    dict_out[i_re] = eve_out[i]
+                df_dict_in_0jets[cat] = dict_in_0jets
+                df_dict_in_1jets[cat] = dict_in_1jets
+                df_dict_in_2jets[cat] = dict_in_2jets
+                df_dict_out[cat] = dict_out
+            # only inclusive fiducial splitting
+            else:
+                dict_in = {}
+                dict_out = {}
+                for i in eve.fields:
+                    i_re = rename_dict[i] if i in rename_dict else i
+                    dict_in[i_re] = eve_in[i]
+                    dict_out[i_re] = eve_out[i]
+                df_dict_in[cat] = dict_in
+                df_dict_out[cat] = dict_out
+        # no fiducial splitting
+        else:
+            dict = {}
+            for i in eve.fields:
+                i_re = rename_dict[i] if i in rename_dict else i
+                dict[i_re] = eve[i]
+            df_dict[cat] = dict
        logger.debug(
            f"Successfully created dict from awkward arrays without variation for category: {cat}."
@@ -286,76 +394,106 @@ else:
 # Now we want to write the dictionary to a root file, since object systematics don't come from
 # the nominal file we have to separate again the treatment of them from the object ones
-with uproot.recreate(outfiles[process]) as file:
-    logger.debug(outfiles[process])
-    # Final fit want a separate tree for each category and variation,
-    # the naming of the branches are quite rigid:
-    # For MC: {inputTreeDir}/{production-mode}_{mass}_{sqrts}_{category}_{syst}
-    # For data: {inputTreeDir}/Data_{sqrts}_{category}
-    for cat in cat_dict:
-        logger.debug(f"writing category: {cat}")
-        if args.do_syst:
-            # check that the category actually contains something, otherwise the flattening step will make the script crash,
-            # an improvement (not sure if needed) may be to also write an empty TTree to not confuse FinalFit
-            if len(df_dict["NOMINAL"][cat]["weight"]):
-                for branch in df_dict["NOMINAL"][cat]:
-                    # here I had to add a flattening step to help uproot with the type of the awkward arrays,
-                    # if you don't flatten (event if you don't have a nested field) you end up having a type like (len_of_array) * ?type, which make uproot very mad apparently
-                    df_dict["NOMINAL"][cat][branch] = ak.flatten(df_dict["NOMINAL"][cat][branch], axis=0)
-                file[names[cat]] = df_dict["NOMINAL"][cat]
-                if notag:
-                    file[name_notag] = df_dict["NOMINAL"][cat]  # this is wrong, to be fixed
-                for syst_name, weight, syst_, c in labels[cat]:
-                    # Skip "NOMINAL" as information included in nominal tree
-                    if syst_ == "NOMINAL":
-                        continue
-                    logger.debug(f"{syst_name}, {weight}, {syst_}, {c}")
-                    # If the name is not in the variation dictionary it is assumed to be a weight systematic
-                    if syst_ not in variation_dict:
-                        logger.debug(f"found weight syst {syst_}")
-                        red_dict = {}
-                        for key, new_key in [
-                            ["CMS_hgg_mass", "CMS_hgg_mass"],
-                            [weight, "weight"],
-                            ["fiducialGeometricTagger_20", "fiducialGeometricTagger_20"],
-                            ["HTXS_Higgs_pt", "HTXS_Higgs_pt"],
-                            ["HTXS_Higgs_y", "HTXS_Higgs_y"]
-                        ]:
-                            if "NOMINAL" in df_dict and cat in df_dict["NOMINAL"] and key in df_dict["NOMINAL"][cat]:
-                                red_dict[new_key] = df_dict["NOMINAL"][cat][key]
-                        logger.info(f"Adding {syst_name}01sigma to out tree...")
-                        file[syst_name + "01sigma"] = red_dict
-                    else:
-                        red_dict = {}
-                        for key, new_key in [
-                            ["CMS_hgg_mass", "CMS_hgg_mass"],
-                            [weight, "weight"],
-                            ["fiducialGeometricTagger_20", "fiducialGeometricTagger_20"],
-                            ["HTXS_Higgs_pt", "HTXS_Higgs_pt"],
-                            ["HTXS_Higgs_y", "HTXS_Higgs_y"]
-                        ]:
-                            if syst_ in df_dict and cat in df_dict[syst_] and key in df_dict[syst_][cat]:
-                                red_dict[new_key] = ak.flatten(df_dict[syst_][cat][key], 0)
-                        logger.info(f"Adding {syst_name}01sigma to out tree...")
-                        file[syst_name + "01sigma"] = red_dict
-            else:
-                logger.info(f"no events survived category selection for cat: {cat}")
+# choose the right df_dict
+if args.do_fiducial:
+    if args.do_differential:
+        df_dict_all = [df_dict_in_0jets, df_dict_in_1jets, df_dict_in_2jets, df_dict_out]
+    else:
+        df_dict_all = [df_dict_in, df_dict_out]
+else:
+    df_dict_all = [df_dict]
+# Loop over all df_dict
+for index, df_dict in enumerate(df_dict_all):
+    if args.do_fiducial:
+        if args.do_differential:
+            if index == 0:
+                outfiles_name = outfiles[process].split(".root")[0] + "_in" + "_0jets" + ".root"
+            elif index == 1:
+                outfiles_name = outfiles[process].split(".root")[0] + "_in" + "_1jets" + ".root"
+            elif index == 2:
+                outfiles_name = outfiles[process].split(".root")[0] + "_in" + "_2jets" + ".root"
+            elif index == 3:
+                outfiles_name = outfiles[process].split(".root")[0] + "_out" + ".root"
        else:
-            # if there are no syst there is no df_dict["NOMINAL"] entry in the dict
+            if index == 0:
-            if len(df_dict[cat][[*df_dict[cat]][0]]):
+                outfiles_name = outfiles[process].split(".root")[0] + "_in" + ".root"
-                # same as before
+            elif index == 1:
-                for branch in df_dict[cat]:
+                outfiles_name = outfiles[process].split(".root")[0] + "_out" + ".root"
-                    df_dict[cat][branch] = ak.flatten(df_dict[cat][branch], axis=0)
+    else:
-                file[names[cat]] = df_dict[cat]
+        outfiles_name = outfiles[process]
-                if notag:
-                    file[name_notag] = df_dict[cat]  # this is wrong, to be fixed
+    with uproot.recreate(outfiles[process]) as file:
+        logger.debug(outfiles[process])
+        # Final fit want a separate tree for each category and variation,
+        # the naming of the branches are quite rigid:
+        # For MC: {inputTreeDir}/{production-mode}_{mass}_{sqrts}_{category}_{syst}
+        # For data: {inputTreeDir}/Data_{sqrts}_{category}
+        for cat in cat_dict:
+            logger.debug(f"writing category: {cat}")
+            if args.do_syst:
+                # check that the category actually contains something, otherwise the flattening step will make the script crash,
+                # an improvement (not sure if needed) may be to also write an empty TTree to not confuse FinalFit
+                if len(df_dict["NOMINAL"][cat]["weight"]):
+                    for branch in df_dict["NOMINAL"][cat]:
+                        # here I had to add a flattening step to help uproot with the type of the awkward arrays,
+                        # if you don't flatten (event if you don't have a nested field) you end up having a type like (len_of_array) * ?type, which make uproot very mad apparently
+                        df_dict["NOMINAL"][cat][branch] = ak.flatten(df_dict["NOMINAL"][cat][branch], axis=0)
+                    file[names[cat]] = df_dict["NOMINAL"][cat]
+                    if notag:
+                        file[name_notag] = df_dict["NOMINAL"][cat]  # this is wrong, to be fixed
+                    for syst_name, weight, syst_, c in labels[cat]:
+                        # Skip "NOMINAL" as information included in nominal tree
+                        if syst_ == "NOMINAL":
+                            continue
+                        logger.debug(f"{syst_name}, {weight}, {syst_}, {c}")
+                        # If the name is not in the variation dictionary it is assumed to be a weight systematic
+                        if syst_ not in variation_dict:
+                            logger.debug(f"found weight syst {syst_}")
+                            red_dict = {}
+                            for key, new_key in [
+                                ["CMS_hgg_mass", "CMS_hgg_mass"],
+                                [weight, "weight"],
+                                ["fiducialGeometricTagger_20", "fiducialGeometricTagger_20"],
+                                ["HTXS_Higgs_pt", "HTXS_Higgs_pt"],
+                                ["HTXS_Higgs_y", "HTXS_Higgs_y"]
+                            ]:
+                                if "NOMINAL" in df_dict and cat in df_dict["NOMINAL"] and key in df_dict["NOMINAL"][cat]:
+                                    red_dict[new_key] = df_dict["NOMINAL"][cat][key]
+                            logger.info(f"Adding {syst_name}01sigma to out tree...")
+                            file[syst_name + "01sigma"] = red_dict
+                        else:
+                            red_dict = {}
+                            for key, new_key in [
+                                ["CMS_hgg_mass", "CMS_hgg_mass"],
+                                [weight, "weight"],
+                                ["fiducialGeometricTagger_20", "fiducialGeometricTagger_20"],
+                                ["HTXS_Higgs_pt", "HTXS_Higgs_pt"],
+                                ["HTXS_Higgs_y", "HTXS_Higgs_y"]
+                            ]:
+                                if syst_ in df_dict and cat in df_dict[syst_] and key in df_dict[syst_][cat]:
+                                    red_dict[new_key] = ak.flatten(df_dict[syst_][cat][key], 0)
+                            logger.info(f"Adding {syst_name}01sigma to out tree...")
+                            file[syst_name + "01sigma"] = red_dict
+                else:
+                    logger.info(f"no events survived category selection for cat: {cat}")
            else:
-                logger.info(f"no events survived category selection for cat: {cat}")
+                # if there are no syst there is no df_dict["NOMINAL"] entry in the dict
+                if len(df_dict[cat][[*df_dict[cat]][0]]):
+                    # same as before
+                    for branch in df_dict[cat]:
+                        df_dict[cat][branch] = ak.flatten(df_dict[cat][branch], axis=0)
+                    file[names[cat]] = df_dict[cat]
+                    if notag:
+                        file[name_notag] = df_dict[cat]  # this is wrong, to be fixed
+                else:
+                    logger.info(f"no events survived category selection for cat: {cat}")
-    logger.info(
+        logger.info(
-        f"Successfully converted parquet file to ROOT file for process {process}."
+            f"Successfully converted parquet file to ROOT file for process {process}."
-    )
+        )