Skip to content
Snippets Groups Projects
Commit 2bc18bff authored by Chengyang Pan's avatar Chengyang Pan
Browse files

Add split method

parent 7de06add
No related branches found
No related tags found
No related merge requests found
Pipeline #7115604 passed
...@@ -33,6 +33,20 @@ parser.add_argument( ...@@ -33,6 +33,20 @@ parser.add_argument(
default=False, default=False,
help="create branches for systematic variations", help="create branches for systematic variations",
) )
parser.add_argument(
"--do_fiducial",
dest="do_fiducial",
action="store_true",
default=False,
help="Do the fiducial analysis",
)
parser.add_argument(
"--do_differential",
dest="do_differential",
action="store_true",
default=False,
help="Do the njets fiducial differential analysis",
)
parser.add_argument( parser.add_argument(
"--cats", "--cats",
type=str, type=str,
...@@ -66,6 +80,11 @@ rename_dict = { ...@@ -66,6 +80,11 @@ rename_dict = {
} }
df_dict = {} df_dict = {}
df_dict_in = {}
df_dict_out = {}
df_dict_in_0jets = {}
df_dict_in_1jets = {}
df_dict_in_2jets = {}
outfiles = { outfiles = {
"ch": target_path.replace( "ch": target_path.replace(
"merged.root", "output_cHToGG_M125_13TeV_amcatnloFXFX_pythia8.root" "merged.root", "output_cHToGG_M125_13TeV_amcatnloFXFX_pythia8.root"
...@@ -163,6 +182,11 @@ if args.do_syst: ...@@ -163,6 +182,11 @@ if args.do_syst:
# object systematics come from a different file (you are supposed to have merged .parquet with the merge_parquet.py script) # object systematics come from a different file (you are supposed to have merged .parquet with the merge_parquet.py script)
for var in variation_dict: for var in variation_dict:
df_dict[var] = {} df_dict[var] = {}
df_dict_in_0jets[var] = {}
df_dict_in_1jets[var] = {}
df_dict_in_2jets[var] = {}
df_dict_in[var] = {}
df_dict_out[var] = {}
for cat in cat_dict: for cat in cat_dict:
var_path = source_path.replace( var_path = source_path.replace(
"merged.parquet", f"{variation_dict[var]}/{cat}_merged.parquet" "merged.parquet", f"{variation_dict[var]}/{cat}_merged.parquet"
...@@ -175,12 +199,54 @@ if args.do_syst: ...@@ -175,12 +199,54 @@ if args.do_syst:
logger.info("Successfully read from parquet file with awkward.") logger.info("Successfully read from parquet file with awkward.")
dict = {} if args.do_fiducial:
for i in eve.fields: # split it into in/out
i_re = rename_dict[i] if i in rename_dict else i eve_in = eve[eve.fiducialGeometricTagger_20 == 21]
dict[i_re] = eve[i] eve_out = eve[eve.fiducialGeometricTagger_20 == 20]
df_dict[var][cat] = dict if args.do_differential:
# Based on fiducial in/out splitting, split it into 3 njets
eve_in_0jets = eve_in[eve_in.njetsTagger_20 == 20]
eve_in_1jets = eve_in[eve_in.njetsTagger_20 == 21]
eve_in_2jets = eve_in[eve_in.njetsTagger_20 >= 22]
dict_in_0jets = {}
dict_in_1jets = {}
dict_in_2jets = {}
dict_out = {}
for i in eve.fields:
i_re = rename_dict[i] if i in rename_dict else i
dict_in_0jets[i_re] = eve_in_0jets[i]
dict_in_1jets[i_re] = eve_in_1jets[i]
dict_in_2jets[i_re] = eve_in_2jets[i]
dict_out[i_re] = eve_out[i]
df_dict_in_0jets[var][cat] = dict_in_0jets
df_dict_in_1jets[var][cat] = dict_in_1jets
df_dict_in_2jets[var][cat] = dict_in_2jets
df_dict_out[var][cat] = dict_out
# only inclusive fiducial splitting
else:
dict_in = {}
dict_out = {}
for i in eve.fields:
i_re = rename_dict[i] if i in rename_dict else i
dict_in[i_re] = eve_in[i]
dict_out[i_re] = eve_out[i]
df_dict_in[var][cat] = dict_in
df_dict_out[var][cat] = dict_out
# no fiducial splitting
else:
dict = {}
for i in eve.fields:
i_re = rename_dict[i] if i in rename_dict else i
dict[i_re] = eve[i]
df_dict[var][cat] = dict
logger.debug( logger.debug(
f"Successfully created dict from awkward arrays for {var} variation for category: {cat}." f"Successfully created dict from awkward arrays for {var} variation for category: {cat}."
...@@ -198,12 +264,54 @@ else: ...@@ -198,12 +264,54 @@ else:
logger.info("Successfully read from parquet file with awkward.") logger.info("Successfully read from parquet file with awkward.")
dict = {} if args.do_fiducial:
for i in eve.fields: # split it into in/out
i_re = rename_dict[i] if i in rename_dict else i eve_in = eve[eve.fiducialGeometricTagger_20 == 21]
dict[i_re] = eve[i] eve_out = eve[eve.fiducialGeometricTagger_20 == 20]
df_dict[cat] = dict if args.do_differential:
# Based on fiducial in/out splitting, split it into 3 njets
eve_in_0jets = eve_in[eve_in.njetsTagger_20 == 20]
eve_in_1jets = eve_in[eve_in.njetsTagger_20 == 21]
eve_in_2jets = eve_in[eve_in.njetsTagger_20 >= 22]
dict_in_0jets = {}
dict_in_1jets = {}
dict_in_2jets = {}
dict_out = {}
for i in eve.fields:
i_re = rename_dict[i] if i in rename_dict else i
dict_in_0jets[i_re] = eve_in_0jets[i]
dict_in_1jets[i_re] = eve_in_1jets[i]
dict_in_2jets[i_re] = eve_in_2jets[i]
dict_out[i_re] = eve_out[i]
df_dict_in_0jets[cat] = dict_in_0jets
df_dict_in_1jets[cat] = dict_in_1jets
df_dict_in_2jets[cat] = dict_in_2jets
df_dict_out[cat] = dict_out
# only inclusive fiducial splitting
else:
dict_in = {}
dict_out = {}
for i in eve.fields:
i_re = rename_dict[i] if i in rename_dict else i
dict_in[i_re] = eve_in[i]
dict_out[i_re] = eve_out[i]
df_dict_in[cat] = dict_in
df_dict_out[cat] = dict_out
# no fiducial splitting
else:
dict = {}
for i in eve.fields:
i_re = rename_dict[i] if i in rename_dict else i
dict[i_re] = eve[i]
df_dict[cat] = dict
logger.debug( logger.debug(
f"Successfully created dict from awkward arrays without variation for category: {cat}." f"Successfully created dict from awkward arrays without variation for category: {cat}."
...@@ -286,76 +394,106 @@ else: ...@@ -286,76 +394,106 @@ else:
# Now we want to write the dictionary to a root file, since object systematics don't come from # Now we want to write the dictionary to a root file, since object systematics don't come from
# the nominal file we have to separate again the treatment of them from the object ones # the nominal file we have to separate again the treatment of them from the object ones
with uproot.recreate(outfiles[process]) as file:
logger.debug(outfiles[process])
# Final fit want a separate tree for each category and variation,
# the naming of the branches are quite rigid:
# For MC: {inputTreeDir}/{production-mode}_{mass}_{sqrts}_{category}_{syst}
# For data: {inputTreeDir}/Data_{sqrts}_{category}
for cat in cat_dict:
logger.debug(f"writing category: {cat}")
if args.do_syst:
# check that the category actually contains something, otherwise the flattening step will make the script crash,
# an improvement (not sure if needed) may be to also write an empty TTree to not confuse FinalFit
if len(df_dict["NOMINAL"][cat]["weight"]):
for branch in df_dict["NOMINAL"][cat]:
# here I had to add a flattening step to help uproot with the type of the awkward arrays,
# if you don't flatten (event if you don't have a nested field) you end up having a type like (len_of_array) * ?type, which make uproot very mad apparently
df_dict["NOMINAL"][cat][branch] = ak.flatten(df_dict["NOMINAL"][cat][branch], axis=0)
file[names[cat]] = df_dict["NOMINAL"][cat]
if notag:
file[name_notag] = df_dict["NOMINAL"][cat] # this is wrong, to be fixed
for syst_name, weight, syst_, c in labels[cat]:
# Skip "NOMINAL" as information included in nominal tree
if syst_ == "NOMINAL":
continue
logger.debug(f"{syst_name}, {weight}, {syst_}, {c}")
# If the name is not in the variation dictionary it is assumed to be a weight systematic
if syst_ not in variation_dict:
logger.debug(f"found weight syst {syst_}")
red_dict = {}
for key, new_key in [
["CMS_hgg_mass", "CMS_hgg_mass"],
[weight, "weight"],
["fiducialGeometricTagger_20", "fiducialGeometricTagger_20"],
["HTXS_Higgs_pt", "HTXS_Higgs_pt"],
["HTXS_Higgs_y", "HTXS_Higgs_y"]
]:
if "NOMINAL" in df_dict and cat in df_dict["NOMINAL"] and key in df_dict["NOMINAL"][cat]:
red_dict[new_key] = df_dict["NOMINAL"][cat][key]
logger.info(f"Adding {syst_name}01sigma to out tree...")
file[syst_name + "01sigma"] = red_dict
else:
red_dict = {}
for key, new_key in [
["CMS_hgg_mass", "CMS_hgg_mass"],
[weight, "weight"],
["fiducialGeometricTagger_20", "fiducialGeometricTagger_20"],
["HTXS_Higgs_pt", "HTXS_Higgs_pt"],
["HTXS_Higgs_y", "HTXS_Higgs_y"]
]:
if syst_ in df_dict and cat in df_dict[syst_] and key in df_dict[syst_][cat]:
red_dict[new_key] = ak.flatten(df_dict[syst_][cat][key], 0)
logger.info(f"Adding {syst_name}01sigma to out tree...")
file[syst_name + "01sigma"] = red_dict
else:
logger.info(f"no events survived category selection for cat: {cat}")
# choose the right df_dict
if args.do_fiducial:
if args.do_differential:
df_dict_all = [df_dict_in_0jets, df_dict_in_1jets, df_dict_in_2jets, df_dict_out]
else:
df_dict_all = [df_dict_in, df_dict_out]
else:
df_dict_all = [df_dict]
# Loop over all df_dict
for index, df_dict in enumerate(df_dict_all):
if args.do_fiducial:
if args.do_differential:
if index == 0:
outfiles_name = outfiles[process].split(".root")[0] + "_in" + "_0jets" + ".root"
elif index == 1:
outfiles_name = outfiles[process].split(".root")[0] + "_in" + "_1jets" + ".root"
elif index == 2:
outfiles_name = outfiles[process].split(".root")[0] + "_in" + "_2jets" + ".root"
elif index == 3:
outfiles_name = outfiles[process].split(".root")[0] + "_out" + ".root"
else: else:
# if there are no syst there is no df_dict["NOMINAL"] entry in the dict if index == 0:
if len(df_dict[cat][[*df_dict[cat]][0]]): outfiles_name = outfiles[process].split(".root")[0] + "_in" + ".root"
# same as before elif index == 1:
for branch in df_dict[cat]: outfiles_name = outfiles[process].split(".root")[0] + "_out" + ".root"
df_dict[cat][branch] = ak.flatten(df_dict[cat][branch], axis=0) else:
file[names[cat]] = df_dict[cat] outfiles_name = outfiles[process]
if notag:
file[name_notag] = df_dict[cat] # this is wrong, to be fixed with uproot.recreate(outfiles[process]) as file:
logger.debug(outfiles[process])
# Final fit want a separate tree for each category and variation,
# the naming of the branches are quite rigid:
# For MC: {inputTreeDir}/{production-mode}_{mass}_{sqrts}_{category}_{syst}
# For data: {inputTreeDir}/Data_{sqrts}_{category}
for cat in cat_dict:
logger.debug(f"writing category: {cat}")
if args.do_syst:
# check that the category actually contains something, otherwise the flattening step will make the script crash,
# an improvement (not sure if needed) may be to also write an empty TTree to not confuse FinalFit
if len(df_dict["NOMINAL"][cat]["weight"]):
for branch in df_dict["NOMINAL"][cat]:
# here I had to add a flattening step to help uproot with the type of the awkward arrays,
# if you don't flatten (event if you don't have a nested field) you end up having a type like (len_of_array) * ?type, which make uproot very mad apparently
df_dict["NOMINAL"][cat][branch] = ak.flatten(df_dict["NOMINAL"][cat][branch], axis=0)
file[names[cat]] = df_dict["NOMINAL"][cat]
if notag:
file[name_notag] = df_dict["NOMINAL"][cat] # this is wrong, to be fixed
for syst_name, weight, syst_, c in labels[cat]:
# Skip "NOMINAL" as information included in nominal tree
if syst_ == "NOMINAL":
continue
logger.debug(f"{syst_name}, {weight}, {syst_}, {c}")
# If the name is not in the variation dictionary it is assumed to be a weight systematic
if syst_ not in variation_dict:
logger.debug(f"found weight syst {syst_}")
red_dict = {}
for key, new_key in [
["CMS_hgg_mass", "CMS_hgg_mass"],
[weight, "weight"],
["fiducialGeometricTagger_20", "fiducialGeometricTagger_20"],
["HTXS_Higgs_pt", "HTXS_Higgs_pt"],
["HTXS_Higgs_y", "HTXS_Higgs_y"]
]:
if "NOMINAL" in df_dict and cat in df_dict["NOMINAL"] and key in df_dict["NOMINAL"][cat]:
red_dict[new_key] = df_dict["NOMINAL"][cat][key]
logger.info(f"Adding {syst_name}01sigma to out tree...")
file[syst_name + "01sigma"] = red_dict
else:
red_dict = {}
for key, new_key in [
["CMS_hgg_mass", "CMS_hgg_mass"],
[weight, "weight"],
["fiducialGeometricTagger_20", "fiducialGeometricTagger_20"],
["HTXS_Higgs_pt", "HTXS_Higgs_pt"],
["HTXS_Higgs_y", "HTXS_Higgs_y"]
]:
if syst_ in df_dict and cat in df_dict[syst_] and key in df_dict[syst_][cat]:
red_dict[new_key] = ak.flatten(df_dict[syst_][cat][key], 0)
logger.info(f"Adding {syst_name}01sigma to out tree...")
file[syst_name + "01sigma"] = red_dict
else:
logger.info(f"no events survived category selection for cat: {cat}")
else: else:
logger.info(f"no events survived category selection for cat: {cat}") # if there are no syst there is no df_dict["NOMINAL"] entry in the dict
if len(df_dict[cat][[*df_dict[cat]][0]]):
# same as before
for branch in df_dict[cat]:
df_dict[cat][branch] = ak.flatten(df_dict[cat][branch], axis=0)
file[names[cat]] = df_dict[cat]
if notag:
file[name_notag] = df_dict[cat] # this is wrong, to be fixed
else:
logger.info(f"no events survived category selection for cat: {cat}")
logger.info( logger.info(
f"Successfully converted parquet file to ROOT file for process {process}." f"Successfully converted parquet file to ROOT file for process {process}."
) )
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment