Skip to content
Snippets Groups Projects

Merging full workflow into Tanay's HiggsDNA

Open Sergi Castells requested to merge castells/higgs-dna-4-gamma-tanays-copy:master into master
Compare and Show latest version
1 file
+ 15
11
Compare changes
  • Side-by-side
  • Inline
+ 15
11
@@ -751,11 +751,6 @@ class HggBaseProcessor(processor.ProcessorABC): # type: ignore
logger.info("No surviving events in this run, return now!")
return histos_etc
if "EvtMix" in dataset_name:
pseudos["eventMixing"] = numpy.full(len(pseudos), 1)
else:
pseudos["eventMixing"] = numpy.full(len(pseudos), 0)
if self.output_location is not None:
# Need to convert to ROOT after doing plots + BDT + categories.
# Will need to sort out a middle step so the samples can be saved as ROOT directly if that is desired.
@@ -789,23 +784,32 @@ class HggBaseProcessor(processor.ProcessorABC): # type: ignore
"selections": Nevents["selections"],
}
assert ~awkward.any(awkward.is_none(diphotons)) and ~awkward.any(awkward.is_none(pseudos)), f"{~awkward.any(awkward.is_none(diphotons))} {~awkward.any(awkward.is_none(pseudos))}"
assert len(diphotons) == len(pseudos)
df = diphoton_list_to_pandas(self, diphotons)
df_ps = ps_list_to_pandas(pseudos)
evtMix = pandas.Series(awkward.to_numpy(awkward.flatten(pseudos["eventMixing"])), name="eventMixing")
evtMix = pandas.Series([1 if "evtMix" in dataset_name else 0] * len(pseudos), name="eventMixing")
if "Signal" in dataset_name:
eff_N = pandas.Series(list(Nevents_clean.values()), name="Nevents")
#final_df = pandas.concat([df, df_ps, evtMix, eff_N], axis=1)
final_df = pandas.concat([df, eff_N], axis=1)
final_df = pandas.concat([df, df_ps, evtMix, eff_N], axis=1)
# NOTE: If there are NaN values in events then look at the assert below. len(final_df) will be a multiple of 5 (for eff_N) and the others will not be.
# When there are fewer than 5 events, eff_N is bigger than the other arrays and those events are filled with NaN. This fine/expected behavior.
assert not final_df.drop(columns=["Nevents"]).isnull().values.any() or not (len(final_df) == len(df) == len(df_ps) == len(evtMix)), f"final_df: {final_df.drop(columns=['Nevents']).isnull().values.any()} diphoton: {df.isnull().values.any()} pseudos: {df_ps.isnull().values.any()} evtMix: {evtMix.isnull().values.any()} eff_N {eff_N.isnull().values.any()}\n {len(final_df)} {len(df)} {len(df_ps)} {len(evtMix)}"
else:
final_df = pandas.concat([df, df_ps, evtMix], axis=1)
fname = (events.behavior["__events_factory__"]._partition_key.replace("/", "_") + ".parquet")
# NOTE: If there are NaN values in events then look at the assert below. len(final_df) will be a multiple of 5 (for eff_N) and the others will not be.
# When there are fewer than 5 events, eff_N is bigger than the other arrays and those events are filled with NaN. This fine/expected behavior.
assert not final_df.drop(columns=["Nevents"]).isnull().values.any() or not (len(final_df) == len(df) == len(df_ps) == len(evtMix)), f"final_df: {final_df.drop(columns=['Nevents']).isnull().values.any()} diphoton: {df.isnull().values.any()} pseudos: {df_ps.isnull().values.any()} evtMix: {evtMix.isnull().values.any()} eff_N {eff_N.isnull().values.any()}\n {len(final_df)} {len(df)} {len(df_ps)} {len(evtMix)}"
fname = (events.behavior["__events_factory__"]._partition_key.replace("/", "_") + ".parquet")
subdirs = []
if "dataset" in events.metadata:
subdirs.append(events.metadata["dataset"])
subdirs.append(variation)
logger.debug(f"Num Events (after selections): {len(events)}")
dump_pandas(self, final_df, fname, self.output_location, subdirs)
return histos_etc
Loading