From af6291e96a23918a4233fc4128a47f785b4c0846 Mon Sep 17 00:00:00 2001 From: Maximilian <maximilianwrabetz@rwth-aachen.de> Date: Tue, 10 Dec 2024 14:51:07 +0100 Subject: [PATCH 01/17] corrected read in of decorrelator.py --- higgs_dna/tools/decorrelation_CDFs_dumper/01_dumpCdfs_decorr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/higgs_dna/tools/decorrelation_CDFs_dumper/01_dumpCdfs_decorr.py b/higgs_dna/tools/decorrelation_CDFs_dumper/01_dumpCdfs_decorr.py index 84e67482..65ec7172 100644 --- a/higgs_dna/tools/decorrelation_CDFs_dumper/01_dumpCdfs_decorr.py +++ b/higgs_dna/tools/decorrelation_CDFs_dumper/01_dumpCdfs_decorr.py @@ -1,7 +1,7 @@ # import root_pandas import numpy as np import pandas as pd -from decorrelator import cdfCalc +from higgs_dna.tools.decorrelator import cdfCalc import argparse import glob -- GitLab From 3e26d367580c653bc8f25ba01869f698f4cafd7a Mon Sep 17 00:00:00 2001 From: Maximilian <maximilianwrabetz@rwth-aachen.de> Date: Tue, 10 Dec 2024 14:52:18 +0100 Subject: [PATCH 02/17] removed unnecessary flag --- higgs_dna/tools/decorrelation_CDFs_dumper/01_dumpCdfs_decorr.py | 1 - 1 file changed, 1 deletion(-) diff --git a/higgs_dna/tools/decorrelation_CDFs_dumper/01_dumpCdfs_decorr.py b/higgs_dna/tools/decorrelation_CDFs_dumper/01_dumpCdfs_decorr.py index 65ec7172..6079efa6 100644 --- a/higgs_dna/tools/decorrelation_CDFs_dumper/01_dumpCdfs_decorr.py +++ b/higgs_dna/tools/decorrelation_CDFs_dumper/01_dumpCdfs_decorr.py @@ -35,7 +35,6 @@ if __name__ == "__main__": parser = argparse.ArgumentParser() requiredArgs = parser.add_argument_group() requiredArgs.add_argument('-i', '--infile', action='store', type=str, required=True) - requiredArgs.add_argument('-i', '--infile', action='store', type=str, required=True) requiredArgs.add_argument('-c', '--cdfsFile', action='store', type=str, required=True) requiredArgs.add_argument('-t', '--tree', action='store', type=str, required=True) options = parser.parse_args() -- GitLab From 7262b4a86c5c030f797c144201f2797fc2691118 Mon Sep 17 00:00:00 2001 From: Maximilian <maximilianwrabetz@rwth-aachen.de> Date: Tue, 10 Dec 2024 14:53:33 +0100 Subject: [PATCH 03/17] removed repeating code --- .../tools/decorrelation_CDFs_dumper/01_dumpCdfs_decorr.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/higgs_dna/tools/decorrelation_CDFs_dumper/01_dumpCdfs_decorr.py b/higgs_dna/tools/decorrelation_CDFs_dumper/01_dumpCdfs_decorr.py index 6079efa6..2cd78fe2 100644 --- a/higgs_dna/tools/decorrelation_CDFs_dumper/01_dumpCdfs_decorr.py +++ b/higgs_dna/tools/decorrelation_CDFs_dumper/01_dumpCdfs_decorr.py @@ -20,11 +20,6 @@ def main(options): print(f"INFO: found {len(events)} events") - df["sigma_m_over_m"] = events[options].to_numpy() - - df["mass"] = events.mass.to_numpy() - df["weight"] = events.weight.to_numpy() - # Evaluating and dumping the CDFs in bins of mass calc = cdfCalc(df, options.tree,'mass',np.linspace(100, 180, 161)) calc.dumpCdfs(options.cdfsFile) -- GitLab From 696aa85555c10cf10702a1d484b32c1caa333fe3 Mon Sep 17 00:00:00 2001 From: Maximilian <maximilianwrabetz@rwth-aachen.de> Date: Tue, 10 Dec 2024 15:15:48 +0100 Subject: [PATCH 04/17] flag was not read in correctly --- higgs_dna/tools/decorrelation_CDFs_dumper/01_dumpCdfs_decorr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/higgs_dna/tools/decorrelation_CDFs_dumper/01_dumpCdfs_decorr.py b/higgs_dna/tools/decorrelation_CDFs_dumper/01_dumpCdfs_decorr.py index 2cd78fe2..84168f8b 100644 --- a/higgs_dna/tools/decorrelation_CDFs_dumper/01_dumpCdfs_decorr.py +++ b/higgs_dna/tools/decorrelation_CDFs_dumper/01_dumpCdfs_decorr.py @@ -14,7 +14,7 @@ def main(options): data = [pd.read_parquet(f) for f in files] events = pd.concat(data,ignore_index=True) - df["sigma_m_over_m"] = events[options].to_numpy() + df["sigma_m_over_m"] = events[options.tree].to_numpy() df["mass"] = events.mass.to_numpy() df["weight"] = events.weight.to_numpy() -- GitLab From 7dcb35c901495c8f8322e0c6aaa386630f5cad2f Mon Sep 17 00:00:00 2001 From: Maximilian <maximilianwrabetz@rwth-aachen.de> Date: Tue, 10 Dec 2024 15:24:09 +0100 Subject: [PATCH 05/17] to remain consistency and for comport, infile flag is now infilepath in consistency with 02_decorrelate --- .../tools/decorrelation_CDFs_dumper/01_dumpCdfs_decorr.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/higgs_dna/tools/decorrelation_CDFs_dumper/01_dumpCdfs_decorr.py b/higgs_dna/tools/decorrelation_CDFs_dumper/01_dumpCdfs_decorr.py index 84168f8b..0da3fd36 100644 --- a/higgs_dna/tools/decorrelation_CDFs_dumper/01_dumpCdfs_decorr.py +++ b/higgs_dna/tools/decorrelation_CDFs_dumper/01_dumpCdfs_decorr.py @@ -10,7 +10,7 @@ def main(options): df = pd.DataFrame() - files = glob.glob(options.infile) + files = glob.glob(str(options.infilepath) + "*.parquet") data = [pd.read_parquet(f) for f in files] events = pd.concat(data,ignore_index=True) @@ -29,7 +29,7 @@ if __name__ == "__main__": parser = argparse.ArgumentParser() requiredArgs = parser.add_argument_group() - requiredArgs.add_argument('-i', '--infile', action='store', type=str, required=True) + requiredArgs.add_argument('-i', '--infilepath', action='store', type=str, required=True) requiredArgs.add_argument('-c', '--cdfsFile', action='store', type=str, required=True) requiredArgs.add_argument('-t', '--tree', action='store', type=str, required=True) options = parser.parse_args() -- GitLab From c4fc65ba24b305707fde0954250dcc4c0799c1d0 Mon Sep 17 00:00:00 2001 From: Maximilian <maximilianwrabetz@rwth-aachen.de> Date: Tue, 10 Dec 2024 15:57:37 +0100 Subject: [PATCH 06/17] column names are now correct and the flags are improved --- .../tools/decorrelation_CDFs_dumper/01_dumpCdfs_decorr.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/higgs_dna/tools/decorrelation_CDFs_dumper/01_dumpCdfs_decorr.py b/higgs_dna/tools/decorrelation_CDFs_dumper/01_dumpCdfs_decorr.py index 0da3fd36..7d393288 100644 --- a/higgs_dna/tools/decorrelation_CDFs_dumper/01_dumpCdfs_decorr.py +++ b/higgs_dna/tools/decorrelation_CDFs_dumper/01_dumpCdfs_decorr.py @@ -14,7 +14,7 @@ def main(options): data = [pd.read_parquet(f) for f in files] events = pd.concat(data,ignore_index=True) - df["sigma_m_over_m"] = events[options.tree].to_numpy() + df[options.tree] = events[options.tree].to_numpy() df["mass"] = events.mass.to_numpy() df["weight"] = events.weight.to_numpy() @@ -29,8 +29,8 @@ if __name__ == "__main__": parser = argparse.ArgumentParser() requiredArgs = parser.add_argument_group() - requiredArgs.add_argument('-i', '--infilepath', action='store', type=str, required=True) - requiredArgs.add_argument('-c', '--cdfsFile', action='store', type=str, required=True) - requiredArgs.add_argument('-t', '--tree', action='store', type=str, required=True) + requiredArgs.add_argument('-i', '--infilepath', action='store', type=str, required=True, help="Input file path") + requiredArgs.add_argument('-c', '--cdfsFile', action='store', type=str, required=True, help="CDFs output file") + parser.add_argument('-t', '--tree', default='sigma_m_over_m', action='store', type=str, help="Tree name (default: sigma_m_over_m)") options = parser.parse_args() main(options) -- GitLab From 678dca0b1fa339af59c3c00924400c9c809acbc6 Mon Sep 17 00:00:00 2001 From: Maximilian <maximilianwrabetz@rwth-aachen.de> Date: Tue, 10 Dec 2024 16:16:58 +0100 Subject: [PATCH 07/17] made it process faster --- .../decorrelation_CDFs_dumper/01_dumpCdfs_decorr.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/higgs_dna/tools/decorrelation_CDFs_dumper/01_dumpCdfs_decorr.py b/higgs_dna/tools/decorrelation_CDFs_dumper/01_dumpCdfs_decorr.py index 7d393288..6c725e59 100644 --- a/higgs_dna/tools/decorrelation_CDFs_dumper/01_dumpCdfs_decorr.py +++ b/higgs_dna/tools/decorrelation_CDFs_dumper/01_dumpCdfs_decorr.py @@ -4,15 +4,22 @@ import pandas as pd from higgs_dna.tools.decorrelator import cdfCalc import argparse import glob +import pyarrow.parquet as pq +import concurrent.futures +def read_parquet_file(file_path): + return pd.read_parquet(file_path) def main(options): df = pd.DataFrame() files = glob.glob(str(options.infilepath) + "*.parquet") - data = [pd.read_parquet(f) for f in files] - events = pd.concat(data,ignore_index=True) + + with concurrent.futures.ThreadPoolExecutor() as executor: + data = list(executor.map(read_parquet_file, files)) + + events = pd.concat(data, ignore_index=True) df[options.tree] = events[options.tree].to_numpy() df["mass"] = events.mass.to_numpy() -- GitLab From 606486907aaed5245584c20f3df28ad0c01d3fb2 Mon Sep 17 00:00:00 2001 From: Maximilian <maximilianwrabetz@rwth-aachen.de> Date: Tue, 10 Dec 2024 16:21:51 +0100 Subject: [PATCH 08/17] some more explanations for the trees --- .../tools/decorrelation_CDFs_dumper/01_dumpCdfs_decorr.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/higgs_dna/tools/decorrelation_CDFs_dumper/01_dumpCdfs_decorr.py b/higgs_dna/tools/decorrelation_CDFs_dumper/01_dumpCdfs_decorr.py index 6c725e59..a66ea8fd 100644 --- a/higgs_dna/tools/decorrelation_CDFs_dumper/01_dumpCdfs_decorr.py +++ b/higgs_dna/tools/decorrelation_CDFs_dumper/01_dumpCdfs_decorr.py @@ -36,8 +36,8 @@ if __name__ == "__main__": parser = argparse.ArgumentParser() requiredArgs = parser.add_argument_group() - requiredArgs.add_argument('-i', '--infilepath', action='store', type=str, required=True, help="Input file path") - requiredArgs.add_argument('-c', '--cdfsFile', action='store', type=str, required=True, help="CDFs output file") + requiredArgs.add_argument('-i', '--infilepath', action='store', type=str, required=True, help="Input file path, e.g. /net/.../samples/") + requiredArgs.add_argument('-c', '--cdfsFile', action='store', type=str, required=True, help="CDFs output file, is cdf_input for 02_decorrelate.py") parser.add_argument('-t', '--tree', default='sigma_m_over_m', action='store', type=str, help="Tree name (default: sigma_m_over_m)") options = parser.parse_args() main(options) -- GitLab From 2f70f2bc3227d74d3dd0e2a659ea413d4f673817 Mon Sep 17 00:00:00 2001 From: Maximilian <maximilianwrabetz@rwth-aachen.de> Date: Tue, 10 Dec 2024 16:34:19 +0100 Subject: [PATCH 09/17] created default output file, only the input path remains necessary --- .../tools/decorrelation_CDFs_dumper/01_dumpCdfs_decorr.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/higgs_dna/tools/decorrelation_CDFs_dumper/01_dumpCdfs_decorr.py b/higgs_dna/tools/decorrelation_CDFs_dumper/01_dumpCdfs_decorr.py index a66ea8fd..583b79c5 100644 --- a/higgs_dna/tools/decorrelation_CDFs_dumper/01_dumpCdfs_decorr.py +++ b/higgs_dna/tools/decorrelation_CDFs_dumper/01_dumpCdfs_decorr.py @@ -29,7 +29,7 @@ def main(options): # Evaluating and dumping the CDFs in bins of mass calc = cdfCalc(df, options.tree,'mass',np.linspace(100, 180, 161)) - calc.dumpCdfs(options.cdfsFile) + calc.dumpCdfs(str(options.cdfsFile) + ".pkl.gz") if __name__ == "__main__": @@ -37,7 +37,7 @@ if __name__ == "__main__": parser = argparse.ArgumentParser() requiredArgs = parser.add_argument_group() requiredArgs.add_argument('-i', '--infilepath', action='store', type=str, required=True, help="Input file path, e.g. /net/.../samples/") - requiredArgs.add_argument('-c', '--cdfsFile', action='store', type=str, required=True, help="CDFs output file, is cdf_input for 02_decorrelate.py") + requiredArgs.add_argument('-c', '--cdfsFile', default='output_01_dumpCDFs_decorr', action='store', type=str, help="CDFs output file in pkl.gz format, is cdf_input for 02_decorrelate.py") parser.add_argument('-t', '--tree', default='sigma_m_over_m', action='store', type=str, help="Tree name (default: sigma_m_over_m)") options = parser.parse_args() main(options) -- GitLab From a8eb12566994669b24e06a87f21ef7c7041ece99 Mon Sep 17 00:00:00 2001 From: Maximilian <maximilianwrabetz@rwth-aachen.de> Date: Tue, 10 Dec 2024 16:45:15 +0100 Subject: [PATCH 10/17] added some error messages and output for help --- .../decorrelation_CDFs_dumper/01_dumpCdfs_decorr.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/higgs_dna/tools/decorrelation_CDFs_dumper/01_dumpCdfs_decorr.py b/higgs_dna/tools/decorrelation_CDFs_dumper/01_dumpCdfs_decorr.py index 583b79c5..28027165 100644 --- a/higgs_dna/tools/decorrelation_CDFs_dumper/01_dumpCdfs_decorr.py +++ b/higgs_dna/tools/decorrelation_CDFs_dumper/01_dumpCdfs_decorr.py @@ -21,12 +21,18 @@ def main(options): events = pd.concat(data, ignore_index=True) - df[options.tree] = events[options.tree].to_numpy() + if options.tree in events: + df[options.tree] = events[options.tree].to_numpy() + else: + print(f"ERROR: tree not in columns of parquet files from {options.infilepath}") + print(f"Please choose from {[col for col in events.columns if col.startswith('sigma')]}") + exit() + df["mass"] = events.mass.to_numpy() df["weight"] = events.weight.to_numpy() print(f"INFO: found {len(events)} events") - + print(f"INFO: output CDF file contains: {df.columns.tolist()}") # Evaluating and dumping the CDFs in bins of mass calc = cdfCalc(df, options.tree,'mass',np.linspace(100, 180, 161)) calc.dumpCdfs(str(options.cdfsFile) + ".pkl.gz") -- GitLab From d53694db97ae87bece4cf96e1d97dff5a21ca38b Mon Sep 17 00:00:00 2001 From: Maximilian <maximilianwrabetz@rwth-aachen.de> Date: Tue, 10 Dec 2024 16:51:00 +0100 Subject: [PATCH 11/17] made flake8 happy --- .../tools/decorrelation_CDFs_dumper/01_dumpCdfs_decorr.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/higgs_dna/tools/decorrelation_CDFs_dumper/01_dumpCdfs_decorr.py b/higgs_dna/tools/decorrelation_CDFs_dumper/01_dumpCdfs_decorr.py index 28027165..36a63c6b 100644 --- a/higgs_dna/tools/decorrelation_CDFs_dumper/01_dumpCdfs_decorr.py +++ b/higgs_dna/tools/decorrelation_CDFs_dumper/01_dumpCdfs_decorr.py @@ -1,26 +1,23 @@ -# import root_pandas import numpy as np import pandas as pd from higgs_dna.tools.decorrelator import cdfCalc import argparse import glob -import pyarrow.parquet as pq import concurrent.futures + def read_parquet_file(file_path): return pd.read_parquet(file_path) + def main(options): df = pd.DataFrame() files = glob.glob(str(options.infilepath) + "*.parquet") - with concurrent.futures.ThreadPoolExecutor() as executor: data = list(executor.map(read_parquet_file, files)) - events = pd.concat(data, ignore_index=True) - if options.tree in events: df[options.tree] = events[options.tree].to_numpy() else: -- GitLab From 11ea4feb5aae3c676a46023285e9b42f98425503 Mon Sep 17 00:00:00 2001 From: Maximilian <maximilianwrabetz@rwth-aachen.de> Date: Wed, 11 Dec 2024 10:25:40 +0100 Subject: [PATCH 12/17] updated the read in of decorr and removed personal lines --- .../decorrelation_CDFs_dumper/02_decorrelate.py | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/higgs_dna/tools/decorrelation_CDFs_dumper/02_decorrelate.py b/higgs_dna/tools/decorrelation_CDFs_dumper/02_decorrelate.py index d2446aa1..04c6dc50 100644 --- a/higgs_dna/tools/decorrelation_CDFs_dumper/02_decorrelate.py +++ b/higgs_dna/tools/decorrelation_CDFs_dumper/02_decorrelate.py @@ -1,4 +1,4 @@ -import decorrelator as decorr +import higgs_dna.tools.decorrelator as decorr import awkward as ak import argparse import os @@ -45,7 +45,6 @@ def main(options): files = glob.glob(str(options.infile) + "*.parquet") data = [pd.read_parquet(f) for f in files] - # data = pd.read_parquet("/net/scratch_cms3a/daumann/massresdecorrhiggsdna/big_bkg/Diphoton.parquet") events = pd.concat(data,ignore_index=True) df = pd.DataFrame() @@ -59,15 +58,6 @@ def main(options): df['{}_decorr'.format(options.var)] = decl.doDecorr(options.ref) - if 'sigmaMoM_decorr' in df.columns: - df['sigmaMoM_decorrOld'] = df['sigmaMoM_decorr'] - - if options.var == 'sigmarv': - df['sigmaMoM_decorr'] = df['sigmarv_decorr'] - - if options.var == 'sigmaRV': - df['sigmaMoM_decorr'] = df['sigmaRV_decorr'] - events["sigma_m_over_m_decorr"] = decl.doDecorr(options.ref) events.to_parquet(options.outFile) -- GitLab From 08c8782195108fb5638cf8e94340a415baca400a Mon Sep 17 00:00:00 2001 From: Maximilian <maximilianwrabetz@rwth-aachen.de> Date: Wed, 11 Dec 2024 13:10:28 +0100 Subject: [PATCH 13/17] script can now directly plot the pkl,gz file + other improvements --- .../02_decorrelate.py | 53 +++++++++++-------- 1 file changed, 32 insertions(+), 21 deletions(-) diff --git a/higgs_dna/tools/decorrelation_CDFs_dumper/02_decorrelate.py b/higgs_dna/tools/decorrelation_CDFs_dumper/02_decorrelate.py index 04c6dc50..e3c182fb 100644 --- a/higgs_dna/tools/decorrelation_CDFs_dumper/02_decorrelate.py +++ b/higgs_dna/tools/decorrelation_CDFs_dumper/02_decorrelate.py @@ -1,4 +1,5 @@ import higgs_dna.tools.decorrelator as decorr +from higgs_dna.tools.decorrelator import cdfCalc import awkward as ak import argparse import os @@ -33,50 +34,60 @@ def getArrayBranchName(branchname, fieldname, index): def main(options): - if options.infile == options.outFile: - raise RuntimeError('Outfile will be recreated, cannot be the same as infile') - - if os.path.exists(options.outFile): - print("WARNING: outfile exists.") - + # loading cdf file dummyDf = pd.DataFrame({'{}'.format(options.var): [0], '{}'.format(options.dVar): [0]}) decl = decorr.decorrelator(dummyDf, options.var, options.dVar, np.linspace(100., 180., 161)) - decl.loadCdfs(options.cdfFile) + decl.loadCdfs(str(options.cdfsFile) + ".pkl.gz") - files = glob.glob(str(options.infile) + "*.parquet") + # loading parquet files + files = glob.glob(str(options.infilepath) + "*.parquet") data = [pd.read_parquet(f) for f in files] events = pd.concat(data,ignore_index=True) + # creating dataframe for decorrelation df = pd.DataFrame() - df["sigma_m_over_m"] = events.sigma_m_over_m_Smeared.to_numpy() - df["mass"] = events.mass.to_numpy() + df[options.var] = events[options.var].to_numpy() + df[options.dVar] = events[options.dVar].to_numpy() df["weight"] = events.weight.to_numpy() + #printing variables that will be decorrelated print("var, dVar:", options.var, options.dVar) + # giving variables to dataframe and resetting index decl.df = df.loc[:, [options.var, options.dVar]] decl.df.reset_index(inplace=True) + # doing actually the decorr df['{}_decorr'.format(options.var)] = decl.doDecorr(options.ref) - - events["sigma_m_over_m_decorr"] = decl.doDecorr(options.ref) - - events.to_parquet(options.outFile) - + events['{}_decorr'.format(options.var)] = decl.doDecorr(options.ref) + + #create pkl.gz output file and remove the old sigma_m_over_m variable + df = df.drop(columns=options.var) + calc = cdfCalc(df, '{}_decorr'.format(options.var), options.dVar, np.linspace(100, 180, 161)) + if options.outFile: + calc.dumpCdfs(str(options.outFile) + ".pkl.gz") + else: + calc.dumpCdfs(str(options.var) + "_decorr" + ".pkl.gz") + + if options.parquetGeneration == "True": + print("parquet file will be created") + events['{}_decorr'.format(options.var)] = decl.doDecorr(options.ref) + events.to_parquet(options.outFile + ".parquet") if __name__ == '__main__': parser = argparse.ArgumentParser() requiredArgs = parser.add_argument_group('Required Arguements') - requiredArgs.add_argument('-t','--tree', nargs='+', required=True) - requiredArgs.add_argument('-i', '--infile', action='store', type=str, required=True) - requiredArgs.add_argument('-c','--cdfFile', action='store', type=str, required=True) - requiredArgs.add_argument('-v','--var', action='store', type=str, required=True) - requiredArgs.add_argument('-d','--dVar', action='store', type=str, required=True) - requiredArgs.add_argument('-o','--outFile', action='store', type=str, required=True) + parser.add_argument('-t', '--tree', default='sigma_m_over_m', action='store', type=str, help="Tree name (default: sigma_m_over_m)") + requiredArgs.add_argument('-i', '--infilepath', action='store', type=str, required=True, help="Input file path, e.g. /net/.../samples/") + requiredArgs.add_argument('-c', '--cdfsFile', default='output_01_dumpCDFs_decorr', action='store', type=str, help="CDFs output file in pkl.gz format, is the output from 01_dumpCdfs_decorr.py") + requiredArgs.add_argument('-v','--var', default='sigma_m_over_m', action='store', type=str, help="variable you want to decorrelate (default: sigma_m_over_m)") + requiredArgs.add_argument('-d','--dVar', default='mass', action='store', type=str, help="variable you want to correlate against, most likely mass, (default: mass)") + requiredArgs.add_argument('-o','--outFile', action='store', type=str, help ="filename and path to the decorrelated files, default: relative to var") optArgs = parser.add_argument_group('Optional Arguments') optArgs.add_argument('-r', '--ref', action='store', type=float, default=125.) optArgs.add_argument('--columns', nargs='+') optArgs.add_argument('--nomColumns', nargs='+') optArgs.add_argument('--vecColumns', nargs='+') + optArgs.add_argument('-p','--parquetGeneration', default='True', action='store', type=str, help="choose True or False to generate parquet files aswell, (default: mass)") options = parser.parse_args() main(options) -- GitLab From b13c5ae6d165bd15cc6cd4181cdc0c5a9693ca89 Mon Sep 17 00:00:00 2001 From: Maximilian <maximilianwrabetz@rwth-aachen.de> Date: Wed, 11 Dec 2024 13:40:05 +0100 Subject: [PATCH 14/17] first working fusion of both scripts, no intermediate cdf file needed + small improvements --- .../01_dumpCdfs_decorr.py | 46 ------------------- .../{02_decorrelate.py => CDFdecorrelator.py} | 43 +++++++++++------ 2 files changed, 28 insertions(+), 61 deletions(-) delete mode 100644 higgs_dna/tools/decorrelation_CDFs_dumper/01_dumpCdfs_decorr.py rename higgs_dna/tools/decorrelation_CDFs_dumper/{02_decorrelate.py => CDFdecorrelator.py} (78%) diff --git a/higgs_dna/tools/decorrelation_CDFs_dumper/01_dumpCdfs_decorr.py b/higgs_dna/tools/decorrelation_CDFs_dumper/01_dumpCdfs_decorr.py deleted file mode 100644 index 36a63c6b..00000000 --- a/higgs_dna/tools/decorrelation_CDFs_dumper/01_dumpCdfs_decorr.py +++ /dev/null @@ -1,46 +0,0 @@ -import numpy as np -import pandas as pd -from higgs_dna.tools.decorrelator import cdfCalc -import argparse -import glob -import concurrent.futures - - -def read_parquet_file(file_path): - return pd.read_parquet(file_path) - - -def main(options): - - df = pd.DataFrame() - - files = glob.glob(str(options.infilepath) + "*.parquet") - with concurrent.futures.ThreadPoolExecutor() as executor: - data = list(executor.map(read_parquet_file, files)) - events = pd.concat(data, ignore_index=True) - if options.tree in events: - df[options.tree] = events[options.tree].to_numpy() - else: - print(f"ERROR: tree not in columns of parquet files from {options.infilepath}") - print(f"Please choose from {[col for col in events.columns if col.startswith('sigma')]}") - exit() - - df["mass"] = events.mass.to_numpy() - df["weight"] = events.weight.to_numpy() - - print(f"INFO: found {len(events)} events") - print(f"INFO: output CDF file contains: {df.columns.tolist()}") - # Evaluating and dumping the CDFs in bins of mass - calc = cdfCalc(df, options.tree,'mass',np.linspace(100, 180, 161)) - calc.dumpCdfs(str(options.cdfsFile) + ".pkl.gz") - - -if __name__ == "__main__": - - parser = argparse.ArgumentParser() - requiredArgs = parser.add_argument_group() - requiredArgs.add_argument('-i', '--infilepath', action='store', type=str, required=True, help="Input file path, e.g. /net/.../samples/") - requiredArgs.add_argument('-c', '--cdfsFile', default='output_01_dumpCDFs_decorr', action='store', type=str, help="CDFs output file in pkl.gz format, is cdf_input for 02_decorrelate.py") - parser.add_argument('-t', '--tree', default='sigma_m_over_m', action='store', type=str, help="Tree name (default: sigma_m_over_m)") - options = parser.parse_args() - main(options) diff --git a/higgs_dna/tools/decorrelation_CDFs_dumper/02_decorrelate.py b/higgs_dna/tools/decorrelation_CDFs_dumper/CDFdecorrelator.py similarity index 78% rename from higgs_dna/tools/decorrelation_CDFs_dumper/02_decorrelate.py rename to higgs_dna/tools/decorrelation_CDFs_dumper/CDFdecorrelator.py index e3c182fb..6bb1f9e1 100644 --- a/higgs_dna/tools/decorrelation_CDFs_dumper/02_decorrelate.py +++ b/higgs_dna/tools/decorrelation_CDFs_dumper/CDFdecorrelator.py @@ -6,7 +6,10 @@ import os import pandas as pd import numpy as np import glob +import concurrent.futures +def read_parquet_file(file_path): + return pd.read_parquet(file_path) def printProgressBar(iteration,total,prefix='',suffix='',decimals=1,length=100,fill=chr(9608),printEnd="\r"): @@ -17,7 +20,6 @@ def printProgressBar(iteration,total,prefix='',suffix='',decimals=1,length=100,f if iteration == total: print() - def diphoton_ak_array(diphotons: ak.Array) -> ak.Array: output = {} @@ -25,7 +27,6 @@ def diphoton_ak_array(diphotons: ak.Array) -> ak.Array: output[field] = diphotons[field] return ak.Array(output) - def getArrayBranchName(branchname, fieldname, index): if index != (): return '{}{}'.format(branchname, index[0]) @@ -33,24 +34,33 @@ def getArrayBranchName(branchname, fieldname, index): def main(options): + + df = pd.DataFrame() - # loading cdf file - dummyDf = pd.DataFrame({'{}'.format(options.var): [0], '{}'.format(options.dVar): [0]}) - decl = decorr.decorrelator(dummyDf, options.var, options.dVar, np.linspace(100., 180., 161)) - decl.loadCdfs(str(options.cdfsFile) + ".pkl.gz") - - # loading parquet files files = glob.glob(str(options.infilepath) + "*.parquet") - data = [pd.read_parquet(f) for f in files] - events = pd.concat(data,ignore_index=True) + with concurrent.futures.ThreadPoolExecutor() as executor: + data = list(executor.map(read_parquet_file, files)) + events = pd.concat(data, ignore_index=True) + if options.tree in events: + df[options.tree] = events[options.tree].to_numpy() + else: + print(f"ERROR: tree not in columns of parquet files from {options.infilepath}") + print(f"Please choose from {[col for col in events.columns if col.startswith('sigma')]}") + exit() - # creating dataframe for decorrelation - df = pd.DataFrame() - df[options.var] = events[options.var].to_numpy() - df[options.dVar] = events[options.dVar].to_numpy() + df["mass"] = events.mass.to_numpy() df["weight"] = events.weight.to_numpy() - #printing variables that will be decorrelated + print(f"INFO: found {len(events)} events") + print(f"INFO: output CDF file contains: {df.columns.tolist()}") + + calc = cdfCalc(df, options.tree, 'mass', np.linspace(100, 180, 161)) + calc.calcCdfs() + cdfs = calc.cdfs + dummyDf = pd.DataFrame({'{}'.format(options.var): [0], '{}'.format(options.dVar): [0]}) + decl = decorr.decorrelator(dummyDf, options.var, options.dVar, np.linspace(100., 180., 161)) + decl.cdfs = cdfs + print("var, dVar:", options.var, options.dVar) # giving variables to dataframe and resetting index decl.df = df.loc[:, [options.var, options.dVar]] @@ -63,11 +73,14 @@ def main(options): #create pkl.gz output file and remove the old sigma_m_over_m variable df = df.drop(columns=options.var) calc = cdfCalc(df, '{}_decorr'.format(options.var), options.dVar, np.linspace(100, 180, 161)) + + print(f"INFO: decorrelated CDF file contains: {df.columns.tolist()}") if options.outFile: calc.dumpCdfs(str(options.outFile) + ".pkl.gz") else: calc.dumpCdfs(str(options.var) + "_decorr" + ".pkl.gz") + # a parquet file with the decorrelated variable will be created by default but takes time if options.parquetGeneration == "True": print("parquet file will be created") events['{}_decorr'.format(options.var)] = decl.doDecorr(options.ref) -- GitLab From 5e16df58edc7d7ef32ee3585505e461311e8700e Mon Sep 17 00:00:00 2001 From: Maximilian <maximilianwrabetz@rwth-aachen.de> Date: Wed, 11 Dec 2024 15:05:01 +0100 Subject: [PATCH 15/17] a lot of improvements, comments and helpful messages in case of an error. flake8 happy aswell --- .../CDFdecorrelator.py | 83 +++++++++++-------- 1 file changed, 48 insertions(+), 35 deletions(-) diff --git a/higgs_dna/tools/decorrelation_CDFs_dumper/CDFdecorrelator.py b/higgs_dna/tools/decorrelation_CDFs_dumper/CDFdecorrelator.py index 6bb1f9e1..d4b147fb 100644 --- a/higgs_dna/tools/decorrelation_CDFs_dumper/CDFdecorrelator.py +++ b/higgs_dna/tools/decorrelation_CDFs_dumper/CDFdecorrelator.py @@ -1,16 +1,18 @@ -import higgs_dna.tools.decorrelator as decorr + from higgs_dna.tools.decorrelator import cdfCalc +import higgs_dna.tools.decorrelator as decorr +import concurrent.futures import awkward as ak -import argparse -import os import pandas as pd import numpy as np +import argparse import glob -import concurrent.futures + def read_parquet_file(file_path): return pd.read_parquet(file_path) + def printProgressBar(iteration,total,prefix='',suffix='',decimals=1,length=100,fill=chr(9608),printEnd="\r"): percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total))) @@ -20,6 +22,7 @@ def printProgressBar(iteration,total,prefix='',suffix='',decimals=1,length=100,f if iteration == total: print() + def diphoton_ak_array(diphotons: ak.Array) -> ak.Array: output = {} @@ -27,6 +30,7 @@ def diphoton_ak_array(diphotons: ak.Array) -> ak.Array: output[field] = diphotons[field] return ak.Array(output) + def getArrayBranchName(branchname, fieldname, index): if index != (): return '{}{}'.format(branchname, index[0]) @@ -34,73 +38,82 @@ def getArrayBranchName(branchname, fieldname, index): def main(options): - + # reading the parquet files df = pd.DataFrame() - files = glob.glob(str(options.infilepath) + "*.parquet") with concurrent.futures.ThreadPoolExecutor() as executor: data = list(executor.map(read_parquet_file, files)) events = pd.concat(data, ignore_index=True) - if options.tree in events: - df[options.tree] = events[options.tree].to_numpy() + print(f"INFO: found {len(events)} events") + + # helper and parquet column read in + if options.var in events: + df[options.var] = events[options.var].to_numpy() else: - print(f"ERROR: tree not in columns of parquet files from {options.infilepath}") + print(f"ERROR: var not in columns of parquet files from {options.infilepath}") print(f"Please choose from {[col for col in events.columns if col.startswith('sigma')]}") exit() - - df["mass"] = events.mass.to_numpy() + df[options.dVar] = events[options.dVar].to_numpy() df["weight"] = events.weight.to_numpy() - print(f"INFO: found {len(events)} events") - print(f"INFO: output CDF file contains: {df.columns.tolist()}") - - calc = cdfCalc(df, options.tree, 'mass', np.linspace(100, 180, 161)) + # handling of decorrelator + calc = cdfCalc(df, options.var, options.dVar, np.linspace(100, 180, 161)) calc.calcCdfs() cdfs = calc.cdfs dummyDf = pd.DataFrame({'{}'.format(options.var): [0], '{}'.format(options.dVar): [0]}) decl = decorr.decorrelator(dummyDf, options.var, options.dVar, np.linspace(100., 180., 161)) decl.cdfs = cdfs + print("var, dVar:", options.var,", ", options.dVar) - print("var, dVar:", options.var, options.dVar) # giving variables to dataframe and resetting index decl.df = df.loc[:, [options.var, options.dVar]] decl.df.reset_index(inplace=True) - # doing actually the decorr + # doing actually the decorr for pickle df['{}_decorr'.format(options.var)] = decl.doDecorr(options.ref) - events['{}_decorr'.format(options.var)] = decl.doDecorr(options.ref) - - #create pkl.gz output file and remove the old sigma_m_over_m variable + + # create pickle file and remove the old sigma_m_over_m variable df = df.drop(columns=options.var) calc = cdfCalc(df, '{}_decorr'.format(options.var), options.dVar, np.linspace(100, 180, 161)) - - print(f"INFO: decorrelated CDF file contains: {df.columns.tolist()}") - if options.outFile: - calc.dumpCdfs(str(options.outFile) + ".pkl.gz") + print(f"INFO: decorrelated CDF contains: {df.columns.tolist()}") + if options.era: + if options.outFile: + calc.dumpCdfs(str(options.outFile) + "_" + str(options.era) + ".pkl.gz") + else: + calc.dumpCdfs(str(options.var) + "_" + str(options.era) + "_CDFs" + ".pkl.gz") else: - calc.dumpCdfs(str(options.var) + "_decorr" + ".pkl.gz") - - # a parquet file with the decorrelated variable will be created by default but takes time + if options.outFile: + calc.dumpCdfs(str(options.outFile) + ".pkl.gz") + else: + calc.dumpCdfs(str(options.var) + "_CDFs" + ".pkl.gz") + + # a parquet file with the decorrelated variable will be created by default but takes more time if options.parquetGeneration == "True": - print("parquet file will be created") - events['{}_decorr'.format(options.var)] = decl.doDecorr(options.ref) - events.to_parquet(options.outFile + ".parquet") + print("INFO: new parquet and pickle files will be created!") + if options.era: + events['{}_decorr'.format(options.var)] = decl.doDecorr(options.ref) + events.to_parquet(options.outFile + "_" + str(options.era) + ".parquet") + else: + events['{}_decorr'.format(options.var)] = decl.doDecorr(options.ref) + events.to_parquet(options.outFile + ".parquet") + else: + print("INFO: new parquet file will not be created, only pickle!") + if __name__ == '__main__': parser = argparse.ArgumentParser() requiredArgs = parser.add_argument_group('Required Arguements') - parser.add_argument('-t', '--tree', default='sigma_m_over_m', action='store', type=str, help="Tree name (default: sigma_m_over_m)") requiredArgs.add_argument('-i', '--infilepath', action='store', type=str, required=True, help="Input file path, e.g. /net/.../samples/") - requiredArgs.add_argument('-c', '--cdfsFile', default='output_01_dumpCDFs_decorr', action='store', type=str, help="CDFs output file in pkl.gz format, is the output from 01_dumpCdfs_decorr.py") requiredArgs.add_argument('-v','--var', default='sigma_m_over_m', action='store', type=str, help="variable you want to decorrelate (default: sigma_m_over_m)") requiredArgs.add_argument('-d','--dVar', default='mass', action='store', type=str, help="variable you want to correlate against, most likely mass, (default: mass)") - requiredArgs.add_argument('-o','--outFile', action='store', type=str, help ="filename and path to the decorrelated files, default: relative to var") + requiredArgs.add_argument('-o','--outFile', action='store', type=str, help="filename and path to the decorrelated files, default .<var>(_era)_CDFs") optArgs = parser.add_argument_group('Optional Arguments') - optArgs.add_argument('-r', '--ref', action='store', type=float, default=125.) + optArgs.add_argument('-r', '--ref', action='store', type=float, default=125., help="reference mass for decorrelation") optArgs.add_argument('--columns', nargs='+') optArgs.add_argument('--nomColumns', nargs='+') optArgs.add_argument('--vecColumns', nargs='+') - optArgs.add_argument('-p','--parquetGeneration', default='True', action='store', type=str, help="choose True or False to generate parquet files aswell, (default: mass)") + optArgs.add_argument('-p','--parquetGeneration', default='True', action='store', type=str, help="choose True or False to generate parquet files aswell, (default: True)") + optArgs.add_argument('-e','--era', action='store', type=str, help="optional: choose era to give extra information in file name") options = parser.parse_args() main(options) -- GitLab From f908ef6cfe85387c126cd6c949f516e178b87e83 Mon Sep 17 00:00:00 2001 From: Maximilian <maximilianwrabetz@rwth-aachen.de> Date: Wed, 11 Dec 2024 15:41:08 +0100 Subject: [PATCH 16/17] even more help messages now, made flake8 happy --- .../CDFdecorrelator.py | 30 ++++++++++++++----- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/higgs_dna/tools/decorrelation_CDFs_dumper/CDFdecorrelator.py b/higgs_dna/tools/decorrelation_CDFs_dumper/CDFdecorrelator.py index d4b147fb..02de331e 100644 --- a/higgs_dna/tools/decorrelation_CDFs_dumper/CDFdecorrelator.py +++ b/higgs_dna/tools/decorrelation_CDFs_dumper/CDFdecorrelator.py @@ -39,6 +39,11 @@ def getArrayBranchName(branchname, fieldname, index): def main(options): # reading the parquet files + if not options.infilepath.endswith('/'): + print(f"WARNING: Please make sure that {options.infilepath} is a path to .parquet files and ends with /") + options.infilepath = options.infilepath + "/" + print(f"INFO: To help you out, the path is changed to: {options.infilepath}") + df = pd.DataFrame() files = glob.glob(str(options.infilepath) + "*.parquet") with concurrent.futures.ThreadPoolExecutor() as executor: @@ -51,7 +56,7 @@ def main(options): df[options.var] = events[options.var].to_numpy() else: print(f"ERROR: var not in columns of parquet files from {options.infilepath}") - print(f"Please choose from {[col for col in events.columns if col.startswith('sigma')]}") + print(f"Please choose e.g. from {[col for col in events.columns if col.startswith('sigma')]} or inspect columns for more information!") exit() df[options.dVar] = events[options.dVar].to_numpy() df["weight"] = events.weight.to_numpy() @@ -69,8 +74,9 @@ def main(options): decl.df = df.loc[:, [options.var, options.dVar]] decl.df.reset_index(inplace=True) - # doing actually the decorr for pickle - df['{}_decorr'.format(options.var)] = decl.doDecorr(options.ref) + # doing the decorr + decorrelated_var = decl.doDecorr(options.ref) + df['{}_decorr'.format(options.var)] = decorrelated_var # create pickle file and remove the old sigma_m_over_m variable df = df.drop(columns=options.var) @@ -86,16 +92,24 @@ def main(options): calc.dumpCdfs(str(options.outFile) + ".pkl.gz") else: calc.dumpCdfs(str(options.var) + "_CDFs" + ".pkl.gz") + print("Created pickle file!") # a parquet file with the decorrelated variable will be created by default but takes more time if options.parquetGeneration == "True": - print("INFO: new parquet and pickle files will be created!") + print("INFO: new parquet will be created, this can take some time!") + events['{}_decorr'.format(options.var)] = decorrelated_var if options.era: - events['{}_decorr'.format(options.var)] = decl.doDecorr(options.ref) - events.to_parquet(options.outFile + "_" + str(options.era) + ".parquet") + if options.outFile: + events.to_parquet(options.outFile + "_" + str(options.era) + ".parquet") + else: + events.to_parquet(str(options.var) + "_decorr_" + str(options.era) + ".parquet") else: - events['{}_decorr'.format(options.var)] = decl.doDecorr(options.ref) - events.to_parquet(options.outFile + ".parquet") + if options.outFile: + events.to_parquet(options.outFile + ".parquet") + else: + events.to_parquet(str(options.var) + "_decorr" + ".parquet") + + print("Created parquet file!") else: print("INFO: new parquet file will not be created, only pickle!") -- GitLab From 98be24e127940ea4c25c188d214f2b0ea3a83712 Mon Sep 17 00:00:00 2001 From: Maximilian <maximilianwrabetz@rwth-aachen.de> Date: Thu, 12 Dec 2024 13:23:27 +0100 Subject: [PATCH 17/17] flag to avoid parquet generation is improved, flake8 happy --- higgs_dna/tools/decorrelation_CDFs_dumper/CDFdecorrelator.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/higgs_dna/tools/decorrelation_CDFs_dumper/CDFdecorrelator.py b/higgs_dna/tools/decorrelation_CDFs_dumper/CDFdecorrelator.py index 02de331e..189daf8a 100644 --- a/higgs_dna/tools/decorrelation_CDFs_dumper/CDFdecorrelator.py +++ b/higgs_dna/tools/decorrelation_CDFs_dumper/CDFdecorrelator.py @@ -95,7 +95,7 @@ def main(options): print("Created pickle file!") # a parquet file with the decorrelated variable will be created by default but takes more time - if options.parquetGeneration == "True": + if not options.parquetGenerationOff: print("INFO: new parquet will be created, this can take some time!") events['{}_decorr'.format(options.var)] = decorrelated_var if options.era: @@ -127,7 +127,7 @@ if __name__ == '__main__': optArgs.add_argument('--columns', nargs='+') optArgs.add_argument('--nomColumns', nargs='+') optArgs.add_argument('--vecColumns', nargs='+') - optArgs.add_argument('-p','--parquetGeneration', default='True', action='store', type=str, help="choose True or False to generate parquet files aswell, (default: True)") + optArgs.add_argument('-p', '--parquetGenerationOff', action='store_true', help="Set this flag to avoid generating parquet files (by default parquet files are created)") optArgs.add_argument('-e','--era', action='store', type=str, help="optional: choose era to give extra information in file name") options = parser.parse_args() main(options) -- GitLab