From af6291e96a23918a4233fc4128a47f785b4c0846 Mon Sep 17 00:00:00 2001
From: Maximilian <maximilianwrabetz@rwth-aachen.de>
Date: Tue, 10 Dec 2024 14:51:07 +0100
Subject: [PATCH 01/17] corrected read in of decorrelator.py

---
 higgs_dna/tools/decorrelation_CDFs_dumper/01_dumpCdfs_decorr.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/higgs_dna/tools/decorrelation_CDFs_dumper/01_dumpCdfs_decorr.py b/higgs_dna/tools/decorrelation_CDFs_dumper/01_dumpCdfs_decorr.py
index 84e67482..65ec7172 100644
--- a/higgs_dna/tools/decorrelation_CDFs_dumper/01_dumpCdfs_decorr.py
+++ b/higgs_dna/tools/decorrelation_CDFs_dumper/01_dumpCdfs_decorr.py
@@ -1,7 +1,7 @@
 # import root_pandas
 import numpy as np
 import pandas as pd
-from decorrelator import cdfCalc
+from higgs_dna.tools.decorrelator import cdfCalc
 import argparse
 import glob
 
-- 
GitLab


From 3e26d367580c653bc8f25ba01869f698f4cafd7a Mon Sep 17 00:00:00 2001
From: Maximilian <maximilianwrabetz@rwth-aachen.de>
Date: Tue, 10 Dec 2024 14:52:18 +0100
Subject: [PATCH 02/17] removed unnecessary flag

---
 higgs_dna/tools/decorrelation_CDFs_dumper/01_dumpCdfs_decorr.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/higgs_dna/tools/decorrelation_CDFs_dumper/01_dumpCdfs_decorr.py b/higgs_dna/tools/decorrelation_CDFs_dumper/01_dumpCdfs_decorr.py
index 65ec7172..6079efa6 100644
--- a/higgs_dna/tools/decorrelation_CDFs_dumper/01_dumpCdfs_decorr.py
+++ b/higgs_dna/tools/decorrelation_CDFs_dumper/01_dumpCdfs_decorr.py
@@ -35,7 +35,6 @@ if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     requiredArgs = parser.add_argument_group()
     requiredArgs.add_argument('-i', '--infile', action='store', type=str, required=True)
-    requiredArgs.add_argument('-i', '--infile', action='store', type=str, required=True)
     requiredArgs.add_argument('-c', '--cdfsFile', action='store', type=str, required=True)
     requiredArgs.add_argument('-t', '--tree', action='store', type=str, required=True)
     options = parser.parse_args()
-- 
GitLab


From 7262b4a86c5c030f797c144201f2797fc2691118 Mon Sep 17 00:00:00 2001
From: Maximilian <maximilianwrabetz@rwth-aachen.de>
Date: Tue, 10 Dec 2024 14:53:33 +0100
Subject: [PATCH 03/17] removed repeating code

---
 .../tools/decorrelation_CDFs_dumper/01_dumpCdfs_decorr.py    | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/higgs_dna/tools/decorrelation_CDFs_dumper/01_dumpCdfs_decorr.py b/higgs_dna/tools/decorrelation_CDFs_dumper/01_dumpCdfs_decorr.py
index 6079efa6..2cd78fe2 100644
--- a/higgs_dna/tools/decorrelation_CDFs_dumper/01_dumpCdfs_decorr.py
+++ b/higgs_dna/tools/decorrelation_CDFs_dumper/01_dumpCdfs_decorr.py
@@ -20,11 +20,6 @@ def main(options):
 
     print(f"INFO: found {len(events)} events")
 
-    df["sigma_m_over_m"] = events[options].to_numpy()
-
-    df["mass"] = events.mass.to_numpy()
-    df["weight"] = events.weight.to_numpy()
-
     # Evaluating and dumping the CDFs in bins of mass
     calc = cdfCalc(df, options.tree,'mass',np.linspace(100, 180, 161))
     calc.dumpCdfs(options.cdfsFile)
-- 
GitLab


From 696aa85555c10cf10702a1d484b32c1caa333fe3 Mon Sep 17 00:00:00 2001
From: Maximilian <maximilianwrabetz@rwth-aachen.de>
Date: Tue, 10 Dec 2024 15:15:48 +0100
Subject: [PATCH 04/17] flag was not read in correctly

---
 higgs_dna/tools/decorrelation_CDFs_dumper/01_dumpCdfs_decorr.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/higgs_dna/tools/decorrelation_CDFs_dumper/01_dumpCdfs_decorr.py b/higgs_dna/tools/decorrelation_CDFs_dumper/01_dumpCdfs_decorr.py
index 2cd78fe2..84168f8b 100644
--- a/higgs_dna/tools/decorrelation_CDFs_dumper/01_dumpCdfs_decorr.py
+++ b/higgs_dna/tools/decorrelation_CDFs_dumper/01_dumpCdfs_decorr.py
@@ -14,7 +14,7 @@ def main(options):
     data = [pd.read_parquet(f) for f in files]
     events = pd.concat(data,ignore_index=True)
 
-    df["sigma_m_over_m"] = events[options].to_numpy()
+    df["sigma_m_over_m"] = events[options.tree].to_numpy()
     df["mass"] = events.mass.to_numpy()
     df["weight"] = events.weight.to_numpy()
 
-- 
GitLab


From 7dcb35c901495c8f8322e0c6aaa386630f5cad2f Mon Sep 17 00:00:00 2001
From: Maximilian <maximilianwrabetz@rwth-aachen.de>
Date: Tue, 10 Dec 2024 15:24:09 +0100
Subject: [PATCH 05/17] to remain consistency and for comport, infile flag is
 now infilepath in consistency with 02_decorrelate

---
 .../tools/decorrelation_CDFs_dumper/01_dumpCdfs_decorr.py     | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/higgs_dna/tools/decorrelation_CDFs_dumper/01_dumpCdfs_decorr.py b/higgs_dna/tools/decorrelation_CDFs_dumper/01_dumpCdfs_decorr.py
index 84168f8b..0da3fd36 100644
--- a/higgs_dna/tools/decorrelation_CDFs_dumper/01_dumpCdfs_decorr.py
+++ b/higgs_dna/tools/decorrelation_CDFs_dumper/01_dumpCdfs_decorr.py
@@ -10,7 +10,7 @@ def main(options):
 
     df = pd.DataFrame()
 
-    files = glob.glob(options.infile)
+    files = glob.glob(str(options.infilepath) + "*.parquet")
     data = [pd.read_parquet(f) for f in files]
     events = pd.concat(data,ignore_index=True)
 
@@ -29,7 +29,7 @@ if __name__ == "__main__":
 
     parser = argparse.ArgumentParser()
     requiredArgs = parser.add_argument_group()
-    requiredArgs.add_argument('-i', '--infile', action='store', type=str, required=True)
+    requiredArgs.add_argument('-i', '--infilepath', action='store', type=str, required=True)
     requiredArgs.add_argument('-c', '--cdfsFile', action='store', type=str, required=True)
     requiredArgs.add_argument('-t', '--tree', action='store', type=str, required=True)
     options = parser.parse_args()
-- 
GitLab


From c4fc65ba24b305707fde0954250dcc4c0799c1d0 Mon Sep 17 00:00:00 2001
From: Maximilian <maximilianwrabetz@rwth-aachen.de>
Date: Tue, 10 Dec 2024 15:57:37 +0100
Subject: [PATCH 06/17] column names are now correct and the flags are improved

---
 .../tools/decorrelation_CDFs_dumper/01_dumpCdfs_decorr.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/higgs_dna/tools/decorrelation_CDFs_dumper/01_dumpCdfs_decorr.py b/higgs_dna/tools/decorrelation_CDFs_dumper/01_dumpCdfs_decorr.py
index 0da3fd36..7d393288 100644
--- a/higgs_dna/tools/decorrelation_CDFs_dumper/01_dumpCdfs_decorr.py
+++ b/higgs_dna/tools/decorrelation_CDFs_dumper/01_dumpCdfs_decorr.py
@@ -14,7 +14,7 @@ def main(options):
     data = [pd.read_parquet(f) for f in files]
     events = pd.concat(data,ignore_index=True)
 
-    df["sigma_m_over_m"] = events[options.tree].to_numpy()
+    df[options.tree] = events[options.tree].to_numpy()
     df["mass"] = events.mass.to_numpy()
     df["weight"] = events.weight.to_numpy()
 
@@ -29,8 +29,8 @@ if __name__ == "__main__":
 
     parser = argparse.ArgumentParser()
     requiredArgs = parser.add_argument_group()
-    requiredArgs.add_argument('-i', '--infilepath', action='store', type=str, required=True)
-    requiredArgs.add_argument('-c', '--cdfsFile', action='store', type=str, required=True)
-    requiredArgs.add_argument('-t', '--tree', action='store', type=str, required=True)
+    requiredArgs.add_argument('-i', '--infilepath', action='store', type=str, required=True, help="Input file path")
+    requiredArgs.add_argument('-c', '--cdfsFile', action='store', type=str, required=True, help="CDFs output file")
+    parser.add_argument('-t', '--tree', default='sigma_m_over_m', action='store', type=str, help="Tree name (default: sigma_m_over_m)")
     options = parser.parse_args()
     main(options)
-- 
GitLab


From 678dca0b1fa339af59c3c00924400c9c809acbc6 Mon Sep 17 00:00:00 2001
From: Maximilian <maximilianwrabetz@rwth-aachen.de>
Date: Tue, 10 Dec 2024 16:16:58 +0100
Subject: [PATCH 07/17] made it process faster

---
 .../decorrelation_CDFs_dumper/01_dumpCdfs_decorr.py   | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/higgs_dna/tools/decorrelation_CDFs_dumper/01_dumpCdfs_decorr.py b/higgs_dna/tools/decorrelation_CDFs_dumper/01_dumpCdfs_decorr.py
index 7d393288..6c725e59 100644
--- a/higgs_dna/tools/decorrelation_CDFs_dumper/01_dumpCdfs_decorr.py
+++ b/higgs_dna/tools/decorrelation_CDFs_dumper/01_dumpCdfs_decorr.py
@@ -4,15 +4,22 @@ import pandas as pd
 from higgs_dna.tools.decorrelator import cdfCalc
 import argparse
 import glob
+import pyarrow.parquet as pq
+import concurrent.futures
 
+def read_parquet_file(file_path):
+    return pd.read_parquet(file_path)
 
 def main(options):
 
     df = pd.DataFrame()
 
     files = glob.glob(str(options.infilepath) + "*.parquet")
-    data = [pd.read_parquet(f) for f in files]
-    events = pd.concat(data,ignore_index=True)
+    
+    with concurrent.futures.ThreadPoolExecutor() as executor:
+        data = list(executor.map(read_parquet_file, files))
+    
+    events = pd.concat(data, ignore_index=True)
 
     df[options.tree] = events[options.tree].to_numpy()
     df["mass"] = events.mass.to_numpy()
-- 
GitLab


From 606486907aaed5245584c20f3df28ad0c01d3fb2 Mon Sep 17 00:00:00 2001
From: Maximilian <maximilianwrabetz@rwth-aachen.de>
Date: Tue, 10 Dec 2024 16:21:51 +0100
Subject: [PATCH 08/17] some more explanations for the trees

---
 .../tools/decorrelation_CDFs_dumper/01_dumpCdfs_decorr.py     | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/higgs_dna/tools/decorrelation_CDFs_dumper/01_dumpCdfs_decorr.py b/higgs_dna/tools/decorrelation_CDFs_dumper/01_dumpCdfs_decorr.py
index 6c725e59..a66ea8fd 100644
--- a/higgs_dna/tools/decorrelation_CDFs_dumper/01_dumpCdfs_decorr.py
+++ b/higgs_dna/tools/decorrelation_CDFs_dumper/01_dumpCdfs_decorr.py
@@ -36,8 +36,8 @@ if __name__ == "__main__":
 
     parser = argparse.ArgumentParser()
     requiredArgs = parser.add_argument_group()
-    requiredArgs.add_argument('-i', '--infilepath', action='store', type=str, required=True, help="Input file path")
-    requiredArgs.add_argument('-c', '--cdfsFile', action='store', type=str, required=True, help="CDFs output file")
+    requiredArgs.add_argument('-i', '--infilepath', action='store', type=str, required=True, help="Input file path, e.g. /net/.../samples/")
+    requiredArgs.add_argument('-c', '--cdfsFile', action='store', type=str, required=True, help="CDFs output file, is cdf_input for 02_decorrelate.py")
     parser.add_argument('-t', '--tree', default='sigma_m_over_m', action='store', type=str, help="Tree name (default: sigma_m_over_m)")
     options = parser.parse_args()
     main(options)
-- 
GitLab


From 2f70f2bc3227d74d3dd0e2a659ea413d4f673817 Mon Sep 17 00:00:00 2001
From: Maximilian <maximilianwrabetz@rwth-aachen.de>
Date: Tue, 10 Dec 2024 16:34:19 +0100
Subject: [PATCH 09/17] created default output file, only the input path
 remains necessary

---
 .../tools/decorrelation_CDFs_dumper/01_dumpCdfs_decorr.py     | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/higgs_dna/tools/decorrelation_CDFs_dumper/01_dumpCdfs_decorr.py b/higgs_dna/tools/decorrelation_CDFs_dumper/01_dumpCdfs_decorr.py
index a66ea8fd..583b79c5 100644
--- a/higgs_dna/tools/decorrelation_CDFs_dumper/01_dumpCdfs_decorr.py
+++ b/higgs_dna/tools/decorrelation_CDFs_dumper/01_dumpCdfs_decorr.py
@@ -29,7 +29,7 @@ def main(options):
 
     # Evaluating and dumping the CDFs in bins of mass
     calc = cdfCalc(df, options.tree,'mass',np.linspace(100, 180, 161))
-    calc.dumpCdfs(options.cdfsFile)
+    calc.dumpCdfs(str(options.cdfsFile) + ".pkl.gz")
 
 
 if __name__ == "__main__":
@@ -37,7 +37,7 @@ if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     requiredArgs = parser.add_argument_group()
     requiredArgs.add_argument('-i', '--infilepath', action='store', type=str, required=True, help="Input file path, e.g. /net/.../samples/")
-    requiredArgs.add_argument('-c', '--cdfsFile', action='store', type=str, required=True, help="CDFs output file, is cdf_input for 02_decorrelate.py")
+    requiredArgs.add_argument('-c', '--cdfsFile', default='output_01_dumpCDFs_decorr', action='store', type=str, help="CDFs output file in pkl.gz format, is cdf_input for 02_decorrelate.py")
     parser.add_argument('-t', '--tree', default='sigma_m_over_m', action='store', type=str, help="Tree name (default: sigma_m_over_m)")
     options = parser.parse_args()
     main(options)
-- 
GitLab


From a8eb12566994669b24e06a87f21ef7c7041ece99 Mon Sep 17 00:00:00 2001
From: Maximilian <maximilianwrabetz@rwth-aachen.de>
Date: Tue, 10 Dec 2024 16:45:15 +0100
Subject: [PATCH 10/17] added some error messages and output for help

---
 .../decorrelation_CDFs_dumper/01_dumpCdfs_decorr.py    | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/higgs_dna/tools/decorrelation_CDFs_dumper/01_dumpCdfs_decorr.py b/higgs_dna/tools/decorrelation_CDFs_dumper/01_dumpCdfs_decorr.py
index 583b79c5..28027165 100644
--- a/higgs_dna/tools/decorrelation_CDFs_dumper/01_dumpCdfs_decorr.py
+++ b/higgs_dna/tools/decorrelation_CDFs_dumper/01_dumpCdfs_decorr.py
@@ -21,12 +21,18 @@ def main(options):
     
     events = pd.concat(data, ignore_index=True)
 
-    df[options.tree] = events[options.tree].to_numpy()
+    if options.tree in events:
+        df[options.tree] = events[options.tree].to_numpy()
+    else:
+        print(f"ERROR: tree not in columns of parquet files from {options.infilepath}")
+        print(f"Please choose from {[col for col in events.columns if col.startswith('sigma')]}")
+        exit()
+
     df["mass"] = events.mass.to_numpy()
     df["weight"] = events.weight.to_numpy()
 
     print(f"INFO: found {len(events)} events")
-
+    print(f"INFO: output CDF file contains: {df.columns.tolist()}")
     # Evaluating and dumping the CDFs in bins of mass
     calc = cdfCalc(df, options.tree,'mass',np.linspace(100, 180, 161))
     calc.dumpCdfs(str(options.cdfsFile) + ".pkl.gz")
-- 
GitLab


From d53694db97ae87bece4cf96e1d97dff5a21ca38b Mon Sep 17 00:00:00 2001
From: Maximilian <maximilianwrabetz@rwth-aachen.de>
Date: Tue, 10 Dec 2024 16:51:00 +0100
Subject: [PATCH 11/17] made flake8 happy

---
 .../tools/decorrelation_CDFs_dumper/01_dumpCdfs_decorr.py  | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/higgs_dna/tools/decorrelation_CDFs_dumper/01_dumpCdfs_decorr.py b/higgs_dna/tools/decorrelation_CDFs_dumper/01_dumpCdfs_decorr.py
index 28027165..36a63c6b 100644
--- a/higgs_dna/tools/decorrelation_CDFs_dumper/01_dumpCdfs_decorr.py
+++ b/higgs_dna/tools/decorrelation_CDFs_dumper/01_dumpCdfs_decorr.py
@@ -1,26 +1,23 @@
-# import root_pandas
 import numpy as np
 import pandas as pd
 from higgs_dna.tools.decorrelator import cdfCalc
 import argparse
 import glob
-import pyarrow.parquet as pq
 import concurrent.futures
 
+
 def read_parquet_file(file_path):
     return pd.read_parquet(file_path)
 
+
 def main(options):
 
     df = pd.DataFrame()
 
     files = glob.glob(str(options.infilepath) + "*.parquet")
-    
     with concurrent.futures.ThreadPoolExecutor() as executor:
         data = list(executor.map(read_parquet_file, files))
-    
     events = pd.concat(data, ignore_index=True)
-
     if options.tree in events:
         df[options.tree] = events[options.tree].to_numpy()
     else:
-- 
GitLab


From 11ea4feb5aae3c676a46023285e9b42f98425503 Mon Sep 17 00:00:00 2001
From: Maximilian <maximilianwrabetz@rwth-aachen.de>
Date: Wed, 11 Dec 2024 10:25:40 +0100
Subject: [PATCH 12/17] updated the read in of decorr and removed personal
 lines

---
 .../decorrelation_CDFs_dumper/02_decorrelate.py      | 12 +-----------
 1 file changed, 1 insertion(+), 11 deletions(-)

diff --git a/higgs_dna/tools/decorrelation_CDFs_dumper/02_decorrelate.py b/higgs_dna/tools/decorrelation_CDFs_dumper/02_decorrelate.py
index d2446aa1..04c6dc50 100644
--- a/higgs_dna/tools/decorrelation_CDFs_dumper/02_decorrelate.py
+++ b/higgs_dna/tools/decorrelation_CDFs_dumper/02_decorrelate.py
@@ -1,4 +1,4 @@
-import decorrelator as decorr
+import higgs_dna.tools.decorrelator as decorr
 import awkward as ak
 import argparse
 import os
@@ -45,7 +45,6 @@ def main(options):
 
     files = glob.glob(str(options.infile) + "*.parquet")
     data = [pd.read_parquet(f) for f in files]
-    # data = pd.read_parquet("/net/scratch_cms3a/daumann/massresdecorrhiggsdna/big_bkg/Diphoton.parquet")
     events = pd.concat(data,ignore_index=True)
 
     df = pd.DataFrame()
@@ -59,15 +58,6 @@ def main(options):
 
     df['{}_decorr'.format(options.var)] = decl.doDecorr(options.ref)
 
-    if 'sigmaMoM_decorr' in df.columns:
-        df['sigmaMoM_decorrOld'] = df['sigmaMoM_decorr']
-
-    if options.var == 'sigmarv':
-        df['sigmaMoM_decorr'] = df['sigmarv_decorr']
-
-    if options.var == 'sigmaRV':
-        df['sigmaMoM_decorr'] = df['sigmaRV_decorr']
-
     events["sigma_m_over_m_decorr"] = decl.doDecorr(options.ref)
 
     events.to_parquet(options.outFile)
-- 
GitLab


From 08c8782195108fb5638cf8e94340a415baca400a Mon Sep 17 00:00:00 2001
From: Maximilian <maximilianwrabetz@rwth-aachen.de>
Date: Wed, 11 Dec 2024 13:10:28 +0100
Subject: [PATCH 13/17] script can now directly plot the pkl,gz file + other
 improvements

---
 .../02_decorrelate.py                         | 53 +++++++++++--------
 1 file changed, 32 insertions(+), 21 deletions(-)

diff --git a/higgs_dna/tools/decorrelation_CDFs_dumper/02_decorrelate.py b/higgs_dna/tools/decorrelation_CDFs_dumper/02_decorrelate.py
index 04c6dc50..e3c182fb 100644
--- a/higgs_dna/tools/decorrelation_CDFs_dumper/02_decorrelate.py
+++ b/higgs_dna/tools/decorrelation_CDFs_dumper/02_decorrelate.py
@@ -1,4 +1,5 @@
 import higgs_dna.tools.decorrelator as decorr
+from higgs_dna.tools.decorrelator import cdfCalc
 import awkward as ak
 import argparse
 import os
@@ -33,50 +34,60 @@ def getArrayBranchName(branchname, fieldname, index):
 
 def main(options):
 
-    if options.infile == options.outFile:
-        raise RuntimeError('Outfile will be recreated, cannot be the same as infile')
-
-    if os.path.exists(options.outFile):
-        print("WARNING: outfile exists.")
-
+    # loading cdf file
     dummyDf = pd.DataFrame({'{}'.format(options.var): [0], '{}'.format(options.dVar): [0]})
     decl = decorr.decorrelator(dummyDf, options.var, options.dVar, np.linspace(100., 180., 161))
-    decl.loadCdfs(options.cdfFile)
+    decl.loadCdfs(str(options.cdfsFile) + ".pkl.gz")
 
-    files = glob.glob(str(options.infile) + "*.parquet")
+    # loading parquet files
+    files = glob.glob(str(options.infilepath) + "*.parquet")
     data = [pd.read_parquet(f) for f in files]
     events = pd.concat(data,ignore_index=True)
 
+    # creating dataframe for decorrelation
     df = pd.DataFrame()
-    df["sigma_m_over_m"] = events.sigma_m_over_m_Smeared.to_numpy()
-    df["mass"] = events.mass.to_numpy()
+    df[options.var] = events[options.var].to_numpy()
+    df[options.dVar] = events[options.dVar].to_numpy()
     df["weight"] = events.weight.to_numpy()
 
+    #printing variables that will be decorrelated
     print("var, dVar:", options.var, options.dVar)
+    # giving variables to dataframe and resetting index
     decl.df = df.loc[:, [options.var, options.dVar]]
     decl.df.reset_index(inplace=True)
 
+    # doing actually the decorr
     df['{}_decorr'.format(options.var)] = decl.doDecorr(options.ref)
-
-    events["sigma_m_over_m_decorr"] = decl.doDecorr(options.ref)
-
-    events.to_parquet(options.outFile)
-
+    events['{}_decorr'.format(options.var)] = decl.doDecorr(options.ref)
+    
+    #create pkl.gz output file and remove the old sigma_m_over_m variable
+    df = df.drop(columns=options.var)
+    calc = cdfCalc(df, '{}_decorr'.format(options.var), options.dVar, np.linspace(100, 180, 161))
+    if options.outFile:
+        calc.dumpCdfs(str(options.outFile) + ".pkl.gz")
+    else:
+        calc.dumpCdfs(str(options.var) + "_decorr" + ".pkl.gz")
+    
+    if options.parquetGeneration == "True":
+        print("parquet file will be created")
+        events['{}_decorr'.format(options.var)] = decl.doDecorr(options.ref)
+        events.to_parquet(options.outFile + ".parquet")
 
 if __name__ == '__main__':
 
     parser = argparse.ArgumentParser()
     requiredArgs = parser.add_argument_group('Required Arguements')
-    requiredArgs.add_argument('-t','--tree', nargs='+', required=True)
-    requiredArgs.add_argument('-i', '--infile', action='store', type=str, required=True)
-    requiredArgs.add_argument('-c','--cdfFile', action='store', type=str, required=True)
-    requiredArgs.add_argument('-v','--var', action='store', type=str, required=True)
-    requiredArgs.add_argument('-d','--dVar', action='store', type=str, required=True)
-    requiredArgs.add_argument('-o','--outFile', action='store', type=str, required=True)
+    parser.add_argument('-t', '--tree', default='sigma_m_over_m', action='store', type=str, help="Tree name (default: sigma_m_over_m)")
+    requiredArgs.add_argument('-i', '--infilepath', action='store', type=str, required=True, help="Input file path, e.g. /net/.../samples/")
+    requiredArgs.add_argument('-c', '--cdfsFile', default='output_01_dumpCDFs_decorr', action='store', type=str, help="CDFs output file in pkl.gz format, is the output from 01_dumpCdfs_decorr.py")
+    requiredArgs.add_argument('-v','--var', default='sigma_m_over_m', action='store', type=str, help="variable you want to decorrelate (default: sigma_m_over_m)")
+    requiredArgs.add_argument('-d','--dVar', default='mass', action='store', type=str, help="variable you want to correlate against, most likely mass, (default: mass)")
+    requiredArgs.add_argument('-o','--outFile', action='store', type=str, help ="filename and path to the decorrelated files, default: relative to var")
     optArgs = parser.add_argument_group('Optional Arguments')
     optArgs.add_argument('-r', '--ref', action='store', type=float, default=125.)
     optArgs.add_argument('--columns', nargs='+')
     optArgs.add_argument('--nomColumns', nargs='+')
     optArgs.add_argument('--vecColumns', nargs='+')
+    optArgs.add_argument('-p','--parquetGeneration', default='True', action='store', type=str, help="choose True or False to generate parquet files aswell, (default: mass)")
     options = parser.parse_args()
     main(options)
-- 
GitLab


From b13c5ae6d165bd15cc6cd4181cdc0c5a9693ca89 Mon Sep 17 00:00:00 2001
From: Maximilian <maximilianwrabetz@rwth-aachen.de>
Date: Wed, 11 Dec 2024 13:40:05 +0100
Subject: [PATCH 14/17] first working fusion of both scripts, no intermediate
 cdf file needed + small improvements

---
 .../01_dumpCdfs_decorr.py                     | 46 -------------------
 .../{02_decorrelate.py => CDFdecorrelator.py} | 43 +++++++++++------
 2 files changed, 28 insertions(+), 61 deletions(-)
 delete mode 100644 higgs_dna/tools/decorrelation_CDFs_dumper/01_dumpCdfs_decorr.py
 rename higgs_dna/tools/decorrelation_CDFs_dumper/{02_decorrelate.py => CDFdecorrelator.py} (78%)

diff --git a/higgs_dna/tools/decorrelation_CDFs_dumper/01_dumpCdfs_decorr.py b/higgs_dna/tools/decorrelation_CDFs_dumper/01_dumpCdfs_decorr.py
deleted file mode 100644
index 36a63c6b..00000000
--- a/higgs_dna/tools/decorrelation_CDFs_dumper/01_dumpCdfs_decorr.py
+++ /dev/null
@@ -1,46 +0,0 @@
-import numpy as np
-import pandas as pd
-from higgs_dna.tools.decorrelator import cdfCalc
-import argparse
-import glob
-import concurrent.futures
-
-
-def read_parquet_file(file_path):
-    return pd.read_parquet(file_path)
-
-
-def main(options):
-
-    df = pd.DataFrame()
-
-    files = glob.glob(str(options.infilepath) + "*.parquet")
-    with concurrent.futures.ThreadPoolExecutor() as executor:
-        data = list(executor.map(read_parquet_file, files))
-    events = pd.concat(data, ignore_index=True)
-    if options.tree in events:
-        df[options.tree] = events[options.tree].to_numpy()
-    else:
-        print(f"ERROR: tree not in columns of parquet files from {options.infilepath}")
-        print(f"Please choose from {[col for col in events.columns if col.startswith('sigma')]}")
-        exit()
-
-    df["mass"] = events.mass.to_numpy()
-    df["weight"] = events.weight.to_numpy()
-
-    print(f"INFO: found {len(events)} events")
-    print(f"INFO: output CDF file contains: {df.columns.tolist()}")
-    # Evaluating and dumping the CDFs in bins of mass
-    calc = cdfCalc(df, options.tree,'mass',np.linspace(100, 180, 161))
-    calc.dumpCdfs(str(options.cdfsFile) + ".pkl.gz")
-
-
-if __name__ == "__main__":
-
-    parser = argparse.ArgumentParser()
-    requiredArgs = parser.add_argument_group()
-    requiredArgs.add_argument('-i', '--infilepath', action='store', type=str, required=True, help="Input file path, e.g. /net/.../samples/")
-    requiredArgs.add_argument('-c', '--cdfsFile', default='output_01_dumpCDFs_decorr', action='store', type=str, help="CDFs output file in pkl.gz format, is cdf_input for 02_decorrelate.py")
-    parser.add_argument('-t', '--tree', default='sigma_m_over_m', action='store', type=str, help="Tree name (default: sigma_m_over_m)")
-    options = parser.parse_args()
-    main(options)
diff --git a/higgs_dna/tools/decorrelation_CDFs_dumper/02_decorrelate.py b/higgs_dna/tools/decorrelation_CDFs_dumper/CDFdecorrelator.py
similarity index 78%
rename from higgs_dna/tools/decorrelation_CDFs_dumper/02_decorrelate.py
rename to higgs_dna/tools/decorrelation_CDFs_dumper/CDFdecorrelator.py
index e3c182fb..6bb1f9e1 100644
--- a/higgs_dna/tools/decorrelation_CDFs_dumper/02_decorrelate.py
+++ b/higgs_dna/tools/decorrelation_CDFs_dumper/CDFdecorrelator.py
@@ -6,7 +6,10 @@ import os
 import pandas as pd
 import numpy as np
 import glob
+import concurrent.futures
 
+def read_parquet_file(file_path):
+    return pd.read_parquet(file_path)
 
 def printProgressBar(iteration,total,prefix='',suffix='',decimals=1,length=100,fill=chr(9608),printEnd="\r"):
 
@@ -17,7 +20,6 @@ def printProgressBar(iteration,total,prefix='',suffix='',decimals=1,length=100,f
     if iteration == total:
         print()
 
-
 def diphoton_ak_array(diphotons: ak.Array) -> ak.Array:
 
     output = {}
@@ -25,7 +27,6 @@ def diphoton_ak_array(diphotons: ak.Array) -> ak.Array:
         output[field] = diphotons[field]
     return ak.Array(output)
 
-
 def getArrayBranchName(branchname, fieldname, index):
     if index != ():
         return '{}{}'.format(branchname, index[0])
@@ -33,24 +34,33 @@ def getArrayBranchName(branchname, fieldname, index):
 
 
 def main(options):
+    
+    df = pd.DataFrame()
 
-    # loading cdf file
-    dummyDf = pd.DataFrame({'{}'.format(options.var): [0], '{}'.format(options.dVar): [0]})
-    decl = decorr.decorrelator(dummyDf, options.var, options.dVar, np.linspace(100., 180., 161))
-    decl.loadCdfs(str(options.cdfsFile) + ".pkl.gz")
-
-    # loading parquet files
     files = glob.glob(str(options.infilepath) + "*.parquet")
-    data = [pd.read_parquet(f) for f in files]
-    events = pd.concat(data,ignore_index=True)
+    with concurrent.futures.ThreadPoolExecutor() as executor:
+        data = list(executor.map(read_parquet_file, files))
+    events = pd.concat(data, ignore_index=True)
+    if options.tree in events:
+        df[options.tree] = events[options.tree].to_numpy()
+    else:
+        print(f"ERROR: tree not in columns of parquet files from {options.infilepath}")
+        print(f"Please choose from {[col for col in events.columns if col.startswith('sigma')]}")
+        exit()
 
-    # creating dataframe for decorrelation
-    df = pd.DataFrame()
-    df[options.var] = events[options.var].to_numpy()
-    df[options.dVar] = events[options.dVar].to_numpy()
+    df["mass"] = events.mass.to_numpy()
     df["weight"] = events.weight.to_numpy()
 
-    #printing variables that will be decorrelated
+    print(f"INFO: found {len(events)} events")
+    print(f"INFO: output CDF file contains: {df.columns.tolist()}")
+
+    calc = cdfCalc(df, options.tree, 'mass', np.linspace(100, 180, 161))
+    calc.calcCdfs()
+    cdfs = calc.cdfs
+    dummyDf = pd.DataFrame({'{}'.format(options.var): [0], '{}'.format(options.dVar): [0]})
+    decl = decorr.decorrelator(dummyDf, options.var, options.dVar, np.linspace(100., 180., 161))
+    decl.cdfs = cdfs
+
     print("var, dVar:", options.var, options.dVar)
     # giving variables to dataframe and resetting index
     decl.df = df.loc[:, [options.var, options.dVar]]
@@ -63,11 +73,14 @@ def main(options):
     #create pkl.gz output file and remove the old sigma_m_over_m variable
     df = df.drop(columns=options.var)
     calc = cdfCalc(df, '{}_decorr'.format(options.var), options.dVar, np.linspace(100, 180, 161))
+    
+    print(f"INFO: decorrelated CDF file contains: {df.columns.tolist()}")
     if options.outFile:
         calc.dumpCdfs(str(options.outFile) + ".pkl.gz")
     else:
         calc.dumpCdfs(str(options.var) + "_decorr" + ".pkl.gz")
     
+    # a parquet file with the decorrelated variable will be created by default but takes time
     if options.parquetGeneration == "True":
         print("parquet file will be created")
         events['{}_decorr'.format(options.var)] = decl.doDecorr(options.ref)
-- 
GitLab


From 5e16df58edc7d7ef32ee3585505e461311e8700e Mon Sep 17 00:00:00 2001
From: Maximilian <maximilianwrabetz@rwth-aachen.de>
Date: Wed, 11 Dec 2024 15:05:01 +0100
Subject: [PATCH 15/17] a lot of improvements, comments and helpful messages in
 case of an error. flake8 happy aswell

---
 .../CDFdecorrelator.py                        | 83 +++++++++++--------
 1 file changed, 48 insertions(+), 35 deletions(-)

diff --git a/higgs_dna/tools/decorrelation_CDFs_dumper/CDFdecorrelator.py b/higgs_dna/tools/decorrelation_CDFs_dumper/CDFdecorrelator.py
index 6bb1f9e1..d4b147fb 100644
--- a/higgs_dna/tools/decorrelation_CDFs_dumper/CDFdecorrelator.py
+++ b/higgs_dna/tools/decorrelation_CDFs_dumper/CDFdecorrelator.py
@@ -1,16 +1,18 @@
-import higgs_dna.tools.decorrelator as decorr
+
 from higgs_dna.tools.decorrelator import cdfCalc
+import higgs_dna.tools.decorrelator as decorr
+import concurrent.futures
 import awkward as ak
-import argparse
-import os
 import pandas as pd
 import numpy as np
+import argparse
 import glob
-import concurrent.futures
+
 
 def read_parquet_file(file_path):
     return pd.read_parquet(file_path)
 
+
 def printProgressBar(iteration,total,prefix='',suffix='',decimals=1,length=100,fill=chr(9608),printEnd="\r"):
 
     percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total)))
@@ -20,6 +22,7 @@ def printProgressBar(iteration,total,prefix='',suffix='',decimals=1,length=100,f
     if iteration == total:
         print()
 
+
 def diphoton_ak_array(diphotons: ak.Array) -> ak.Array:
 
     output = {}
@@ -27,6 +30,7 @@ def diphoton_ak_array(diphotons: ak.Array) -> ak.Array:
         output[field] = diphotons[field]
     return ak.Array(output)
 
+
 def getArrayBranchName(branchname, fieldname, index):
     if index != ():
         return '{}{}'.format(branchname, index[0])
@@ -34,73 +38,82 @@ def getArrayBranchName(branchname, fieldname, index):
 
 
 def main(options):
-    
+    # reading the parquet files
     df = pd.DataFrame()
-
     files = glob.glob(str(options.infilepath) + "*.parquet")
     with concurrent.futures.ThreadPoolExecutor() as executor:
         data = list(executor.map(read_parquet_file, files))
     events = pd.concat(data, ignore_index=True)
-    if options.tree in events:
-        df[options.tree] = events[options.tree].to_numpy()
+    print(f"INFO: found {len(events)} events")
+
+    # helper and parquet column read in
+    if options.var in events:
+        df[options.var] = events[options.var].to_numpy()
     else:
-        print(f"ERROR: tree not in columns of parquet files from {options.infilepath}")
+        print(f"ERROR: var not in columns of parquet files from {options.infilepath}")
         print(f"Please choose from {[col for col in events.columns if col.startswith('sigma')]}")
         exit()
-
-    df["mass"] = events.mass.to_numpy()
+    df[options.dVar] = events[options.dVar].to_numpy()
     df["weight"] = events.weight.to_numpy()
 
-    print(f"INFO: found {len(events)} events")
-    print(f"INFO: output CDF file contains: {df.columns.tolist()}")
-
-    calc = cdfCalc(df, options.tree, 'mass', np.linspace(100, 180, 161))
+    # handling of decorrelator
+    calc = cdfCalc(df, options.var, options.dVar, np.linspace(100, 180, 161))
     calc.calcCdfs()
     cdfs = calc.cdfs
     dummyDf = pd.DataFrame({'{}'.format(options.var): [0], '{}'.format(options.dVar): [0]})
     decl = decorr.decorrelator(dummyDf, options.var, options.dVar, np.linspace(100., 180., 161))
     decl.cdfs = cdfs
+    print("var, dVar:", options.var,", ", options.dVar)
 
-    print("var, dVar:", options.var, options.dVar)
     # giving variables to dataframe and resetting index
     decl.df = df.loc[:, [options.var, options.dVar]]
     decl.df.reset_index(inplace=True)
 
-    # doing actually the decorr
+    # doing actually the decorr for pickle
     df['{}_decorr'.format(options.var)] = decl.doDecorr(options.ref)
-    events['{}_decorr'.format(options.var)] = decl.doDecorr(options.ref)
-    
-    #create pkl.gz output file and remove the old sigma_m_over_m variable
+
+    # create pickle file and remove the old sigma_m_over_m variable
     df = df.drop(columns=options.var)
     calc = cdfCalc(df, '{}_decorr'.format(options.var), options.dVar, np.linspace(100, 180, 161))
-    
-    print(f"INFO: decorrelated CDF file contains: {df.columns.tolist()}")
-    if options.outFile:
-        calc.dumpCdfs(str(options.outFile) + ".pkl.gz")
+    print(f"INFO: decorrelated CDF contains: {df.columns.tolist()}")
+    if options.era:
+        if options.outFile:
+            calc.dumpCdfs(str(options.outFile) + "_" + str(options.era) + ".pkl.gz")
+        else:
+            calc.dumpCdfs(str(options.var) + "_" + str(options.era) + "_CDFs" + ".pkl.gz")
     else:
-        calc.dumpCdfs(str(options.var) + "_decorr" + ".pkl.gz")
-    
-    # a parquet file with the decorrelated variable will be created by default but takes time
+        if options.outFile:
+            calc.dumpCdfs(str(options.outFile) + ".pkl.gz")
+        else:
+            calc.dumpCdfs(str(options.var) + "_CDFs" + ".pkl.gz")
+
+    # a parquet file with the decorrelated variable will be created by default but takes more time
     if options.parquetGeneration == "True":
-        print("parquet file will be created")
-        events['{}_decorr'.format(options.var)] = decl.doDecorr(options.ref)
-        events.to_parquet(options.outFile + ".parquet")
+        print("INFO: new parquet and pickle files will be created!")
+        if options.era:
+            events['{}_decorr'.format(options.var)] = decl.doDecorr(options.ref)
+            events.to_parquet(options.outFile + "_" + str(options.era) + ".parquet")
+        else:
+            events['{}_decorr'.format(options.var)] = decl.doDecorr(options.ref)
+            events.to_parquet(options.outFile + ".parquet")
+    else:
+        print("INFO: new parquet file will not be created, only pickle!")
+
 
 if __name__ == '__main__':
 
     parser = argparse.ArgumentParser()
     requiredArgs = parser.add_argument_group('Required Arguements')
-    parser.add_argument('-t', '--tree', default='sigma_m_over_m', action='store', type=str, help="Tree name (default: sigma_m_over_m)")
     requiredArgs.add_argument('-i', '--infilepath', action='store', type=str, required=True, help="Input file path, e.g. /net/.../samples/")
-    requiredArgs.add_argument('-c', '--cdfsFile', default='output_01_dumpCDFs_decorr', action='store', type=str, help="CDFs output file in pkl.gz format, is the output from 01_dumpCdfs_decorr.py")
     requiredArgs.add_argument('-v','--var', default='sigma_m_over_m', action='store', type=str, help="variable you want to decorrelate (default: sigma_m_over_m)")
     requiredArgs.add_argument('-d','--dVar', default='mass', action='store', type=str, help="variable you want to correlate against, most likely mass, (default: mass)")
-    requiredArgs.add_argument('-o','--outFile', action='store', type=str, help ="filename and path to the decorrelated files, default: relative to var")
+    requiredArgs.add_argument('-o','--outFile', action='store', type=str, help="filename and path to the decorrelated files, default .<var>(_era)_CDFs")
     optArgs = parser.add_argument_group('Optional Arguments')
-    optArgs.add_argument('-r', '--ref', action='store', type=float, default=125.)
+    optArgs.add_argument('-r', '--ref', action='store', type=float, default=125., help="reference mass for decorrelation")
     optArgs.add_argument('--columns', nargs='+')
     optArgs.add_argument('--nomColumns', nargs='+')
     optArgs.add_argument('--vecColumns', nargs='+')
-    optArgs.add_argument('-p','--parquetGeneration', default='True', action='store', type=str, help="choose True or False to generate parquet files aswell, (default: mass)")
+    optArgs.add_argument('-p','--parquetGeneration', default='True', action='store', type=str, help="choose True or False to generate parquet files aswell, (default: True)")
+    optArgs.add_argument('-e','--era', action='store', type=str, help="optional: choose era to give extra information in file name")
     options = parser.parse_args()
     main(options)
-- 
GitLab


From f908ef6cfe85387c126cd6c949f516e178b87e83 Mon Sep 17 00:00:00 2001
From: Maximilian <maximilianwrabetz@rwth-aachen.de>
Date: Wed, 11 Dec 2024 15:41:08 +0100
Subject: [PATCH 16/17] even more help messages now, made flake8 happy

---
 .../CDFdecorrelator.py                        | 30 ++++++++++++++-----
 1 file changed, 22 insertions(+), 8 deletions(-)

diff --git a/higgs_dna/tools/decorrelation_CDFs_dumper/CDFdecorrelator.py b/higgs_dna/tools/decorrelation_CDFs_dumper/CDFdecorrelator.py
index d4b147fb..02de331e 100644
--- a/higgs_dna/tools/decorrelation_CDFs_dumper/CDFdecorrelator.py
+++ b/higgs_dna/tools/decorrelation_CDFs_dumper/CDFdecorrelator.py
@@ -39,6 +39,11 @@ def getArrayBranchName(branchname, fieldname, index):
 
 def main(options):
     # reading the parquet files
+    if not options.infilepath.endswith('/'):
+        print(f"WARNING: Please make sure that {options.infilepath} is a path to .parquet files and ends with /")
+        options.infilepath = options.infilepath + "/"
+        print(f"INFO: To help you out, the path is changed to: {options.infilepath}")
+
     df = pd.DataFrame()
     files = glob.glob(str(options.infilepath) + "*.parquet")
     with concurrent.futures.ThreadPoolExecutor() as executor:
@@ -51,7 +56,7 @@ def main(options):
         df[options.var] = events[options.var].to_numpy()
     else:
         print(f"ERROR: var not in columns of parquet files from {options.infilepath}")
-        print(f"Please choose from {[col for col in events.columns if col.startswith('sigma')]}")
+        print(f"Please choose e.g. from {[col for col in events.columns if col.startswith('sigma')]} or inspect columns for more information!")
         exit()
     df[options.dVar] = events[options.dVar].to_numpy()
     df["weight"] = events.weight.to_numpy()
@@ -69,8 +74,9 @@ def main(options):
     decl.df = df.loc[:, [options.var, options.dVar]]
     decl.df.reset_index(inplace=True)
 
-    # doing actually the decorr for pickle
-    df['{}_decorr'.format(options.var)] = decl.doDecorr(options.ref)
+    # doing the decorr
+    decorrelated_var = decl.doDecorr(options.ref)
+    df['{}_decorr'.format(options.var)] = decorrelated_var
 
     # create pickle file and remove the old sigma_m_over_m variable
     df = df.drop(columns=options.var)
@@ -86,16 +92,24 @@ def main(options):
             calc.dumpCdfs(str(options.outFile) + ".pkl.gz")
         else:
             calc.dumpCdfs(str(options.var) + "_CDFs" + ".pkl.gz")
+    print("Created pickle file!")
 
     # a parquet file with the decorrelated variable will be created by default but takes more time
     if options.parquetGeneration == "True":
-        print("INFO: new parquet and pickle files will be created!")
+        print("INFO: new parquet will be created, this can take some time!")
+        events['{}_decorr'.format(options.var)] = decorrelated_var
         if options.era:
-            events['{}_decorr'.format(options.var)] = decl.doDecorr(options.ref)
-            events.to_parquet(options.outFile + "_" + str(options.era) + ".parquet")
+            if options.outFile:
+                events.to_parquet(options.outFile + "_" + str(options.era) + ".parquet")
+            else:
+                events.to_parquet(str(options.var) + "_decorr_" + str(options.era) + ".parquet")
         else:
-            events['{}_decorr'.format(options.var)] = decl.doDecorr(options.ref)
-            events.to_parquet(options.outFile + ".parquet")
+            if options.outFile:
+                events.to_parquet(options.outFile + ".parquet")
+            else:
+                events.to_parquet(str(options.var) + "_decorr" + ".parquet")
+
+        print("Created parquet file!")
     else:
         print("INFO: new parquet file will not be created, only pickle!")
 
-- 
GitLab


From 98be24e127940ea4c25c188d214f2b0ea3a83712 Mon Sep 17 00:00:00 2001
From: Maximilian <maximilianwrabetz@rwth-aachen.de>
Date: Thu, 12 Dec 2024 13:23:27 +0100
Subject: [PATCH 17/17] flag to avoid parquet generation is improved, flake8
 happy

---
 higgs_dna/tools/decorrelation_CDFs_dumper/CDFdecorrelator.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/higgs_dna/tools/decorrelation_CDFs_dumper/CDFdecorrelator.py b/higgs_dna/tools/decorrelation_CDFs_dumper/CDFdecorrelator.py
index 02de331e..189daf8a 100644
--- a/higgs_dna/tools/decorrelation_CDFs_dumper/CDFdecorrelator.py
+++ b/higgs_dna/tools/decorrelation_CDFs_dumper/CDFdecorrelator.py
@@ -95,7 +95,7 @@ def main(options):
     print("Created pickle file!")
 
     # a parquet file with the decorrelated variable will be created by default but takes more time
-    if options.parquetGeneration == "True":
+    if not options.parquetGenerationOff:
         print("INFO: new parquet will be created, this can take some time!")
         events['{}_decorr'.format(options.var)] = decorrelated_var
         if options.era:
@@ -127,7 +127,7 @@ if __name__ == '__main__':
     optArgs.add_argument('--columns', nargs='+')
     optArgs.add_argument('--nomColumns', nargs='+')
     optArgs.add_argument('--vecColumns', nargs='+')
-    optArgs.add_argument('-p','--parquetGeneration', default='True', action='store', type=str, help="choose True or False to generate parquet files aswell, (default: True)")
+    optArgs.add_argument('-p', '--parquetGenerationOff', action='store_true', help="Set this flag to avoid generating parquet files (by default parquet files are created)")
     optArgs.add_argument('-e','--era', action='store', type=str, help="optional: choose era to give extra information in file name")
     options = parser.parse_args()
     main(options)
-- 
GitLab