From c284851f37fb1921688243bf5bd180b454db3dd2 Mon Sep 17 00:00:00 2001
From: alfroch <alexander.froch@cern.ch>
Date: Mon, 31 Jan 2022 15:27:35 +0100
Subject: [PATCH 01/10] Adding new path for PDF_sampling files

---
 umami/preprocessing_tools/Resampling.py | 39 +++++++++++++++----------
 1 file changed, 23 insertions(+), 16 deletions(-)

diff --git a/umami/preprocessing_tools/Resampling.py b/umami/preprocessing_tools/Resampling.py
index 88c5147b0..048333f72 100644
--- a/umami/preprocessing_tools/Resampling.py
+++ b/umami/preprocessing_tools/Resampling.py
@@ -339,10 +339,17 @@ class Resampling:
 
         self.outfile_name = self.config.GetFileName(option="resampled")
         self.outfile_path = self.config.config["parameters"]["sample_path"]
+        self.resampled_path = self.config.config["parameters"]["file_path"]
 
+        # Check if the directory for the outfile is existing
         if os.path.dirname(self.outfile_name):
             os.makedirs(os.path.dirname(self.outfile_name), exist_ok=True)
 
+        # Check if the directory for the resampled, scaled files
+        # (normally preprocessed/) exists
+        if os.path.dirname(self.resampled_path):
+            os.makedirs(os.path.dirname(self.resampled_path), exist_ok=True)
+
         # Get class labels from sampling/preparation.
         # Try/Except here for backward compatibility
         try:
@@ -1296,7 +1303,7 @@ class PDFSampling(Resampling):  # pylint: disable=too-many-public-methods
             ],
         )
         save_name = os.path.join(
-            self.outfile_path,
+            self.resampled_path,
             "PDF_sampling",
             f"inter_func_{store_key}",
         )
@@ -1429,7 +1436,7 @@ class PDFSampling(Resampling):  # pylint: disable=too-many-public-methods
         """Get unnormalised PDF weight."""
         # Get the inter_func
         load_name = os.path.join(
-            self.outfile_path,
+            self.resampled_path,
             "PDF_sampling",
             f"inter_func_{store_key}",
         )
@@ -1469,7 +1476,7 @@ class PDFSampling(Resampling):  # pylint: disable=too-many-public-methods
 
         # Load number to sample
         load_name = os.path.join(
-            self.outfile_path,
+            self.resampled_path,
             "PDF_sampling",
             "target_data.json",
         )
@@ -1591,7 +1598,7 @@ class PDFSampling(Resampling):  # pylint: disable=too-many-public-methods
         create_file = True
         chunk_counter = 0
         save_name = os.path.join(
-            self.outfile_path,
+            self.resampled_path,
             "PDF_sampling",
             self.options["samples"][sample_category][sample_id] + "_selected.h5",
         )
@@ -1684,7 +1691,7 @@ class PDFSampling(Resampling):  # pylint: disable=too-many-public-methods
 
         # Load number to sample
         load_name = os.path.join(
-            self.outfile_path,
+            self.resampled_path,
             "PDF_sampling",
             "target_data.json",
         )
@@ -1693,7 +1700,7 @@ class PDFSampling(Resampling):  # pylint: disable=too-many-public-methods
         number_to_sample = target_data["number_to_sample"][sample_name]
 
         index_file = os.path.join(
-            self.outfile_path,
+            self.resampled_path,
             "PDF_sampling",
             self.options["samples"][sample_category][sample_id] + "_indices.h5",
         )
@@ -1704,7 +1711,7 @@ class PDFSampling(Resampling):  # pylint: disable=too-many-public-methods
         duplicate = True
 
         save_name = os.path.join(
-            self.outfile_path,
+            self.resampled_path,
             "PDF_sampling",
             self.options["samples"][sample_category][sample_id] + "_selected.h5",
         )
@@ -1879,12 +1886,12 @@ class PDFSampling(Resampling):  # pylint: disable=too-many-public-methods
             "target_fraction": self.target_fractions,
         }
         save_name = os.path.join(
-            self.outfile_path,
+            self.resampled_path,
             "PDF_sampling",
             "target_data.json",
         )
-        if not os.path.exists(os.path.join(self.outfile_path, "PDF_sampling")):
-            os.mkdir(os.path.join(self.outfile_path, "PDF_sampling"))
+        if not os.path.exists(os.path.join(self.resampled_path, "PDF_sampling")):
+            os.mkdir(os.path.join(self.resampled_path, "PDF_sampling"))
         with open(save_name, "w") as write_file:
             json.dump(save_data, write_file, cls=JsonNumpyEncoder)
 
@@ -1899,7 +1906,7 @@ class PDFSampling(Resampling):  # pylint: disable=too-many-public-methods
         """
 
         load_name = os.path.join(
-            self.outfile_path,
+            self.resampled_path,
             "PDF_sampling",
             "target_data.json",
         )
@@ -1998,7 +2005,7 @@ class PDFSampling(Resampling):  # pylint: disable=too-many-public-methods
 
         # Load the target data
         load_name = os.path.join(
-            self.outfile_path,
+            self.resampled_path,
             "PDF_sampling",
             "target_data.json",
         )
@@ -2060,14 +2067,14 @@ class PDFSampling(Resampling):  # pylint: disable=too-many-public-methods
 
         sample_name = self.options["samples"][sample_category][sample_id]
         save_name = os.path.join(
-            self.outfile_path,
+            self.resampled_path,
             "PDF_sampling",
             self.options["samples"][sample_category][sample_id] + "_indices.h5",
         )
 
         # Load number to sample
         load_name = os.path.join(
-            self.outfile_path,
+            self.resampled_path,
             "PDF_sampling",
             "target_data.json",
         )
@@ -2180,7 +2187,7 @@ class PDFSampling(Resampling):  # pylint: disable=too-many-public-methods
         ):
             for _, sample_category in enumerate(self.options["samples"]):
                 load_name = os.path.join(
-                    self.outfile_path,
+                    self.resampled_path,
                     "PDF_sampling",
                     self.options["samples"][sample_category][sample_id]
                     + "_selected.h5",
@@ -2362,7 +2369,7 @@ class PDFSampling(Resampling):  # pylint: disable=too-many-public-methods
         ):
             for cat_ind, sample_category in enumerate(self.options["samples"]):
                 load_name = os.path.join(
-                    self.outfile_path,
+                    self.resampled_path,
                     "PDF_sampling",
                     self.options["samples"][sample_category][sample_id]
                     + "_selected.h5",
-- 
GitLab


From 3da3f8a784ff69adac850f7a5b46377028892eb6 Mon Sep 17 00:00:00 2001
From: alfroch <alexander.froch@cern.ch>
Date: Mon, 31 Jan 2022 16:58:49 +0100
Subject: [PATCH 02/10] Adding new path for resampling plots

---
 umami/preprocessing_tools/Resampling.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/umami/preprocessing_tools/Resampling.py b/umami/preprocessing_tools/Resampling.py
index 048333f72..8457273f7 100644
--- a/umami/preprocessing_tools/Resampling.py
+++ b/umami/preprocessing_tools/Resampling.py
@@ -2340,7 +2340,7 @@ class PDFSampling(Resampling):  # pylint: disable=too-many-public-methods
 
         # Check if the directory for the plots exists
         plot_dir_path = os.path.join(
-            self.config.config["parameters"]["sample_path"],
+            self.resampled_path,
             "plots/",
         )
         os.makedirs(plot_dir_path, exist_ok=True)
@@ -2566,7 +2566,7 @@ class Weighting(ResamplingTools):
 
         # Check if the directory for the plots exists
         plot_dir_path = os.path.join(
-            self.config.config["parameters"]["sample_path"],
+            self.resampled_path,
             "plots/",
         )
         os.makedirs(plot_dir_path, exist_ok=True)
@@ -2820,7 +2820,7 @@ class UnderSampling(ResamplingTools):
 
         # Check if the directory for the plots exists
         plot_dir_path = os.path.join(
-            self.config.config["parameters"]["sample_path"],
+            self.resampled_path,
             "plots/",
         )
         os.makedirs(plot_dir_path, exist_ok=True)
-- 
GitLab


From 98612fb503b01d55748b9b32efad5c81b6ad582d Mon Sep 17 00:00:00 2001
From: alfroch <alexander.froch@cern.ch>
Date: Tue, 1 Feb 2022 09:08:36 +0100
Subject: [PATCH 03/10] Fixing os makedirs

---
 umami/preprocessing_tools/Resampling.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/umami/preprocessing_tools/Resampling.py b/umami/preprocessing_tools/Resampling.py
index 8457273f7..2812c3bf5 100644
--- a/umami/preprocessing_tools/Resampling.py
+++ b/umami/preprocessing_tools/Resampling.py
@@ -1890,8 +1890,10 @@ class PDFSampling(Resampling):  # pylint: disable=too-many-public-methods
             "PDF_sampling",
             "target_data.json",
         )
-        if not os.path.exists(os.path.join(self.resampled_path, "PDF_sampling")):
-            os.mkdir(os.path.join(self.resampled_path, "PDF_sampling"))
+
+        # Ensure the output path exists
+        os.makedirs(os.path.join(self.resampled_path, "PDF_sampling"), exist_ok=True)
+
         with open(save_name, "w") as write_file:
             json.dump(save_data, write_file, cls=JsonNumpyEncoder)
 
-- 
GitLab


From ad3f8c33c64ef7cac88c416ae051772854130b27 Mon Sep 17 00:00:00 2001
From: alfroch <alexander.froch@cern.ch>
Date: Tue, 1 Feb 2022 13:18:29 +0100
Subject: [PATCH 04/10] Adding overhauled preprocessing integration tests

---
 umami/tests/integration/test_preprocessing.py | 537 ++++++++----------
 1 file changed, 242 insertions(+), 295 deletions(-)

diff --git a/umami/tests/integration/test_preprocessing.py b/umami/tests/integration/test_preprocessing.py
index ce710b252..73e215164 100644
--- a/umami/tests/integration/test_preprocessing.py
+++ b/umami/tests/integration/test_preprocessing.py
@@ -5,14 +5,15 @@ This script integration tests the preprocessing methods.
 """
 
 import os
+import tempfile
 import unittest
-from shutil import copyfile
+from shutil import copyfile, copytree
 from subprocess import CalledProcessError, run
 
 import yaml
 
 from umami.configuration import logger
-from umami.tools import replaceLineInFile, yaml_loader
+from umami.tools import YAML, replaceLineInFile, yaml_loader
 
 
 def getConfiguration():
@@ -28,7 +29,161 @@ def getConfiguration():
     return conf_setup
 
 
-def runPreprocessing(config: dict, tagger: str, method: str) -> bool:
+def preparePreprocessingConfig(
+    test_dir: str,
+    tagger: str,
+    sampling_method: str,
+) -> str:
+    """Prepare the preprocessing configs for the different tagger/methods.
+
+    Parameters
+    ----------
+    test_dir : str
+        Path to test directory
+    tagger : str
+        Name of the tagger which the preprocessing is for.
+    sampling_method : str
+        Sampling method that will be used.
+
+    Returns
+    -------
+    str
+        Path to the prepared preprocessing config.
+    """
+    # Get test configuration
+    data = getConfiguration()
+
+    # Make ttbar und zprime directory
+    os.makedirs(os.path.join(test_dir, "ttbar"), exist_ok=True)
+    os.makedirs(os.path.join(test_dir, "zpext"), exist_ok=True)
+
+    # inputs for test will be located in test_dir
+    config_source = os.path.join(os.getcwd(), data["test_preprocessing"]["config"])
+    config_paths_source = os.path.join(
+        os.getcwd(), data["test_preprocessing"]["config_paths"]
+    )
+    var_dict_source = os.path.join(
+        os.getcwd(), data["test_preprocessing"][f"var_dict_{tagger}"]
+    )
+    config = os.path.join(test_dir, os.path.basename(config_source))
+    config_paths = os.path.join(test_dir, os.path.basename(config_paths_source))
+    var_dict = os.path.join(test_dir, os.path.basename(var_dict_source))
+    scale_dict = os.path.join(test_dir, "PFlow-scale_dict.json")
+    output = os.path.join(test_dir, "PFlow-hybrid_70-test.h5")
+
+    logger.info(f"Preparing config file based on {config_source} in {config}...")
+    copyfile(config_source, config)
+    copyfile(config_paths_source, config_paths)
+    copyfile(var_dict_source, var_dict)
+
+    # modify copy of preprocessing config file for test
+    replaceLineInFile(
+        config_paths,
+        "ntuple_path:",
+        f"ntuple_path: &ntuple_path {test_dir}",
+    )
+    replaceLineInFile(
+        config_paths,
+        "sample_path:",
+        f"sample_path: &sample_path {test_dir}",
+    )
+    replaceLineInFile(
+        config_paths,
+        "file_path:",
+        f"file_path: &file_path {test_dir}",
+    )
+    replaceLineInFile(
+        config_paths,
+        ".outfile_name:",
+        f".outfile_name: &outfile_name {output}",
+    )
+    replaceLineInFile(
+        config_paths,
+        ".dict_file:",
+        f".dict_file: &dict_file {scale_dict}",
+    )
+    replaceLineInFile(
+        config_paths,
+        ".intermediate_index_file:",
+        ".intermediate_index_file: &intermediate_index_file indices.h5",
+    )
+    replaceLineInFile(
+        config_paths,
+        ".var_file:",
+        f".var_file: &var_file {var_dict}",
+    )
+
+    # Load the preprocessing file
+    umami_yaml = YAML(typ="safe", pure=True)
+    with open(config, "r") as conf:
+        config_file = umami_yaml.load(conf)
+
+    config_file["preparation"]["ntuples"]["ttbar"]["file_pattern"] = "ttbar/*.h5"
+    config_file["preparation"]["ntuples"]["zprime"]["file_pattern"] = "zpext/*.h5"
+
+    if tagger.casefold() == "dl1r":
+        config_file["sampling"]["options"]["save_tracks"] = False
+        config_file["sampling"]["options"]["tracks_names"] = None
+
+    else:
+        config_file["sampling"]["options"]["save_tracks"] = True
+        config_file["sampling"]["options"]["tracks_names"] = ["tracks", "tracks_loose"]
+
+    if sampling_method.casefold() == "pdf":
+        # Change the method to pdf and adapt options
+        config_file["sampling"]["method"] = "pdf"
+
+        # Set correct binning for pdf method
+        config_file["sampling"]["options"]["sampling_variables"] = [
+            {"pt_btagJes": {"bins": [[0, 25e4, 100], [25e4, 6e6, 100]]}},
+            {"absEta_btagJes": {"bins": [[0, 2.5, 10], [0, 2.5, 10]]}},
+        ]
+
+        # Set number of jets to maximum
+        config_file["sampling"]["options"]["njets"] = -1
+
+        # Remove custom njets initial
+        config_file["sampling"]["options"]["custom_njets_initial"] = None
+
+    elif sampling_method.casefold() == "weighting":
+        config_file["sampling"]["method"] = "weighting"
+        config_file["sampling"]["options"]["bool_attach_sample_weights"] = True
+
+    # save the preprocessing config file for test
+    with open(config, "w") as con:
+        umami_yaml.dump(config_file, con)
+
+    logger.info("Downloading test data...")
+    for file in data["test_preprocessing"]["files"]:
+        path = os.path.join(
+            data["data_url"],
+            data["test_preprocessing"]["data_subfolder"],
+            file,
+        )
+        logger.info(f"Retrieving file from path {path}")
+        run(["wget", path, "--directory-prefix", test_dir], check=True)
+
+    run(
+        [
+            "mv",
+            os.path.join(test_dir, "ci_ttbar_basefile.h5"),
+            os.path.join(test_dir, "ttbar", "ci_ttbar_basefile.h5"),
+        ],
+        check=True,
+    )
+    run(
+        [
+            "mv",
+            os.path.join(test_dir, "ci_zpext_basefile.h5"),
+            os.path.join(test_dir, "zpext", "ci_zpext_basefile.h5"),
+        ],
+        check=True,
+    )
+
+    return config
+
+
+def runPreprocessing(config: dict, tagger: str, method: str, test_dir: str) -> bool:
     """
     Call all steps of the preprocessing for a certain configuration and variable dict
     input.
@@ -189,59 +344,7 @@ def runPreprocessing(config: dict, tagger: str, method: str) -> bool:
             ) from Error
 
     tagger_path = f"./test_preprocessing_{tagger}/"
-    if not os.path.isdir(tagger_path):
-        run(["mkdir", tagger_path], check=True)
-
-    run(
-        [
-            "cp",
-            "-r",
-            "/tmp/umami/preprocessing/",
-            tagger_path,
-        ],
-        check=True,
-    )
-
-    # Get the path of the not needed configs
-    unused_configs = os.path.join(
-        tagger_path, "preprocessing/", "PFlow-Preprocessing_*.yaml"
-    )
-
-    # Rename the needed config to PFlow-Preprocessing.yaml and erase the unused
-    # TODO change in python 3.10
-    if method == "count":
-        run(
-            [f"rm -rfv {unused_configs}"],
-            shell=True,
-            check=True,
-        )
-
-    elif method == "pdf":
-        copyfile(
-            os.path.join(tagger_path, "preprocessing/", "PFlow-Preprocessing_pdf.yaml"),
-            os.path.join(tagger_path, "preprocessing/", "PFlow-Preprocessing.yaml"),
-        )
-        run(
-            [f"rm -rfv {unused_configs}"],
-            shell=True,
-            check=True,
-        )
-
-    elif method == "weighting":
-        copyfile(
-            os.path.join(
-                tagger_path, "preprocessing/", "PFlow-Preprocessing_weighting.yaml"
-            ),
-            os.path.join(tagger_path, "preprocessing/", "PFlow-Preprocessing.yaml"),
-        )
-        run(
-            [f"rm -rfv {unused_configs}"],
-            shell=True,
-            check=True,
-        )
-
-    else:
-        raise KeyError(f"Method {method} is not supported by the integration test!")
+    copytree(test_dir, tagger_path)
 
     return True
 
@@ -260,288 +363,132 @@ class TestPreprocessing(unittest.TestCase):
         # Get test configuration
         self.data = getConfiguration()
 
-        test_dir = os.path.join(self.data["test_preprocessing"]["testdir"])
-        logger.info(f"Creating test directory in {test_dir}")
-        # clean up, hopefully this causes no "uh oh...""
-        if test_dir.startswith("/tmp"):
-            run(["rm", "-rf", test_dir], check=True)
-        run(["mkdir", "-p", test_dir], check=True)
-
-        # Make filepaths for basefiles
-        run(["mkdir", "-p", os.path.join(test_dir, "ttbar")], check=True)
-        run(["mkdir", "-p", os.path.join(test_dir, "zpext")], check=True)
-
-        # inputs for test will be located in test_dir
-        config_source = os.path.join(
-            os.getcwd(), self.data["test_preprocessing"]["config"]
-        )
-        config_paths_source = os.path.join(
-            os.getcwd(), self.data["test_preprocessing"]["config_paths"]
-        )
-        var_dict_umami_source = os.path.join(
-            os.getcwd(), self.data["test_preprocessing"]["var_dict_umami"]
-        )
-        var_dict_dips_source = os.path.join(
-            os.getcwd(), self.data["test_preprocessing"]["var_dict_dips"]
-        )
-        var_dict_dl1r_source = os.path.join(
-            os.getcwd(), self.data["test_preprocessing"]["var_dict_dl1r"]
-        )
-        self.config = os.path.join(test_dir, os.path.basename(config_source))
-        self.config_paths = os.path.join(
-            test_dir, os.path.basename(config_paths_source)
-        )
-        self.var_dict_umami = os.path.join(
-            test_dir, os.path.basename(var_dict_umami_source)
-        )
-        self.var_dict_dips = os.path.join(
-            test_dir, os.path.basename(var_dict_dips_source)
-        )
-        self.var_dict_dl1r = os.path.join(
-            test_dir, os.path.basename(var_dict_dl1r_source)
-        )
-        self.scale_dict = os.path.join(test_dir, "PFlow-scale_dict.json")
-        self.output = os.path.join(test_dir, "PFlow-hybrid_70-test.h5")
-
-        logger.info(
-            f"Preparing config file based on {config_source} in {self.config}..."
-        )
-        copyfile(config_source, self.config)
-        copyfile(config_paths_source, self.config_paths)
-        copyfile(var_dict_umami_source, self.var_dict_umami)
-        copyfile(var_dict_dips_source, self.var_dict_dips)
-        copyfile(var_dict_dl1r_source, self.var_dict_dl1r)
-
-        # modify copy of preprocessing config file for test
-        replaceLineInFile(
-            self.config_paths,
-            "ntuple_path:",
-            f"ntuple_path: &ntuple_path {test_dir}",
-        )
-        replaceLineInFile(
-            self.config_paths,
-            "sample_path:",
-            f"sample_path: &sample_path {test_dir}",
-        )
-        replaceLineInFile(
-            self.config_paths,
-            "file_path:",
-            f"file_path: &file_path {test_dir}",
-        )
-        replaceLineInFile(
-            self.config_paths,
-            ".outfile_name:",
-            f".outfile_name: &outfile_name {self.output}",
-        )
-        replaceLineInFile(
-            self.config_paths,
-            ".dict_file:",
-            f".dict_file: &dict_file {self.scale_dict}",
-        )
-        replaceLineInFile(
-            self.config_paths,
-            ".intermediate_index_file:",
-            ".intermediate_index_file: &intermediate_index_file indices.h5",
-        )
-        replaceLineInFile(
-            self.config,
-            "      file_pattern: user.alfroch.410470",
-            "      file_pattern: ttbar/*.h5",
-        )
-        replaceLineInFile(
-            self.config,
-            "      file_pattern: user.alfroch.427081",
-            "      file_pattern: zpext/*.h5",
-        )
-        replaceLineInFile(
-            self.config,
-            "    tracks_names:",
-            "    tracks_names: ['tracks','tracks_loose']",
-        )
-
-        # copy config file and change name to pdf for pdf preprocessing config
-        self.pdf_config = self.config[:].replace(".yaml", "") + "_pdf.yaml"
-        copyfile(self.config, self.pdf_config)
+        self.test_dir_path = tempfile.TemporaryDirectory()  # pylint: disable=R1732
+        self.test_dir = f"{self.test_dir_path.name}"
+        logger.info(f"Creating test directory in {self.test_dir}")
 
-        # Change the method to pdf and adapt options
-        replaceLineInFile(self.pdf_config, "  method: count", "  method: pdf")
-        replaceLineInFile(
-            self.pdf_config,
-            "          bins: [[0, 600000, 351], [650000, 6000000, 84]]",
-            "          bins: [[0, 25e4, 100], [25e4, 6e6, 100]]",
-        )
-        replaceLineInFile(
-            self.pdf_config,
-            "          bins: [0, 2.5, 10]",
-            "          bins: [[0, 2.5, 10], [0, 2.5, 10]]",
-        )
-        replaceLineInFile(
-            self.pdf_config,
-            "    njets: 25e6",
-            "    njets: -1",
-        )
-        replaceLineInFile(
-            self.pdf_config,
-            "      training_ttbar_bjets: 5.5e6",
-            "",
-        )
-        replaceLineInFile(
-            self.pdf_config,
-            "      training_ttbar_cjets: 11.5e6",
-            "",
-        )
-        replaceLineInFile(
-            self.pdf_config,
-            "      training_ttbar_ujets: 13.5e6",
-            "",
-        )
-
-        # copy config file and change name to pdf for pdf preprocessing config
-        self.weight_config = self.config[:].replace(".yaml", "") + "_weighting.yaml"
-        copyfile(self.config, self.weight_config)
+    def test_preprocessing_umami_count(self):
+        """Integration test of preprocessing.py script using Umami variables."""
+        tagger = "umami"
+        method = "count"
 
-        replaceLineInFile(self.weight_config, "  method: count", "  method: weighting")
-        replaceLineInFile(
-            self.weight_config,
-            "    bool_attach_sample_weights: False",
-            "    bool_attach_sample_weights: True",
+        config = preparePreprocessingConfig(
+            tagger=tagger, test_dir=self.test_dir, sampling_method=method
         )
-
-        logger.info("Downloading test data...")
-        for file in self.data["test_preprocessing"]["files"]:
-            path = os.path.join(
-                self.data["data_url"],
-                self.data["test_preprocessing"]["data_subfolder"],
-                file,
+        self.assertTrue(
+            runPreprocessing(
+                config=config, test_dir=self.test_dir, tagger=tagger, method=method
             )
-            logger.info(f"Retrieving file from path {path}")
-            run(["wget", path, "--directory-prefix", test_dir], check=True)
-
-        run(
-            [
-                "mv",
-                os.path.join(test_dir, "ci_ttbar_basefile.h5"),
-                os.path.join(test_dir, "ttbar", "ci_ttbar_basefile.h5"),
-            ],
-            check=True,
-        )
-        run(
-            [
-                "mv",
-                os.path.join(test_dir, "ci_zpext_basefile.h5"),
-                os.path.join(test_dir, "zpext", "ci_zpext_basefile.h5"),
-            ],
-            check=True,
         )
 
-    def test_preprocessing_umami_count(self):
-        """Integration test of preprocessing.py script using Umami variables."""
-        replaceLineInFile(
-            self.config_paths,
-            ".var_file:",
-            f".var_file: &var_file {self.var_dict_umami}",
-        )
-
-        self.assertTrue(runPreprocessing(self.config, tagger="umami", method="count"))
-
     def test_preprocessing_dips_count(self):
         """Integration test of preprocessing.py script using DIPS variables."""
-        replaceLineInFile(
-            self.config_paths,
-            ".var_file:",
-            f".var_file: &var_file {self.var_dict_dips}",
+        tagger = "dips"
+        method = "count"
+
+        config = preparePreprocessingConfig(
+            tagger=tagger, test_dir=self.test_dir, sampling_method=method
+        )
+        self.assertTrue(
+            runPreprocessing(
+                config=config, test_dir=self.test_dir, tagger=tagger, method=method
+            )
         )
-        self.assertTrue(runPreprocessing(self.config, tagger="dips", method="count"))
 
     def test_preprocessing_dl1r_count(self):
         """Integration test of preprocessing.py script using DL1r variables."""
-        replaceLineInFile(
-            self.config,
-            "    save_tracks:",
-            "    save_tracks: False",
-        )
+        tagger = "dl1r"
+        method = "count"
 
-        replaceLineInFile(
-            self.config_paths,
-            ".var_file:",
-            f".var_file: &var_file {self.var_dict_dl1r}",
+        config = preparePreprocessingConfig(
+            tagger=tagger, test_dir=self.test_dir, sampling_method=method
+        )
+        self.assertTrue(
+            runPreprocessing(
+                config=config, test_dir=self.test_dir, tagger=tagger, method=method
+            )
         )
-
-        self.assertTrue(runPreprocessing(self.config, tagger="dl1r", method="count"))
 
     def test_preprocessing_umami_pdf(self):
         """Integration test of preprocessing.py script using Umami variables."""
-        replaceLineInFile(
-            self.config_paths,
-            ".var_file:",
-            f".var_file: &var_file {self.var_dict_umami}",
-        )
+        tagger = "umami"
+        method = "pdf"
 
-        self.assertTrue(runPreprocessing(self.pdf_config, tagger="umami", method="pdf"))
+        config = preparePreprocessingConfig(
+            tagger=tagger, test_dir=self.test_dir, sampling_method=method
+        )
+        self.assertTrue(
+            runPreprocessing(
+                config=config, test_dir=self.test_dir, tagger=tagger, method=method
+            )
+        )
 
     def test_preprocessing_dips_pdf(self):
         """Integration test of preprocessing.py script using DIPS variables."""
-        replaceLineInFile(
-            self.config_paths,
-            ".var_file:",
-            f".var_file: &var_file {self.var_dict_dips}",
-        )
+        tagger = "dips"
+        method = "pdf"
 
-        self.assertTrue(runPreprocessing(self.pdf_config, tagger="dips", method="pdf"))
+        config = preparePreprocessingConfig(
+            tagger=tagger, test_dir=self.test_dir, sampling_method=method
+        )
+        self.assertTrue(
+            runPreprocessing(
+                config=config, test_dir=self.test_dir, tagger=tagger, method=method
+            )
+        )
 
     def test_preprocessing_dl1r_pdf(self):
         """Integration test of preprocessing.py script using DL1r variables."""
-        replaceLineInFile(
-            self.pdf_config,
-            "    save_tracks:",
-            "    save_tracks: False",
-        )
+        tagger = "dl1r"
+        method = "pdf"
 
-        replaceLineInFile(
-            self.config_paths,
-            ".var_file:",
-            f".var_file: &var_file {self.var_dict_dl1r}",
+        config = preparePreprocessingConfig(
+            tagger=tagger, test_dir=self.test_dir, sampling_method=method
+        )
+        self.assertTrue(
+            runPreprocessing(
+                config=config, test_dir=self.test_dir, tagger=tagger, method=method
+            )
         )
-
-        self.assertTrue(runPreprocessing(self.pdf_config, tagger="dl1r", method="pdf"))
 
     def test_preprocessing_umami_weighting(self):
         """Integration test of preprocessing.py script using Umami variables."""
-        replaceLineInFile(
-            self.config_paths,
-            ".var_file:",
-            f".var_file: &var_file {self.var_dict_umami}",
-        )
+        tagger = "umami"
+        method = "weighting"
 
+        config = preparePreprocessingConfig(
+            tagger=tagger, test_dir=self.test_dir, sampling_method=method
+        )
         self.assertTrue(
-            runPreprocessing(self.weight_config, tagger="umami", method="weighting")
+            runPreprocessing(
+                config=config, test_dir=self.test_dir, tagger=tagger, method=method
+            )
         )
 
     def test_preprocessing_dips_weighting(self):
         """Integration test of preprocessing.py script using DIPS variables."""
-        replaceLineInFile(
-            self.config_paths,
-            ".var_file:",
-            f".var_file: &var_file {self.var_dict_dips}",
+        tagger = "dips"
+        method = "weighting"
+
+        config = preparePreprocessingConfig(
+            tagger=tagger, test_dir=self.test_dir, sampling_method=method
         )
         self.assertTrue(
-            runPreprocessing(self.weight_config, tagger="dips", method="weighting")
+            runPreprocessing(
+                config=config, test_dir=self.test_dir, tagger=tagger, method=method
+            )
         )
 
     def test_preprocessing_dl1r_weighting(self):
         """Integration test of preprocessing.py script using DL1r variables."""
-        replaceLineInFile(
-            self.weight_config,
-            "    save_tracks:",
-            "    save_tracks: False",
-        )
+        tagger = "dl1r"
+        method = "weighting"
 
-        replaceLineInFile(
-            self.config_paths,
-            ".var_file:",
-            f".var_file: &var_file {self.var_dict_dl1r}",
+        config = preparePreprocessingConfig(
+            tagger=tagger, test_dir=self.test_dir, sampling_method=method
         )
-
         self.assertTrue(
-            runPreprocessing(self.weight_config, tagger="dl1r", method="weighting")
+            runPreprocessing(
+                config=config, test_dir=self.test_dir, tagger=tagger, method=method
+            )
         )
-- 
GitLab


From 0172eebdbd429117752db3dde5218538e67d19de Mon Sep 17 00:00:00 2001
From: alfroch <alexander.froch@cern.ch>
Date: Tue, 1 Feb 2022 13:39:08 +0100
Subject: [PATCH 05/10] Fixing pipeline

---
 umami/preprocessing_tools/Scaling.py          |  4 ++--
 umami/tests/integration/test_preprocessing.py |  6 ++++++
 umami/tests/integration/test_train.py         | 12 ++++++------
 3 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/umami/preprocessing_tools/Scaling.py b/umami/preprocessing_tools/Scaling.py
index a4a744f9e..12fbe98cd 100644
--- a/umami/preprocessing_tools/Scaling.py
+++ b/umami/preprocessing_tools/Scaling.py
@@ -425,8 +425,8 @@ class Scaling:
                 # Loop over chunks
                 for chunk_counter in range(n_chunks):
                     logger.info(
-                        f"Calculating track scales for chunk {chunk_counter+1} of"
-                        f" {n_chunks}"
+                        f"Calculating track scales for {tracks_name} for chunk"
+                        f" {chunk_counter+1} of {n_chunks}"
                     )
                     # Check if this is the first time loading from the generator
                     if chunk_counter == 0:
diff --git a/umami/tests/integration/test_preprocessing.py b/umami/tests/integration/test_preprocessing.py
index 73e215164..f3c3ee7d0 100644
--- a/umami/tests/integration/test_preprocessing.py
+++ b/umami/tests/integration/test_preprocessing.py
@@ -343,7 +343,13 @@ def runPreprocessing(config: dict, tagger: str, method: str, test_dir: str) -> b
                 "Test failed: preprocessing.py --to_records."
             ) from Error
 
+    # Define path where the results are copied to
     tagger_path = f"./test_preprocessing_{tagger}/"
+
+    # Remove any artifacts that might be locally there
+    run(["rm", "-rfv", f"{tagger_path}"])
+
+    # Copy the results to given path
     copytree(test_dir, tagger_path)
 
     return True
diff --git a/umami/tests/integration/test_train.py b/umami/tests/integration/test_train.py
index 75e030318..42319e4aa 100644
--- a/umami/tests/integration/test_train.py
+++ b/umami/tests/integration/test_train.py
@@ -72,11 +72,11 @@ def prepareConfig(
     config = os.path.join(test_dir, os.path.basename(config_source))
 
     preprocessing_config_source = os.path.join(
-        f"./test_preprocessing_{preprocess_files}/preprocessing/",
+        f"./test_preprocessing_{preprocess_files}/",
         os.path.basename(data["test_preprocessing"]["config"]),
     )
     preprocessing_config_paths_source = os.path.join(
-        f"./test_preprocessing_{preprocess_files}/preprocessing/",
+        f"./test_preprocessing_{preprocess_files}/",
         os.path.basename(data["test_preprocessing"]["config_paths"]),
     )
     preprocessing_config = os.path.join(
@@ -87,7 +87,7 @@ def prepareConfig(
     )
 
     var_dict_source = os.path.join(
-        f"./test_preprocessing_{preprocess_files}/preprocessing/",
+        f"./test_preprocessing_{preprocess_files}/",
         os.path.basename(data["test_preprocessing"][f"var_dict_{preprocess_files}"]),
     )
     var_dict = os.path.join(test_dir, os.path.basename(var_dict_source))
@@ -96,13 +96,13 @@ def prepareConfig(
     logger.info("Retrieving files from preprocessing...")
 
     train_file = os.path.join(
-        f"./test_preprocessing_{preprocess_files}/preprocessing/",
+        f"./test_preprocessing_{preprocess_files}/",
         "PFlow-hybrid_70-test-resampled_scaled_shuffled.h5",
     )
     test_file_ttbar = os.path.join(test_dir, "ci_ttbar_testing.h5")
     test_file_zprime = os.path.join(test_dir, "ci_zpext_testing.h5")
     scale_dict = os.path.join(
-        f"./test_preprocessing_{preprocess_files}/preprocessing/",
+        f"./test_preprocessing_{preprocess_files}/",
         "PFlow-scale_dict.json",
     )
 
@@ -184,7 +184,7 @@ def prepareConfig(
 
     if useTFRecords is True:
         config_file["train_file"] = os.path.join(
-            f"./test_preprocessing_{preprocess_files}/preprocessing/",
+            f"./test_preprocessing_{preprocess_files}/",
             "PFlow-hybrid_70-test-resampled_scaled_shuffled",
         )
         config_file["model_name"] = data["test_dips"]["model_name"] + "_tfrecords"
-- 
GitLab


From 5596c5ace5dae33987a12ceee58c880480b05e77 Mon Sep 17 00:00:00 2001
From: alfroch <alexander.froch@cern.ch>
Date: Tue, 1 Feb 2022 13:43:05 +0100
Subject: [PATCH 06/10] Pylint

---
 umami/tests/integration/test_preprocessing.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/umami/tests/integration/test_preprocessing.py b/umami/tests/integration/test_preprocessing.py
index f3c3ee7d0..ba11e24cc 100644
--- a/umami/tests/integration/test_preprocessing.py
+++ b/umami/tests/integration/test_preprocessing.py
@@ -347,7 +347,7 @@ def runPreprocessing(config: dict, tagger: str, method: str, test_dir: str) -> b
     tagger_path = f"./test_preprocessing_{tagger}/"
 
     # Remove any artifacts that might be locally there
-    run(["rm", "-rfv", f"{tagger_path}"])
+    run(["rm", "-rfv", f"{tagger_path}"], check=True)
 
     # Copy the results to given path
     copytree(test_dir, tagger_path)
-- 
GitLab


From 5aadccab29bdce0746aaf22b25daca1f10671cf9 Mon Sep 17 00:00:00 2001
From: alfroch <alexander.froch@cern.ch>
Date: Tue, 1 Feb 2022 13:55:51 +0100
Subject: [PATCH 07/10] Fixing pipeline

---
 umami/preprocessing_tools/Configuration.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/umami/preprocessing_tools/Configuration.py b/umami/preprocessing_tools/Configuration.py
index b6311b747..627fe676d 100644
--- a/umami/preprocessing_tools/Configuration.py
+++ b/umami/preprocessing_tools/Configuration.py
@@ -31,9 +31,9 @@ class Configuration:
             first_line = conf.readline()
         first_line = first_line.split("!include ")
         if first_line[0] != "parameters: ":
-            raise ValueError(
-                "Please specify in the first line of the preprocessing config"
-                " the 'parameters' with the !include option."
+            logger.warning(
+                "`parameters` is not defined in the first line with !include. "
+                "This can cause issues further downstream. "
             )
 
         preprocess_parameters_path = os.path.join(
-- 
GitLab


From 9a2298e72b6bfe0d8040794f4d49005567983ebd Mon Sep 17 00:00:00 2001
From: alfroch <alexander.froch@cern.ch>
Date: Tue, 1 Feb 2022 14:03:58 +0100
Subject: [PATCH 08/10] Fixing pipeline

---
 umami/preprocessing_tools/Configuration.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/umami/preprocessing_tools/Configuration.py b/umami/preprocessing_tools/Configuration.py
index 627fe676d..a55daf8d0 100644
--- a/umami/preprocessing_tools/Configuration.py
+++ b/umami/preprocessing_tools/Configuration.py
@@ -35,12 +35,14 @@ class Configuration:
                 "`parameters` is not defined in the first line with !include. "
                 "This can cause issues further downstream. "
             )
+            return None
 
-        preprocess_parameters_path = os.path.join(
-            os.path.dirname(self.ConfigPath),
-            first_line[1].strip(),
-        )
-        return preprocess_parameters_path
+        else:
+            preprocess_parameters_path = os.path.join(
+                os.path.dirname(self.ConfigPath),
+                first_line[1].strip(),
+            )
+            return preprocess_parameters_path
 
     def LoadConfigFiles(self):
         """Load config file from disk."""
-- 
GitLab


From c4615a8cbac8cf97c48b22cf8ce1ed7d2c31b1c8 Mon Sep 17 00:00:00 2001
From: alfroch <alexander.froch@cern.ch>
Date: Tue, 1 Feb 2022 14:11:31 +0100
Subject: [PATCH 09/10] Fixing pylint

---
 umami/preprocessing_tools/Configuration.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/umami/preprocessing_tools/Configuration.py b/umami/preprocessing_tools/Configuration.py
index a55daf8d0..8c3623537 100644
--- a/umami/preprocessing_tools/Configuration.py
+++ b/umami/preprocessing_tools/Configuration.py
@@ -35,14 +35,15 @@ class Configuration:
                 "`parameters` is not defined in the first line with !include. "
                 "This can cause issues further downstream. "
             )
-            return None
+            preprocess_parameters_path = None
 
         else:
             preprocess_parameters_path = os.path.join(
                 os.path.dirname(self.ConfigPath),
                 first_line[1].strip(),
             )
-            return preprocess_parameters_path
+
+        return preprocess_parameters_path
 
     def LoadConfigFiles(self):
         """Load config file from disk."""
-- 
GitLab


From e0c9b245c1cdb8a96953b7aa874dc58c154d8596 Mon Sep 17 00:00:00 2001
From: alfroch <alexander.froch@cern.ch>
Date: Tue, 1 Feb 2022 14:32:55 +0100
Subject: [PATCH 10/10] Undo preprocessing changes

---
 umami/preprocessing_tools/Configuration.py    |  17 +-
 umami/tests/integration/test_preprocessing.py | 539 ++++++++++--------
 umami/tests/integration/test_train.py         |  12 +-
 3 files changed, 306 insertions(+), 262 deletions(-)

diff --git a/umami/preprocessing_tools/Configuration.py b/umami/preprocessing_tools/Configuration.py
index 8c3623537..b6311b747 100644
--- a/umami/preprocessing_tools/Configuration.py
+++ b/umami/preprocessing_tools/Configuration.py
@@ -31,18 +31,15 @@ class Configuration:
             first_line = conf.readline()
         first_line = first_line.split("!include ")
         if first_line[0] != "parameters: ":
-            logger.warning(
-                "`parameters` is not defined in the first line with !include. "
-                "This can cause issues further downstream. "
-            )
-            preprocess_parameters_path = None
-
-        else:
-            preprocess_parameters_path = os.path.join(
-                os.path.dirname(self.ConfigPath),
-                first_line[1].strip(),
+            raise ValueError(
+                "Please specify in the first line of the preprocessing config"
+                " the 'parameters' with the !include option."
             )
 
+        preprocess_parameters_path = os.path.join(
+            os.path.dirname(self.ConfigPath),
+            first_line[1].strip(),
+        )
         return preprocess_parameters_path
 
     def LoadConfigFiles(self):
diff --git a/umami/tests/integration/test_preprocessing.py b/umami/tests/integration/test_preprocessing.py
index ba11e24cc..ce710b252 100644
--- a/umami/tests/integration/test_preprocessing.py
+++ b/umami/tests/integration/test_preprocessing.py
@@ -5,15 +5,14 @@ This script integration tests the preprocessing methods.
 """
 
 import os
-import tempfile
 import unittest
-from shutil import copyfile, copytree
+from shutil import copyfile
 from subprocess import CalledProcessError, run
 
 import yaml
 
 from umami.configuration import logger
-from umami.tools import YAML, replaceLineInFile, yaml_loader
+from umami.tools import replaceLineInFile, yaml_loader
 
 
 def getConfiguration():
@@ -29,161 +28,7 @@ def getConfiguration():
     return conf_setup
 
 
-def preparePreprocessingConfig(
-    test_dir: str,
-    tagger: str,
-    sampling_method: str,
-) -> str:
-    """Prepare the preprocessing configs for the different tagger/methods.
-
-    Parameters
-    ----------
-    test_dir : str
-        Path to test directory
-    tagger : str
-        Name of the tagger which the preprocessing is for.
-    sampling_method : str
-        Sampling method that will be used.
-
-    Returns
-    -------
-    str
-        Path to the prepared preprocessing config.
-    """
-    # Get test configuration
-    data = getConfiguration()
-
-    # Make ttbar und zprime directory
-    os.makedirs(os.path.join(test_dir, "ttbar"), exist_ok=True)
-    os.makedirs(os.path.join(test_dir, "zpext"), exist_ok=True)
-
-    # inputs for test will be located in test_dir
-    config_source = os.path.join(os.getcwd(), data["test_preprocessing"]["config"])
-    config_paths_source = os.path.join(
-        os.getcwd(), data["test_preprocessing"]["config_paths"]
-    )
-    var_dict_source = os.path.join(
-        os.getcwd(), data["test_preprocessing"][f"var_dict_{tagger}"]
-    )
-    config = os.path.join(test_dir, os.path.basename(config_source))
-    config_paths = os.path.join(test_dir, os.path.basename(config_paths_source))
-    var_dict = os.path.join(test_dir, os.path.basename(var_dict_source))
-    scale_dict = os.path.join(test_dir, "PFlow-scale_dict.json")
-    output = os.path.join(test_dir, "PFlow-hybrid_70-test.h5")
-
-    logger.info(f"Preparing config file based on {config_source} in {config}...")
-    copyfile(config_source, config)
-    copyfile(config_paths_source, config_paths)
-    copyfile(var_dict_source, var_dict)
-
-    # modify copy of preprocessing config file for test
-    replaceLineInFile(
-        config_paths,
-        "ntuple_path:",
-        f"ntuple_path: &ntuple_path {test_dir}",
-    )
-    replaceLineInFile(
-        config_paths,
-        "sample_path:",
-        f"sample_path: &sample_path {test_dir}",
-    )
-    replaceLineInFile(
-        config_paths,
-        "file_path:",
-        f"file_path: &file_path {test_dir}",
-    )
-    replaceLineInFile(
-        config_paths,
-        ".outfile_name:",
-        f".outfile_name: &outfile_name {output}",
-    )
-    replaceLineInFile(
-        config_paths,
-        ".dict_file:",
-        f".dict_file: &dict_file {scale_dict}",
-    )
-    replaceLineInFile(
-        config_paths,
-        ".intermediate_index_file:",
-        ".intermediate_index_file: &intermediate_index_file indices.h5",
-    )
-    replaceLineInFile(
-        config_paths,
-        ".var_file:",
-        f".var_file: &var_file {var_dict}",
-    )
-
-    # Load the preprocessing file
-    umami_yaml = YAML(typ="safe", pure=True)
-    with open(config, "r") as conf:
-        config_file = umami_yaml.load(conf)
-
-    config_file["preparation"]["ntuples"]["ttbar"]["file_pattern"] = "ttbar/*.h5"
-    config_file["preparation"]["ntuples"]["zprime"]["file_pattern"] = "zpext/*.h5"
-
-    if tagger.casefold() == "dl1r":
-        config_file["sampling"]["options"]["save_tracks"] = False
-        config_file["sampling"]["options"]["tracks_names"] = None
-
-    else:
-        config_file["sampling"]["options"]["save_tracks"] = True
-        config_file["sampling"]["options"]["tracks_names"] = ["tracks", "tracks_loose"]
-
-    if sampling_method.casefold() == "pdf":
-        # Change the method to pdf and adapt options
-        config_file["sampling"]["method"] = "pdf"
-
-        # Set correct binning for pdf method
-        config_file["sampling"]["options"]["sampling_variables"] = [
-            {"pt_btagJes": {"bins": [[0, 25e4, 100], [25e4, 6e6, 100]]}},
-            {"absEta_btagJes": {"bins": [[0, 2.5, 10], [0, 2.5, 10]]}},
-        ]
-
-        # Set number of jets to maximum
-        config_file["sampling"]["options"]["njets"] = -1
-
-        # Remove custom njets initial
-        config_file["sampling"]["options"]["custom_njets_initial"] = None
-
-    elif sampling_method.casefold() == "weighting":
-        config_file["sampling"]["method"] = "weighting"
-        config_file["sampling"]["options"]["bool_attach_sample_weights"] = True
-
-    # save the preprocessing config file for test
-    with open(config, "w") as con:
-        umami_yaml.dump(config_file, con)
-
-    logger.info("Downloading test data...")
-    for file in data["test_preprocessing"]["files"]:
-        path = os.path.join(
-            data["data_url"],
-            data["test_preprocessing"]["data_subfolder"],
-            file,
-        )
-        logger.info(f"Retrieving file from path {path}")
-        run(["wget", path, "--directory-prefix", test_dir], check=True)
-
-    run(
-        [
-            "mv",
-            os.path.join(test_dir, "ci_ttbar_basefile.h5"),
-            os.path.join(test_dir, "ttbar", "ci_ttbar_basefile.h5"),
-        ],
-        check=True,
-    )
-    run(
-        [
-            "mv",
-            os.path.join(test_dir, "ci_zpext_basefile.h5"),
-            os.path.join(test_dir, "zpext", "ci_zpext_basefile.h5"),
-        ],
-        check=True,
-    )
-
-    return config
-
-
-def runPreprocessing(config: dict, tagger: str, method: str, test_dir: str) -> bool:
+def runPreprocessing(config: dict, tagger: str, method: str) -> bool:
     """
     Call all steps of the preprocessing for a certain configuration and variable dict
     input.
@@ -343,14 +188,60 @@ def runPreprocessing(config: dict, tagger: str, method: str, test_dir: str) -> b
                 "Test failed: preprocessing.py --to_records."
             ) from Error
 
-    # Define path where the results are copied to
     tagger_path = f"./test_preprocessing_{tagger}/"
+    if not os.path.isdir(tagger_path):
+        run(["mkdir", tagger_path], check=True)
+
+    run(
+        [
+            "cp",
+            "-r",
+            "/tmp/umami/preprocessing/",
+            tagger_path,
+        ],
+        check=True,
+    )
+
+    # Get the path of the not needed configs
+    unused_configs = os.path.join(
+        tagger_path, "preprocessing/", "PFlow-Preprocessing_*.yaml"
+    )
 
-    # Remove any artifacts that might be locally there
-    run(["rm", "-rfv", f"{tagger_path}"], check=True)
+    # Rename the needed config to PFlow-Preprocessing.yaml and erase the unused
+    # TODO change in python 3.10
+    if method == "count":
+        run(
+            [f"rm -rfv {unused_configs}"],
+            shell=True,
+            check=True,
+        )
+
+    elif method == "pdf":
+        copyfile(
+            os.path.join(tagger_path, "preprocessing/", "PFlow-Preprocessing_pdf.yaml"),
+            os.path.join(tagger_path, "preprocessing/", "PFlow-Preprocessing.yaml"),
+        )
+        run(
+            [f"rm -rfv {unused_configs}"],
+            shell=True,
+            check=True,
+        )
+
+    elif method == "weighting":
+        copyfile(
+            os.path.join(
+                tagger_path, "preprocessing/", "PFlow-Preprocessing_weighting.yaml"
+            ),
+            os.path.join(tagger_path, "preprocessing/", "PFlow-Preprocessing.yaml"),
+        )
+        run(
+            [f"rm -rfv {unused_configs}"],
+            shell=True,
+            check=True,
+        )
 
-    # Copy the results to given path
-    copytree(test_dir, tagger_path)
+    else:
+        raise KeyError(f"Method {method} is not supported by the integration test!")
 
     return True
 
@@ -369,132 +260,288 @@ class TestPreprocessing(unittest.TestCase):
         # Get test configuration
         self.data = getConfiguration()
 
-        self.test_dir_path = tempfile.TemporaryDirectory()  # pylint: disable=R1732
-        self.test_dir = f"{self.test_dir_path.name}"
-        logger.info(f"Creating test directory in {self.test_dir}")
+        test_dir = os.path.join(self.data["test_preprocessing"]["testdir"])
+        logger.info(f"Creating test directory in {test_dir}")
+        # clean up, hopefully this causes no "uh oh...""
+        if test_dir.startswith("/tmp"):
+            run(["rm", "-rf", test_dir], check=True)
+        run(["mkdir", "-p", test_dir], check=True)
 
-    def test_preprocessing_umami_count(self):
-        """Integration test of preprocessing.py script using Umami variables."""
-        tagger = "umami"
-        method = "count"
+        # Make filepaths for basefiles
+        run(["mkdir", "-p", os.path.join(test_dir, "ttbar")], check=True)
+        run(["mkdir", "-p", os.path.join(test_dir, "zpext")], check=True)
 
-        config = preparePreprocessingConfig(
-            tagger=tagger, test_dir=self.test_dir, sampling_method=method
+        # inputs for test will be located in test_dir
+        config_source = os.path.join(
+            os.getcwd(), self.data["test_preprocessing"]["config"]
         )
-        self.assertTrue(
-            runPreprocessing(
-                config=config, test_dir=self.test_dir, tagger=tagger, method=method
-            )
+        config_paths_source = os.path.join(
+            os.getcwd(), self.data["test_preprocessing"]["config_paths"]
+        )
+        var_dict_umami_source = os.path.join(
+            os.getcwd(), self.data["test_preprocessing"]["var_dict_umami"]
+        )
+        var_dict_dips_source = os.path.join(
+            os.getcwd(), self.data["test_preprocessing"]["var_dict_dips"]
+        )
+        var_dict_dl1r_source = os.path.join(
+            os.getcwd(), self.data["test_preprocessing"]["var_dict_dl1r"]
         )
+        self.config = os.path.join(test_dir, os.path.basename(config_source))
+        self.config_paths = os.path.join(
+            test_dir, os.path.basename(config_paths_source)
+        )
+        self.var_dict_umami = os.path.join(
+            test_dir, os.path.basename(var_dict_umami_source)
+        )
+        self.var_dict_dips = os.path.join(
+            test_dir, os.path.basename(var_dict_dips_source)
+        )
+        self.var_dict_dl1r = os.path.join(
+            test_dir, os.path.basename(var_dict_dl1r_source)
+        )
+        self.scale_dict = os.path.join(test_dir, "PFlow-scale_dict.json")
+        self.output = os.path.join(test_dir, "PFlow-hybrid_70-test.h5")
 
-    def test_preprocessing_dips_count(self):
-        """Integration test of preprocessing.py script using DIPS variables."""
-        tagger = "dips"
-        method = "count"
+        logger.info(
+            f"Preparing config file based on {config_source} in {self.config}..."
+        )
+        copyfile(config_source, self.config)
+        copyfile(config_paths_source, self.config_paths)
+        copyfile(var_dict_umami_source, self.var_dict_umami)
+        copyfile(var_dict_dips_source, self.var_dict_dips)
+        copyfile(var_dict_dl1r_source, self.var_dict_dl1r)
+
+        # modify copy of preprocessing config file for test
+        replaceLineInFile(
+            self.config_paths,
+            "ntuple_path:",
+            f"ntuple_path: &ntuple_path {test_dir}",
+        )
+        replaceLineInFile(
+            self.config_paths,
+            "sample_path:",
+            f"sample_path: &sample_path {test_dir}",
+        )
+        replaceLineInFile(
+            self.config_paths,
+            "file_path:",
+            f"file_path: &file_path {test_dir}",
+        )
+        replaceLineInFile(
+            self.config_paths,
+            ".outfile_name:",
+            f".outfile_name: &outfile_name {self.output}",
+        )
+        replaceLineInFile(
+            self.config_paths,
+            ".dict_file:",
+            f".dict_file: &dict_file {self.scale_dict}",
+        )
+        replaceLineInFile(
+            self.config_paths,
+            ".intermediate_index_file:",
+            ".intermediate_index_file: &intermediate_index_file indices.h5",
+        )
+        replaceLineInFile(
+            self.config,
+            "      file_pattern: user.alfroch.410470",
+            "      file_pattern: ttbar/*.h5",
+        )
+        replaceLineInFile(
+            self.config,
+            "      file_pattern: user.alfroch.427081",
+            "      file_pattern: zpext/*.h5",
+        )
+        replaceLineInFile(
+            self.config,
+            "    tracks_names:",
+            "    tracks_names: ['tracks','tracks_loose']",
+        )
+
+        # copy config file and change name to pdf for pdf preprocessing config
+        self.pdf_config = self.config[:].replace(".yaml", "") + "_pdf.yaml"
+        copyfile(self.config, self.pdf_config)
 
-        config = preparePreprocessingConfig(
-            tagger=tagger, test_dir=self.test_dir, sampling_method=method
+        # Change the method to pdf and adapt options
+        replaceLineInFile(self.pdf_config, "  method: count", "  method: pdf")
+        replaceLineInFile(
+            self.pdf_config,
+            "          bins: [[0, 600000, 351], [650000, 6000000, 84]]",
+            "          bins: [[0, 25e4, 100], [25e4, 6e6, 100]]",
         )
-        self.assertTrue(
-            runPreprocessing(
-                config=config, test_dir=self.test_dir, tagger=tagger, method=method
-            )
+        replaceLineInFile(
+            self.pdf_config,
+            "          bins: [0, 2.5, 10]",
+            "          bins: [[0, 2.5, 10], [0, 2.5, 10]]",
+        )
+        replaceLineInFile(
+            self.pdf_config,
+            "    njets: 25e6",
+            "    njets: -1",
+        )
+        replaceLineInFile(
+            self.pdf_config,
+            "      training_ttbar_bjets: 5.5e6",
+            "",
+        )
+        replaceLineInFile(
+            self.pdf_config,
+            "      training_ttbar_cjets: 11.5e6",
+            "",
+        )
+        replaceLineInFile(
+            self.pdf_config,
+            "      training_ttbar_ujets: 13.5e6",
+            "",
         )
 
-    def test_preprocessing_dl1r_count(self):
-        """Integration test of preprocessing.py script using DL1r variables."""
-        tagger = "dl1r"
-        method = "count"
+        # copy config file and change name to pdf for pdf preprocessing config
+        self.weight_config = self.config[:].replace(".yaml", "") + "_weighting.yaml"
+        copyfile(self.config, self.weight_config)
 
-        config = preparePreprocessingConfig(
-            tagger=tagger, test_dir=self.test_dir, sampling_method=method
+        replaceLineInFile(self.weight_config, "  method: count", "  method: weighting")
+        replaceLineInFile(
+            self.weight_config,
+            "    bool_attach_sample_weights: False",
+            "    bool_attach_sample_weights: True",
         )
-        self.assertTrue(
-            runPreprocessing(
-                config=config, test_dir=self.test_dir, tagger=tagger, method=method
+
+        logger.info("Downloading test data...")
+        for file in self.data["test_preprocessing"]["files"]:
+            path = os.path.join(
+                self.data["data_url"],
+                self.data["test_preprocessing"]["data_subfolder"],
+                file,
             )
+            logger.info(f"Retrieving file from path {path}")
+            run(["wget", path, "--directory-prefix", test_dir], check=True)
+
+        run(
+            [
+                "mv",
+                os.path.join(test_dir, "ci_ttbar_basefile.h5"),
+                os.path.join(test_dir, "ttbar", "ci_ttbar_basefile.h5"),
+            ],
+            check=True,
+        )
+        run(
+            [
+                "mv",
+                os.path.join(test_dir, "ci_zpext_basefile.h5"),
+                os.path.join(test_dir, "zpext", "ci_zpext_basefile.h5"),
+            ],
+            check=True,
         )
 
-    def test_preprocessing_umami_pdf(self):
+    def test_preprocessing_umami_count(self):
         """Integration test of preprocessing.py script using Umami variables."""
-        tagger = "umami"
-        method = "pdf"
+        replaceLineInFile(
+            self.config_paths,
+            ".var_file:",
+            f".var_file: &var_file {self.var_dict_umami}",
+        )
+
+        self.assertTrue(runPreprocessing(self.config, tagger="umami", method="count"))
 
-        config = preparePreprocessingConfig(
-            tagger=tagger, test_dir=self.test_dir, sampling_method=method
+    def test_preprocessing_dips_count(self):
+        """Integration test of preprocessing.py script using DIPS variables."""
+        replaceLineInFile(
+            self.config_paths,
+            ".var_file:",
+            f".var_file: &var_file {self.var_dict_dips}",
         )
-        self.assertTrue(
-            runPreprocessing(
-                config=config, test_dir=self.test_dir, tagger=tagger, method=method
-            )
+        self.assertTrue(runPreprocessing(self.config, tagger="dips", method="count"))
+
+    def test_preprocessing_dl1r_count(self):
+        """Integration test of preprocessing.py script using DL1r variables."""
+        replaceLineInFile(
+            self.config,
+            "    save_tracks:",
+            "    save_tracks: False",
         )
 
-    def test_preprocessing_dips_pdf(self):
-        """Integration test of preprocessing.py script using DIPS variables."""
-        tagger = "dips"
-        method = "pdf"
+        replaceLineInFile(
+            self.config_paths,
+            ".var_file:",
+            f".var_file: &var_file {self.var_dict_dl1r}",
+        )
+
+        self.assertTrue(runPreprocessing(self.config, tagger="dl1r", method="count"))
 
-        config = preparePreprocessingConfig(
-            tagger=tagger, test_dir=self.test_dir, sampling_method=method
+    def test_preprocessing_umami_pdf(self):
+        """Integration test of preprocessing.py script using Umami variables."""
+        replaceLineInFile(
+            self.config_paths,
+            ".var_file:",
+            f".var_file: &var_file {self.var_dict_umami}",
         )
-        self.assertTrue(
-            runPreprocessing(
-                config=config, test_dir=self.test_dir, tagger=tagger, method=method
-            )
+
+        self.assertTrue(runPreprocessing(self.pdf_config, tagger="umami", method="pdf"))
+
+    def test_preprocessing_dips_pdf(self):
+        """Integration test of preprocessing.py script using DIPS variables."""
+        replaceLineInFile(
+            self.config_paths,
+            ".var_file:",
+            f".var_file: &var_file {self.var_dict_dips}",
         )
 
+        self.assertTrue(runPreprocessing(self.pdf_config, tagger="dips", method="pdf"))
+
     def test_preprocessing_dl1r_pdf(self):
         """Integration test of preprocessing.py script using DL1r variables."""
-        tagger = "dl1r"
-        method = "pdf"
-
-        config = preparePreprocessingConfig(
-            tagger=tagger, test_dir=self.test_dir, sampling_method=method
+        replaceLineInFile(
+            self.pdf_config,
+            "    save_tracks:",
+            "    save_tracks: False",
         )
-        self.assertTrue(
-            runPreprocessing(
-                config=config, test_dir=self.test_dir, tagger=tagger, method=method
-            )
+
+        replaceLineInFile(
+            self.config_paths,
+            ".var_file:",
+            f".var_file: &var_file {self.var_dict_dl1r}",
         )
 
+        self.assertTrue(runPreprocessing(self.pdf_config, tagger="dl1r", method="pdf"))
+
     def test_preprocessing_umami_weighting(self):
         """Integration test of preprocessing.py script using Umami variables."""
-        tagger = "umami"
-        method = "weighting"
-
-        config = preparePreprocessingConfig(
-            tagger=tagger, test_dir=self.test_dir, sampling_method=method
+        replaceLineInFile(
+            self.config_paths,
+            ".var_file:",
+            f".var_file: &var_file {self.var_dict_umami}",
         )
+
         self.assertTrue(
-            runPreprocessing(
-                config=config, test_dir=self.test_dir, tagger=tagger, method=method
-            )
+            runPreprocessing(self.weight_config, tagger="umami", method="weighting")
         )
 
     def test_preprocessing_dips_weighting(self):
         """Integration test of preprocessing.py script using DIPS variables."""
-        tagger = "dips"
-        method = "weighting"
-
-        config = preparePreprocessingConfig(
-            tagger=tagger, test_dir=self.test_dir, sampling_method=method
+        replaceLineInFile(
+            self.config_paths,
+            ".var_file:",
+            f".var_file: &var_file {self.var_dict_dips}",
         )
         self.assertTrue(
-            runPreprocessing(
-                config=config, test_dir=self.test_dir, tagger=tagger, method=method
-            )
+            runPreprocessing(self.weight_config, tagger="dips", method="weighting")
         )
 
     def test_preprocessing_dl1r_weighting(self):
         """Integration test of preprocessing.py script using DL1r variables."""
-        tagger = "dl1r"
-        method = "weighting"
+        replaceLineInFile(
+            self.weight_config,
+            "    save_tracks:",
+            "    save_tracks: False",
+        )
 
-        config = preparePreprocessingConfig(
-            tagger=tagger, test_dir=self.test_dir, sampling_method=method
+        replaceLineInFile(
+            self.config_paths,
+            ".var_file:",
+            f".var_file: &var_file {self.var_dict_dl1r}",
         )
+
         self.assertTrue(
-            runPreprocessing(
-                config=config, test_dir=self.test_dir, tagger=tagger, method=method
-            )
+            runPreprocessing(self.weight_config, tagger="dl1r", method="weighting")
         )
diff --git a/umami/tests/integration/test_train.py b/umami/tests/integration/test_train.py
index 42319e4aa..75e030318 100644
--- a/umami/tests/integration/test_train.py
+++ b/umami/tests/integration/test_train.py
@@ -72,11 +72,11 @@ def prepareConfig(
     config = os.path.join(test_dir, os.path.basename(config_source))
 
     preprocessing_config_source = os.path.join(
-        f"./test_preprocessing_{preprocess_files}/",
+        f"./test_preprocessing_{preprocess_files}/preprocessing/",
         os.path.basename(data["test_preprocessing"]["config"]),
     )
     preprocessing_config_paths_source = os.path.join(
-        f"./test_preprocessing_{preprocess_files}/",
+        f"./test_preprocessing_{preprocess_files}/preprocessing/",
         os.path.basename(data["test_preprocessing"]["config_paths"]),
     )
     preprocessing_config = os.path.join(
@@ -87,7 +87,7 @@ def prepareConfig(
     )
 
     var_dict_source = os.path.join(
-        f"./test_preprocessing_{preprocess_files}/",
+        f"./test_preprocessing_{preprocess_files}/preprocessing/",
         os.path.basename(data["test_preprocessing"][f"var_dict_{preprocess_files}"]),
     )
     var_dict = os.path.join(test_dir, os.path.basename(var_dict_source))
@@ -96,13 +96,13 @@ def prepareConfig(
     logger.info("Retrieving files from preprocessing...")
 
     train_file = os.path.join(
-        f"./test_preprocessing_{preprocess_files}/",
+        f"./test_preprocessing_{preprocess_files}/preprocessing/",
         "PFlow-hybrid_70-test-resampled_scaled_shuffled.h5",
     )
     test_file_ttbar = os.path.join(test_dir, "ci_ttbar_testing.h5")
     test_file_zprime = os.path.join(test_dir, "ci_zpext_testing.h5")
     scale_dict = os.path.join(
-        f"./test_preprocessing_{preprocess_files}/",
+        f"./test_preprocessing_{preprocess_files}/preprocessing/",
         "PFlow-scale_dict.json",
     )
 
@@ -184,7 +184,7 @@ def prepareConfig(
 
     if useTFRecords is True:
         config_file["train_file"] = os.path.join(
-            f"./test_preprocessing_{preprocess_files}/",
+            f"./test_preprocessing_{preprocess_files}/preprocessing/",
             "PFlow-hybrid_70-test-resampled_scaled_shuffled",
         )
         config_file["model_name"] = data["test_dips"]["model_name"] + "_tfrecords"
-- 
GitLab