From c284851f37fb1921688243bf5bd180b454db3dd2 Mon Sep 17 00:00:00 2001 From: alfroch <alexander.froch@cern.ch> Date: Mon, 31 Jan 2022 15:27:35 +0100 Subject: [PATCH 01/10] Adding new path for PDF_sampling files --- umami/preprocessing_tools/Resampling.py | 39 +++++++++++++++---------- 1 file changed, 23 insertions(+), 16 deletions(-) diff --git a/umami/preprocessing_tools/Resampling.py b/umami/preprocessing_tools/Resampling.py index 88c5147b0..048333f72 100644 --- a/umami/preprocessing_tools/Resampling.py +++ b/umami/preprocessing_tools/Resampling.py @@ -339,10 +339,17 @@ class Resampling: self.outfile_name = self.config.GetFileName(option="resampled") self.outfile_path = self.config.config["parameters"]["sample_path"] + self.resampled_path = self.config.config["parameters"]["file_path"] + # Check if the directory for the outfile is existing if os.path.dirname(self.outfile_name): os.makedirs(os.path.dirname(self.outfile_name), exist_ok=True) + # Check if the directory for the resampled, scaled files + # (normally preprocessed/) exists + if os.path.dirname(self.resampled_path): + os.makedirs(os.path.dirname(self.resampled_path), exist_ok=True) + # Get class labels from sampling/preparation. # Try/Except here for backward compatibility try: @@ -1296,7 +1303,7 @@ class PDFSampling(Resampling): # pylint: disable=too-many-public-methods ], ) save_name = os.path.join( - self.outfile_path, + self.resampled_path, "PDF_sampling", f"inter_func_{store_key}", ) @@ -1429,7 +1436,7 @@ class PDFSampling(Resampling): # pylint: disable=too-many-public-methods """Get unnormalised PDF weight.""" # Get the inter_func load_name = os.path.join( - self.outfile_path, + self.resampled_path, "PDF_sampling", f"inter_func_{store_key}", ) @@ -1469,7 +1476,7 @@ class PDFSampling(Resampling): # pylint: disable=too-many-public-methods # Load number to sample load_name = os.path.join( - self.outfile_path, + self.resampled_path, "PDF_sampling", "target_data.json", ) @@ -1591,7 +1598,7 @@ class PDFSampling(Resampling): # pylint: disable=too-many-public-methods create_file = True chunk_counter = 0 save_name = os.path.join( - self.outfile_path, + self.resampled_path, "PDF_sampling", self.options["samples"][sample_category][sample_id] + "_selected.h5", ) @@ -1684,7 +1691,7 @@ class PDFSampling(Resampling): # pylint: disable=too-many-public-methods # Load number to sample load_name = os.path.join( - self.outfile_path, + self.resampled_path, "PDF_sampling", "target_data.json", ) @@ -1693,7 +1700,7 @@ class PDFSampling(Resampling): # pylint: disable=too-many-public-methods number_to_sample = target_data["number_to_sample"][sample_name] index_file = os.path.join( - self.outfile_path, + self.resampled_path, "PDF_sampling", self.options["samples"][sample_category][sample_id] + "_indices.h5", ) @@ -1704,7 +1711,7 @@ class PDFSampling(Resampling): # pylint: disable=too-many-public-methods duplicate = True save_name = os.path.join( - self.outfile_path, + self.resampled_path, "PDF_sampling", self.options["samples"][sample_category][sample_id] + "_selected.h5", ) @@ -1879,12 +1886,12 @@ class PDFSampling(Resampling): # pylint: disable=too-many-public-methods "target_fraction": self.target_fractions, } save_name = os.path.join( - self.outfile_path, + self.resampled_path, "PDF_sampling", "target_data.json", ) - if not os.path.exists(os.path.join(self.outfile_path, "PDF_sampling")): - os.mkdir(os.path.join(self.outfile_path, "PDF_sampling")) + if not os.path.exists(os.path.join(self.resampled_path, "PDF_sampling")): + os.mkdir(os.path.join(self.resampled_path, "PDF_sampling")) with open(save_name, "w") as write_file: json.dump(save_data, write_file, cls=JsonNumpyEncoder) @@ -1899,7 +1906,7 @@ class PDFSampling(Resampling): # pylint: disable=too-many-public-methods """ load_name = os.path.join( - self.outfile_path, + self.resampled_path, "PDF_sampling", "target_data.json", ) @@ -1998,7 +2005,7 @@ class PDFSampling(Resampling): # pylint: disable=too-many-public-methods # Load the target data load_name = os.path.join( - self.outfile_path, + self.resampled_path, "PDF_sampling", "target_data.json", ) @@ -2060,14 +2067,14 @@ class PDFSampling(Resampling): # pylint: disable=too-many-public-methods sample_name = self.options["samples"][sample_category][sample_id] save_name = os.path.join( - self.outfile_path, + self.resampled_path, "PDF_sampling", self.options["samples"][sample_category][sample_id] + "_indices.h5", ) # Load number to sample load_name = os.path.join( - self.outfile_path, + self.resampled_path, "PDF_sampling", "target_data.json", ) @@ -2180,7 +2187,7 @@ class PDFSampling(Resampling): # pylint: disable=too-many-public-methods ): for _, sample_category in enumerate(self.options["samples"]): load_name = os.path.join( - self.outfile_path, + self.resampled_path, "PDF_sampling", self.options["samples"][sample_category][sample_id] + "_selected.h5", @@ -2362,7 +2369,7 @@ class PDFSampling(Resampling): # pylint: disable=too-many-public-methods ): for cat_ind, sample_category in enumerate(self.options["samples"]): load_name = os.path.join( - self.outfile_path, + self.resampled_path, "PDF_sampling", self.options["samples"][sample_category][sample_id] + "_selected.h5", -- GitLab From 3da3f8a784ff69adac850f7a5b46377028892eb6 Mon Sep 17 00:00:00 2001 From: alfroch <alexander.froch@cern.ch> Date: Mon, 31 Jan 2022 16:58:49 +0100 Subject: [PATCH 02/10] Adding new path for resampling plots --- umami/preprocessing_tools/Resampling.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/umami/preprocessing_tools/Resampling.py b/umami/preprocessing_tools/Resampling.py index 048333f72..8457273f7 100644 --- a/umami/preprocessing_tools/Resampling.py +++ b/umami/preprocessing_tools/Resampling.py @@ -2340,7 +2340,7 @@ class PDFSampling(Resampling): # pylint: disable=too-many-public-methods # Check if the directory for the plots exists plot_dir_path = os.path.join( - self.config.config["parameters"]["sample_path"], + self.resampled_path, "plots/", ) os.makedirs(plot_dir_path, exist_ok=True) @@ -2566,7 +2566,7 @@ class Weighting(ResamplingTools): # Check if the directory for the plots exists plot_dir_path = os.path.join( - self.config.config["parameters"]["sample_path"], + self.resampled_path, "plots/", ) os.makedirs(plot_dir_path, exist_ok=True) @@ -2820,7 +2820,7 @@ class UnderSampling(ResamplingTools): # Check if the directory for the plots exists plot_dir_path = os.path.join( - self.config.config["parameters"]["sample_path"], + self.resampled_path, "plots/", ) os.makedirs(plot_dir_path, exist_ok=True) -- GitLab From 98612fb503b01d55748b9b32efad5c81b6ad582d Mon Sep 17 00:00:00 2001 From: alfroch <alexander.froch@cern.ch> Date: Tue, 1 Feb 2022 09:08:36 +0100 Subject: [PATCH 03/10] Fixing os makedirs --- umami/preprocessing_tools/Resampling.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/umami/preprocessing_tools/Resampling.py b/umami/preprocessing_tools/Resampling.py index 8457273f7..2812c3bf5 100644 --- a/umami/preprocessing_tools/Resampling.py +++ b/umami/preprocessing_tools/Resampling.py @@ -1890,8 +1890,10 @@ class PDFSampling(Resampling): # pylint: disable=too-many-public-methods "PDF_sampling", "target_data.json", ) - if not os.path.exists(os.path.join(self.resampled_path, "PDF_sampling")): - os.mkdir(os.path.join(self.resampled_path, "PDF_sampling")) + + # Ensure the output path exists + os.makedirs(os.path.join(self.resampled_path, "PDF_sampling"), exist_ok=True) + with open(save_name, "w") as write_file: json.dump(save_data, write_file, cls=JsonNumpyEncoder) -- GitLab From ad3f8c33c64ef7cac88c416ae051772854130b27 Mon Sep 17 00:00:00 2001 From: alfroch <alexander.froch@cern.ch> Date: Tue, 1 Feb 2022 13:18:29 +0100 Subject: [PATCH 04/10] Adding overhauled preprocessing integration tests --- umami/tests/integration/test_preprocessing.py | 537 ++++++++---------- 1 file changed, 242 insertions(+), 295 deletions(-) diff --git a/umami/tests/integration/test_preprocessing.py b/umami/tests/integration/test_preprocessing.py index ce710b252..73e215164 100644 --- a/umami/tests/integration/test_preprocessing.py +++ b/umami/tests/integration/test_preprocessing.py @@ -5,14 +5,15 @@ This script integration tests the preprocessing methods. """ import os +import tempfile import unittest -from shutil import copyfile +from shutil import copyfile, copytree from subprocess import CalledProcessError, run import yaml from umami.configuration import logger -from umami.tools import replaceLineInFile, yaml_loader +from umami.tools import YAML, replaceLineInFile, yaml_loader def getConfiguration(): @@ -28,7 +29,161 @@ def getConfiguration(): return conf_setup -def runPreprocessing(config: dict, tagger: str, method: str) -> bool: +def preparePreprocessingConfig( + test_dir: str, + tagger: str, + sampling_method: str, +) -> str: + """Prepare the preprocessing configs for the different tagger/methods. + + Parameters + ---------- + test_dir : str + Path to test directory + tagger : str + Name of the tagger which the preprocessing is for. + sampling_method : str + Sampling method that will be used. + + Returns + ------- + str + Path to the prepared preprocessing config. + """ + # Get test configuration + data = getConfiguration() + + # Make ttbar und zprime directory + os.makedirs(os.path.join(test_dir, "ttbar"), exist_ok=True) + os.makedirs(os.path.join(test_dir, "zpext"), exist_ok=True) + + # inputs for test will be located in test_dir + config_source = os.path.join(os.getcwd(), data["test_preprocessing"]["config"]) + config_paths_source = os.path.join( + os.getcwd(), data["test_preprocessing"]["config_paths"] + ) + var_dict_source = os.path.join( + os.getcwd(), data["test_preprocessing"][f"var_dict_{tagger}"] + ) + config = os.path.join(test_dir, os.path.basename(config_source)) + config_paths = os.path.join(test_dir, os.path.basename(config_paths_source)) + var_dict = os.path.join(test_dir, os.path.basename(var_dict_source)) + scale_dict = os.path.join(test_dir, "PFlow-scale_dict.json") + output = os.path.join(test_dir, "PFlow-hybrid_70-test.h5") + + logger.info(f"Preparing config file based on {config_source} in {config}...") + copyfile(config_source, config) + copyfile(config_paths_source, config_paths) + copyfile(var_dict_source, var_dict) + + # modify copy of preprocessing config file for test + replaceLineInFile( + config_paths, + "ntuple_path:", + f"ntuple_path: &ntuple_path {test_dir}", + ) + replaceLineInFile( + config_paths, + "sample_path:", + f"sample_path: &sample_path {test_dir}", + ) + replaceLineInFile( + config_paths, + "file_path:", + f"file_path: &file_path {test_dir}", + ) + replaceLineInFile( + config_paths, + ".outfile_name:", + f".outfile_name: &outfile_name {output}", + ) + replaceLineInFile( + config_paths, + ".dict_file:", + f".dict_file: &dict_file {scale_dict}", + ) + replaceLineInFile( + config_paths, + ".intermediate_index_file:", + ".intermediate_index_file: &intermediate_index_file indices.h5", + ) + replaceLineInFile( + config_paths, + ".var_file:", + f".var_file: &var_file {var_dict}", + ) + + # Load the preprocessing file + umami_yaml = YAML(typ="safe", pure=True) + with open(config, "r") as conf: + config_file = umami_yaml.load(conf) + + config_file["preparation"]["ntuples"]["ttbar"]["file_pattern"] = "ttbar/*.h5" + config_file["preparation"]["ntuples"]["zprime"]["file_pattern"] = "zpext/*.h5" + + if tagger.casefold() == "dl1r": + config_file["sampling"]["options"]["save_tracks"] = False + config_file["sampling"]["options"]["tracks_names"] = None + + else: + config_file["sampling"]["options"]["save_tracks"] = True + config_file["sampling"]["options"]["tracks_names"] = ["tracks", "tracks_loose"] + + if sampling_method.casefold() == "pdf": + # Change the method to pdf and adapt options + config_file["sampling"]["method"] = "pdf" + + # Set correct binning for pdf method + config_file["sampling"]["options"]["sampling_variables"] = [ + {"pt_btagJes": {"bins": [[0, 25e4, 100], [25e4, 6e6, 100]]}}, + {"absEta_btagJes": {"bins": [[0, 2.5, 10], [0, 2.5, 10]]}}, + ] + + # Set number of jets to maximum + config_file["sampling"]["options"]["njets"] = -1 + + # Remove custom njets initial + config_file["sampling"]["options"]["custom_njets_initial"] = None + + elif sampling_method.casefold() == "weighting": + config_file["sampling"]["method"] = "weighting" + config_file["sampling"]["options"]["bool_attach_sample_weights"] = True + + # save the preprocessing config file for test + with open(config, "w") as con: + umami_yaml.dump(config_file, con) + + logger.info("Downloading test data...") + for file in data["test_preprocessing"]["files"]: + path = os.path.join( + data["data_url"], + data["test_preprocessing"]["data_subfolder"], + file, + ) + logger.info(f"Retrieving file from path {path}") + run(["wget", path, "--directory-prefix", test_dir], check=True) + + run( + [ + "mv", + os.path.join(test_dir, "ci_ttbar_basefile.h5"), + os.path.join(test_dir, "ttbar", "ci_ttbar_basefile.h5"), + ], + check=True, + ) + run( + [ + "mv", + os.path.join(test_dir, "ci_zpext_basefile.h5"), + os.path.join(test_dir, "zpext", "ci_zpext_basefile.h5"), + ], + check=True, + ) + + return config + + +def runPreprocessing(config: dict, tagger: str, method: str, test_dir: str) -> bool: """ Call all steps of the preprocessing for a certain configuration and variable dict input. @@ -189,59 +344,7 @@ def runPreprocessing(config: dict, tagger: str, method: str) -> bool: ) from Error tagger_path = f"./test_preprocessing_{tagger}/" - if not os.path.isdir(tagger_path): - run(["mkdir", tagger_path], check=True) - - run( - [ - "cp", - "-r", - "/tmp/umami/preprocessing/", - tagger_path, - ], - check=True, - ) - - # Get the path of the not needed configs - unused_configs = os.path.join( - tagger_path, "preprocessing/", "PFlow-Preprocessing_*.yaml" - ) - - # Rename the needed config to PFlow-Preprocessing.yaml and erase the unused - # TODO change in python 3.10 - if method == "count": - run( - [f"rm -rfv {unused_configs}"], - shell=True, - check=True, - ) - - elif method == "pdf": - copyfile( - os.path.join(tagger_path, "preprocessing/", "PFlow-Preprocessing_pdf.yaml"), - os.path.join(tagger_path, "preprocessing/", "PFlow-Preprocessing.yaml"), - ) - run( - [f"rm -rfv {unused_configs}"], - shell=True, - check=True, - ) - - elif method == "weighting": - copyfile( - os.path.join( - tagger_path, "preprocessing/", "PFlow-Preprocessing_weighting.yaml" - ), - os.path.join(tagger_path, "preprocessing/", "PFlow-Preprocessing.yaml"), - ) - run( - [f"rm -rfv {unused_configs}"], - shell=True, - check=True, - ) - - else: - raise KeyError(f"Method {method} is not supported by the integration test!") + copytree(test_dir, tagger_path) return True @@ -260,288 +363,132 @@ class TestPreprocessing(unittest.TestCase): # Get test configuration self.data = getConfiguration() - test_dir = os.path.join(self.data["test_preprocessing"]["testdir"]) - logger.info(f"Creating test directory in {test_dir}") - # clean up, hopefully this causes no "uh oh..."" - if test_dir.startswith("/tmp"): - run(["rm", "-rf", test_dir], check=True) - run(["mkdir", "-p", test_dir], check=True) - - # Make filepaths for basefiles - run(["mkdir", "-p", os.path.join(test_dir, "ttbar")], check=True) - run(["mkdir", "-p", os.path.join(test_dir, "zpext")], check=True) - - # inputs for test will be located in test_dir - config_source = os.path.join( - os.getcwd(), self.data["test_preprocessing"]["config"] - ) - config_paths_source = os.path.join( - os.getcwd(), self.data["test_preprocessing"]["config_paths"] - ) - var_dict_umami_source = os.path.join( - os.getcwd(), self.data["test_preprocessing"]["var_dict_umami"] - ) - var_dict_dips_source = os.path.join( - os.getcwd(), self.data["test_preprocessing"]["var_dict_dips"] - ) - var_dict_dl1r_source = os.path.join( - os.getcwd(), self.data["test_preprocessing"]["var_dict_dl1r"] - ) - self.config = os.path.join(test_dir, os.path.basename(config_source)) - self.config_paths = os.path.join( - test_dir, os.path.basename(config_paths_source) - ) - self.var_dict_umami = os.path.join( - test_dir, os.path.basename(var_dict_umami_source) - ) - self.var_dict_dips = os.path.join( - test_dir, os.path.basename(var_dict_dips_source) - ) - self.var_dict_dl1r = os.path.join( - test_dir, os.path.basename(var_dict_dl1r_source) - ) - self.scale_dict = os.path.join(test_dir, "PFlow-scale_dict.json") - self.output = os.path.join(test_dir, "PFlow-hybrid_70-test.h5") - - logger.info( - f"Preparing config file based on {config_source} in {self.config}..." - ) - copyfile(config_source, self.config) - copyfile(config_paths_source, self.config_paths) - copyfile(var_dict_umami_source, self.var_dict_umami) - copyfile(var_dict_dips_source, self.var_dict_dips) - copyfile(var_dict_dl1r_source, self.var_dict_dl1r) - - # modify copy of preprocessing config file for test - replaceLineInFile( - self.config_paths, - "ntuple_path:", - f"ntuple_path: &ntuple_path {test_dir}", - ) - replaceLineInFile( - self.config_paths, - "sample_path:", - f"sample_path: &sample_path {test_dir}", - ) - replaceLineInFile( - self.config_paths, - "file_path:", - f"file_path: &file_path {test_dir}", - ) - replaceLineInFile( - self.config_paths, - ".outfile_name:", - f".outfile_name: &outfile_name {self.output}", - ) - replaceLineInFile( - self.config_paths, - ".dict_file:", - f".dict_file: &dict_file {self.scale_dict}", - ) - replaceLineInFile( - self.config_paths, - ".intermediate_index_file:", - ".intermediate_index_file: &intermediate_index_file indices.h5", - ) - replaceLineInFile( - self.config, - " file_pattern: user.alfroch.410470", - " file_pattern: ttbar/*.h5", - ) - replaceLineInFile( - self.config, - " file_pattern: user.alfroch.427081", - " file_pattern: zpext/*.h5", - ) - replaceLineInFile( - self.config, - " tracks_names:", - " tracks_names: ['tracks','tracks_loose']", - ) - - # copy config file and change name to pdf for pdf preprocessing config - self.pdf_config = self.config[:].replace(".yaml", "") + "_pdf.yaml" - copyfile(self.config, self.pdf_config) + self.test_dir_path = tempfile.TemporaryDirectory() # pylint: disable=R1732 + self.test_dir = f"{self.test_dir_path.name}" + logger.info(f"Creating test directory in {self.test_dir}") - # Change the method to pdf and adapt options - replaceLineInFile(self.pdf_config, " method: count", " method: pdf") - replaceLineInFile( - self.pdf_config, - " bins: [[0, 600000, 351], [650000, 6000000, 84]]", - " bins: [[0, 25e4, 100], [25e4, 6e6, 100]]", - ) - replaceLineInFile( - self.pdf_config, - " bins: [0, 2.5, 10]", - " bins: [[0, 2.5, 10], [0, 2.5, 10]]", - ) - replaceLineInFile( - self.pdf_config, - " njets: 25e6", - " njets: -1", - ) - replaceLineInFile( - self.pdf_config, - " training_ttbar_bjets: 5.5e6", - "", - ) - replaceLineInFile( - self.pdf_config, - " training_ttbar_cjets: 11.5e6", - "", - ) - replaceLineInFile( - self.pdf_config, - " training_ttbar_ujets: 13.5e6", - "", - ) - - # copy config file and change name to pdf for pdf preprocessing config - self.weight_config = self.config[:].replace(".yaml", "") + "_weighting.yaml" - copyfile(self.config, self.weight_config) + def test_preprocessing_umami_count(self): + """Integration test of preprocessing.py script using Umami variables.""" + tagger = "umami" + method = "count" - replaceLineInFile(self.weight_config, " method: count", " method: weighting") - replaceLineInFile( - self.weight_config, - " bool_attach_sample_weights: False", - " bool_attach_sample_weights: True", + config = preparePreprocessingConfig( + tagger=tagger, test_dir=self.test_dir, sampling_method=method ) - - logger.info("Downloading test data...") - for file in self.data["test_preprocessing"]["files"]: - path = os.path.join( - self.data["data_url"], - self.data["test_preprocessing"]["data_subfolder"], - file, + self.assertTrue( + runPreprocessing( + config=config, test_dir=self.test_dir, tagger=tagger, method=method ) - logger.info(f"Retrieving file from path {path}") - run(["wget", path, "--directory-prefix", test_dir], check=True) - - run( - [ - "mv", - os.path.join(test_dir, "ci_ttbar_basefile.h5"), - os.path.join(test_dir, "ttbar", "ci_ttbar_basefile.h5"), - ], - check=True, - ) - run( - [ - "mv", - os.path.join(test_dir, "ci_zpext_basefile.h5"), - os.path.join(test_dir, "zpext", "ci_zpext_basefile.h5"), - ], - check=True, ) - def test_preprocessing_umami_count(self): - """Integration test of preprocessing.py script using Umami variables.""" - replaceLineInFile( - self.config_paths, - ".var_file:", - f".var_file: &var_file {self.var_dict_umami}", - ) - - self.assertTrue(runPreprocessing(self.config, tagger="umami", method="count")) - def test_preprocessing_dips_count(self): """Integration test of preprocessing.py script using DIPS variables.""" - replaceLineInFile( - self.config_paths, - ".var_file:", - f".var_file: &var_file {self.var_dict_dips}", + tagger = "dips" + method = "count" + + config = preparePreprocessingConfig( + tagger=tagger, test_dir=self.test_dir, sampling_method=method + ) + self.assertTrue( + runPreprocessing( + config=config, test_dir=self.test_dir, tagger=tagger, method=method + ) ) - self.assertTrue(runPreprocessing(self.config, tagger="dips", method="count")) def test_preprocessing_dl1r_count(self): """Integration test of preprocessing.py script using DL1r variables.""" - replaceLineInFile( - self.config, - " save_tracks:", - " save_tracks: False", - ) + tagger = "dl1r" + method = "count" - replaceLineInFile( - self.config_paths, - ".var_file:", - f".var_file: &var_file {self.var_dict_dl1r}", + config = preparePreprocessingConfig( + tagger=tagger, test_dir=self.test_dir, sampling_method=method + ) + self.assertTrue( + runPreprocessing( + config=config, test_dir=self.test_dir, tagger=tagger, method=method + ) ) - - self.assertTrue(runPreprocessing(self.config, tagger="dl1r", method="count")) def test_preprocessing_umami_pdf(self): """Integration test of preprocessing.py script using Umami variables.""" - replaceLineInFile( - self.config_paths, - ".var_file:", - f".var_file: &var_file {self.var_dict_umami}", - ) + tagger = "umami" + method = "pdf" - self.assertTrue(runPreprocessing(self.pdf_config, tagger="umami", method="pdf")) + config = preparePreprocessingConfig( + tagger=tagger, test_dir=self.test_dir, sampling_method=method + ) + self.assertTrue( + runPreprocessing( + config=config, test_dir=self.test_dir, tagger=tagger, method=method + ) + ) def test_preprocessing_dips_pdf(self): """Integration test of preprocessing.py script using DIPS variables.""" - replaceLineInFile( - self.config_paths, - ".var_file:", - f".var_file: &var_file {self.var_dict_dips}", - ) + tagger = "dips" + method = "pdf" - self.assertTrue(runPreprocessing(self.pdf_config, tagger="dips", method="pdf")) + config = preparePreprocessingConfig( + tagger=tagger, test_dir=self.test_dir, sampling_method=method + ) + self.assertTrue( + runPreprocessing( + config=config, test_dir=self.test_dir, tagger=tagger, method=method + ) + ) def test_preprocessing_dl1r_pdf(self): """Integration test of preprocessing.py script using DL1r variables.""" - replaceLineInFile( - self.pdf_config, - " save_tracks:", - " save_tracks: False", - ) + tagger = "dl1r" + method = "pdf" - replaceLineInFile( - self.config_paths, - ".var_file:", - f".var_file: &var_file {self.var_dict_dl1r}", + config = preparePreprocessingConfig( + tagger=tagger, test_dir=self.test_dir, sampling_method=method + ) + self.assertTrue( + runPreprocessing( + config=config, test_dir=self.test_dir, tagger=tagger, method=method + ) ) - - self.assertTrue(runPreprocessing(self.pdf_config, tagger="dl1r", method="pdf")) def test_preprocessing_umami_weighting(self): """Integration test of preprocessing.py script using Umami variables.""" - replaceLineInFile( - self.config_paths, - ".var_file:", - f".var_file: &var_file {self.var_dict_umami}", - ) + tagger = "umami" + method = "weighting" + config = preparePreprocessingConfig( + tagger=tagger, test_dir=self.test_dir, sampling_method=method + ) self.assertTrue( - runPreprocessing(self.weight_config, tagger="umami", method="weighting") + runPreprocessing( + config=config, test_dir=self.test_dir, tagger=tagger, method=method + ) ) def test_preprocessing_dips_weighting(self): """Integration test of preprocessing.py script using DIPS variables.""" - replaceLineInFile( - self.config_paths, - ".var_file:", - f".var_file: &var_file {self.var_dict_dips}", + tagger = "dips" + method = "weighting" + + config = preparePreprocessingConfig( + tagger=tagger, test_dir=self.test_dir, sampling_method=method ) self.assertTrue( - runPreprocessing(self.weight_config, tagger="dips", method="weighting") + runPreprocessing( + config=config, test_dir=self.test_dir, tagger=tagger, method=method + ) ) def test_preprocessing_dl1r_weighting(self): """Integration test of preprocessing.py script using DL1r variables.""" - replaceLineInFile( - self.weight_config, - " save_tracks:", - " save_tracks: False", - ) + tagger = "dl1r" + method = "weighting" - replaceLineInFile( - self.config_paths, - ".var_file:", - f".var_file: &var_file {self.var_dict_dl1r}", + config = preparePreprocessingConfig( + tagger=tagger, test_dir=self.test_dir, sampling_method=method ) - self.assertTrue( - runPreprocessing(self.weight_config, tagger="dl1r", method="weighting") + runPreprocessing( + config=config, test_dir=self.test_dir, tagger=tagger, method=method + ) ) -- GitLab From 0172eebdbd429117752db3dde5218538e67d19de Mon Sep 17 00:00:00 2001 From: alfroch <alexander.froch@cern.ch> Date: Tue, 1 Feb 2022 13:39:08 +0100 Subject: [PATCH 05/10] Fixing pipeline --- umami/preprocessing_tools/Scaling.py | 4 ++-- umami/tests/integration/test_preprocessing.py | 6 ++++++ umami/tests/integration/test_train.py | 12 ++++++------ 3 files changed, 14 insertions(+), 8 deletions(-) diff --git a/umami/preprocessing_tools/Scaling.py b/umami/preprocessing_tools/Scaling.py index a4a744f9e..12fbe98cd 100644 --- a/umami/preprocessing_tools/Scaling.py +++ b/umami/preprocessing_tools/Scaling.py @@ -425,8 +425,8 @@ class Scaling: # Loop over chunks for chunk_counter in range(n_chunks): logger.info( - f"Calculating track scales for chunk {chunk_counter+1} of" - f" {n_chunks}" + f"Calculating track scales for {tracks_name} for chunk" + f" {chunk_counter+1} of {n_chunks}" ) # Check if this is the first time loading from the generator if chunk_counter == 0: diff --git a/umami/tests/integration/test_preprocessing.py b/umami/tests/integration/test_preprocessing.py index 73e215164..f3c3ee7d0 100644 --- a/umami/tests/integration/test_preprocessing.py +++ b/umami/tests/integration/test_preprocessing.py @@ -343,7 +343,13 @@ def runPreprocessing(config: dict, tagger: str, method: str, test_dir: str) -> b "Test failed: preprocessing.py --to_records." ) from Error + # Define path where the results are copied to tagger_path = f"./test_preprocessing_{tagger}/" + + # Remove any artifacts that might be locally there + run(["rm", "-rfv", f"{tagger_path}"]) + + # Copy the results to given path copytree(test_dir, tagger_path) return True diff --git a/umami/tests/integration/test_train.py b/umami/tests/integration/test_train.py index 75e030318..42319e4aa 100644 --- a/umami/tests/integration/test_train.py +++ b/umami/tests/integration/test_train.py @@ -72,11 +72,11 @@ def prepareConfig( config = os.path.join(test_dir, os.path.basename(config_source)) preprocessing_config_source = os.path.join( - f"./test_preprocessing_{preprocess_files}/preprocessing/", + f"./test_preprocessing_{preprocess_files}/", os.path.basename(data["test_preprocessing"]["config"]), ) preprocessing_config_paths_source = os.path.join( - f"./test_preprocessing_{preprocess_files}/preprocessing/", + f"./test_preprocessing_{preprocess_files}/", os.path.basename(data["test_preprocessing"]["config_paths"]), ) preprocessing_config = os.path.join( @@ -87,7 +87,7 @@ def prepareConfig( ) var_dict_source = os.path.join( - f"./test_preprocessing_{preprocess_files}/preprocessing/", + f"./test_preprocessing_{preprocess_files}/", os.path.basename(data["test_preprocessing"][f"var_dict_{preprocess_files}"]), ) var_dict = os.path.join(test_dir, os.path.basename(var_dict_source)) @@ -96,13 +96,13 @@ def prepareConfig( logger.info("Retrieving files from preprocessing...") train_file = os.path.join( - f"./test_preprocessing_{preprocess_files}/preprocessing/", + f"./test_preprocessing_{preprocess_files}/", "PFlow-hybrid_70-test-resampled_scaled_shuffled.h5", ) test_file_ttbar = os.path.join(test_dir, "ci_ttbar_testing.h5") test_file_zprime = os.path.join(test_dir, "ci_zpext_testing.h5") scale_dict = os.path.join( - f"./test_preprocessing_{preprocess_files}/preprocessing/", + f"./test_preprocessing_{preprocess_files}/", "PFlow-scale_dict.json", ) @@ -184,7 +184,7 @@ def prepareConfig( if useTFRecords is True: config_file["train_file"] = os.path.join( - f"./test_preprocessing_{preprocess_files}/preprocessing/", + f"./test_preprocessing_{preprocess_files}/", "PFlow-hybrid_70-test-resampled_scaled_shuffled", ) config_file["model_name"] = data["test_dips"]["model_name"] + "_tfrecords" -- GitLab From 5596c5ace5dae33987a12ceee58c880480b05e77 Mon Sep 17 00:00:00 2001 From: alfroch <alexander.froch@cern.ch> Date: Tue, 1 Feb 2022 13:43:05 +0100 Subject: [PATCH 06/10] Pylint --- umami/tests/integration/test_preprocessing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/umami/tests/integration/test_preprocessing.py b/umami/tests/integration/test_preprocessing.py index f3c3ee7d0..ba11e24cc 100644 --- a/umami/tests/integration/test_preprocessing.py +++ b/umami/tests/integration/test_preprocessing.py @@ -347,7 +347,7 @@ def runPreprocessing(config: dict, tagger: str, method: str, test_dir: str) -> b tagger_path = f"./test_preprocessing_{tagger}/" # Remove any artifacts that might be locally there - run(["rm", "-rfv", f"{tagger_path}"]) + run(["rm", "-rfv", f"{tagger_path}"], check=True) # Copy the results to given path copytree(test_dir, tagger_path) -- GitLab From 5aadccab29bdce0746aaf22b25daca1f10671cf9 Mon Sep 17 00:00:00 2001 From: alfroch <alexander.froch@cern.ch> Date: Tue, 1 Feb 2022 13:55:51 +0100 Subject: [PATCH 07/10] Fixing pipeline --- umami/preprocessing_tools/Configuration.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/umami/preprocessing_tools/Configuration.py b/umami/preprocessing_tools/Configuration.py index b6311b747..627fe676d 100644 --- a/umami/preprocessing_tools/Configuration.py +++ b/umami/preprocessing_tools/Configuration.py @@ -31,9 +31,9 @@ class Configuration: first_line = conf.readline() first_line = first_line.split("!include ") if first_line[0] != "parameters: ": - raise ValueError( - "Please specify in the first line of the preprocessing config" - " the 'parameters' with the !include option." + logger.warning( + "`parameters` is not defined in the first line with !include. " + "This can cause issues further downstream. " ) preprocess_parameters_path = os.path.join( -- GitLab From 9a2298e72b6bfe0d8040794f4d49005567983ebd Mon Sep 17 00:00:00 2001 From: alfroch <alexander.froch@cern.ch> Date: Tue, 1 Feb 2022 14:03:58 +0100 Subject: [PATCH 08/10] Fixing pipeline --- umami/preprocessing_tools/Configuration.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/umami/preprocessing_tools/Configuration.py b/umami/preprocessing_tools/Configuration.py index 627fe676d..a55daf8d0 100644 --- a/umami/preprocessing_tools/Configuration.py +++ b/umami/preprocessing_tools/Configuration.py @@ -35,12 +35,14 @@ class Configuration: "`parameters` is not defined in the first line with !include. " "This can cause issues further downstream. " ) + return None - preprocess_parameters_path = os.path.join( - os.path.dirname(self.ConfigPath), - first_line[1].strip(), - ) - return preprocess_parameters_path + else: + preprocess_parameters_path = os.path.join( + os.path.dirname(self.ConfigPath), + first_line[1].strip(), + ) + return preprocess_parameters_path def LoadConfigFiles(self): """Load config file from disk.""" -- GitLab From c4615a8cbac8cf97c48b22cf8ce1ed7d2c31b1c8 Mon Sep 17 00:00:00 2001 From: alfroch <alexander.froch@cern.ch> Date: Tue, 1 Feb 2022 14:11:31 +0100 Subject: [PATCH 09/10] Fixing pylint --- umami/preprocessing_tools/Configuration.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/umami/preprocessing_tools/Configuration.py b/umami/preprocessing_tools/Configuration.py index a55daf8d0..8c3623537 100644 --- a/umami/preprocessing_tools/Configuration.py +++ b/umami/preprocessing_tools/Configuration.py @@ -35,14 +35,15 @@ class Configuration: "`parameters` is not defined in the first line with !include. " "This can cause issues further downstream. " ) - return None + preprocess_parameters_path = None else: preprocess_parameters_path = os.path.join( os.path.dirname(self.ConfigPath), first_line[1].strip(), ) - return preprocess_parameters_path + + return preprocess_parameters_path def LoadConfigFiles(self): """Load config file from disk.""" -- GitLab From e0c9b245c1cdb8a96953b7aa874dc58c154d8596 Mon Sep 17 00:00:00 2001 From: alfroch <alexander.froch@cern.ch> Date: Tue, 1 Feb 2022 14:32:55 +0100 Subject: [PATCH 10/10] Undo preprocessing changes --- umami/preprocessing_tools/Configuration.py | 17 +- umami/tests/integration/test_preprocessing.py | 539 ++++++++++-------- umami/tests/integration/test_train.py | 12 +- 3 files changed, 306 insertions(+), 262 deletions(-) diff --git a/umami/preprocessing_tools/Configuration.py b/umami/preprocessing_tools/Configuration.py index 8c3623537..b6311b747 100644 --- a/umami/preprocessing_tools/Configuration.py +++ b/umami/preprocessing_tools/Configuration.py @@ -31,18 +31,15 @@ class Configuration: first_line = conf.readline() first_line = first_line.split("!include ") if first_line[0] != "parameters: ": - logger.warning( - "`parameters` is not defined in the first line with !include. " - "This can cause issues further downstream. " - ) - preprocess_parameters_path = None - - else: - preprocess_parameters_path = os.path.join( - os.path.dirname(self.ConfigPath), - first_line[1].strip(), + raise ValueError( + "Please specify in the first line of the preprocessing config" + " the 'parameters' with the !include option." ) + preprocess_parameters_path = os.path.join( + os.path.dirname(self.ConfigPath), + first_line[1].strip(), + ) return preprocess_parameters_path def LoadConfigFiles(self): diff --git a/umami/tests/integration/test_preprocessing.py b/umami/tests/integration/test_preprocessing.py index ba11e24cc..ce710b252 100644 --- a/umami/tests/integration/test_preprocessing.py +++ b/umami/tests/integration/test_preprocessing.py @@ -5,15 +5,14 @@ This script integration tests the preprocessing methods. """ import os -import tempfile import unittest -from shutil import copyfile, copytree +from shutil import copyfile from subprocess import CalledProcessError, run import yaml from umami.configuration import logger -from umami.tools import YAML, replaceLineInFile, yaml_loader +from umami.tools import replaceLineInFile, yaml_loader def getConfiguration(): @@ -29,161 +28,7 @@ def getConfiguration(): return conf_setup -def preparePreprocessingConfig( - test_dir: str, - tagger: str, - sampling_method: str, -) -> str: - """Prepare the preprocessing configs for the different tagger/methods. - - Parameters - ---------- - test_dir : str - Path to test directory - tagger : str - Name of the tagger which the preprocessing is for. - sampling_method : str - Sampling method that will be used. - - Returns - ------- - str - Path to the prepared preprocessing config. - """ - # Get test configuration - data = getConfiguration() - - # Make ttbar und zprime directory - os.makedirs(os.path.join(test_dir, "ttbar"), exist_ok=True) - os.makedirs(os.path.join(test_dir, "zpext"), exist_ok=True) - - # inputs for test will be located in test_dir - config_source = os.path.join(os.getcwd(), data["test_preprocessing"]["config"]) - config_paths_source = os.path.join( - os.getcwd(), data["test_preprocessing"]["config_paths"] - ) - var_dict_source = os.path.join( - os.getcwd(), data["test_preprocessing"][f"var_dict_{tagger}"] - ) - config = os.path.join(test_dir, os.path.basename(config_source)) - config_paths = os.path.join(test_dir, os.path.basename(config_paths_source)) - var_dict = os.path.join(test_dir, os.path.basename(var_dict_source)) - scale_dict = os.path.join(test_dir, "PFlow-scale_dict.json") - output = os.path.join(test_dir, "PFlow-hybrid_70-test.h5") - - logger.info(f"Preparing config file based on {config_source} in {config}...") - copyfile(config_source, config) - copyfile(config_paths_source, config_paths) - copyfile(var_dict_source, var_dict) - - # modify copy of preprocessing config file for test - replaceLineInFile( - config_paths, - "ntuple_path:", - f"ntuple_path: &ntuple_path {test_dir}", - ) - replaceLineInFile( - config_paths, - "sample_path:", - f"sample_path: &sample_path {test_dir}", - ) - replaceLineInFile( - config_paths, - "file_path:", - f"file_path: &file_path {test_dir}", - ) - replaceLineInFile( - config_paths, - ".outfile_name:", - f".outfile_name: &outfile_name {output}", - ) - replaceLineInFile( - config_paths, - ".dict_file:", - f".dict_file: &dict_file {scale_dict}", - ) - replaceLineInFile( - config_paths, - ".intermediate_index_file:", - ".intermediate_index_file: &intermediate_index_file indices.h5", - ) - replaceLineInFile( - config_paths, - ".var_file:", - f".var_file: &var_file {var_dict}", - ) - - # Load the preprocessing file - umami_yaml = YAML(typ="safe", pure=True) - with open(config, "r") as conf: - config_file = umami_yaml.load(conf) - - config_file["preparation"]["ntuples"]["ttbar"]["file_pattern"] = "ttbar/*.h5" - config_file["preparation"]["ntuples"]["zprime"]["file_pattern"] = "zpext/*.h5" - - if tagger.casefold() == "dl1r": - config_file["sampling"]["options"]["save_tracks"] = False - config_file["sampling"]["options"]["tracks_names"] = None - - else: - config_file["sampling"]["options"]["save_tracks"] = True - config_file["sampling"]["options"]["tracks_names"] = ["tracks", "tracks_loose"] - - if sampling_method.casefold() == "pdf": - # Change the method to pdf and adapt options - config_file["sampling"]["method"] = "pdf" - - # Set correct binning for pdf method - config_file["sampling"]["options"]["sampling_variables"] = [ - {"pt_btagJes": {"bins": [[0, 25e4, 100], [25e4, 6e6, 100]]}}, - {"absEta_btagJes": {"bins": [[0, 2.5, 10], [0, 2.5, 10]]}}, - ] - - # Set number of jets to maximum - config_file["sampling"]["options"]["njets"] = -1 - - # Remove custom njets initial - config_file["sampling"]["options"]["custom_njets_initial"] = None - - elif sampling_method.casefold() == "weighting": - config_file["sampling"]["method"] = "weighting" - config_file["sampling"]["options"]["bool_attach_sample_weights"] = True - - # save the preprocessing config file for test - with open(config, "w") as con: - umami_yaml.dump(config_file, con) - - logger.info("Downloading test data...") - for file in data["test_preprocessing"]["files"]: - path = os.path.join( - data["data_url"], - data["test_preprocessing"]["data_subfolder"], - file, - ) - logger.info(f"Retrieving file from path {path}") - run(["wget", path, "--directory-prefix", test_dir], check=True) - - run( - [ - "mv", - os.path.join(test_dir, "ci_ttbar_basefile.h5"), - os.path.join(test_dir, "ttbar", "ci_ttbar_basefile.h5"), - ], - check=True, - ) - run( - [ - "mv", - os.path.join(test_dir, "ci_zpext_basefile.h5"), - os.path.join(test_dir, "zpext", "ci_zpext_basefile.h5"), - ], - check=True, - ) - - return config - - -def runPreprocessing(config: dict, tagger: str, method: str, test_dir: str) -> bool: +def runPreprocessing(config: dict, tagger: str, method: str) -> bool: """ Call all steps of the preprocessing for a certain configuration and variable dict input. @@ -343,14 +188,60 @@ def runPreprocessing(config: dict, tagger: str, method: str, test_dir: str) -> b "Test failed: preprocessing.py --to_records." ) from Error - # Define path where the results are copied to tagger_path = f"./test_preprocessing_{tagger}/" + if not os.path.isdir(tagger_path): + run(["mkdir", tagger_path], check=True) + + run( + [ + "cp", + "-r", + "/tmp/umami/preprocessing/", + tagger_path, + ], + check=True, + ) + + # Get the path of the not needed configs + unused_configs = os.path.join( + tagger_path, "preprocessing/", "PFlow-Preprocessing_*.yaml" + ) - # Remove any artifacts that might be locally there - run(["rm", "-rfv", f"{tagger_path}"], check=True) + # Rename the needed config to PFlow-Preprocessing.yaml and erase the unused + # TODO change in python 3.10 + if method == "count": + run( + [f"rm -rfv {unused_configs}"], + shell=True, + check=True, + ) + + elif method == "pdf": + copyfile( + os.path.join(tagger_path, "preprocessing/", "PFlow-Preprocessing_pdf.yaml"), + os.path.join(tagger_path, "preprocessing/", "PFlow-Preprocessing.yaml"), + ) + run( + [f"rm -rfv {unused_configs}"], + shell=True, + check=True, + ) + + elif method == "weighting": + copyfile( + os.path.join( + tagger_path, "preprocessing/", "PFlow-Preprocessing_weighting.yaml" + ), + os.path.join(tagger_path, "preprocessing/", "PFlow-Preprocessing.yaml"), + ) + run( + [f"rm -rfv {unused_configs}"], + shell=True, + check=True, + ) - # Copy the results to given path - copytree(test_dir, tagger_path) + else: + raise KeyError(f"Method {method} is not supported by the integration test!") return True @@ -369,132 +260,288 @@ class TestPreprocessing(unittest.TestCase): # Get test configuration self.data = getConfiguration() - self.test_dir_path = tempfile.TemporaryDirectory() # pylint: disable=R1732 - self.test_dir = f"{self.test_dir_path.name}" - logger.info(f"Creating test directory in {self.test_dir}") + test_dir = os.path.join(self.data["test_preprocessing"]["testdir"]) + logger.info(f"Creating test directory in {test_dir}") + # clean up, hopefully this causes no "uh oh..."" + if test_dir.startswith("/tmp"): + run(["rm", "-rf", test_dir], check=True) + run(["mkdir", "-p", test_dir], check=True) - def test_preprocessing_umami_count(self): - """Integration test of preprocessing.py script using Umami variables.""" - tagger = "umami" - method = "count" + # Make filepaths for basefiles + run(["mkdir", "-p", os.path.join(test_dir, "ttbar")], check=True) + run(["mkdir", "-p", os.path.join(test_dir, "zpext")], check=True) - config = preparePreprocessingConfig( - tagger=tagger, test_dir=self.test_dir, sampling_method=method + # inputs for test will be located in test_dir + config_source = os.path.join( + os.getcwd(), self.data["test_preprocessing"]["config"] ) - self.assertTrue( - runPreprocessing( - config=config, test_dir=self.test_dir, tagger=tagger, method=method - ) + config_paths_source = os.path.join( + os.getcwd(), self.data["test_preprocessing"]["config_paths"] + ) + var_dict_umami_source = os.path.join( + os.getcwd(), self.data["test_preprocessing"]["var_dict_umami"] + ) + var_dict_dips_source = os.path.join( + os.getcwd(), self.data["test_preprocessing"]["var_dict_dips"] + ) + var_dict_dl1r_source = os.path.join( + os.getcwd(), self.data["test_preprocessing"]["var_dict_dl1r"] ) + self.config = os.path.join(test_dir, os.path.basename(config_source)) + self.config_paths = os.path.join( + test_dir, os.path.basename(config_paths_source) + ) + self.var_dict_umami = os.path.join( + test_dir, os.path.basename(var_dict_umami_source) + ) + self.var_dict_dips = os.path.join( + test_dir, os.path.basename(var_dict_dips_source) + ) + self.var_dict_dl1r = os.path.join( + test_dir, os.path.basename(var_dict_dl1r_source) + ) + self.scale_dict = os.path.join(test_dir, "PFlow-scale_dict.json") + self.output = os.path.join(test_dir, "PFlow-hybrid_70-test.h5") - def test_preprocessing_dips_count(self): - """Integration test of preprocessing.py script using DIPS variables.""" - tagger = "dips" - method = "count" + logger.info( + f"Preparing config file based on {config_source} in {self.config}..." + ) + copyfile(config_source, self.config) + copyfile(config_paths_source, self.config_paths) + copyfile(var_dict_umami_source, self.var_dict_umami) + copyfile(var_dict_dips_source, self.var_dict_dips) + copyfile(var_dict_dl1r_source, self.var_dict_dl1r) + + # modify copy of preprocessing config file for test + replaceLineInFile( + self.config_paths, + "ntuple_path:", + f"ntuple_path: &ntuple_path {test_dir}", + ) + replaceLineInFile( + self.config_paths, + "sample_path:", + f"sample_path: &sample_path {test_dir}", + ) + replaceLineInFile( + self.config_paths, + "file_path:", + f"file_path: &file_path {test_dir}", + ) + replaceLineInFile( + self.config_paths, + ".outfile_name:", + f".outfile_name: &outfile_name {self.output}", + ) + replaceLineInFile( + self.config_paths, + ".dict_file:", + f".dict_file: &dict_file {self.scale_dict}", + ) + replaceLineInFile( + self.config_paths, + ".intermediate_index_file:", + ".intermediate_index_file: &intermediate_index_file indices.h5", + ) + replaceLineInFile( + self.config, + " file_pattern: user.alfroch.410470", + " file_pattern: ttbar/*.h5", + ) + replaceLineInFile( + self.config, + " file_pattern: user.alfroch.427081", + " file_pattern: zpext/*.h5", + ) + replaceLineInFile( + self.config, + " tracks_names:", + " tracks_names: ['tracks','tracks_loose']", + ) + + # copy config file and change name to pdf for pdf preprocessing config + self.pdf_config = self.config[:].replace(".yaml", "") + "_pdf.yaml" + copyfile(self.config, self.pdf_config) - config = preparePreprocessingConfig( - tagger=tagger, test_dir=self.test_dir, sampling_method=method + # Change the method to pdf and adapt options + replaceLineInFile(self.pdf_config, " method: count", " method: pdf") + replaceLineInFile( + self.pdf_config, + " bins: [[0, 600000, 351], [650000, 6000000, 84]]", + " bins: [[0, 25e4, 100], [25e4, 6e6, 100]]", ) - self.assertTrue( - runPreprocessing( - config=config, test_dir=self.test_dir, tagger=tagger, method=method - ) + replaceLineInFile( + self.pdf_config, + " bins: [0, 2.5, 10]", + " bins: [[0, 2.5, 10], [0, 2.5, 10]]", + ) + replaceLineInFile( + self.pdf_config, + " njets: 25e6", + " njets: -1", + ) + replaceLineInFile( + self.pdf_config, + " training_ttbar_bjets: 5.5e6", + "", + ) + replaceLineInFile( + self.pdf_config, + " training_ttbar_cjets: 11.5e6", + "", + ) + replaceLineInFile( + self.pdf_config, + " training_ttbar_ujets: 13.5e6", + "", ) - def test_preprocessing_dl1r_count(self): - """Integration test of preprocessing.py script using DL1r variables.""" - tagger = "dl1r" - method = "count" + # copy config file and change name to pdf for pdf preprocessing config + self.weight_config = self.config[:].replace(".yaml", "") + "_weighting.yaml" + copyfile(self.config, self.weight_config) - config = preparePreprocessingConfig( - tagger=tagger, test_dir=self.test_dir, sampling_method=method + replaceLineInFile(self.weight_config, " method: count", " method: weighting") + replaceLineInFile( + self.weight_config, + " bool_attach_sample_weights: False", + " bool_attach_sample_weights: True", ) - self.assertTrue( - runPreprocessing( - config=config, test_dir=self.test_dir, tagger=tagger, method=method + + logger.info("Downloading test data...") + for file in self.data["test_preprocessing"]["files"]: + path = os.path.join( + self.data["data_url"], + self.data["test_preprocessing"]["data_subfolder"], + file, ) + logger.info(f"Retrieving file from path {path}") + run(["wget", path, "--directory-prefix", test_dir], check=True) + + run( + [ + "mv", + os.path.join(test_dir, "ci_ttbar_basefile.h5"), + os.path.join(test_dir, "ttbar", "ci_ttbar_basefile.h5"), + ], + check=True, + ) + run( + [ + "mv", + os.path.join(test_dir, "ci_zpext_basefile.h5"), + os.path.join(test_dir, "zpext", "ci_zpext_basefile.h5"), + ], + check=True, ) - def test_preprocessing_umami_pdf(self): + def test_preprocessing_umami_count(self): """Integration test of preprocessing.py script using Umami variables.""" - tagger = "umami" - method = "pdf" + replaceLineInFile( + self.config_paths, + ".var_file:", + f".var_file: &var_file {self.var_dict_umami}", + ) + + self.assertTrue(runPreprocessing(self.config, tagger="umami", method="count")) - config = preparePreprocessingConfig( - tagger=tagger, test_dir=self.test_dir, sampling_method=method + def test_preprocessing_dips_count(self): + """Integration test of preprocessing.py script using DIPS variables.""" + replaceLineInFile( + self.config_paths, + ".var_file:", + f".var_file: &var_file {self.var_dict_dips}", ) - self.assertTrue( - runPreprocessing( - config=config, test_dir=self.test_dir, tagger=tagger, method=method - ) + self.assertTrue(runPreprocessing(self.config, tagger="dips", method="count")) + + def test_preprocessing_dl1r_count(self): + """Integration test of preprocessing.py script using DL1r variables.""" + replaceLineInFile( + self.config, + " save_tracks:", + " save_tracks: False", ) - def test_preprocessing_dips_pdf(self): - """Integration test of preprocessing.py script using DIPS variables.""" - tagger = "dips" - method = "pdf" + replaceLineInFile( + self.config_paths, + ".var_file:", + f".var_file: &var_file {self.var_dict_dl1r}", + ) + + self.assertTrue(runPreprocessing(self.config, tagger="dl1r", method="count")) - config = preparePreprocessingConfig( - tagger=tagger, test_dir=self.test_dir, sampling_method=method + def test_preprocessing_umami_pdf(self): + """Integration test of preprocessing.py script using Umami variables.""" + replaceLineInFile( + self.config_paths, + ".var_file:", + f".var_file: &var_file {self.var_dict_umami}", ) - self.assertTrue( - runPreprocessing( - config=config, test_dir=self.test_dir, tagger=tagger, method=method - ) + + self.assertTrue(runPreprocessing(self.pdf_config, tagger="umami", method="pdf")) + + def test_preprocessing_dips_pdf(self): + """Integration test of preprocessing.py script using DIPS variables.""" + replaceLineInFile( + self.config_paths, + ".var_file:", + f".var_file: &var_file {self.var_dict_dips}", ) + self.assertTrue(runPreprocessing(self.pdf_config, tagger="dips", method="pdf")) + def test_preprocessing_dl1r_pdf(self): """Integration test of preprocessing.py script using DL1r variables.""" - tagger = "dl1r" - method = "pdf" - - config = preparePreprocessingConfig( - tagger=tagger, test_dir=self.test_dir, sampling_method=method + replaceLineInFile( + self.pdf_config, + " save_tracks:", + " save_tracks: False", ) - self.assertTrue( - runPreprocessing( - config=config, test_dir=self.test_dir, tagger=tagger, method=method - ) + + replaceLineInFile( + self.config_paths, + ".var_file:", + f".var_file: &var_file {self.var_dict_dl1r}", ) + self.assertTrue(runPreprocessing(self.pdf_config, tagger="dl1r", method="pdf")) + def test_preprocessing_umami_weighting(self): """Integration test of preprocessing.py script using Umami variables.""" - tagger = "umami" - method = "weighting" - - config = preparePreprocessingConfig( - tagger=tagger, test_dir=self.test_dir, sampling_method=method + replaceLineInFile( + self.config_paths, + ".var_file:", + f".var_file: &var_file {self.var_dict_umami}", ) + self.assertTrue( - runPreprocessing( - config=config, test_dir=self.test_dir, tagger=tagger, method=method - ) + runPreprocessing(self.weight_config, tagger="umami", method="weighting") ) def test_preprocessing_dips_weighting(self): """Integration test of preprocessing.py script using DIPS variables.""" - tagger = "dips" - method = "weighting" - - config = preparePreprocessingConfig( - tagger=tagger, test_dir=self.test_dir, sampling_method=method + replaceLineInFile( + self.config_paths, + ".var_file:", + f".var_file: &var_file {self.var_dict_dips}", ) self.assertTrue( - runPreprocessing( - config=config, test_dir=self.test_dir, tagger=tagger, method=method - ) + runPreprocessing(self.weight_config, tagger="dips", method="weighting") ) def test_preprocessing_dl1r_weighting(self): """Integration test of preprocessing.py script using DL1r variables.""" - tagger = "dl1r" - method = "weighting" + replaceLineInFile( + self.weight_config, + " save_tracks:", + " save_tracks: False", + ) - config = preparePreprocessingConfig( - tagger=tagger, test_dir=self.test_dir, sampling_method=method + replaceLineInFile( + self.config_paths, + ".var_file:", + f".var_file: &var_file {self.var_dict_dl1r}", ) + self.assertTrue( - runPreprocessing( - config=config, test_dir=self.test_dir, tagger=tagger, method=method - ) + runPreprocessing(self.weight_config, tagger="dl1r", method="weighting") ) diff --git a/umami/tests/integration/test_train.py b/umami/tests/integration/test_train.py index 42319e4aa..75e030318 100644 --- a/umami/tests/integration/test_train.py +++ b/umami/tests/integration/test_train.py @@ -72,11 +72,11 @@ def prepareConfig( config = os.path.join(test_dir, os.path.basename(config_source)) preprocessing_config_source = os.path.join( - f"./test_preprocessing_{preprocess_files}/", + f"./test_preprocessing_{preprocess_files}/preprocessing/", os.path.basename(data["test_preprocessing"]["config"]), ) preprocessing_config_paths_source = os.path.join( - f"./test_preprocessing_{preprocess_files}/", + f"./test_preprocessing_{preprocess_files}/preprocessing/", os.path.basename(data["test_preprocessing"]["config_paths"]), ) preprocessing_config = os.path.join( @@ -87,7 +87,7 @@ def prepareConfig( ) var_dict_source = os.path.join( - f"./test_preprocessing_{preprocess_files}/", + f"./test_preprocessing_{preprocess_files}/preprocessing/", os.path.basename(data["test_preprocessing"][f"var_dict_{preprocess_files}"]), ) var_dict = os.path.join(test_dir, os.path.basename(var_dict_source)) @@ -96,13 +96,13 @@ def prepareConfig( logger.info("Retrieving files from preprocessing...") train_file = os.path.join( - f"./test_preprocessing_{preprocess_files}/", + f"./test_preprocessing_{preprocess_files}/preprocessing/", "PFlow-hybrid_70-test-resampled_scaled_shuffled.h5", ) test_file_ttbar = os.path.join(test_dir, "ci_ttbar_testing.h5") test_file_zprime = os.path.join(test_dir, "ci_zpext_testing.h5") scale_dict = os.path.join( - f"./test_preprocessing_{preprocess_files}/", + f"./test_preprocessing_{preprocess_files}/preprocessing/", "PFlow-scale_dict.json", ) @@ -184,7 +184,7 @@ def prepareConfig( if useTFRecords is True: config_file["train_file"] = os.path.join( - f"./test_preprocessing_{preprocess_files}/", + f"./test_preprocessing_{preprocess_files}/preprocessing/", "PFlow-hybrid_70-test-resampled_scaled_shuffled", ) config_file["model_name"] = data["test_dips"]["model_name"] + "_tfrecords" -- GitLab