From b7658bf28a2d625c3e636bdd71310f01336dfed4 Mon Sep 17 00:00:00 2001 From: Manuel Guth <manuel.guth@cern.ch> Date: Thu, 30 Jan 2020 12:45:37 +0000 Subject: [PATCH 1/2] adding tests for GetScales function --- umami/configs/DL1r_Variables.yaml | 24 ++++++------- umami/preprocessing.py | 40 ++++++++------------- umami/preprocessing_tools/Configuration.py | 3 +- umami/preprocessing_tools/Resampling.py | 6 ++-- umami/preprocessing_tools/__init__.py | 2 +- umami/tests/test_preprocessing_tools.py | 41 +++++++++++++++++++++- 6 files changed, 72 insertions(+), 44 deletions(-) diff --git a/umami/configs/DL1r_Variables.yaml b/umami/configs/DL1r_Variables.yaml index 18a829a1c..536aa35b5 100644 --- a/umami/configs/DL1r_Variables.yaml +++ b/umami/configs/DL1r_Variables.yaml @@ -50,15 +50,15 @@ spectator_variables: - DL1r_pb - DL1r_pu - custom_defaults_vars: - JetFitter_energyFraction: 0 - JetFitter_significance3d: 0 - JetFitter_nVTX: -1 - JetFitter_nSingleTracks: -1 - JetFitter_nTracksAtVtx: -1 - JetFitter_N2Tpair: -1 - SV1_N2Tpair: -1 - SV1_NGTinSvx: -1 - SV1_efracsvx: 0 - secondaryVtx_nTrks: 0 - secondaryVtx_EFrac: 0 +custom_defaults_vars: + JetFitter_energyFraction: 0 + JetFitter_significance3d: 0 + JetFitter_nVTX: -1 + JetFitter_nSingleTracks: -1 + JetFitter_nTracksAtVtx: -1 + JetFitter_N2Tpair: -1 + SV1_N2Tpair: -1 + SV1_NGTinSvx: -1 + SV1_efracsvx: 0 + secondaryVtx_nTrks: 0 + secondaryVtx_EFrac: 0 diff --git a/umami/preprocessing.py b/umami/preprocessing.py index 7f5719671..258157bc5 100644 --- a/umami/preprocessing.py +++ b/umami/preprocessing.py @@ -45,9 +45,8 @@ def GetParser(): return args -def RunUndersampling(): +def RunUndersampling(args): """Applies required cuts to the samples and applies the downsampling.""" - args = GetParser() config = upt.Configuration(args.config_file) N_list = upt.GetNJetsPerIteration(config) @@ -157,28 +156,16 @@ def RunUndersampling(): # python Preprocessing.py --no_writing --downsampled --only_scale # --dummy_weights --input_file ${INPUTFILE} -f params_MC16D-2019-VRjets -o "" -def GetScaleDict(): - args = GetParser() +def GetScaleDict(args): config = upt.Configuration(args.config_file) - # TODO: find good way to get file names + # TODO: find good way to get file names, breaks if no iterations input_file = config.GetFileName(iteration=1, option='downsampled') infile_all = h5py.File(input_file, 'r') - # TODO: add properly Variable config - config.variable_config = "/home/fr/fr_fr/fr_mg1150/workspace/btagging/"\ - "umami/umami/configs/DL1r_Variables.yaml" - # TODO: check if dictfile already exists in proper way - # dict_dir = "./" - # dict_file = "test.json" - input_file = "test.h5" - print('Preprocessing', input_file) - - with open(config.variable_config, "r") as conf: + with open(args.var_dict, "r") as conf: variable_config = yaml.load(conf, Loader=yaml_loader) - var_list = [variable_config["label"], "category"] - var_list += variable_config["train_variables"] - var_list += variable_config["spectator_variables"] + var_list = variable_config["train_variables"] bjets = pd.DataFrame(infile_all['bjets'][:][var_list]) cjets = pd.DataFrame(infile_all['cjets'][:][var_list]) @@ -198,20 +185,21 @@ def GetScaleDict(): # no scaling and shifting is applied to the check variables scale_dict.append(upt.dict_in(var, 0., 1., None)) else: - dict_entry = upt.Get_Shift_Scale( + dict_entry = upt.GetScales( vec=X[var].values, - w=X['weight'].values, varname=var, + # TODO: implement weights + w=np.ones(len(X)), varname=var, custom_defaults_vars=variable_config["custom_defaults_vars"]) scale_dict.append(upt.dict_in(*dict_entry)) - + # save scale/shift dictionary to json file - scale_name = '%s/%s.json' % (args.dict_dir, args.dict_file) - with open(scale_name, 'w') as outfile: - json.dump(scale_dict, outfile, indent=4) - print("saved scale dictionary as", scale_name) + # scale_name = '%s/%s.json' % (args.dict_dir, args.dict_file) + # with open(scale_name, 'w') as outfile: + # json.dump(scale_dict, outfile, indent=4) + # print("saved scale dictionary as", scale_name) if __name__ == '__main__': args = GetParser() # RunDownsampling() - # GetScaleDict() + GetScaleDict(args) diff --git a/umami/preprocessing_tools/Configuration.py b/umami/preprocessing_tools/Configuration.py index bce5d026f..b8d30e77d 100644 --- a/umami/preprocessing_tools/Configuration.py +++ b/umami/preprocessing_tools/Configuration.py @@ -52,7 +52,7 @@ class Configuration(object): setattr(self, elem, self.default_config[elem]) def GetFileName(self, iteration=None, option=None): - if option is None: + if option is None and iteration is None: return self.outfile_name out_file = self.outfile_name idx = out_file.index(".h5") @@ -63,6 +63,5 @@ class Configuration(object): inserttxt = f"-{option}-file-{iteration:.0f}"\ f"_{self.iterations:.0f}" - print(inserttxt) out_file = out_file[:idx] + inserttxt + out_file[idx:] return out_file diff --git a/umami/preprocessing_tools/Resampling.py b/umami/preprocessing_tools/Resampling.py index 143c0918e..40efaaa04 100644 --- a/umami/preprocessing_tools/Resampling.py +++ b/umami/preprocessing_tools/Resampling.py @@ -85,8 +85,10 @@ def GetNJetsPerIteration(config): return N_list -def Get_Shift_Scale(vec, w, varname, custom_defaults_vars): +def GetScales(vec, w, varname, custom_defaults_vars): """Calculates the weighted average and std for vector vec and weight w.""" + if np.sum(w) == 0: + raise ValueError("Sum of weights has to be >0.") # find NaN values nans = np.isnan(vec) # check if variable has predefined default value @@ -101,7 +103,7 @@ def Get_Shift_Scale(vec, w, varname, custom_defaults_vars): vec[nans] = default average = np.ma.average(vec, weights=w) std = np.sqrt(np.average((vec - average) ** 2, weights=w)) - return [varname, average, std, default] + return varname, average, std, default def dict_in(varname, average, std, default): diff --git a/umami/preprocessing_tools/__init__.py b/umami/preprocessing_tools/__init__.py index 79cee7511..78f675b0b 100644 --- a/umami/preprocessing_tools/__init__.py +++ b/umami/preprocessing_tools/__init__.py @@ -1,4 +1,4 @@ # flake8: noqa -from umami.preprocessing_tools.Resampling import UnderSampling, GetNJetsPerIteration, Get_Shift_Scale +from umami.preprocessing_tools.Resampling import UnderSampling, GetNJetsPerIteration, GetScales, dict_in from umami.preprocessing_tools.Configuration import Configuration from umami.preprocessing_tools.Cuts import GetCuts diff --git a/umami/tests/test_preprocessing_tools.py b/umami/tests/test_preprocessing_tools.py index 7f21946d2..1c621d489 100644 --- a/umami/tests/test_preprocessing_tools.py +++ b/umami/tests/test_preprocessing_tools.py @@ -3,7 +3,7 @@ import numpy as np import pandas as pd import os from umami.preprocessing_tools import UnderSampling, Configuration -from umami.preprocessing_tools import GetNJetsPerIteration, GetCuts +from umami.preprocessing_tools import GetNJetsPerIteration, GetCuts, GetScales class UnderSamplingTestCase(unittest.TestCase): @@ -92,6 +92,11 @@ class ConfigurationTestCase(unittest.TestCase): out_file = config.GetFileName(option="test") self.assertIn("test", out_file) + def test_GetFileName_no_iterations_no_input(self): + config = Configuration(self.config_file) + out_file = config.GetFileName() + self.assertEqual(config.outfile_name, out_file) + class GetNJetsPerIterationTestCase(unittest.TestCase): """ @@ -227,3 +232,37 @@ class PreprocessingTestCuts(unittest.TestCase): cut_result = np.ones(len(jets)) np.put(cut_result, indices_to_remove, 0) self.assertTrue(np.array_equal(cut_result, np.array([1, 0, 0]))) + + +class GetScalesTestCase(unittest.TestCase): + """ + Test the implementation of the GetScales class. + """ + + def setUp(self): + self.arr_0 = np.zeros(500) + self.arr_1 = np.ones(500) + + def test_ZeroCase(self): + varname, average, std, default = GetScales( + self.arr_0, self.arr_1, "zeros", {}) + self.assertEqual(average, 0) + self.assertEqual(std, 0) + self.assertEqual(default, 0) + + def test_ReturnVarname(self): + varname, _, _, _ = GetScales(self.arr_0, self.arr_1, "zeros", {}) + self.assertEqual(varname, "zeros") + + def test_WeightZero(self): + with self.assertRaises(ValueError): + varname, average, std, default = GetScales( + self.arr_1, self.arr_0, "zeros", {}) + + def test_OneCase(self): + varname, average, std, default = GetScales( + self.arr_1, self.arr_1, "ones", {}) + self.assertEqual(average, 1) + self.assertEqual(std, 0) + self.assertEqual(default, 1) + -- GitLab From 7e61a0b487ba888d482af38aea31eec2d7b28073 Mon Sep 17 00:00:00 2001 From: Manuel Guth <manuel.guth@cern.ch> Date: Mon, 3 Feb 2020 11:28:27 +0000 Subject: [PATCH 2/2] switching to slim docker images and correcting linter errors --- .gitlab-ci.yml | 4 ++-- umami/preprocessing.py | 4 ++-- umami/tests/test_preprocessing.py | 23 ++++++++++------------- umami/tests/test_preprocessing_tools.py | 1 - 4 files changed, 14 insertions(+), 18 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index b4a6ef254..e2840c7cb 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -4,14 +4,14 @@ stages: unittest: stage: test - image: python:3.7 + image: python:3.7-slim script: - pip install -r requirements.txt - pytest ./umami/tests/ -v linter: stage: test - image: python:3.7 + image: python:3.7-slim allow_failure: true script: - pip install flake8 diff --git a/umami/preprocessing.py b/umami/preprocessing.py index 258157bc5..7eb88d45a 100644 --- a/umami/preprocessing.py +++ b/umami/preprocessing.py @@ -6,7 +6,7 @@ import pandas as pd import argparse import yaml from umami.tools import yaml_loader -import json +# import json def GetParser(): @@ -191,7 +191,7 @@ def GetScaleDict(args): w=np.ones(len(X)), varname=var, custom_defaults_vars=variable_config["custom_defaults_vars"]) scale_dict.append(upt.dict_in(*dict_entry)) - + # save scale/shift dictionary to json file # scale_name = '%s/%s.json' % (args.dict_dir, args.dict_file) # with open(scale_name, 'w') as outfile: diff --git a/umami/tests/test_preprocessing.py b/umami/tests/test_preprocessing.py index 1758f1e9a..00ba72bad 100644 --- a/umami/tests/test_preprocessing.py +++ b/umami/tests/test_preprocessing.py @@ -28,18 +28,15 @@ class PreprocessingTestParser(unittest.TestCase): parser = GetParser() self.assertEqual(parser.config_file, self.config_file) self.assertFalse(parser.tracks) - # self.assertIsNone(parser.cut_config_file) -# @mock.patch('argparse.ArgumentParser.parse_args', -# return_value=argparse.Namespace(config_file=config_file, -# tracks=True)) -# def test_ParserTracks(self, mock_args): -# self.parser = GetParser() -# self.assertTrue(self.parser.tracks) -# @mock.patch('argparse.ArgumentParser.parse_args', -# return_value=argparse.Namespace(config_file=config_file, -# undersampling=True)) -# def test_ParserTracks(self, mock_args): -# self.parser = GetParser() -# self.assertTrue(self.parser.tracks) +class PreprocessingTestGetScaleDict(unittest.TestCase): + """ + Test the implementation of the GetScaleDict function. + """ + config_file = os.path.join(os.path.dirname(__file__), + "test_preprocess_config.yaml") + + def setUp(self): + self.config_file = os.path.join(os.path.dirname(__file__), + "test_preprocess_config.yaml") diff --git a/umami/tests/test_preprocessing_tools.py b/umami/tests/test_preprocessing_tools.py index 1c621d489..aa0c76b65 100644 --- a/umami/tests/test_preprocessing_tools.py +++ b/umami/tests/test_preprocessing_tools.py @@ -265,4 +265,3 @@ class GetScalesTestCase(unittest.TestCase): self.assertEqual(average, 1) self.assertEqual(std, 0) self.assertEqual(default, 1) - -- GitLab