From b7658bf28a2d625c3e636bdd71310f01336dfed4 Mon Sep 17 00:00:00 2001
From: Manuel Guth <manuel.guth@cern.ch>
Date: Thu, 30 Jan 2020 12:45:37 +0000
Subject: [PATCH 1/2] adding tests for GetScales function

---
 umami/configs/DL1r_Variables.yaml          | 24 ++++++-------
 umami/preprocessing.py                     | 40 ++++++++-------------
 umami/preprocessing_tools/Configuration.py |  3 +-
 umami/preprocessing_tools/Resampling.py    |  6 ++--
 umami/preprocessing_tools/__init__.py      |  2 +-
 umami/tests/test_preprocessing_tools.py    | 41 +++++++++++++++++++++-
 6 files changed, 72 insertions(+), 44 deletions(-)

diff --git a/umami/configs/DL1r_Variables.yaml b/umami/configs/DL1r_Variables.yaml
index 18a829a1c..536aa35b5 100644
--- a/umami/configs/DL1r_Variables.yaml
+++ b/umami/configs/DL1r_Variables.yaml
@@ -50,15 +50,15 @@ spectator_variables:
   - DL1r_pb
   - DL1r_pu
 
-  custom_defaults_vars:
-    JetFitter_energyFraction: 0
-    JetFitter_significance3d: 0
-    JetFitter_nVTX: -1
-    JetFitter_nSingleTracks: -1
-    JetFitter_nTracksAtVtx: -1
-    JetFitter_N2Tpair: -1
-    SV1_N2Tpair: -1
-    SV1_NGTinSvx: -1
-    SV1_efracsvx: 0
-    secondaryVtx_nTrks: 0
-    secondaryVtx_EFrac: 0
+custom_defaults_vars:
+  JetFitter_energyFraction: 0
+  JetFitter_significance3d: 0
+  JetFitter_nVTX: -1
+  JetFitter_nSingleTracks: -1
+  JetFitter_nTracksAtVtx: -1
+  JetFitter_N2Tpair: -1
+  SV1_N2Tpair: -1
+  SV1_NGTinSvx: -1
+  SV1_efracsvx: 0
+  secondaryVtx_nTrks: 0
+  secondaryVtx_EFrac: 0
diff --git a/umami/preprocessing.py b/umami/preprocessing.py
index 7f5719671..258157bc5 100644
--- a/umami/preprocessing.py
+++ b/umami/preprocessing.py
@@ -45,9 +45,8 @@ def GetParser():
     return args
 
 
-def RunUndersampling():
+def RunUndersampling(args):
     """Applies required cuts to the samples and applies the downsampling."""
-    args = GetParser()
     config = upt.Configuration(args.config_file)
     N_list = upt.GetNJetsPerIteration(config)
 
@@ -157,28 +156,16 @@ def RunUndersampling():
 # python Preprocessing.py --no_writing --downsampled --only_scale
 # --dummy_weights --input_file ${INPUTFILE} -f params_MC16D-2019-VRjets -o ""
 
-def GetScaleDict():
-    args = GetParser()
+def GetScaleDict(args):
     config = upt.Configuration(args.config_file)
-    # TODO: find good way to get file names
+    # TODO: find good way to get file names, breaks if no iterations
     input_file = config.GetFileName(iteration=1, option='downsampled')
     infile_all = h5py.File(input_file, 'r')
 
-    # TODO: add properly Variable config
-    config.variable_config = "/home/fr/fr_fr/fr_mg1150/workspace/btagging/"\
-        "umami/umami/configs/DL1r_Variables.yaml"
-    # TODO: check if dictfile already exists in proper way
-    # dict_dir = "./"
-    # dict_file = "test.json"
-    input_file = "test.h5"
-    print('Preprocessing', input_file)
-
-    with open(config.variable_config, "r") as conf:
+    with open(args.var_dict, "r") as conf:
         variable_config = yaml.load(conf, Loader=yaml_loader)
 
-    var_list = [variable_config["label"], "category"]
-    var_list += variable_config["train_variables"]
-    var_list += variable_config["spectator_variables"]
+    var_list = variable_config["train_variables"]
 
     bjets = pd.DataFrame(infile_all['bjets'][:][var_list])
     cjets = pd.DataFrame(infile_all['cjets'][:][var_list])
@@ -198,20 +185,21 @@ def GetScaleDict():
             # no scaling and shifting is applied to the check variables
             scale_dict.append(upt.dict_in(var, 0., 1., None))
         else:
-            dict_entry = upt.Get_Shift_Scale(
+            dict_entry = upt.GetScales(
                 vec=X[var].values,
-                w=X['weight'].values, varname=var,
+                # TODO: implement weights
+                w=np.ones(len(X)), varname=var,
                 custom_defaults_vars=variable_config["custom_defaults_vars"])
             scale_dict.append(upt.dict_in(*dict_entry))
-
+        
         # save scale/shift dictionary to json file
-        scale_name = '%s/%s.json' % (args.dict_dir, args.dict_file)
-        with open(scale_name, 'w') as outfile:
-            json.dump(scale_dict, outfile, indent=4)
-        print("saved scale dictionary as", scale_name)
+    #     scale_name = '%s/%s.json' % (args.dict_dir, args.dict_file)
+    #     with open(scale_name, 'w') as outfile:
+    #         json.dump(scale_dict, outfile, indent=4)
+    #     print("saved scale dictionary as", scale_name)
 
 
 if __name__ == '__main__':
     args = GetParser()
     # RunDownsampling()
-    # GetScaleDict()
+    GetScaleDict(args)
diff --git a/umami/preprocessing_tools/Configuration.py b/umami/preprocessing_tools/Configuration.py
index bce5d026f..b8d30e77d 100644
--- a/umami/preprocessing_tools/Configuration.py
+++ b/umami/preprocessing_tools/Configuration.py
@@ -52,7 +52,7 @@ class Configuration(object):
                 setattr(self, elem, self.default_config[elem])
 
     def GetFileName(self, iteration=None, option=None):
-        if option is None:
+        if option is None and iteration is None:
             return self.outfile_name
         out_file = self.outfile_name
         idx = out_file.index(".h5")
@@ -63,6 +63,5 @@ class Configuration(object):
             inserttxt = f"-{option}-file-{iteration:.0f}"\
                         f"_{self.iterations:.0f}"
 
-        print(inserttxt)
         out_file = out_file[:idx] + inserttxt + out_file[idx:]
         return out_file
diff --git a/umami/preprocessing_tools/Resampling.py b/umami/preprocessing_tools/Resampling.py
index 143c0918e..40efaaa04 100644
--- a/umami/preprocessing_tools/Resampling.py
+++ b/umami/preprocessing_tools/Resampling.py
@@ -85,8 +85,10 @@ def GetNJetsPerIteration(config):
     return N_list
 
 
-def Get_Shift_Scale(vec, w, varname, custom_defaults_vars):
+def GetScales(vec, w, varname, custom_defaults_vars):
     """Calculates the weighted average and std for vector vec and weight w."""
+    if np.sum(w) == 0:
+        raise ValueError("Sum of weights has to be >0.")
     # find NaN values
     nans = np.isnan(vec)
     # check if variable has predefined default value
@@ -101,7 +103,7 @@ def Get_Shift_Scale(vec, w, varname, custom_defaults_vars):
     vec[nans] = default
     average = np.ma.average(vec, weights=w)
     std = np.sqrt(np.average((vec - average) ** 2, weights=w))
-    return [varname, average, std, default]
+    return varname, average, std, default
 
 
 def dict_in(varname, average, std, default):
diff --git a/umami/preprocessing_tools/__init__.py b/umami/preprocessing_tools/__init__.py
index 79cee7511..78f675b0b 100644
--- a/umami/preprocessing_tools/__init__.py
+++ b/umami/preprocessing_tools/__init__.py
@@ -1,4 +1,4 @@
 # flake8: noqa
-from umami.preprocessing_tools.Resampling import UnderSampling, GetNJetsPerIteration, Get_Shift_Scale
+from umami.preprocessing_tools.Resampling import UnderSampling, GetNJetsPerIteration, GetScales, dict_in
 from umami.preprocessing_tools.Configuration import Configuration
 from umami.preprocessing_tools.Cuts import GetCuts
diff --git a/umami/tests/test_preprocessing_tools.py b/umami/tests/test_preprocessing_tools.py
index 7f21946d2..1c621d489 100644
--- a/umami/tests/test_preprocessing_tools.py
+++ b/umami/tests/test_preprocessing_tools.py
@@ -3,7 +3,7 @@ import numpy as np
 import pandas as pd
 import os
 from umami.preprocessing_tools import UnderSampling, Configuration
-from umami.preprocessing_tools import GetNJetsPerIteration, GetCuts
+from umami.preprocessing_tools import GetNJetsPerIteration, GetCuts, GetScales
 
 
 class UnderSamplingTestCase(unittest.TestCase):
@@ -92,6 +92,11 @@ class ConfigurationTestCase(unittest.TestCase):
         out_file = config.GetFileName(option="test")
         self.assertIn("test", out_file)
 
+    def test_GetFileName_no_iterations_no_input(self):
+        config = Configuration(self.config_file)
+        out_file = config.GetFileName()
+        self.assertEqual(config.outfile_name, out_file)
+
 
 class GetNJetsPerIterationTestCase(unittest.TestCase):
     """
@@ -227,3 +232,37 @@ class PreprocessingTestCuts(unittest.TestCase):
         cut_result = np.ones(len(jets))
         np.put(cut_result, indices_to_remove, 0)
         self.assertTrue(np.array_equal(cut_result, np.array([1, 0, 0])))
+
+
+class GetScalesTestCase(unittest.TestCase):
+    """
+    Test the implementation of the GetScales class.
+    """
+
+    def setUp(self):
+        self.arr_0 = np.zeros(500)
+        self.arr_1 = np.ones(500)
+
+    def test_ZeroCase(self):
+        varname, average, std, default = GetScales(
+            self.arr_0, self.arr_1, "zeros", {})
+        self.assertEqual(average, 0)
+        self.assertEqual(std, 0)
+        self.assertEqual(default, 0)
+
+    def test_ReturnVarname(self):
+        varname, _, _, _ = GetScales(self.arr_0, self.arr_1, "zeros", {})
+        self.assertEqual(varname, "zeros")
+
+    def test_WeightZero(self):
+        with self.assertRaises(ValueError):
+            varname, average, std, default = GetScales(
+                self.arr_1,  self.arr_0, "zeros", {})
+
+    def test_OneCase(self):
+        varname, average, std, default = GetScales(
+            self.arr_1, self.arr_1, "ones", {})
+        self.assertEqual(average, 1)
+        self.assertEqual(std, 0)
+        self.assertEqual(default, 1)
+
-- 
GitLab


From 7e61a0b487ba888d482af38aea31eec2d7b28073 Mon Sep 17 00:00:00 2001
From: Manuel Guth <manuel.guth@cern.ch>
Date: Mon, 3 Feb 2020 11:28:27 +0000
Subject: [PATCH 2/2] switching to slim docker images and correcting linter
 errors

---
 .gitlab-ci.yml                          |  4 ++--
 umami/preprocessing.py                  |  4 ++--
 umami/tests/test_preprocessing.py       | 23 ++++++++++-------------
 umami/tests/test_preprocessing_tools.py |  1 -
 4 files changed, 14 insertions(+), 18 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index b4a6ef254..e2840c7cb 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -4,14 +4,14 @@ stages:
 
 unittest:
   stage: test
-  image: python:3.7
+  image: python:3.7-slim
   script:
     - pip install -r requirements.txt
     - pytest ./umami/tests/ -v
 
 linter:
   stage: test
-  image: python:3.7
+  image: python:3.7-slim
   allow_failure: true
   script:
     - pip install flake8
diff --git a/umami/preprocessing.py b/umami/preprocessing.py
index 258157bc5..7eb88d45a 100644
--- a/umami/preprocessing.py
+++ b/umami/preprocessing.py
@@ -6,7 +6,7 @@ import pandas as pd
 import argparse
 import yaml
 from umami.tools import yaml_loader
-import json
+# import json
 
 
 def GetParser():
@@ -191,7 +191,7 @@ def GetScaleDict(args):
                 w=np.ones(len(X)), varname=var,
                 custom_defaults_vars=variable_config["custom_defaults_vars"])
             scale_dict.append(upt.dict_in(*dict_entry))
-        
+
         # save scale/shift dictionary to json file
     #     scale_name = '%s/%s.json' % (args.dict_dir, args.dict_file)
     #     with open(scale_name, 'w') as outfile:
diff --git a/umami/tests/test_preprocessing.py b/umami/tests/test_preprocessing.py
index 1758f1e9a..00ba72bad 100644
--- a/umami/tests/test_preprocessing.py
+++ b/umami/tests/test_preprocessing.py
@@ -28,18 +28,15 @@ class PreprocessingTestParser(unittest.TestCase):
         parser = GetParser()
         self.assertEqual(parser.config_file, self.config_file)
         self.assertFalse(parser.tracks)
-        # self.assertIsNone(parser.cut_config_file)
 
-#    @mock.patch('argparse.ArgumentParser.parse_args',
-#                return_value=argparse.Namespace(config_file=config_file,
-#                                                tracks=True))
-#     def test_ParserTracks(self, mock_args):
-#        self.parser = GetParser()
-#        self.assertTrue(self.parser.tracks)
 
-#     @mock.patch('argparse.ArgumentParser.parse_args',
-#                 return_value=argparse.Namespace(config_file=config_file,
-#                                                 undersampling=True))
-#     def test_ParserTracks(self, mock_args):
-#         self.parser = GetParser()
-#         self.assertTrue(self.parser.tracks)
+class PreprocessingTestGetScaleDict(unittest.TestCase):
+    """
+    Test the implementation of the GetScaleDict function.
+    """
+    config_file = os.path.join(os.path.dirname(__file__),
+                               "test_preprocess_config.yaml")
+
+    def setUp(self):
+        self.config_file = os.path.join(os.path.dirname(__file__),
+                                        "test_preprocess_config.yaml")
diff --git a/umami/tests/test_preprocessing_tools.py b/umami/tests/test_preprocessing_tools.py
index 1c621d489..aa0c76b65 100644
--- a/umami/tests/test_preprocessing_tools.py
+++ b/umami/tests/test_preprocessing_tools.py
@@ -265,4 +265,3 @@ class GetScalesTestCase(unittest.TestCase):
         self.assertEqual(average, 1)
         self.assertEqual(std, 0)
         self.assertEqual(default, 1)
-
-- 
GitLab