diff --git a/freeforestml/model.py b/freeforestml/model.py index de45ed5bda1113c178f76554caca077e7554f427..3f42c22e69850ea117657dbc185bc23a27b61a6d 100644 --- a/freeforestml/model.py +++ b/freeforestml/model.py @@ -7,6 +7,7 @@ import json import numpy as np import pandas as pd + import tensorflow from freeforestml.variable import Variable @@ -69,7 +70,7 @@ class CrossValidator(ABC): """ @abstractmethod - def select_training(self, df, fold_i): + def select_training(self, df, fold_i, for_predicting = False): """ Returns the index array to select all training events from the dataset for the given fold. @@ -89,7 +90,7 @@ class CrossValidator(ABC): given fold. """ - def select_cv_set(self, df, cv, fold_i): + def select_cv_set(self, df, cv, fold_i, for_predicting = False): """ Returns the index array to select all events from the cross validator set specified with cv ('train', 'val', 'test') for the given fold. @@ -98,7 +99,7 @@ class CrossValidator(ABC): raise ValueError("Argument 'cv' must be one of 'train', 'val', " "'test', 'all'; but was %s." % repr(cv)) if cv == "train": - selected = self.select_training(df, fold_i) + selected = self.select_training(df, fold_i, for_predicting = for_predicting) elif cv == "val": selected = self.select_validation(df, fold_i) else: @@ -108,11 +109,15 @@ class CrossValidator(ABC): def retrieve_fold_info(self, df, cv): """ Returns and array of integers to specify which event was used - for train/val/test in which fold + for train/val/test in which fold. Mostly useful for the inference/predict + step. For cross validators with a high number of folds, so that an event + is used in multiple folds for the training set, a single fold number is + retrieved so that the folds are equally represented in the predicted + training data. """ fold_info = np.zeros(len(df), dtype='bool') - 1 for fold_i in range(self.k): - selected = self.select_cv_set(df, cv, fold_i) + selected = self.select_cv_set(df, cv, fold_i, True) fold_info[selected] = fold_i return fold_info @@ -179,7 +184,7 @@ class ClassicalCV(CrossValidator): return (slice_id / (self.k * 2.0) <= variable) \ & (variable < (slice_id + 1.0) / (self.k * 2)) - def select_training(self, df, fold_i): + def select_training(self, df, fold_i, for_predicting = False): """ Returns the index array to select all training events from the dataset for the given fold. @@ -190,7 +195,6 @@ class ClassicalCV(CrossValidator): continue selected = selected | self.select_slice(df, slice_i) - return selected def select_validation(self, df, fold_i): @@ -207,9 +211,8 @@ class ClassicalCV(CrossValidator): """ selected = np.zeros(len(df), dtype='bool') for slice_i in range(self.k, self.k * 2): - selected = selected | self.select_slice(df, slice_i) - + return selected @@ -247,7 +250,7 @@ class NoTestCV(CrossValidator): return (slice_id / self.k <= variable) \ & (variable < (slice_id + 1.0) / self.k) - def select_training(self, df, fold_i): + def select_training(self, df, fold_i, for_predicting = False): """ Returns the index array to select all training events from the dataset. The fold_i parameter has no effect. @@ -307,7 +310,7 @@ class BinaryCV(CrossValidator): return (slice_id / self.k <= variable) \ & (variable < (slice_id + 1.0) / self.k) - def select_training(self, df, fold_i): + def select_training(self, df, fold_i, for_predicting = False): """ Returns the index array to select all training events from the dataset for the given fold. @@ -357,18 +360,47 @@ class MixedCV(CrossValidator): return (slice_id / self.k <= variable) \ & (variable < (slice_id + 1.0) / self.k) - def select_training(self, df, fold_i): + def select_training_slices(self, fold_i, for_predicting = False): + """ + Returns array with integers corresponding + to the data slices used in training fold_i. + If 'for_predicting' is set to True only one slice + is returned for each fold so that the folds are equally represented + in the predicted training data. + """ + all_slices_for_folds = [] + for fold in range(self.k): + all_slices_for_folds.append([]) + for slice_i in range(self.k): + if (slice_i + fold) % self.k == self.k - 1: + continue + if (slice_i + fold) % self.k == self.k - 2: + continue + all_slices_for_folds[-1].append(slice_i) + + # if we select the slices for training we are done + if not for_predicting: return all_slices_for_folds[fold_i] + + # all_slices_for_folds looks e.g. like: + # [[0, 1, 2], [0, 1, 4], [0, 3, 4], [2, 3, 4], [1, 2, 3]] + # need to select array with uniq entries: + # [0, 1, 2, 4, 3] + uniq_el = lambda ar: set(x for l in ar for x in l) + exclusive_slices = [] + for i, slices in enumerate(all_slices_for_folds): + for sl in slices: + if sl not in exclusive_slices and sl in uniq_el(all_slices_for_folds[i:]): + exclusive_slices.append(sl) + return [exclusive_slices[fold_i]] + + def select_training(self, df, fold_i, for_predicting = False): """ Returns the index array to select all training events from the dataset for the given fold. """ selected = np.zeros(len(df), dtype='bool') - for slice_i in range(self.k): - if (slice_i + fold_i) % self.k == self.k - 1: - continue - if (slice_i + fold_i) % self.k == self.k - 2: - continue - + slices = self.select_training_slices(fold_i, for_predicting = for_predicting) + for slice_i in slices: selected = selected | self.select_slice(df, slice_i) return selected @@ -633,14 +665,20 @@ class HepNet: return True - def fit(self, df, weight=None, **kwds): + def fit(self, df, train_weight=None, event_weight=None, all_callbacks=None, **kwds): """ Calls fit() on all folds. All kwds are passed to fit(). """ - if weight is None: - weight = Variable("unity", lambda d: np.ones(len(d))) - elif isinstance(weight, str): - weight = Variable(weight, weight) + # For the weights we have event weights and sample weights. + # We want to distinguish between both to use custom callbacks + if train_weight is None: + train_weight = Variable("unity", lambda d: np.ones(len(d))) + elif isinstance(train_weight, str): + train_weight = Variable(train_weight, train_weight) + if event_weight is None: + event_weight = Variable("unity", lambda d: np.ones(len(d))) + elif isinstance(event_weight, str): + event_weight = Variable(event_weight, event_weight) ### Loop over folds: self.norms = [] @@ -666,15 +704,27 @@ class HepNet: model = self.model_cls() self.models.append(model) - + # check for and initialize custom callbacks + lazily_initialized_callbacks = [] + lazily_initialized_callbacks_names = [] + for cc in all_callbacks: + if cc == "Z0Callback": # callback that retrieves significance + lazily_initialized_callbacks.append(Z0Callback(validation_df[self.input_list], + validation_df[self.output_list], + # only use event weights, no sample weights + np.array(event_weight(validation_df)) )) + lazily_initialized_callbacks_names.append(cc) + callbacks = [c for c in all_callbacks if not c in lazily_initialized_callbacks_names] + lazily_initialized_callbacks + history = model.fit(training_df[self.input_list], training_df[self.output_list], validation_data=( validation_df[self.input_list], validation_df[self.output_list], - np.array(weight(validation_df)), + np.array(train_weight(validation_df)), ), - sample_weight=np.array(weight(training_df)), + sample_weight=np.array(train_weight(training_df)), + callbacks = callbacks, **kwds) history = history.history @@ -702,7 +752,7 @@ class HepNet: norm = self.norms[fold_i] # identify fold - selected = self.cv.select_cv_set(df, cv, fold_i) + selected = self.cv.select_cv_set(df, cv, fold_i, for_predicting = True) test_set |= selected out[selected] = model.predict(norm(df[selected][self.input_list]), @@ -739,6 +789,9 @@ class HepNet: with h5py.File(path, "w") as output_file: # save default model class # since this is a arbitrary piece of python code we need to use the python_to_str function + # The following is not working with tensorflow 2.0 and disabled eager_execution + # the following error is thrown: + # NotImplementedError: numpy() is only available when eager execution is enabled. group = output_file.create_group("models/default") group.attrs["model_cls"] = np.string_(python_to_str(self.model_cls)) @@ -856,3 +909,37 @@ class HepNet: f"{path_base}_vars_{fold_i}.json " f"{path_base}_wght_{fold_i}.h5 " f"> {path_base}_{fold_i}.json", file=script_file) + +class Z0Callback(tensorflow.keras.callbacks.Callback): + + def __init__(self, X_valid=0, Y_valid=0, W_valid = 0): + self.X_valid = np.array(X_valid) + self.Y_valid = np.array(Y_valid) + self.W_valid = np.array(W_valid) + self.W_valid = self.W_valid.reshape((self.W_valid.shape[0], 1)) + + def add_to_history(self, Z0): + if "Z0" in self.model.history.history.keys(): + self.model.history.history["Z0"].append(Z0) + else: # first epoch + self.model.history.history["Z0"] = [Z0] + + def on_epoch_end(self, epoch, logs=None): + + y_pred = np.array(self.model.predict(self.X_valid, batch_size=4096)) + w_bkg = self.W_valid[self.Y_valid == 0] + w_sig = self.W_valid[self.Y_valid == 1] + y_bkg = y_pred[self.Y_valid == 0] + y_sig = y_pred[self.Y_valid == 1] + + c_sig , edges = np.histogram(y_sig, 20, weights=w_sig, range = (0, 1)) + c_bkg , edges = np.histogram(y_bkg, 20, weights=w_bkg, range = (0, 1)) + + Z0_func = lambda s, b: np.sqrt( 2*((s+b)* np.log1p (s/b) - s)) + z_list = [Z0_func(si, bi) for si, bi in zip(c_sig, c_bkg) if bi > 0 and si > 0] + Z0 = np.sqrt( np.sum( np.square(z_list) ) ) + + self.add_to_history(Z0) + + print("\nINFO: Significance in epoch {} is Z0 = {}".format(epoch, Z0)) + diff --git a/freeforestml/plot.py b/freeforestml/plot.py index 3eda8b4b8ed6fbbfcbb5a053dbf329eef346f08a..4a367a7819bff9723eea1f520e5986d0061333d1 100644 --- a/freeforestml/plot.py +++ b/freeforestml/plot.py @@ -333,9 +333,9 @@ def hist(dataframe, variable, bins, stacks, selection=None, for yield_name in stack_item.yield_names] return sum(process_totals) - if y_log: - for stack in uhepp_obj.stacks: - stack.content.sort(key=lambda x: total_stackitem(uhepp_obj, x)) + #if y_log: + #for stack in uhepp_obj.stacks: + #stack.content.sort(key=lambda x: total_stackitem(uhepp_obj, x)) ################## # Vertical lines @@ -579,7 +579,7 @@ def correlation_matrix(df, variables, columns=[v.name for v in variables], index=[v.name for v in variables]) - sns.heatmap(data * 100, **dict(vmin=-100, vmax=100, cmap="RdBu_r", ax=axes, + res = sns.heatmap(data * 100, **dict(vmin=-100, vmax=100, cmap="RdBu_r", ax=axes, cbar_kws={ 'label': "Correlation coefficient $\\rho \\cdot 100$" } @@ -591,4 +591,4 @@ def correlation_matrix(df, variables, atlasify(*fill_labels(atlas, info), axes=axes, outside=True) figure.subplots_adjust(top=1/enlarge) - return data + return res, data diff --git a/freeforestml/stack.py b/freeforestml/stack.py index 0aba6319ba80725e335d7bde08c74157bf090d90..ba6511175b15866885961327ac2c2698e986d56e 100644 --- a/freeforestml/stack.py +++ b/freeforestml/stack.py @@ -68,7 +68,7 @@ class Stack: self.data_uncertainties.append(data_uncertainty) self.aux.append(aux) - def get_hist(self, df, i, bins, variable, weight, include_outside=False): + def get_hist(self, df, i, bins, variable, weight, range = None, include_outside=False): """ Returns the yields per bin for the i-th process in the stack. The bins argument specifies the bin edges. @@ -93,7 +93,7 @@ class Stack: else: func = np.histogram - total, _ = func(variable, bins=bins, weights=weight) + total, _ = func(variable, bins=bins, weights=weight, range = range) return total def get_total(self, df, bins, variable, weight, include_outside=False): @@ -108,7 +108,7 @@ class Stack: return total - def get_uncertainty(self, df, i, bins, variable, weight, + def get_uncertainty(self, df, i, bins, variable, weight, range = None, include_outside=False): """ Returns the uncertainty of the total yield per bin. The bins argument @@ -121,11 +121,11 @@ class Stack: uncertainty_2 = 0 if self.is_data_uncertainty(i): - uncertainty_2 += self.get_hist(df, i, bins, variable, weight, + uncertainty_2 += self.get_hist(df, i, bins, variable, weight, range = range, include_outside=include_outside) else: uncertainty_2 += self.get_hist(df, i, bins, variable, - weight=weight_2, + weight=weight_2, range = range, include_outside=include_outside) return np.sqrt(uncertainty_2) diff --git a/freeforestml/tests/test_model.py b/freeforestml/tests/test_model.py index d7c9b809bd5fd749bedd6d0a1c92e24f172ced70..c596b988e1340719c46e29afb1ba90eb3d82b231 100644 --- a/freeforestml/tests/test_model.py +++ b/freeforestml/tests/test_model.py @@ -908,8 +908,7 @@ class HepNetTestCase(unittest.TestCase): df["is_sig"] = (df.fpid == 1) df["is_ztt"] = (df.fpid == 0) - net.fit(df.compute(), epochs=5, verbose=0, - weight=Variable("weight", "weight")) + net.fit(df, epochs=5, verbose=0) fd, path = tempfile.mkstemp()