diff --git a/freeforestml/model.py b/freeforestml/model.py index 24d42abf95fcb799c416f4f574b9488874d338d8..7d7e7ce4efc53310859b65007d8a81bcca7ab4ae 100644 --- a/freeforestml/model.py +++ b/freeforestml/model.py @@ -69,7 +69,7 @@ class CrossValidator(ABC): """ @abstractmethod - def select_training(self, df, fold_i): + def select_training(self, df, fold_i, for_predicting = False): """ Returns the index array to select all training events from the dataset for the given fold. @@ -89,7 +89,7 @@ class CrossValidator(ABC): given fold. """ - def select_cv_set(self, df, cv, fold_i): + def select_cv_set(self, df, cv, fold_i, for_predicting = False): """ Returns the index array to select all events from the cross validator set specified with cv ('train', 'val', 'test') for the given fold. @@ -98,7 +98,7 @@ class CrossValidator(ABC): raise ValueError("Argument 'cv' must be one of 'train', 'val', " "'test', 'all'; but was %s." % repr(cv)) if cv == "train": - selected = self.select_training(df, fold_i) + selected = self.select_training(df, fold_i, for_predicting = for_predicting) elif cv == "val": selected = self.select_validation(df, fold_i) else: @@ -108,11 +108,15 @@ class CrossValidator(ABC): def retrieve_fold_info(self, df, cv): """ Returns and array of integers to specify which event was used - for train/val/test in which fold + for train/val/test in which fold. Mostly useful for the inference/predict + step. For cross validators with a high number of folds, so that an event + is used in multiple folds for the training set, a single fold number is + retrieved so that the folds are equally represented in the predicted + training data. """ fold_info = np.zeros(len(df), dtype='bool') - 1 for fold_i in range(self.k): - selected = self.select_cv_set(df, cv, fold_i) + selected = self.select_cv_set(df, cv, fold_i, True) fold_info[selected] = fold_i return fold_info @@ -190,7 +194,6 @@ class ClassicalCV(CrossValidator): continue selected = selected | self.select_slice(df, slice_i) - return selected def select_validation(self, df, fold_i): @@ -207,9 +210,8 @@ class ClassicalCV(CrossValidator): """ selected = np.zeros(len(df), dtype='bool') for slice_i in range(self.k, self.k * 2): - selected = selected | self.select_slice(df, slice_i) - + return selected @@ -357,18 +359,47 @@ class MixedCV(CrossValidator): return (slice_id / self.k <= variable) \ & (variable < (slice_id + 1.0) / self.k) - def select_training(self, df, fold_i): + def select_training_slices(self, fold_i, for_predicting = False): + """ + Returns array with integers corresponding + to the data slices used in training fold_i. + If 'for_predicting' is set to True only one slice + is returned for each fold so that the folds are equally represented + in the predicted training data. + """ + all_slices_for_folds = [] + for fold in range(self.k): + all_slices_for_folds.append([]) + for slice_i in range(self.k): + if (slice_i + fold) % self.k == self.k - 1: + continue + if (slice_i + fold) % self.k == self.k - 2: + continue + all_slices_for_folds[-1].append(slice_i) + + # if we select the slices for training we are done + if not for_predicting: return all_slices_for_folds[fold_i] + + # all_slices_for_folds looks e.g. like: + # [[0, 1, 2], [0, 1, 4], [0, 3, 4], [2, 3, 4], [1, 2, 3]] + # need to select array with uniq entries: + # [0, 1, 2, 4, 3] + uniq_el = lambda ar: set(x for l in ar for x in l) + exclusive_slices = [] + for i, slices in enumerate(all_slices_for_folds): + for sl in slices: + if sl not in exclusive_slices and sl in uniq_el(all_slices_for_folds[i:]): + exclusive_slices.append(sl) + return [exclusive_slices[fold_i]] + + def select_training(self, df, fold_i, for_predicting = False): """ Returns the index array to select all training events from the dataset for the given fold. """ selected = np.zeros(len(df), dtype='bool') - for slice_i in range(self.k): - if (slice_i + fold_i) % self.k == self.k - 1: - continue - if (slice_i + fold_i) % self.k == self.k - 2: - continue - + slices = self.select_training_slices(fold_i, for_predicting = for_predicting) + for slice_i in slices: selected = selected | self.select_slice(df, slice_i) return selected @@ -720,7 +751,7 @@ class HepNet: norm = self.norms[fold_i] # identify fold - selected = self.cv.select_cv_set(df, cv, fold_i) + selected = self.cv.select_cv_set(df, cv, fold_i, for_predicting = True) test_set |= selected out[selected] = model.predict(norm(df[selected][self.input_list]),