Commit 29f6af12 authored by Manuel Guth's avatar Manuel Guth
Browse files

new pdf sampling prototype

parent 9c779889
import numpy as np
from umami.preprocessing import PDFSampling
# create some dummy data
x = np.random.default_rng().normal(size=1000)
y = np.random.default_rng().normal(1, 2, size=1000)
# get 2d histogram of our dummy data
h_original, x_bins, y_bins = np.histogram2d(x, y, [4, 5])
# calculate a custom function
pt = np.cos(x ** 2) + np.sin(x + y) + np.exp(x)
eta = 20 - y ** 2
h_target, _, _ = np.histogram2d(pt, eta, bins=[x_bins, y_bins])
ps = PDFSampling()
ps.CalculatePDFRatio(h_target, h_original, x_bins, y_bins)
ps.save("custom-pdf.pkl")
import pickle
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
......@@ -19,7 +21,7 @@ bjets = df.query("HadronConeExclTruthLabelID==5")
cjets = df.query("HadronConeExclTruthLabelID==4")
def GetInterpolation(x_y_target, x_y_original, bins=[100, 9]):
def GetInterpolation(x_y_target, x_y_original, bins=[100, 9], ratio_max=1.0):
# filling histograms
h_target, x_bins, y_bins = np.histogram2d(
x_y_target[0], x_y_target[1], bins
......@@ -46,7 +48,7 @@ def GetInterpolation(x_y_target, x_y_original, bins=[100, 9]):
),
where=(h_original != 0),
)
ratio = ratio / ratio.max() * ratio_max
# calculate interpolation function
print("retrieve interpolation function")
return RectBivariateSpline(x_bins, y_bins, ratio), x_bins, y_bins
......@@ -61,6 +63,11 @@ f, x_bins, y_bins = GetInterpolation(
x_y_original=[cjets["pt_btagJes"], cjets["absEta_btagJes"]],
)
with open("pdf-b_c.pkl", "wb") as file:
pickle.dump(f, file)
with open("pdf-b_c.pkl", "rb") as file:
f = pickle.load(file)
# read in an orthogonal sample to apply the re-sampling
df_val = pd.read_hdf(f"{path}/{file_valid}", key="jets")
df_val.query("pt_btagJes<250000", inplace=True)
......@@ -81,16 +88,36 @@ def Resample(f, x, y, size):
indices, p=r_resamp, size=size
)
return x[sampled_indices], y[sampled_indices], sampled_indices
def ResampleSingle(f, x, y):
r_resamp = f.ev(np.asarray(x), np.asarray(y))
rnd_numbers = np.random.default_rng().uniform(0, 1, len(r_resamp))
sampled_indices = np.where(rnd_numbers < r_resamp)
return x[sampled_indices], y[sampled_indices]
x_resampled, y_resampled = Resample(
x_resampled, y_resampled = ResampleSingle(
f,
x=cjets_val["pt_btagJes"].values,
y=cjets_val["absEta_btagJes"].values,
size=len(bjet_pt),
)
print("brefore sampling:", len(cjets_val))
print("after sampling:", len(x_resampled))
x_resampled, y_resampled, sampled_indices = Resample(
f,
x=cjets_val["pt_btagJes"].values,
y=cjets_val["absEta_btagJes"].values,
size=len(bjets_val),
)
(unique, counts) = np.unique(sampled_indices, return_counts=True)
frequencies = np.asarray((unique, counts)).T
print(frequencies)
print(np.average(counts))
print(np.max(counts))
counts_b, Bins_b = np.histogram(
bjets_val["pt_btagJes"],
......@@ -100,11 +127,13 @@ counts_c, Bins_c = np.histogram(
x_resampled,
bins=x_bins,
)
norm_b = counts_b.sum()
norm_c = counts_c.sum()
plt.hist(
x=Bins_b[:-1] / 1e3,
bins=Bins_b / 1e3,
weights=(counts_b),
weights=(counts_b / norm_b),
histtype="step",
linewidth=1.0,
color="#1f77b4",
......@@ -116,7 +145,7 @@ plt.hist(
plt.hist(
x=Bins_b[:-1] / 1e3,
bins=Bins_b / 1e3,
weights=(counts_c),
weights=(counts_c / norm_c),
histtype="step",
linewidth=1.0,
color="#ff7f0e",
......@@ -141,7 +170,7 @@ counts_c, Bins_c = np.histogram(
plt.hist(
x=Bins_b[:-1],
bins=Bins_b,
weights=(counts_b),
weights=(counts_b / norm_b),
histtype="step",
linewidth=1.0,
color="#1f77b4",
......@@ -153,7 +182,7 @@ plt.hist(
plt.hist(
x=Bins_b[:-1],
bins=Bins_b,
weights=(counts_c),
weights=(counts_c / norm_c),
histtype="step",
linewidth=1.0,
color="#ff7f0e",
......
import pickle
import numpy as np
from scipy.interpolate import RectBivariateSpline
from umami.configuration import logger
class PDFSampling(object):
"""
......@@ -13,31 +17,43 @@ class PDFSampling(object):
super(PDFSampling, self).__init__()
self.target_x = 1
self.target_y = 1
self.f = None
def GetInterpolation(x_y_target, x_y_original, bins=[100, 9]):
def CalculatePDF(
self, x_y_target, x_y_original, bins=[100, 9], ratio_max: float = 1
):
# from data points
# filling histograms
h_target, x_bins, y_bins = np.histogram2d(
h_target, self._x_bin_edges, self._y_bin_edges = np.histogram2d(
x_y_target[0], x_y_target[1], bins
)
h_original, _, _ = np.histogram2d(
x_y_original[0], x_y_original[1], [x_bins, y_bins]
x_y_original[0],
x_y_original[1],
[self._x_bin_edges, self._y_bin_edges],
)
self.CalculatePDFRatio(
h_target,
h_original,
self._x_bin_edges,
self._y_bin_edges,
ratio_max=ratio_max,
)
def CalculatePDFRatio(
self, h_target, h_original, x_bins, y_bins, ratio_max: float = 1
):
# normalise the histograms to unity
h_target = h_target / np.sum(h_target)
h_original = h_original / np.sum(h_original)
# transform bin edges to bin centres
x_bins = (x_bins[:-1] + x_bins[1:]) / 2
y_bins = (y_bins[:-1] + y_bins[1:]) / 2
self.x_bins = (x_bins[:-1] + x_bins[1:]) / 2
self.y_bins = (y_bins[:-1] + y_bins[1:]) / 2
# calculating the ratio of the reference distribution w.r.t. the target distribution
ratio = np.divide(
h_target,
h_original,
......@@ -47,6 +63,51 @@ class PDFSampling(object):
),
where=(h_original != 0),
)
# setting max ratio value
self._ratio = ratio / ratio.max() * ratio_max
# calculate interpolation function
print("retrieve interpolation function")
return RectBivariateSpline(x_bins, y_bins, ratio), x_bins, y_bins
logger.info("retrieve interpolation function")
self.f = RectBivariateSpline(self.x_bins, self.y_bins, self._ratio)
def save(self, file_name: str):
# check if self.f not None
with open(file_name, "wb") as file:
pickle.dump(self.f, file)
def load(self, file_name: str):
with open(file_name, "rb") as file:
self.f = pickle.load(file)
# the resampling is so far only working for a batch which is being normalised
# TODO: rename
def inMemoryResample(self, x, y, size):
r_resamp = self.f.ev(x, y)
indices = np.where(r_resamp >= 0)[0]
r_resamp = r_resamp[indices]
r_resamp = r_resamp / np.sum(r_resamp)
sampled_indices = np.random.default_rng().choice(
indices, p=r_resamp, size=size
)
return x[sampled_indices], y[sampled_indices]
# TODO: rename
def Resample(self, x, y):
# should be able to take an array as well as single values
r_resamp = self.f.ev(np.asarray(x), np.asarray(y))
rnd_numbers = np.random.default_rng().uniform(0, 1, len(r_resamp))
sampled_indices = np.where(rnd_numbers < r_resamp)
return x[sampled_indices], y[sampled_indices]
def __call__(
self,
):
# define custom functions
a = 1
print(a)
@property
def ratio(self):
return self._ratio
......@@ -9,19 +9,20 @@ from umami.preprocessing_tools.Merging import (
create_datasets,
get_size,
)
from umami.preprocessing_tools.PDF_Sampling import PDFSampling
from umami.preprocessing_tools.Preparation import get_jets
from umami.preprocessing_tools.Resampling import (
EnforceFraction,
Gen_default_dict,
GetNJetsPerIteration,
GetScales,
RunSampling,
RunStatSamples,
UnderSampling,
UnderSamplingProp,
UnderSamplingTemplate,
Weighting2D,
dict_in,
EnforceFraction,
RunStatSamples,
RunSampling,
)
from umami.preprocessing_tools.utils import (
GetBinaryLabels,
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment