Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
Tomke Schroer
Umami
Commits
29f6af12
Commit
29f6af12
authored
Aug 06, 2021
by
Manuel Guth
Browse files
new pdf sampling prototype
parent
9c779889
Changes
4
Hide whitespace changes
Inline
Side-by-side
examples/custom-pdf.py
0 → 100644
View file @
29f6af12
import
numpy
as
np
from
umami.preprocessing
import
PDFSampling
# create some dummy data
x
=
np
.
random
.
default_rng
().
normal
(
size
=
1000
)
y
=
np
.
random
.
default_rng
().
normal
(
1
,
2
,
size
=
1000
)
# get 2d histogram of our dummy data
h_original
,
x_bins
,
y_bins
=
np
.
histogram2d
(
x
,
y
,
[
4
,
5
])
# calculate a custom function
pt
=
np
.
cos
(
x
**
2
)
+
np
.
sin
(
x
+
y
)
+
np
.
exp
(
x
)
eta
=
20
-
y
**
2
h_target
,
_
,
_
=
np
.
histogram2d
(
pt
,
eta
,
bins
=
[
x_bins
,
y_bins
])
ps
=
PDFSampling
()
ps
.
CalculatePDFRatio
(
h_target
,
h_original
,
x_bins
,
y_bins
)
ps
.
save
(
"custom-pdf.pkl"
)
sampling-tests/sampling-test.py
View file @
29f6af12
import
pickle
import
matplotlib.pyplot
as
plt
import
numpy
as
np
import
pandas
as
pd
...
...
@@ -19,7 +21,7 @@ bjets = df.query("HadronConeExclTruthLabelID==5")
cjets
=
df
.
query
(
"HadronConeExclTruthLabelID==4"
)
def
GetInterpolation
(
x_y_target
,
x_y_original
,
bins
=
[
100
,
9
]):
def
GetInterpolation
(
x_y_target
,
x_y_original
,
bins
=
[
100
,
9
]
,
ratio_max
=
1.0
):
# filling histograms
h_target
,
x_bins
,
y_bins
=
np
.
histogram2d
(
x_y_target
[
0
],
x_y_target
[
1
],
bins
...
...
@@ -46,7 +48,7 @@ def GetInterpolation(x_y_target, x_y_original, bins=[100, 9]):
),
where
=
(
h_original
!=
0
),
)
ratio
=
ratio
/
ratio
.
max
()
*
ratio_max
# calculate interpolation function
print
(
"retrieve interpolation function"
)
return
RectBivariateSpline
(
x_bins
,
y_bins
,
ratio
),
x_bins
,
y_bins
...
...
@@ -61,6 +63,11 @@ f, x_bins, y_bins = GetInterpolation(
x_y_original
=
[
cjets
[
"pt_btagJes"
],
cjets
[
"absEta_btagJes"
]],
)
with
open
(
"pdf-b_c.pkl"
,
"wb"
)
as
file
:
pickle
.
dump
(
f
,
file
)
with
open
(
"pdf-b_c.pkl"
,
"rb"
)
as
file
:
f
=
pickle
.
load
(
file
)
# read in an orthogonal sample to apply the re-sampling
df_val
=
pd
.
read_hdf
(
f
"
{
path
}
/
{
file_valid
}
"
,
key
=
"jets"
)
df_val
.
query
(
"pt_btagJes<250000"
,
inplace
=
True
)
...
...
@@ -81,16 +88,36 @@ def Resample(f, x, y, size):
indices
,
p
=
r_resamp
,
size
=
size
)
return
x
[
sampled_indices
],
y
[
sampled_indices
],
sampled_indices
def
ResampleSingle
(
f
,
x
,
y
):
r_resamp
=
f
.
ev
(
np
.
asarray
(
x
),
np
.
asarray
(
y
))
rnd_numbers
=
np
.
random
.
default_rng
().
uniform
(
0
,
1
,
len
(
r_resamp
))
sampled_indices
=
np
.
where
(
rnd_numbers
<
r_resamp
)
return
x
[
sampled_indices
],
y
[
sampled_indices
]
x_resampled
,
y_resampled
=
Resample
(
x_resampled
,
y_resampled
=
Resample
Single
(
f
,
x
=
cjets_val
[
"pt_btagJes"
].
values
,
y
=
cjets_val
[
"absEta_btagJes"
].
values
,
size
=
len
(
bjet_pt
),
)
print
(
"brefore sampling:"
,
len
(
cjets_val
))
print
(
"after sampling:"
,
len
(
x_resampled
))
x_resampled
,
y_resampled
,
sampled_indices
=
Resample
(
f
,
x
=
cjets_val
[
"pt_btagJes"
].
values
,
y
=
cjets_val
[
"absEta_btagJes"
].
values
,
size
=
len
(
bjets_val
),
)
(
unique
,
counts
)
=
np
.
unique
(
sampled_indices
,
return_counts
=
True
)
frequencies
=
np
.
asarray
((
unique
,
counts
)).
T
print
(
frequencies
)
print
(
np
.
average
(
counts
))
print
(
np
.
max
(
counts
))
counts_b
,
Bins_b
=
np
.
histogram
(
bjets_val
[
"pt_btagJes"
],
...
...
@@ -100,11 +127,13 @@ counts_c, Bins_c = np.histogram(
x_resampled
,
bins
=
x_bins
,
)
norm_b
=
counts_b
.
sum
()
norm_c
=
counts_c
.
sum
()
plt
.
hist
(
x
=
Bins_b
[:
-
1
]
/
1e3
,
bins
=
Bins_b
/
1e3
,
weights
=
(
counts_b
),
weights
=
(
counts_b
/
norm_b
),
histtype
=
"step"
,
linewidth
=
1.0
,
color
=
"#1f77b4"
,
...
...
@@ -116,7 +145,7 @@ plt.hist(
plt
.
hist
(
x
=
Bins_b
[:
-
1
]
/
1e3
,
bins
=
Bins_b
/
1e3
,
weights
=
(
counts_c
),
weights
=
(
counts_c
/
norm_c
),
histtype
=
"step"
,
linewidth
=
1.0
,
color
=
"#ff7f0e"
,
...
...
@@ -141,7 +170,7 @@ counts_c, Bins_c = np.histogram(
plt
.
hist
(
x
=
Bins_b
[:
-
1
],
bins
=
Bins_b
,
weights
=
(
counts_b
),
weights
=
(
counts_b
/
norm_b
),
histtype
=
"step"
,
linewidth
=
1.0
,
color
=
"#1f77b4"
,
...
...
@@ -153,7 +182,7 @@ plt.hist(
plt
.
hist
(
x
=
Bins_b
[:
-
1
],
bins
=
Bins_b
,
weights
=
(
counts_c
),
weights
=
(
counts_c
/
norm_c
),
histtype
=
"step"
,
linewidth
=
1.0
,
color
=
"#ff7f0e"
,
...
...
umami/preprocessing_tools/PDF_Sampling.py
View file @
29f6af12
import
pickle
import
numpy
as
np
from
scipy.interpolate
import
RectBivariateSpline
from
umami.configuration
import
logger
class
PDFSampling
(
object
):
"""
...
...
@@ -13,31 +17,43 @@ class PDFSampling(object):
super
(
PDFSampling
,
self
).
__init__
()
self
.
target_x
=
1
self
.
target_y
=
1
self
.
f
=
None
def
GetInterpolation
(
x_y_target
,
x_y_original
,
bins
=
[
100
,
9
]):
def
CalculatePDF
(
self
,
x_y_target
,
x_y_original
,
bins
=
[
100
,
9
],
ratio_max
:
float
=
1
):
# from data points
# filling histograms
h_target
,
x_bins
,
y_bin
s
=
np
.
histogram2d
(
h_target
,
self
.
_x_bin_edges
,
self
.
_y_bin_edge
s
=
np
.
histogram2d
(
x_y_target
[
0
],
x_y_target
[
1
],
bins
)
h_original
,
_
,
_
=
np
.
histogram2d
(
x_y_original
[
0
],
x_y_original
[
1
],
[
x_bins
,
y_bins
]
x_y_original
[
0
],
x_y_original
[
1
],
[
self
.
_x_bin_edges
,
self
.
_y_bin_edges
],
)
self
.
CalculatePDFRatio
(
h_target
,
h_original
,
self
.
_x_bin_edges
,
self
.
_y_bin_edges
,
ratio_max
=
ratio_max
,
)
def
CalculatePDFRatio
(
self
,
h_target
,
h_original
,
x_bins
,
y_bins
,
ratio_max
:
float
=
1
):
# normalise the histograms to unity
h_target
=
h_target
/
np
.
sum
(
h_target
)
h_original
=
h_original
/
np
.
sum
(
h_original
)
# transform bin edges to bin centres
x_bins
=
(
x_bins
[:
-
1
]
+
x_bins
[
1
:])
/
2
y_bins
=
(
y_bins
[:
-
1
]
+
y_bins
[
1
:])
/
2
self
.
x_bins
=
(
x_bins
[:
-
1
]
+
x_bins
[
1
:])
/
2
self
.
y_bins
=
(
y_bins
[:
-
1
]
+
y_bins
[
1
:])
/
2
# calculating the ratio of the reference distribution w.r.t. the target distribution
ratio
=
np
.
divide
(
h_target
,
h_original
,
...
...
@@ -47,6 +63,51 @@ class PDFSampling(object):
),
where
=
(
h_original
!=
0
),
)
# setting max ratio value
self
.
_ratio
=
ratio
/
ratio
.
max
()
*
ratio_max
# calculate interpolation function
print
(
"retrieve interpolation function"
)
return
RectBivariateSpline
(
x_bins
,
y_bins
,
ratio
),
x_bins
,
y_bins
logger
.
info
(
"retrieve interpolation function"
)
self
.
f
=
RectBivariateSpline
(
self
.
x_bins
,
self
.
y_bins
,
self
.
_ratio
)
def
save
(
self
,
file_name
:
str
):
# check if self.f not None
with
open
(
file_name
,
"wb"
)
as
file
:
pickle
.
dump
(
self
.
f
,
file
)
def
load
(
self
,
file_name
:
str
):
with
open
(
file_name
,
"rb"
)
as
file
:
self
.
f
=
pickle
.
load
(
file
)
# the resampling is so far only working for a batch which is being normalised
# TODO: rename
def
inMemoryResample
(
self
,
x
,
y
,
size
):
r_resamp
=
self
.
f
.
ev
(
x
,
y
)
indices
=
np
.
where
(
r_resamp
>=
0
)[
0
]
r_resamp
=
r_resamp
[
indices
]
r_resamp
=
r_resamp
/
np
.
sum
(
r_resamp
)
sampled_indices
=
np
.
random
.
default_rng
().
choice
(
indices
,
p
=
r_resamp
,
size
=
size
)
return
x
[
sampled_indices
],
y
[
sampled_indices
]
# TODO: rename
def
Resample
(
self
,
x
,
y
):
# should be able to take an array as well as single values
r_resamp
=
self
.
f
.
ev
(
np
.
asarray
(
x
),
np
.
asarray
(
y
))
rnd_numbers
=
np
.
random
.
default_rng
().
uniform
(
0
,
1
,
len
(
r_resamp
))
sampled_indices
=
np
.
where
(
rnd_numbers
<
r_resamp
)
return
x
[
sampled_indices
],
y
[
sampled_indices
]
def
__call__
(
self
,
):
# define custom functions
a
=
1
print
(
a
)
@
property
def
ratio
(
self
):
return
self
.
_ratio
umami/preprocessing_tools/__init__.py
View file @
29f6af12
...
...
@@ -9,19 +9,20 @@ from umami.preprocessing_tools.Merging import (
create_datasets
,
get_size
,
)
from
umami.preprocessing_tools.PDF_Sampling
import
PDFSampling
from
umami.preprocessing_tools.Preparation
import
get_jets
from
umami.preprocessing_tools.Resampling
import
(
EnforceFraction
,
Gen_default_dict
,
GetNJetsPerIteration
,
GetScales
,
RunSampling
,
RunStatSamples
,
UnderSampling
,
UnderSamplingProp
,
UnderSamplingTemplate
,
Weighting2D
,
dict_in
,
EnforceFraction
,
RunStatSamples
,
RunSampling
,
)
from
umami.preprocessing_tools.utils
import
(
GetBinaryLabels
,
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment