Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
Tomke Schroer
Umami
Commits
dce2de1a
Commit
dce2de1a
authored
Aug 30, 2021
by
Philipp Gadow
Browse files
first draft of scaling class
parent
8e099563
Changes
2
Hide whitespace changes
Inline
Side-by-side
umami/preprocessing.py
View file @
dce2de1a
import
argparse
import
json
import
os
import
sys
import
h5py
...
...
@@ -101,109 +100,6 @@ def GetParser():
return
args
def
GetScaleDict
(
args
,
config
):
"""
Calculates the scaling, shifting and default values and saves them to json.
The calculation is done only on the first iteration.
"""
# TODO: find good way to get file names, breaks if no iterations
# check if var_dict is provided, otherwise exit
if
not
args
.
var_dict
:
logger
.
error
(
"Provide --var_dict to retrieve scaling and shifting factors"
)
sys
.
exit
(
1
)
input_file
=
config
.
GetFileName
(
iteration
=
1
,
option
=
"downsampled"
)
logger
.
info
(
input_file
)
infile_all
=
h5py
.
File
(
input_file
,
"r"
)
take_taus
=
config
.
bool_process_taus
with
open
(
args
.
var_dict
,
"r"
)
as
conf
:
variable_config
=
yaml
.
load
(
conf
,
Loader
=
yaml_loader
)
variables_header
=
variable_config
[
"train_variables"
]
var_list
=
[
i
for
j
in
variables_header
for
i
in
variables_header
[
j
]]
bjets
=
pd
.
DataFrame
(
infile_all
[
"bjets"
][:][
var_list
])
cjets
=
pd
.
DataFrame
(
infile_all
[
"cjets"
][:][
var_list
])
ujets
=
pd
.
DataFrame
(
infile_all
[
"ujets"
][:][
var_list
])
if
take_taus
:
taujets
=
pd
.
DataFrame
(
infile_all
[
"taujets"
][:][
var_list
])
X
=
pd
.
concat
([
bjets
,
cjets
,
ujets
,
taujets
])
del
taujets
else
:
X
=
pd
.
concat
([
bjets
,
cjets
,
ujets
])
del
bjets
,
cjets
,
ujets
X
.
replace
([
np
.
inf
,
-
np
.
inf
],
np
.
nan
,
inplace
=
True
)
logger
.
info
(
"Retrieving scaling and shifting values for the jet variables"
)
scale_dict
=
[]
for
var
in
X
.
columns
.
values
:
if
var
in
[
variable_config
[
"label"
],
"weight"
,
"category"
]:
continue
elif
"isDefaults"
in
var
:
# no scaling and shifting is applied to the check variables
scale_dict
.
append
(
upt
.
dict_in
(
var
,
0.0
,
1.0
,
None
))
else
:
dict_entry
=
upt
.
GetScales
(
vec
=
X
[
var
].
values
,
# TODO: implement weights
w
=
np
.
ones
(
len
(
X
)),
varname
=
var
,
custom_defaults_vars
=
variable_config
[
"custom_defaults_vars"
],
)
scale_dict
.
append
(
upt
.
dict_in
(
*
dict_entry
))
scale_dict_trk
=
{}
if
args
.
tracks
:
logger
.
info
(
"Retrieving scaling and shifting values for the track variables"
)
logNormVars
=
variable_config
[
"track_train_variables"
][
"logNormVars"
]
jointNormVars
=
variable_config
[
"track_train_variables"
][
"jointNormVars"
]
trkVars
=
logNormVars
+
jointNormVars
btrks
=
np
.
asarray
(
infile_all
[
"btrk"
][:])
ctrks
=
np
.
asarray
(
infile_all
[
"ctrk"
][:])
utrks
=
np
.
asarray
(
infile_all
[
"utrk"
][:])
if
take_taus
:
tautrks
=
np
.
asarray
(
infile_all
[
"tautrk"
][:])
trks
=
np
.
concatenate
((
tautrks
,
utrks
,
ctrks
,
btrks
))
else
:
trks
=
np
.
concatenate
((
utrks
,
ctrks
,
btrks
))
X_trk_train
=
np
.
stack
(
[
np
.
nan_to_num
(
trks
[
v
])
for
v
in
trkVars
],
axis
=-
1
)
mask
=
~
np
.
all
(
X_trk_train
==
0
,
axis
=-
1
)
eps
=
1e-8
# Take the log of the desired variables
for
i
,
v
in
enumerate
(
logNormVars
):
X_trk_train
[:,
:,
i
][
mask
]
=
np
.
log
(
X_trk_train
[:,
:,
i
][
mask
]
+
eps
)
scale_dict_trk
=
upt
.
ScaleTracks
(
X_trk_train
[:,
:,
:],
logNormVars
+
jointNormVars
)
# save scale/shift dictionary to json file
scale_dict
=
{
"jets"
:
scale_dict
,
"tracks"
:
scale_dict_trk
}
os
.
makedirs
(
os
.
path
.
dirname
(
config
.
dict_file
),
exist_ok
=
True
)
with
open
(
config
.
dict_file
,
"w"
)
as
outfile
:
json
.
dump
(
scale_dict
,
outfile
,
indent
=
4
)
logger
.
info
(
f
"saved scale dictionary as
{
config
.
dict_file
}
"
)
def
ApplyScalesTrksNumpy
(
args
,
config
,
iteration
=
1
):
if
not
args
.
var_dict
:
logger
.
error
(
...
...
@@ -410,7 +306,8 @@ if __name__ == "__main__":
us
.
Run
()
# here the other options such as PDFSampling etc. would be called
if
args
.
scaling
:
GetScaleDict
(
args
,
config
)
scaling_tool
=
upt
.
Scaling
(
args
,
config
)
scaling_tool
.
GetScaleDict
()
if
args
.
apply_scales
:
ApplyScales
(
args
,
config
)
if
args
.
write
:
...
...
umami/preprocessing_tools/Scaling.py
0 → 100644
View file @
dce2de1a
"""
Helper functions to creating hybrid hdf5 samples from ttbar and Zprime ntuples
"""
import
json
import
os
import
h5py
import
numpy
as
np
import
pandas
as
pd
import
yaml
import
umami.preprocessing_tools
as
upt
from
umami.configuration
import
logger
from
umami.tools
import
yaml_loader
class
Scaling
(
object
):
"""
Class for all scaling operations in umami.
"""
def
__init__
(
self
,
args
,
config
)
->
None
:
"""
Parameters
----------
args: ArgumentParser output
config: config file
Returns
-------
"""
self
.
config
=
config
self
.
__setup
(
args
)
def
__setup
(
self
,
args
):
# check if var_dict is provided, otherwise exit
if
not
args
.
var_dict
:
raise
KeyError
(
"Please provide --var_dict to retrieve scaling and shifting factors."
)
self
.
var_dict
=
args
.
var_dict
self
.
jets_key
=
"jets"
self
.
class_labels_map
=
{
label
:
label_id
for
label_id
,
label
in
enumerate
(
self
.
config
.
preparation
[
"class_labels"
]
)
}
self
.
options
=
self
.
config
.
sampling
.
get
(
"options"
)
self
.
save_tracks
=
(
self
.
options
[
"save_tracks"
]
if
"save_tracks"
in
self
.
options
.
keys
()
else
False
)
self
.
input_file
=
self
.
config
.
GetFileName
()
def
GetScaleDict
(
self
):
"""
Calculates the scaling, shifting and default values and saves them to json.
The calculation is done only on the first iteration.
"""
logger
.
info
(
self
.
input_file
)
infile_all
=
h5py
.
File
(
self
.
input_file
,
"r"
)
take_taus
=
self
.
config
.
bool_process_taus
with
open
(
self
.
var_dict
,
"r"
)
as
conf
:
variable_config
=
yaml
.
load
(
conf
,
Loader
=
yaml_loader
)
variables_header
=
variable_config
[
"train_variables"
]
var_list
=
[
i
for
j
in
variables_header
for
i
in
variables_header
[
j
]]
bjets
=
pd
.
DataFrame
(
infile_all
[
"bjets"
][:][
var_list
])
cjets
=
pd
.
DataFrame
(
infile_all
[
"cjets"
][:][
var_list
])
ujets
=
pd
.
DataFrame
(
infile_all
[
"ujets"
][:][
var_list
])
if
take_taus
:
taujets
=
pd
.
DataFrame
(
infile_all
[
"taujets"
][:][
var_list
])
X
=
pd
.
concat
([
bjets
,
cjets
,
ujets
,
taujets
])
del
taujets
else
:
X
=
pd
.
concat
([
bjets
,
cjets
,
ujets
])
del
bjets
,
cjets
,
ujets
X
.
replace
([
np
.
inf
,
-
np
.
inf
],
np
.
nan
,
inplace
=
True
)
logger
.
info
(
"Retrieving scaling and shifting values for the jet variables"
)
scale_dict
=
[]
for
var
in
X
.
columns
.
values
:
if
var
in
[
variable_config
[
"label"
],
"weight"
,
"category"
]:
continue
elif
"isDefaults"
in
var
:
# no scaling and shifting is applied to the check variables
scale_dict
.
append
(
upt
.
dict_in
(
var
,
0.0
,
1.0
,
None
))
else
:
dict_entry
=
upt
.
GetScales
(
vec
=
X
[
var
].
values
,
# TODO: implement weights
w
=
np
.
ones
(
len
(
X
)),
varname
=
var
,
custom_defaults_vars
=
variable_config
[
"custom_defaults_vars"
],
)
scale_dict
.
append
(
upt
.
dict_in
(
*
dict_entry
))
scale_dict_trk
=
{}
if
self
.
save_tracks
:
logger
.
info
(
"Retrieving scaling and shifting values for the track variables"
)
logNormVars
=
variable_config
[
"track_train_variables"
][
"logNormVars"
]
jointNormVars
=
variable_config
[
"track_train_variables"
][
"jointNormVars"
]
trkVars
=
logNormVars
+
jointNormVars
btrks
=
np
.
asarray
(
infile_all
[
"btrk"
][:])
ctrks
=
np
.
asarray
(
infile_all
[
"ctrk"
][:])
utrks
=
np
.
asarray
(
infile_all
[
"utrk"
][:])
if
take_taus
:
tautrks
=
np
.
asarray
(
infile_all
[
"tautrk"
][:])
trks
=
np
.
concatenate
((
tautrks
,
utrks
,
ctrks
,
btrks
))
else
:
trks
=
np
.
concatenate
((
utrks
,
ctrks
,
btrks
))
X_trk_train
=
np
.
stack
(
[
np
.
nan_to_num
(
trks
[
v
])
for
v
in
trkVars
],
axis
=-
1
)
mask
=
~
np
.
all
(
X_trk_train
==
0
,
axis
=-
1
)
eps
=
1e-8
# Take the log of the desired variables
for
i
,
v
in
enumerate
(
logNormVars
):
X_trk_train
[:,
:,
i
][
mask
]
=
np
.
log
(
X_trk_train
[:,
:,
i
][
mask
]
+
eps
)
scale_dict_trk
=
upt
.
ScaleTracks
(
X_trk_train
[:,
:,
:],
logNormVars
+
jointNormVars
)
# save scale/shift dictionary to json file
scale_dict
=
{
"jets"
:
scale_dict
,
"tracks"
:
scale_dict_trk
}
os
.
makedirs
(
os
.
path
.
dirname
(
self
.
config
.
dict_file
),
exist_ok
=
True
)
with
open
(
self
.
config
.
dict_file
,
"w"
)
as
outfile
:
json
.
dump
(
scale_dict
,
outfile
,
indent
=
4
)
logger
.
info
(
f
"saved scale dictionary as
{
self
.
config
.
dict_file
}
"
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment