Commit dce2de1a authored by Philipp Gadow's avatar Philipp Gadow
Browse files

first draft of scaling class

parent 8e099563
import argparse
import json
import os
import sys
import h5py
......@@ -101,109 +100,6 @@ def GetParser():
return args
def GetScaleDict(args, config):
"""
Calculates the scaling, shifting and default values and saves them to json.
The calculation is done only on the first iteration.
"""
# TODO: find good way to get file names, breaks if no iterations
# check if var_dict is provided, otherwise exit
if not args.var_dict:
logger.error(
"Provide --var_dict to retrieve scaling and shifting factors"
)
sys.exit(1)
input_file = config.GetFileName(iteration=1, option="downsampled")
logger.info(input_file)
infile_all = h5py.File(input_file, "r")
take_taus = config.bool_process_taus
with open(args.var_dict, "r") as conf:
variable_config = yaml.load(conf, Loader=yaml_loader)
variables_header = variable_config["train_variables"]
var_list = [i for j in variables_header for i in variables_header[j]]
bjets = pd.DataFrame(infile_all["bjets"][:][var_list])
cjets = pd.DataFrame(infile_all["cjets"][:][var_list])
ujets = pd.DataFrame(infile_all["ujets"][:][var_list])
if take_taus:
taujets = pd.DataFrame(infile_all["taujets"][:][var_list])
X = pd.concat([bjets, cjets, ujets, taujets])
del taujets
else:
X = pd.concat([bjets, cjets, ujets])
del bjets, cjets, ujets
X.replace([np.inf, -np.inf], np.nan, inplace=True)
logger.info("Retrieving scaling and shifting values for the jet variables")
scale_dict = []
for var in X.columns.values:
if var in [variable_config["label"], "weight", "category"]:
continue
elif "isDefaults" in var:
# no scaling and shifting is applied to the check variables
scale_dict.append(upt.dict_in(var, 0.0, 1.0, None))
else:
dict_entry = upt.GetScales(
vec=X[var].values,
# TODO: implement weights
w=np.ones(len(X)),
varname=var,
custom_defaults_vars=variable_config["custom_defaults_vars"],
)
scale_dict.append(upt.dict_in(*dict_entry))
scale_dict_trk = {}
if args.tracks:
logger.info(
"Retrieving scaling and shifting values for the track variables"
)
logNormVars = variable_config["track_train_variables"]["logNormVars"]
jointNormVars = variable_config["track_train_variables"][
"jointNormVars"
]
trkVars = logNormVars + jointNormVars
btrks = np.asarray(infile_all["btrk"][:])
ctrks = np.asarray(infile_all["ctrk"][:])
utrks = np.asarray(infile_all["utrk"][:])
if take_taus:
tautrks = np.asarray(infile_all["tautrk"][:])
trks = np.concatenate((tautrks, utrks, ctrks, btrks))
else:
trks = np.concatenate((utrks, ctrks, btrks))
X_trk_train = np.stack(
[np.nan_to_num(trks[v]) for v in trkVars], axis=-1
)
mask = ~np.all(X_trk_train == 0, axis=-1)
eps = 1e-8
# Take the log of the desired variables
for i, v in enumerate(logNormVars):
X_trk_train[:, :, i][mask] = np.log(
X_trk_train[:, :, i][mask] + eps
)
scale_dict_trk = upt.ScaleTracks(
X_trk_train[:, :, :], logNormVars + jointNormVars
)
# save scale/shift dictionary to json file
scale_dict = {"jets": scale_dict, "tracks": scale_dict_trk}
os.makedirs(os.path.dirname(config.dict_file), exist_ok=True)
with open(config.dict_file, "w") as outfile:
json.dump(scale_dict, outfile, indent=4)
logger.info(f"saved scale dictionary as {config.dict_file}")
def ApplyScalesTrksNumpy(args, config, iteration=1):
if not args.var_dict:
logger.error(
......@@ -410,7 +306,8 @@ if __name__ == "__main__":
us.Run()
# here the other options such as PDFSampling etc. would be called
if args.scaling:
GetScaleDict(args, config)
scaling_tool = upt.Scaling(args, config)
scaling_tool.GetScaleDict()
if args.apply_scales:
ApplyScales(args, config)
if args.write:
......
"""
Helper functions to creating hybrid hdf5 samples from ttbar and Zprime ntuples
"""
import json
import os
import h5py
import numpy as np
import pandas as pd
import yaml
import umami.preprocessing_tools as upt
from umami.configuration import logger
from umami.tools import yaml_loader
class Scaling(object):
"""
Class for all scaling operations in umami.
"""
def __init__(self, args, config) -> None:
"""
Parameters
----------
args: ArgumentParser output
config: config file
Returns
-------
"""
self.config = config
self.__setup(args)
def __setup(self, args):
# check if var_dict is provided, otherwise exit
if not args.var_dict:
raise KeyError(
"Please provide --var_dict to retrieve scaling and shifting factors."
)
self.var_dict = args.var_dict
self.jets_key = "jets"
self.class_labels_map = {
label: label_id
for label_id, label in enumerate(
self.config.preparation["class_labels"]
)
}
self.options = self.config.sampling.get("options")
self.save_tracks = (
self.options["save_tracks"]
if "save_tracks" in self.options.keys()
else False
)
self.input_file = self.config.GetFileName()
def GetScaleDict(self):
"""
Calculates the scaling, shifting and default values and saves them to json.
The calculation is done only on the first iteration.
"""
logger.info(self.input_file)
infile_all = h5py.File(self.input_file, "r")
take_taus = self.config.bool_process_taus
with open(self.var_dict, "r") as conf:
variable_config = yaml.load(conf, Loader=yaml_loader)
variables_header = variable_config["train_variables"]
var_list = [i for j in variables_header for i in variables_header[j]]
bjets = pd.DataFrame(infile_all["bjets"][:][var_list])
cjets = pd.DataFrame(infile_all["cjets"][:][var_list])
ujets = pd.DataFrame(infile_all["ujets"][:][var_list])
if take_taus:
taujets = pd.DataFrame(infile_all["taujets"][:][var_list])
X = pd.concat([bjets, cjets, ujets, taujets])
del taujets
else:
X = pd.concat([bjets, cjets, ujets])
del bjets, cjets, ujets
X.replace([np.inf, -np.inf], np.nan, inplace=True)
logger.info(
"Retrieving scaling and shifting values for the jet variables"
)
scale_dict = []
for var in X.columns.values:
if var in [variable_config["label"], "weight", "category"]:
continue
elif "isDefaults" in var:
# no scaling and shifting is applied to the check variables
scale_dict.append(upt.dict_in(var, 0.0, 1.0, None))
else:
dict_entry = upt.GetScales(
vec=X[var].values,
# TODO: implement weights
w=np.ones(len(X)),
varname=var,
custom_defaults_vars=variable_config[
"custom_defaults_vars"
],
)
scale_dict.append(upt.dict_in(*dict_entry))
scale_dict_trk = {}
if self.save_tracks:
logger.info(
"Retrieving scaling and shifting values for the track variables"
)
logNormVars = variable_config["track_train_variables"][
"logNormVars"
]
jointNormVars = variable_config["track_train_variables"][
"jointNormVars"
]
trkVars = logNormVars + jointNormVars
btrks = np.asarray(infile_all["btrk"][:])
ctrks = np.asarray(infile_all["ctrk"][:])
utrks = np.asarray(infile_all["utrk"][:])
if take_taus:
tautrks = np.asarray(infile_all["tautrk"][:])
trks = np.concatenate((tautrks, utrks, ctrks, btrks))
else:
trks = np.concatenate((utrks, ctrks, btrks))
X_trk_train = np.stack(
[np.nan_to_num(trks[v]) for v in trkVars], axis=-1
)
mask = ~np.all(X_trk_train == 0, axis=-1)
eps = 1e-8
# Take the log of the desired variables
for i, v in enumerate(logNormVars):
X_trk_train[:, :, i][mask] = np.log(
X_trk_train[:, :, i][mask] + eps
)
scale_dict_trk = upt.ScaleTracks(
X_trk_train[:, :, :], logNormVars + jointNormVars
)
# save scale/shift dictionary to json file
scale_dict = {"jets": scale_dict, "tracks": scale_dict_trk}
os.makedirs(os.path.dirname(self.config.dict_file), exist_ok=True)
with open(self.config.dict_file, "w") as outfile:
json.dump(scale_dict, outfile, indent=4)
logger.info(f"saved scale dictionary as {self.config.dict_file}")
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment