Commit af0d2fac authored by Joschka Birk's avatar Joschka Birk
Browse files

Merge branch alfroch-update-tf-tools with refs/heads/master into refs/merge-requests/444/train

parents 3b4d15dd a582acb2
Pipeline #3635751 passed with stages
in 15 minutes and 39 seconds
......@@ -352,14 +352,14 @@ precision: float16
# Options for the conversion to tfrecords
convert_to_tfrecord:
chunk_size: 5000
N_Add_Vars: 4
N_add_vars: null
```
In the last part, the path to the variable dict `var_file` and the scale dict `dict_file` is defined. Those values are set in the `parameters` file. For example, the training variables for DL1r are defined in [DL1r_Variables.yaml](https://gitlab.cern.ch/atlas-flavor-tagging-tools/algorithms/umami/-/blob/master/umami/configs/DL1r_Variables.yaml).
Also the `outfile_name` is defined (which is also included in `parameters`). The `plot_name` here defines the names of the control plots which are produced in the preprocessing.
If you want to save the samples as TFRecord files you can specify under `convert_to_tfrecord` the `chunk_size`, i.e. the number of samples to be loaded and saved per file.
If you want to save the samples as TFRecord files you can specify under `convert_to_tfrecord` the `chunk_size`, i.e. the number of samples to be loaded and saved per file. The `N_add_vars` option is needed when you want to train a conditional attention model, like Umami Cond Att or CADS. This is an `int` with the number of jet variables that are given to the dips-like block as conditional information.
??? info "TF records"
......
......@@ -285,3 +285,4 @@ precision: float16
# Options for the conversion to tfrecords
convert_to_tfrecord:
chunk_size: 5000
N_add_vars: null
......@@ -277,4 +277,4 @@ precision: float16
# Options for the conversion to tfrecords
convert_to_tfrecord:
chunk_size: 5000
N_add_vars:
N_add_vars: null
......@@ -29,9 +29,12 @@
- test_train_dips
- test_train_tfrecords_dips
- test_train_cads
- test_train_tfrecords_cads
- test_train_dl1r
- test_train_umami
- test_train_tfrecords_umami
- test_train_umami_cond_att
- test_train_tfrecords_umami_cond_att
- test_evaluate_tagger_in_files
......@@ -212,6 +215,26 @@ test_train_cads:
- test_cads_model/
- coverage_files/
test_train_tfrecords_cads:
<<: *test_template
stage: integration_test_tagger
needs:
- test_preprocessing_umami_count
- test_preprocessing_umami_pdf
- test_preprocessing_umami_weighting
dependencies:
- test_preprocessing_umami_count
- test_preprocessing_umami_pdf
- test_preprocessing_umami_weighting
script:
- pytest --cov=./ --cov-report= ./umami/tests/integration/test_train.py -v -s --junitxml=report.xml -k "test_train_tfrecords_cads"
- cp .coverage ./coverage_files/.coverage.test_train_tfrecords_cads
artifacts:
<<: *artifact_template
paths:
- test_cads_model_tfrecords/
- coverage_files/
test_train_dl1r:
<<: *test_template
stage: integration_test_tagger
......@@ -272,6 +295,46 @@ test_train_tfrecords_umami:
- test_umami_model_tfrecords/
- coverage_files/
test_train_umami_cond_att:
<<: *test_template
stage: integration_test_tagger
needs:
- test_preprocessing_umami_count
- test_preprocessing_umami_pdf
- test_preprocessing_umami_weighting
dependencies:
- test_preprocessing_umami_count
- test_preprocessing_umami_pdf
- test_preprocessing_umami_weighting
script:
- pytest --cov=./ --cov-report= ./umami/tests/integration/test_train.py -v -s --junitxml=report.xml -k "test_train_umami_cond_att"
- cp .coverage ./coverage_files/.coverage.test_train_umami_cond_att
artifacts:
<<: *artifact_template
paths:
- test_umami_cond_att_model/
- coverage_files/
test_train_tfrecords_umami_cond_att:
<<: *test_template
stage: integration_test_tagger
needs:
- test_preprocessing_umami_count
- test_preprocessing_umami_pdf
- test_preprocessing_umami_weighting
dependencies:
- test_preprocessing_umami_count
- test_preprocessing_umami_pdf
- test_preprocessing_umami_weighting
script:
- pytest --cov=./ --cov-report= ./umami/tests/integration/test_train.py -v -s --junitxml=report.xml -k "test_train_tfrecords_umami_cond_att"
- cp .coverage ./coverage_files/.coverage.test_train_tfrecords_umami_cond_att
artifacts:
<<: *artifact_template
paths:
- test_umami_cond_att_model_tfrecords/
- coverage_files/
test_evaluate_tagger_in_files:
<<: *test_template
stage: integration_test_tagger
......
......@@ -86,6 +86,7 @@ def EvaluateModel(
preprocess_config: object,
test_file: str,
data_set_name: str,
tagger: str,
):
"""
Evaluate only the taggers in the files or also the UMAMI tagger.
......@@ -103,6 +104,9 @@ def EvaluateModel(
data_set_name : str
Dataset name for the results files. The results will be saved in
dicts. The key will be this dataset name.
tagger : str
Name of the tagger that is to be evaluated. Can either be umami or
umami_cond_att depending which architecture is used.
Raises
------
......@@ -193,26 +197,65 @@ def EvaluateModel(
if "exclude" in train_config.config:
exclude = train_config.config["exclude"]
# Load the test jets
X_test, X_test_trk, _ = utt.GetTestFile(
input_file=test_file,
var_dict=train_config.var_dict,
preprocess_config=preprocess_config,
class_labels=class_labels,
tracks_name=tracks_name,
nJets=nJets,
exclude=exclude,
cut_vars_dict=var_cuts,
)
# Check which test files need to be loaded depending on the CADS version
if tagger.casefold() == "umami_cond_att".casefold():
# Load the test jets
X_test, X_test_trk, _ = utt.GetTestFile(
input_file=test_file,
var_dict=train_config.var_dict,
preprocess_config=preprocess_config,
class_labels=class_labels,
tracks_name=tracks_name,
nJets=nJets,
exclude=exclude,
cut_vars_dict=var_cuts,
print_logger=False,
)
# Form the inputs for the network
X = [
X_test_trk,
X_test[
[
global_config.etavariable,
global_config.pTvariable,
]
],
X_test,
]
else:
# Get the testfile with the needed configs
X_test, X_test_trk, _ = utt.GetTestFile(
input_file=test_file,
var_dict=train_config.var_dict,
preprocess_config=preprocess_config,
class_labels=class_labels,
tracks_name=tracks_name,
nJets=nJets,
exclude=exclude,
cut_vars_dict=var_cuts,
)
# Form the inputs for the network
X = [X_test_trk, X_test]
# Load the model for evaluation. Note: The Sum is needed here!
with CustomObjectScope({"Sum": utf.Sum}):
with CustomObjectScope(
{
"Sum": utf.Sum,
"Attention": utf.Attention,
"DeepSet": utf.DeepSet,
"AttentionPooling": utf.AttentionPooling,
"DenseNet": utf.DenseNet,
"ConditionalAttention": utf.ConditionalAttention,
"ConditionalDeepSet": utf.ConditionalDeepSet,
}
):
model = load_model(model_file)
# Predict the output of the model on the test jets
pred_dips, pred_umami = model.predict(
[X_test_trk, X_test], batch_size=5000, verbose=0
)
pred_dips, pred_umami = model.predict(X, batch_size=5000, verbose=0)
# Fill the tagger_names and tagger_preds
tagger_names = ["dips", "umami"]
......@@ -231,9 +274,9 @@ def EvaluateModel(
variables += list(set(label_var_list))
# Add the predictions labels for the defined taggers to variables list
for tagger in tagger_list:
for tagger_iter in tagger_list:
variables += uct.get_class_prob_var_names(
tagger_name=f"{tagger}", class_labels=class_labels
tagger_name=f"{tagger_iter}", class_labels=class_labels
)
# Load the jets and truth labels (internal) with selected variables
......@@ -996,9 +1039,12 @@ if __name__ == "__main__":
tagger=tagger_name,
)
elif tagger_name == "umami" or not evaluate_trained_model:
if tagger_name == "umami":
logger.info("Start evaluating UMAMI with test files...")
elif (
tagger_name.casefold() in ("umami", "umami_cond_att")
or not evaluate_trained_model
):
if tagger_name.casefold() in ("umami", "umami_cond_att"):
logger.info(f"Start evaluating {tagger_name} with test files...")
else:
logger.info("Start evaluating in-file taggers with test files...")
......@@ -1012,6 +1058,7 @@ if __name__ == "__main__":
preprocess_config=preprocessing_config,
test_file=test_file_config["path"],
data_set_name=test_file_identifier,
tagger=tagger_name,
)
else:
......
......@@ -105,10 +105,6 @@ def Cads(args, train_config, preprocess_config):
Raises
------
ValueError
If input is neither a h5 nor a directory.
KeyError
When no metadata file is given for tfrecords.
ValueError
If input is neither a h5 nor a directory.
"""
......@@ -117,27 +113,29 @@ def Cads(args, train_config, preprocess_config):
NN_structure = train_config.NN_structure
val_params = train_config.Validation_metrics_settings
eval_params = train_config.Eval_parameters_validation
tracks_key = train_config.tracks_key
tracks_name = train_config.tracks_name
# Init a list for the callbacks
callbacks = []
# Get needed variable from the train config
WP = float(val_params["WP"]) if "WP" in val_params else float(eval_params["WP"])
n_jets = (
n_jets_val = (
int(val_params["n_jets"])
if "n_jets" in val_params
else int(eval_params["n_jets"])
)
if ".h5" in train_config.train_file:
# Init a metadata dict
metadata = {}
# Get the shapes for training
with h5py.File(train_config.train_file, "r") as f:
nJets, nTrks, nFeatures = f[tracks_key].shape
nJets, nDim = f["Y_train"].shape
if NN_structure["nJets_train"] is not None:
nJets = int(NN_structure["nJets_train"])
metadata["n_jets"], metadata["n_trks"], metadata["n_trk_features"] = f[
f"X_{tracks_name}_train"
].shape
_, metadata["n_dim"] = f["Y_train"].shape
if NN_structure["use_sample_weights"]:
tensor_types = (
......@@ -147,10 +145,12 @@ def Cads(args, train_config, preprocess_config):
)
tensor_shapes = (
{
"input_1": tf.TensorShape([None, nTrks, nFeatures]),
"input_1": tf.TensorShape(
[None, metadata["n_trks"], metadata["n_trk_features"]]
),
"input_2": tf.TensorShape([None, NN_structure["N_Conditions"]]),
},
tf.TensorShape([None, nDim]),
tf.TensorShape([None, metadata["n_dim"]]),
tf.TensorShape([None]),
)
else:
......@@ -160,10 +160,12 @@ def Cads(args, train_config, preprocess_config):
)
tensor_shapes = (
{
"input_1": tf.TensorShape([None, nTrks, nFeatures]),
"input_1": tf.TensorShape(
[None, metadata["n_trks"], metadata["n_trk_features"]]
),
"input_2": tf.TensorShape([None, NN_structure["N_Conditions"]]),
},
tf.TensorShape([None, nDim]),
tf.TensorShape([None, metadata["n_dim"]]),
)
# Get training set from generator
......@@ -172,9 +174,12 @@ def Cads(args, train_config, preprocess_config):
utf.cads_generator(
train_file_path=train_config.train_file,
X_Name="X_train",
X_trk_Name=tracks_key,
X_trk_Name=f"X_{tracks_name}_train",
Y_Name="Y_train",
n_jets=nJets,
n_jets=int(NN_structure["nJets_train"])
if "nJets_train" in NN_structure
and NN_structure["nJets_train"] is not None
else metadata["n_jets"],
batch_size=NN_structure["batch_size"],
nConds=NN_structure["N_Conditions"],
chunk_size=int(1e6),
......@@ -188,69 +193,21 @@ def Cads(args, train_config, preprocess_config):
)
elif os.path.isdir(train_config.train_file):
# Get the files in dir
train_file_names = os.listdir(train_config.train_file)
# Loop over files in dir
for train_file_name in train_file_names:
# Check if file is tfrecords or .h5
if not (".tfrecord" in train_file_name) and not (
train_file_name == "metadata.json"
):
raise ValueError(
f"Input file {train_config.train_file} is neither a "
".h5 file nor a directory with TF Record Files. "
"You should check this."
)
# Check if train file is in metadata
if "metadata.json" not in train_file_names:
raise KeyError("No metadata file in directory.")
# Check if nfiles is given. Otherwise set to 5
try:
nfiles = train_config.config["nfiles"]
except KeyError:
logger.warning(
"No number of files to be loaded in parallel defined. Set to 5"
)
nfiles = 5
# Get the tfrecords
tfrecord_reader = utf.TFRecordReader(
train_config.train_file,
NN_structure["batch_size"],
nfiles,
NN_structure["use_sample_weights"],
NN_structure["N_Conditions"],
train_dataset, metadata = utf.load_tfrecords_train_dataset(
train_config=train_config
)
# Load the dataset from reader
train_dataset = tfrecord_reader.load_Dataset()
# Get the metadata name
metadata_name = (train_config.train_file + "/metadata.json").replace("//", "/")
# Load metadata in file
with open(metadata_name, "r") as metadata_file:
metadata = json.load(metadata_file)
nJets = metadata["nJets"]
nTrks = metadata["nTrks"]
nFeatures = metadata["nFeatures"]
nDim = metadata["nDim"]
else:
raise ValueError(
f"input file {train_config.train_file} is neither a .h5 file nor "
"a directory with TF Record Files. You should check this."
)
logger.info(f"nJets: {nJets}, nTrks: {nTrks}")
# Init CADS model
cads, epochs = Cads_model(train_config=train_config, input_shape=(nTrks, nFeatures))
cads, epochs = Cads_model(
train_config=train_config,
input_shape=(metadata["n_trks"], metadata["n_trk_features"]),
)
# Check if epochs is set via argparser or not
if args.epochs is None:
......@@ -282,12 +239,12 @@ def Cads(args, train_config, preprocess_config):
# Load validation data for callback
val_data_dict = None
if n_jets > 0:
if n_jets_val > 0:
if NN_structure["N_Conditions"] is None:
val_data_dict = utt.load_validation_data_dips(
train_config=train_config,
preprocess_config=preprocess_config,
nJets=n_jets,
nJets=n_jets_val,
convert_to_tensor=True,
)
......@@ -303,7 +260,7 @@ def Cads(args, train_config, preprocess_config):
val_data_dict = utt.load_validation_data_umami(
train_config=train_config,
preprocess_config=preprocess_config,
nJets=n_jets,
nJets=n_jets_val,
convert_to_tensor=True,
jets_var_list=["absEta_btagJes", "pt_btagJes"],
)
......@@ -330,7 +287,7 @@ def Cads(args, train_config, preprocess_config):
frac_dict=eval_params["frac_values"],
dict_file_name=utt.get_validation_dict_name(
WP=WP,
n_jets=n_jets,
n_jets=n_jets_val,
dir_name=train_config.model_name,
),
)
......@@ -345,7 +302,9 @@ def Cads(args, train_config, preprocess_config):
# TODO: Add a representative validation dataset for training (shown in stdout)
# validation_data=validation_data,
callbacks=callbacks,
steps_per_epoch=nJets / NN_structure["batch_size"],
steps_per_epoch=int(NN_structure["nJets_train"]) / NN_structure["batch_size"]
if "nJets_train" in NN_structure and NN_structure["nJets_train"] is not None
else metadata["n_jets"] / NN_structure["batch_size"],
use_multiprocessing=True,
workers=8,
)
......
......@@ -126,7 +126,12 @@ def TrainLargeFile(args, train_config, preprocess_config):
preprocess_config : object
preprocessing configuration
Raises
------
ValueError
If input is neither a h5 nor a directory.
"""
# Load NN Structure and training parameter from file
NN_structure = train_config.NN_structure
val_params = train_config.Validation_metrics_settings
......@@ -137,7 +142,7 @@ def TrainLargeFile(args, train_config, preprocess_config):
# Get needed variable from the train config
WP = float(val_params["WP"]) if "WP" in val_params else float(eval_params["WP"])
n_jets = (
n_jets_val = (
int(val_params["n_jets"])
if "n_jets" in val_params
else int(eval_params["n_jets"])
......@@ -165,56 +170,66 @@ def TrainLargeFile(args, train_config, preprocess_config):
logger.info(f"Repeating the following variables in the last layer {repeat_end}")
feature_connect_indices = utt.get_jet_feature_position(repeat_end, variables)
# Get the shapes for training
with h5py.File(train_config.train_file, "r") as f:
nJets, nFeatures = f["X_train"].shape
nJets, nDim = f["Y_train"].shape
if excluded_var is not None:
nFeatures -= len(excluded_var)
if NN_structure["nJets_train"] is not None:
nJets = int(NN_structure["nJets_train"])
# Print how much jets are used
logger.info(f"Number of Jets used for training: {nJets}")
# pass correct tensor types/shapes depending on using sample weights
if NN_structure["use_sample_weights"]:
tensor_types = (tf.float32, tf.float32, tf.float32)
tensor_shapes = (
tf.TensorShape([None, nFeatures]),
tf.TensorShape([None, nDim]),
tf.TensorShape([None]),
if ".h5" in train_config.train_file:
# Init a metadata dict
metadata = {}
# Get the shapes for training
with h5py.File(train_config.train_file, "r") as f:
metadata["n_jets"], metadata["n_dim"] = f["Y_train"].shape
_, metadata["n_jet_features"] = f["X_train"].shape
if NN_structure["use_sample_weights"]:
tensor_types = (tf.float32, tf.float32, tf.float32)
tensor_shapes = (
tf.TensorShape([None, metadata["n_jet_features"]]),
tf.TensorShape([None, metadata["n_dim"]]),
tf.TensorShape([None]),
)
else:
tensor_types = (tf.float32, tf.float32)
tensor_shapes = (
tf.TensorShape([None, metadata["n_jet_features"]]),
tf.TensorShape([None, metadata["n_dim"]]),
)
# Build train_datasets for training
train_dataset = (
tf.data.Dataset.from_generator(
utf.dl1_generator(
train_file_path=train_config.train_file,
X_Name="X_train",
Y_Name="Y_train",
n_jets=int(NN_structure["nJets_train"])
if "nJets_train" in NN_structure
and NN_structure["nJets_train"] is not None
else metadata["n_jets"],
batch_size=NN_structure["batch_size"],
excluded_var=excluded_var,
sample_weights=NN_structure["use_sample_weights"],
),
tensor_types,
tensor_shapes,
)
.repeat()
.prefetch(tf.data.AUTOTUNE)
)
elif os.path.isdir(train_config.train_file):
train_dataset, metadata = utf.load_tfrecords_train_dataset(
train_config=train_config
)
else:
tensor_types = (tf.float32, tf.float32)
tensor_shapes = (
tf.TensorShape([None, nFeatures]),
tf.TensorShape([None, nDim]),
raise ValueError(
f"input file {train_config.train_file} is neither a .h5 file nor a"
" directory with TF Record Files. You should check this."
)
# Build train_datasets for training
train_dataset = (
tf.data.Dataset.from_generator(
utf.dl1_generator(
train_file_path=train_config.train_file,
X_Name="X_train",
Y_Name="Y_train",
n_jets=nJets,
batch_size=NN_structure["batch_size"],
excluded_var=excluded_var,
sample_weights=NN_structure["use_sample_weights"],
),
tensor_types,
tensor_shapes,
)
.repeat()
.prefetch(3)
)
# Load model and epochs
model, epochs = DL1_model(
train_config=train_config,
input_shape=(nFeatures,),
input_shape=(metadata["n_jet_features"],),
feature_connect_indices=feature_connect_indices,
)
......@@ -248,11 +263,11 @@ def TrainLargeFile(args, train_config, preprocess_config):
# Load validation data for callback
val_data_dict = None
if n_jets > 0:
if n_jets_val > 0:
val_data_dict = utt.load_validation_data_dl1(
train_config=train_config,
preprocess_config=preprocess_config,
nJets=n_jets,
nJets=n_jets_val,
)
# Set my_callback as callback. Writes history information
......@@ -266,7 +281,7 @@ def TrainLargeFile(args, train_config, preprocess_config):
frac_dict=eval_params["frac_values"],
dict_file_name=utt.get_validation_dict_name(