Commit de74c905 authored by toschroe's avatar toschroe
Browse files
parents 6cfbfa24 555457e5
......@@ -345,12 +345,18 @@ Also the `outfile_name` is defined (which is also included in `parameters`). The
To run the sample preparation for the ttbar b-jet sample `training_ttbar_bjets`, which has been defined in the config file in the `preparation: samples:` block, execute:
```
```bash
preprocessing.py --config <path to config file> --sample training_ttbar_bjets --prepare
```
As a result, an output file will be written to the output path you specified via `sample_path`. The file will have the name defined in the `preparation` block.
If you want to prepare all the samples defined in the `preparation: samples:` block, just leave out the `--sample` option. Also, if you want to use tracks, you need to add the flag `--tracks`. An example command would look like this:
```bash
preprocessing.py --config <path to config file> --prepare --tracks
```
### Running the preprocessing
After the preparation of the samples, the next step is the processing for the training itself which is also done with the [`preprocessing.py`](https://gitlab.cern.ch/atlas-flavor-tagging-tools/algorithms/umami/-/blob/master/preprocessing.py) script. Again, the configurations for the preprocessing are defined in the config file [PFlow-Preprocessing.yaml](https://gitlab.cern.ch/atlas-flavor-tagging-tools/algorithms/umami/-/blob/master/examples/PFlow-Preprocessing.yaml) which you need to adapt to your needs.
......@@ -360,31 +366,31 @@ The steps defined in the following segment are only performed on the training sa
1. Running the resampling:
```bash
preprocessing.py -c examples/PFlow-Preprocessing.yaml --resampling
preprocessing.py --config <path to config file> --resampling
```
If you want to also use the tracks of the jets, you need to give an extra flag `--tracks`. Track information are not needed for the DL1r but for DIPS and Umami. If you want to train one of those, you need to process the track information too with setting the `--tracks` flag:
```bash
preprocessing.py -c examples/PFlow-Preprocessing.yaml --resampling --tracks
preprocessing.py --config <path to config file> --resampling --tracks
```
2. Retrieving scaling and shifting factors:
```bash
preprocessing.py -c examples/PFlow-Preprocessing.yaml --scaling --tracks
preprocessing.py --config <path to config file> --scaling --tracks
```
3. Applying shifting and scaling factors
```bash
preprocessing.py -c examples/PFlow-Preprocessing.yaml --apply_scales --tracks
preprocessing.py --config <path to config file> --apply_scales --tracks
```
4. Writing the samples to disk in the correct format for training.
```bash
preprocessing.py -c examples/PFlow-Preprocessing.yaml --write --tracks
preprocessing.py --config <path to config file> --write --tracks
```
## Full example
......
......@@ -98,32 +98,36 @@ Eval_parameters_validation:
}
# Cuts which are applied to the different datasets used for evaluation
variable_cuts: {
"ttbar_r21": {
"pt_btagJes": {
"operator": "<=",
"condition": 250000,
}
},
"ttbar_r22": {
"pt_btagJes": {
"operator": "<=",
"condition": 250000,
}
},
"zpext_r21": {
"pt_btagJes": {
"operator": ">",
"condition": 250000,
}
},
"zpext_r22": {
"pt_btagJes": {
"operator": ">",
"condition": 250000,
}
},
}
variable_cuts:
validation_file:
pt_btagJes:
operator: "<="
condition: 250000
ttbar_r21:
pt_btagJes:
operator: "<="
condition: 250000
ttbar_r22:
pt_btagJes:
operator: "<="
condition: 250000
add_validation_file:
pt_btagJes:
operator: ">"
condition: 250000
zpext_r21:
pt_btagJes:
operator: ">"
condition: 250000
zpext_r22:
pt_btagJes:
operator: ">"
condition: 250000
# A list to add available variables to the evaluation files
add_variables_eval: ["actualInteractionsPerCrossing"]
......
......@@ -97,32 +97,36 @@ Eval_parameters_validation:
}
# Cuts which are applied to the different datasets used for evaluation
variable_cuts: {
"ttbar_r21": {
"pt_btagJes": {
"operator": "<=",
"condition": 250000,
}
},
"ttbar_r22": {
"pt_btagJes": {
"operator": "<=",
"condition": 250000,
}
},
"zpext_r21": {
"pt_btagJes": {
"operator": ">",
"condition": 250000,
}
},
"zpext_r22": {
"pt_btagJes": {
"operator": ">",
"condition": 250000,
}
},
}
variable_cuts:
validation_file:
pt_btagJes:
operator: "<="
condition: 250000
ttbar_r21:
pt_btagJes:
operator: "<="
condition: 250000
ttbar_r22:
pt_btagJes:
operator: "<="
condition: 250000
add_validation_file:
pt_btagJes:
operator: ">"
condition: 250000
zpext_r21:
pt_btagJes:
operator: ">"
condition: 250000
zpext_r22:
pt_btagJes:
operator: ">"
condition: 250000
# Working point used in the evaluation
WP: 0.77
......
......@@ -100,20 +100,26 @@ Eval_parameters_validation:
}
# Cuts which are applied to the different datasets used for evaluation
variable_cuts: {
"ttbar": {
"pt_btagJes": {
"operator": "<=",
"condition": 250000,
}
},
"zpext": {
"pt_btagJes": {
"operator": ">",
"condition": 250000,
}
},
}
variable_cuts:
validation_file:
pt_btagJes:
operator: "<="
condition: 250000
ttbar:
pt_btagJes:
operator: "<="
condition: 250000
add_validation_file:
pt_btagJes:
operator: ">"
condition: 250000
zpext:
pt_btagJes:
operator: ">"
condition: 250000
# Working point used in the evaluation
WP: 0.77
......
......@@ -140,7 +140,7 @@ def EvaluateModel(
# Load the test jets
X_test, X_test_trk, Y_test = utt.GetTestFile(
file=test_file,
input_file=test_file,
var_dict=train_config.var_dict,
preprocess_config=preprocess_config,
class_labels=class_labels,
......
......@@ -51,7 +51,7 @@ def GetParser():
"-t",
"--tagger",
type=str,
default=None,
required=True,
help="Model type which is used. You can either use 'dips', 'dl1' or 'umami'.",
)
......@@ -67,29 +67,31 @@ def main(args, train_config, preprocess_config):
else:
nJets = args.nJets
if args.tagger is None:
raise ValueError("You need to give a model type with -t or --tagger")
if args.dict:
output_file_name = args.dict
parameters = utt.get_parameters_from_validation_dict_name(
output_file_name
)
beff = parameters["WP"]
else:
output_file_name = utt.calc_validation_metrics(
train_config=train_config,
preprocess_config=preprocess_config,
target_beff=args.beff
if args.beff
else train_config.Eval_parameters_validation["WP"],
nJets=nJets,
tagger=args.tagger,
)
beff = train_config.Eval_parameters_validation["WP"]
# Check if the tagger given is supported
if args.tagger in ["umami", "dl1", "dips"]:
# If dict is given, the re-calculation is skipped
if args.dict:
output_file_name = args.dict
parameters = utt.get_parameters_from_validation_dict_name(
output_file_name
)
beff = parameters["WP"]
else:
# Calculate the validation metrics and save them
output_file_name = utt.calc_validation_metrics(
train_config=train_config,
preprocess_config=preprocess_config,
target_beff=args.beff
if args.beff
else train_config.Eval_parameters_validation["WP"],
nJets=nJets,
tagger=args.tagger,
)
beff = train_config.Eval_parameters_validation["WP"]
# Run the Performance check with the values from the dict and plot them
RunPerformanceCheck(
train_config=train_config,
tagger=args.tagger,
......@@ -109,7 +111,7 @@ def main(args, train_config, preprocess_config):
else:
raise ValueError(
"You need to define a model type. You can either use 'dips', 'dl1' or 'umami'."
"You need to define a model type! You can either use 'dips', 'dl1' or 'umami'."
)
......
import argparse
import umami.preprocessing_tools as upt
from umami.configuration import logger
def GetParser():
......@@ -92,23 +93,65 @@ if __name__ == "__main__":
args = GetParser()
config = upt.Configuration(args.config_file)
# Check for preparation
if args.prepare:
preparation_tool = upt.PrepareSamples(args, config)
preparation_tool.Run()
if args.resampling:
# Check if one specific sample is given
if args.sample:
preparation_tool = upt.PrepareSamples(args, config)
preparation_tool.Run()
# If no specific sample is given
else:
logger.warning(
"No --sample was selected, using all in config file! This can take a lot of time!"
)
# Iterate over the samples defined in the config
for iter_sample in config.preparation["samples"].keys():
# Set the argument in args to this sample and run prepare
args.sample = iter_sample
preparation_tool = upt.PrepareSamples(args, config)
preparation_tool.Run()
# Check for resampling
elif args.resampling:
# Check the method which should be used for resampling
if config.sampling["method"] == "count":
sampler = upt.UnderSampling(config)
elif config.sampling["method"] == "pdf":
sampler = upt.PDFSampling(config)
if config.sampling["method"] == "probability_ratio":
elif config.sampling["method"] == "probability_ratio":
sampler = upt.ProbabilityRatioUnderSampling(config)
else:
raise ValueError(
f'{config.sampling["method"]} as sampling method is not supported!'
)
# Run the sampling with the selected method
sampler.Run()
if args.scaling:
# Calculate the scale dicts of the previous resampled files
elif args.scaling:
Scaling = upt.Scaling(config)
Scaling.GetScaleDict(chunkSize=args.chunk_size)
if args.apply_scales:
# Apply scaling of the previous calculated scale dicts
elif args.apply_scales:
Scaling = upt.Scaling(config)
Scaling.ApplyScales()
if args.write:
# Check for final writing to disk in train format
elif args.write:
Writer = upt.TrainSampleWriter(config)
Writer.WriteTrainSample()
# Give error when nothing is used
else:
raise ValueError(
"You need to define which part of the preprocessing you want to run!"
)
......@@ -133,6 +133,7 @@ class PrepareSamples:
rng = np.random.default_rng(seed=42)
rng.shuffle(jets)
if self.save_tracks:
rng = np.random.default_rng(seed=42)
rng.shuffle(tracks)
if self.create_file:
......
......@@ -440,8 +440,10 @@ class Resampling(object):
pbar.update(jets.size)
rng = np.random.default_rng(seed=self.rnd_seed)
rng.shuffle(jets)
rng = np.random.default_rng(seed=42)
rng.shuffle(labels)
if self.save_tracks:
rng = np.random.default_rng(seed=42)
rng.shuffle(tracks)
if create_file:
......@@ -546,23 +548,25 @@ class PDFSampling(Resampling):
in_file = GetPreparationSamplePath(preparation_sample)
samples = {}
with h5py.File(in_file, "r") as f:
Njets_initial = None
Njets_initial = len(f["jets"])
if (
"custom_njets_initial" in self.options
and self.options["custom_njets_initial"] is not None
and sample in list(self.options["custom_njets_initial"])
):
Njets_initial = int(
self.options["custom_njets_initial"][sample]
)
else:
Njets_initial = len(f["jets"])
Njets_asked = int(self.options["custom_njets_initial"][sample])
if Njets_initial <= Njets_asked:
logger.warning(
f"For sample {sample}, demanding more initial jets ({Njets_asked}) than available ({Njets_initial}). Forcing to available."
)
else:
Njets_initial = Njets_asked
start_ind = 0
end_ind = int(start_ind + chunk_size)
# create indices and then shuffle those to be used in the loop below
tupled_indices = []
while end_ind <= Njets_initial or start_ind == 0:
while end_ind < Njets_initial or start_ind == 0:
if end_ind + chunk_size > Njets_initial:
# Missing less then a chunk, joining to last chunk
end_ind = Njets_initial
......@@ -618,15 +622,20 @@ class PDFSampling(Resampling):
in_file = GetPreparationSamplePath(preparation_sample)
samples = {}
with h5py.File(in_file, "r") as f:
Njets_initial = None
Njets_initial = len(f["jets"])
if (
"custom_njets_initial" in self.options
and self.options["custom_njets_initial"] is not None
and sample in list(self.options["custom_njets_initial"])
):
Njets_initial = int(
self.options["custom_njets_initial"][sample]
)
Njets_asked = int(self.options["custom_njets_initial"][sample])
if Njets_initial <= Njets_asked:
logger.warning(
f"For sample {sample}, demanding more initial jets ({Njets_asked}) than available ({Njets_initial}). Forcing to available."
)
else:
Njets_initial = Njets_asked
jets_x = np.asarray(f["jets"][self.var_x])[:Njets_initial]
jets_y = np.asarray(f["jets"][self.var_y])[:Njets_initial]
logger.info(
......@@ -1153,9 +1162,8 @@ class PDFSampling(Resampling):
to_sample = self.number_to_sample[sample_name] - sampled_jets
weights = weights / np.sum(weights)
selected_indices = self.Resample_chunk(
weights, size=round(to_sample)
)
selected_ind = self.Resample_chunk(weights, size=round(to_sample))
selected_indices = np.sort(selected_ind).astype(int)
sampled_jets += len(selected_indices)
pbar.update(selected_indices.size)
if create_file:
......@@ -1234,8 +1242,10 @@ class PDFSampling(Resampling):
pbar.update(jets.size)
rng = np.random.default_rng(seed=42)
rng.shuffle(jets)
rng = np.random.default_rng(seed=42)
rng.shuffle(labels)
if self.save_tracks:
rng = np.random.default_rng(seed=42)
rng.shuffle(tracks)
if create_file:
......@@ -1350,8 +1360,10 @@ class PDFSampling(Resampling):
pbar.update(jets.size)
rng = np.random.default_rng(seed=42)
rng.shuffle(jets)
rng = np.random.default_rng(seed=42)
rng.shuffle(labels)
if self.save_tracks:
rng = np.random.default_rng(seed=42)
rng.shuffle(tracks)
if create_file:
......@@ -2020,16 +2032,23 @@ class UnderSampling(Resampling):
f"Loading sampling variables from {preparation_sample_path}"
)
with h5py.File(preparation_sample_path, "r") as f:
nJets_initial = None
nJets_initial = len(f["jets"])
if (
"custom_njets_initial" in self.options
and self.options["custom_njets_initial"] is not None
and sample
in list(self.options["custom_njets_initial"])
):
nJets_initial = int(
nJets_asked = int(
self.options["custom_njets_initial"][sample]
)
if nJets_initial <= nJets_asked:
logger.warning(
f"For sample {sample}, demanding more initial jets ({nJets_asked}) than available ({nJets_initial}). Forcing to available."
)
else:
nJets_initial = nJets_asked
jets_x = np.asarray(f["jets"][self.var_x])[:nJets_initial]
jets_y = np.asarray(f["jets"][self.var_y])[:nJets_initial]
logger.info(
......
......@@ -28,12 +28,49 @@ from umami.train_tools.NN_tools import (
get_jet_feature_indices,
get_parameters_from_validation_dict_name,
get_validation_dict_name,
get_variable_cuts,
load_validation_data_dips,
load_validation_data_umami,
setup_output_directory,
)
class get_variable_cuts_TestCase(unittest.TestCase):
def setUp(self):
self.Eval_parameters = {
"variable_cuts": {
"validation_file": {
"pt_btagJes": {
"operator": "<=",
"condition": 250000,
}
}
}
}
self.file = "validation_file"
self.control_dict = {
"pt_btagJes": {
"operator": "<=",
"condition": 250000,
}
}
def test_get_variable_cuts(self):
# Get dict
cut_dict = get_variable_cuts(self.Eval_parameters, self.file)
# Check dict
self.assertEqual(cut_dict, self.control_dict)
def testtest_get_variable_cuts_None(self):
# Get dict
cut_dict = get_variable_cuts(self.Eval_parameters, "error")
# Check dict
self.assertEqual(cut_dict, None)
class get_epoch_from_string_TestCase(unittest.TestCase):
def setUp(self):
self.test_string = "model_epoch11.h5"
......@@ -733,6 +770,7 @@ class GetSamples_TestCase(unittest.TestCase):
"""
def setUp(self):
self.Eval_parameters_validation = {}
self.NN_structure = {"class_labels": ["bjets", "cjets", "ujets"]}
self.preparation = {"class_labels": ["bjets", "cjets", "ujets"]}
self.test_dir = tempfile.TemporaryDirectory()
......@@ -875,7 +913,7 @@ class GetSamples_TestCase(unittest.TestCase):
def test_GetTestFile(self):
(X_valid, X_valid_trk, Y_valid,) = GetTestFile(
file=self.validation_file,
input_file=self.validation_file,
var_dict=self.var_dict,
preprocess_config=self,
class_labels=self.class_labels,
......@@ -890,6 +928,22 @@ class GetSamples_TestCase(unittest.TestCase):
self.assertEqual(Y_valid.shape, (len(Y_valid), 3))
def test_load_validation_data_umami(self):
self.Eval_parameters_validation = {
"variable_cuts": {
"validation_file": {
"pt_btagJes": {
"operator": "<=",
"condition": 50_000,
}
},
"add_validation_file": {
"pt_btagJes": {
"operator": ">",
"condition": 50_000,
}
},