Commit ab5d7627 authored by Manuel Guth's avatar Manuel Guth
Browse files

Merge branch 'alfroch-tracks-flag' into 'master'

Removing --tracks flag in preprocessing

See merge request atlas-flavor-tagging-tools/algorithms/umami!267
parents bb781745 27eb80e5
......@@ -152,10 +152,10 @@ info_text_event = f"We are using the {jet_collection} jet collection and have {n
In Python 3 a dedicated integer division was introduced.
```python
# standard division -> returns by default a flaot
# standard division -> returns by default a flaot (no rounding)
nEvents = nJets / 4
# integer division -> returns an integer
# integer division -> rounds to integer precision
nEvents = nJets // 4
```
......
......@@ -328,6 +328,7 @@ sampling:
# number of training jets
njets: 25e6
save_tracks: True
tracks_name: "tracks"
# this stores the indices per sample into an intermediate file
intermediate_index_file: indices.h5
```
......@@ -341,7 +342,7 @@ In `sampling`, we can define the method which is used in the preprocessing for r
The `options` are some options for the different resampling methods. You need to define the sampling variables which are used for resampling. For example, if you want to resample in `pt_btagJes` and `absEta_btagJes` bins, you just define them with their respective bins.
Another thing you need to define are the `samples` which are to be resampled. You need to define them for `ttbar` and `zprime`. The samples defined in here are the ones we prepared in the step above. To ensure a smooth hybrid sample of ttbar and zprime, we need to define some empirically derived values for the ttbar samples in `custom_njets_initial`.
`fractions` gives us the fractions of ttbar and zprime in the final training sample. These values need to add up to 1! The rest of the variables are pretty self-explanatory.
`fractions` gives us the fractions of ttbar and zprime in the final training sample. These values need to add up to 1! The `save_tracks` and the `tracks_name` options define the using of tracks. `save_tracks` is bool while `tracks_name` is a string. The latter is the name of the tracks how they are called in the .h5 files coming from the dumper. After the preparation stage, they will have the name `tracks`. The rest of the variables are pretty self-explanatory.
If you want to use the PDF sampling, have a look at the example config [PFlow-Preprocessing-taus.yaml](https://gitlab.cern.ch/atlas-flavor-tagging-tools/algorithms/umami/-/blob/master/examples/PFlow-Preprocessing-taus.yaml).
......@@ -398,28 +399,24 @@ The steps defined in the following segment are only performed on the training sa
preprocessing.py --config <path to config file> --resampling
```
If you want to also use the tracks of the jets, you need to give an extra flag `--tracks`. Track information are not needed for the DL1r but for DIPS and Umami. If you want to train one of those, you need to process the track information too with setting the `--tracks` flag:
```bash
preprocessing.py --config <path to config file> --resampling --tracks
```
If you want to also use the tracks of the jets, you need to set the option `save_tracks` in the preprocessing config to `True`. If the tracks have a different name than `"tracks"` in the .h5 files coming from the dumper, you can also set change `tracks_name` to your needs. Track information are not needed for the DL1r but for DIPS and Umami.
2. Retrieving scaling and shifting factors:
```bash
preprocessing.py --config <path to config file> --scaling --tracks
preprocessing.py --config <path to config file> --scaling
```
3. Applying shifting and scaling factors
```bash
preprocessing.py --config <path to config file> --apply_scales --tracks
preprocessing.py --config <path to config file> --apply_scales
```
4. Writing the samples to disk in the correct format for training.
```bash
preprocessing.py --config <path to config file> --write --tracks
preprocessing.py --config <path to config file> --write
```
## Full example
......
......@@ -230,6 +230,7 @@ sampling:
# If set to -1: max out to target numbers (limited by fractions ratio)
njets: 25e6
save_tracks: True
tracks_name: "tracks"
# this stores the indices per sample into an intermediate file
intermediate_index_file: indices.h5
# outputfiles are split into 5 -> needs to be implemented
......
......@@ -233,6 +233,7 @@ sampling:
# If set to -1: max out to target numbers (limited by fractions ratio)
njets: -1
save_tracks: False
tracks_name:
# this stores the indices per sample into an intermediate file
intermediate_index_file: indices.h5
# outputfiles are split into 5 -> needs to be implemented
......
......@@ -233,6 +233,7 @@ sampling:
# If set to -1: max out to target numbers (limited by fractions ratio)
njets: 25e6
save_tracks: True
tracks_name: "tracks"
# this stores the indices per sample into an intermediate file
intermediate_index_file: indices.h5
# outputfiles are split into 5 -> needs to be implemented
......
......@@ -18,17 +18,6 @@ def GetParser():
help="Enter the name of the config file to create the"
" hybrid sample.",
)
parser.add_argument(
"-t",
"--tracks",
action="store_true",
help="Stores also track information.",
)
parser.add_argument(
"--tracks_name",
default="tracks",
help="Enter the name of the tracks dataset.",
)
parser.add_argument(
"--sample",
default=None,
......
......@@ -70,8 +70,20 @@ class PrepareSamples:
)
self.cuts = cuts + category_cuts
self.n_jets_to_get = int(sample.get("n_jets", 0))
self.save_tracks = args.tracks
self.tracks_name = args.tracks_name
# Check if tracks are used
self.save_tracks = self.config.sampling["options"]["save_tracks"]
# Check for tracks name. If not there, use default
if (
"tracks_name" in self.config.sampling["options"]
and self.config.sampling["options"]["tracks_name"] is not None
):
self.tracks_name = self.config.sampling["options"]["tracks_name"]
else:
self.tracks_name = "tracks"
output_path = sample.get("f_output")["path"]
self.output_file = os.path.join(
output_path, sample.get("f_output")["file"]
......
......@@ -45,30 +45,17 @@ def runPreprocessing(config, tagger):
"training_zprime_cjets",
"training_zprime_ujets",
]:
if tagger == "dl1r":
run_prepare = run(
[
"preprocessing.py",
"-c",
f"{config}",
"--sample",
f"{sample}",
"--prepare",
]
)
run_prepare = run(
[
"preprocessing.py",
"-c",
f"{config}",
"--sample",
f"{sample}",
"--prepare",
]
)
else:
run_prepare = run(
[
"preprocessing.py",
"-c",
f"{config}",
"--sample",
f"{sample}",
"--tracks",
"--prepare",
]
)
try:
run_prepare.check_returncode()
except CalledProcessError:
......@@ -79,26 +66,15 @@ def runPreprocessing(config, tagger):
run_prepare
logger.info("Test: running the resampling...")
if tagger == "dl1r":
run_resampling = run(
[
"preprocessing.py",
"-c",
f"{config}",
"--resampling",
]
)
run_resampling = run(
[
"preprocessing.py",
"-c",
f"{config}",
"--resampling",
]
)
else:
run_resampling = run(
[
"preprocessing.py",
"-c",
f"{config}",
"--tracks",
"--resampling",
]
)
try:
run_resampling.check_returncode()
except CalledProcessError:
......@@ -109,31 +85,16 @@ def runPreprocessing(config, tagger):
run_resampling
logger.info("Test: retrieving scaling and shifting factors...")
if tagger == "dl1r":
run_scaling = run(
[
"preprocessing.py",
"-c",
f"{config}",
"--scaling",
"--chunk_size",
"1000",
]
)
else:
run_scaling = run(
[
"preprocessing.py",
"-c",
f"{config}",
"--scaling",
"--chunk_size",
"1000",
"--tracks",
]
)
run_scaling = run(
[
"preprocessing.py",
"-c",
f"{config}",
"--scaling",
"--chunk_size",
"1000",
]
)
try:
run_scaling.check_returncode()
except CalledProcessError:
......@@ -144,26 +105,14 @@ def runPreprocessing(config, tagger):
run_scaling
logger.info("Test: applying shifting and scaling factors...")
if tagger == "dl1r":
run_apply_scales = run(
[
"preprocessing.py",
"-c",
f"{config}",
"--apply_scales",
]
)
else:
run_apply_scales = run(
[
"preprocessing.py",
"-c",
f"{config}",
"--apply_scales",
"--tracks",
]
)
run_apply_scales = run(
[
"preprocessing.py",
"-c",
f"{config}",
"--apply_scales",
]
)
try:
run_apply_scales.check_returncode()
except CalledProcessError:
......@@ -176,26 +125,15 @@ def runPreprocessing(config, tagger):
logger.info(
"Test: shuffling the samples and writing the samples to disk..."
)
if tagger == "dl1r":
run_write = run(
[
"preprocessing.py",
"-c",
f"{config}",
"--write",
]
)
run_write = run(
[
"preprocessing.py",
"-c",
f"{config}",
"--write",
]
)
else:
run_write = run(
[
"preprocessing.py",
"-c",
f"{config}",
"--write",
"--tracks",
]
)
try:
run_write.check_returncode()
except CalledProcessError:
......
......@@ -45,30 +45,17 @@ def runPreprocessing(config, tagger):
"training_zprime_cjets",
"training_zprime_ujets",
]:
if tagger == "dl1r":
run_prepare = run(
[
"preprocessing.py",
"-c",
f"{config}",
"--sample",
f"{sample}",
"--prepare",
]
)
run_prepare = run(
[
"preprocessing.py",
"-c",
f"{config}",
"--sample",
f"{sample}",
"--prepare",
]
)
else:
run_prepare = run(
[
"preprocessing.py",
"-c",
f"{config}",
"--sample",
f"{sample}",
"--tracks",
"--prepare",
]
)
try:
run_prepare.check_returncode()
except CalledProcessError:
......@@ -79,26 +66,15 @@ def runPreprocessing(config, tagger):
run_prepare
logger.info("Test: running the resampling...")
if tagger == "dl1r":
run_resampling = run(
[
"preprocessing.py",
"-c",
f"{config}",
"--resampling",
]
)
run_resampling = run(
[
"preprocessing.py",
"-c",
f"{config}",
"--resampling",
]
)
else:
run_resampling = run(
[
"preprocessing.py",
"-c",
f"{config}",
"--tracks",
"--resampling",
]
)
try:
run_resampling.check_returncode()
except CalledProcessError:
......@@ -109,31 +85,17 @@ def runPreprocessing(config, tagger):
run_resampling
logger.info("Test: retrieving scaling and shifting factors...")
run_scaling = run(
[
"preprocessing.py",
"-c",
f"{config}",
"--scaling",
"--chunk_size",
"1000",
]
)
if tagger == "dl1r":
run_scaling = run(
[
"preprocessing.py",
"-c",
f"{config}",
"--scaling",
"--chunk_size",
"1000",
]
)
else:
run_scaling = run(
[
"preprocessing.py",
"-c",
f"{config}",
"--scaling",
"--chunk_size",
"1000",
"--tracks",
]
)
try:
run_scaling.check_returncode()
except CalledProcessError:
......@@ -144,26 +106,15 @@ def runPreprocessing(config, tagger):
run_scaling
logger.info("Test: applying shifting and scaling factors...")
if tagger == "dl1r":
run_apply_scales = run(
[
"preprocessing.py",
"-c",
f"{config}",
"--apply_scales",
]
)
run_apply_scales = run(
[
"preprocessing.py",
"-c",
f"{config}",
"--apply_scales",
]
)
else:
run_apply_scales = run(
[
"preprocessing.py",
"-c",
f"{config}",
"--apply_scales",
"--tracks",
]
)
try:
run_apply_scales.check_returncode()
except CalledProcessError:
......@@ -176,26 +127,15 @@ def runPreprocessing(config, tagger):
logger.info(
"Test: shuffling the samples and writing the samples to disk..."
)
if tagger == "dl1r":
run_write = run(
[
"preprocessing.py",
"-c",
f"{config}",
"--write",
]
)
run_write = run(
[
"preprocessing.py",
"-c",
f"{config}",
"--write",
]
)
else:
run_write = run(
[
"preprocessing.py",
"-c",
f"{config}",
"--write",
"--tracks",
]
)
try:
run_write.check_returncode()
except CalledProcessError:
......
......@@ -112,7 +112,8 @@ sampling:
ttbar: 0.65
zprime: 0.35
# can change after applying resampling in the hybrid sample creation
save_tracks: True
save_tracks: False
tracks_name: "tracks"
njets: 5.5e6
intermediate_index_file: indices.h5
# outputfiles are split into 5
......
......@@ -160,8 +160,6 @@ class PrepareSamplesTestCase(unittest.TestCase):
def __init__(self) -> None:
self.sample = "ttbar"
self.config_file = "test_preprocess_config.yaml"
self.tracks = False
self.tracks_name = "tracks"
self.shuffle_array = True
def setUp(self):
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment