diff --git a/.gitmodules b/.gitmodules index aed4fd3e7e9995d169ec516e4ef3eae15e3ff092..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,5 +0,0 @@ -[submodule "cms/mlpf/particleflow"] - path = cms/mlpf/particleflow - url = https://github.com/dcsouthwick/particleflow - branch = benchmark_suite - fetchRecurseSubmodules = true diff --git a/WL_list.md b/WL_list.md index a66f27314cc5553b327bb21b59d26874b6b85361..c00ab08044f088a098f2ec4d5c571c0192e1245a 100644 --- a/WL_list.md +++ b/WL_list.md @@ -21,22 +21,22 @@ where: | Experiment | WL repo | SIF image registry | Docker image registry| Latest Built Version | Latest Pipeline status | Unpacked container size | | -------- | -------- | -------- | -------- | -------- | -------- | -------- | -| alice | [digi-reco-core-run3-ma][alice_digi-reco-core-run3-ma_code] | [click for link][alice_digi-reco-core-run3-ma_sif] | [click for link][alice_digi-reco-core-run3-ma_img] | [v3.0][alice_digi-reco-core-run3-ma_pipelink] | ![ci][alice_digi-reco-core-run3-ma_pipeline]| 16G | -| atlas | [gen_sherpa-ma][atlas_gen_sherpa-ma_code] | [click for link][atlas_gen_sherpa-ma_sif] | [click for link][atlas_gen_sherpa-ma_img] | [v2.2][atlas_gen_sherpa-ma_pipelink] | ![ci][atlas_gen_sherpa-ma_pipeline]| 22G | -| atlas | [reco_mt-ma][atlas_reco_mt-ma_code] | [click for link][atlas_reco_mt-ma_sif] | [click for link][atlas_reco_mt-ma_img] | [v2.3][atlas_reco_mt-ma_pipelink] | ![ci][atlas_reco_mt-ma_pipeline]| 22G | -| atlas | [sim_mt-ma][atlas_sim_mt-ma_code] | [click for link][atlas_sim_mt-ma_sif] | [click for link][atlas_sim_mt-ma_img] | [v2.1][atlas_sim_mt-ma_pipelink] | ![ci][atlas_sim_mt-ma_pipeline]| 20G | -| belle2 | [gen-sim-reco-ma][belle2_gen-sim-reco-ma_code] | [click for link][belle2_gen-sim-reco-ma_sif] | [click for link][belle2_gen-sim-reco-ma_img] | [v2.2][belle2_gen-sim-reco-ma_pipelink] | ![ci][belle2_gen-sim-reco-ma_pipeline]| 3.4G | -| cms | [digi-run3-ma][cms_digi-run3-ma_code] | [click for link][cms_digi-run3-ma_sif] | [click for link][cms_digi-run3-ma_img] | [v1.1][cms_digi-run3-ma_pipelink] | ![ci][cms_digi-run3-ma_pipeline]| 5.8G | -| cms | [gen-sim-run3-ma][cms_gen-sim-run3-ma_code] | [click for link][cms_gen-sim-run3-ma_sif] | [click for link][cms_gen-sim-run3-ma_img] | [v1.1][cms_gen-sim-run3-ma_pipelink] | ![ci][cms_gen-sim-run3-ma_pipeline]| 6.2G | -| cms | [hlt-ma][cms_hlt-ma_code] | [click for link][cms_hlt-ma_sif] | [click for link][cms_hlt-ma_img] | [v0.2][cms_hlt-ma_pipelink] | ![ci][cms_hlt-ma_pipeline]| 19G | -| cms | [mlpf][cms_mlpf_code] | [click for link][cms_mlpf_sif] | [click for link][cms_mlpf_img] | [v0.1][cms_mlpf_pipelink] | ![ci][cms_mlpf_pipeline]| | -| cms | [reco-run3-ma][cms_reco-run3-ma_code] | [click for link][cms_reco-run3-ma_sif] | [click for link][cms_reco-run3-ma_img] | [v1.2][cms_reco-run3-ma_pipelink] | ![ci][cms_reco-run3-ma_pipeline]| 6.5G | -| hello | [world-c7-ma][hello_world-c7-ma_code] | [click for link][hello_world-c7-ma_sif] | [click for link][hello_world-c7-ma_img] | [v1.0][hello_world-c7-ma_pipelink] | ![ci][hello_world-c7-ma_pipeline]| 759M | -| hello | [world-cs8-ma][hello_world-cs8-ma_code] | [click for link][hello_world-cs8-ma_sif] | [click for link][hello_world-cs8-ma_img] | [ci-v1.0][hello_world-cs8-ma_pipelink] | ![ci][hello_world-cs8-ma_pipeline]| 518M | -| igwn | [pe][igwn_pe_code] | [click for link][igwn_pe_sif] | [click for link][igwn_pe_img] | [v0.5][igwn_pe_pipelink] | ![ci][igwn_pe_pipeline]| 2.9G | -| juno | [gen-sim-reco][juno_gen-sim-reco_code] | [click for link][juno_gen-sim-reco_sif] | [click for link][juno_gen-sim-reco_img] | [v3.1][juno_gen-sim-reco_pipelink] | ![ci][juno_gen-sim-reco_pipeline]| 3.3G | -| lhcb | [sim-run3-ma][lhcb_sim-run3-ma_code] | [click for link][lhcb_sim-run3-ma_sif] | [click for link][lhcb_sim-run3-ma_img] | [v1.1][lhcb_sim-run3-ma_pipelink] | ![ci][lhcb_sim-run3-ma_pipeline]| 5.4G | -| mg5amc | [madgraph4gpu-2022][mg5amc_madgraph4gpu-2022_code] | [click for link][mg5amc_madgraph4gpu-2022_sif] | [click for link][mg5amc_madgraph4gpu-2022_img] | [ci-v0.10][mg5amc_madgraph4gpu-2022_pipelink] | ![ci][mg5amc_madgraph4gpu-2022_pipeline]| 11G | +| alice | [digi-reco-core-run3-ma][alice_digi-reco-core-run3-ma_code] | [click for link][alice_digi-reco-core-run3-ma_sif] | [click for link][alice_digi-reco-core-run3-ma_img] | [v3.0][alice_digi-reco-core-run3-ma_pipelink] | ![ci][alice_digi-reco-core-run3-ma_pipeline]| 16G | +| atlas | [gen_sherpa-ma][atlas_gen_sherpa-ma_code] | [click for link][atlas_gen_sherpa-ma_sif] | [click for link][atlas_gen_sherpa-ma_img] | [v2.2][atlas_gen_sherpa-ma_pipelink] | ![ci][atlas_gen_sherpa-ma_pipeline]| 22G | +| atlas | [reco_mt-ma][atlas_reco_mt-ma_code] | [click for link][atlas_reco_mt-ma_sif] | [click for link][atlas_reco_mt-ma_img] | [v2.3][atlas_reco_mt-ma_pipelink] | ![ci][atlas_reco_mt-ma_pipeline]| 22G | +| atlas | [sim_mt-ma][atlas_sim_mt-ma_code] | [click for link][atlas_sim_mt-ma_sif] | [click for link][atlas_sim_mt-ma_img] | [v2.1][atlas_sim_mt-ma_pipelink] | ![ci][atlas_sim_mt-ma_pipeline]| 20G | +| belle2 | [gen-sim-reco-ma][belle2_gen-sim-reco-ma_code] | [click for link][belle2_gen-sim-reco-ma_sif] | [click for link][belle2_gen-sim-reco-ma_img] | [v2.2][belle2_gen-sim-reco-ma_pipelink] | ![ci][belle2_gen-sim-reco-ma_pipeline]| 3.4G | +| cms | [digi-run3-ma][cms_digi-run3-ma_code] | [click for link][cms_digi-run3-ma_sif] | [click for link][cms_digi-run3-ma_img] | [v1.1][cms_digi-run3-ma_pipelink] | ![ci][cms_digi-run3-ma_pipeline]| 5.8G | +| cms | [gen-sim-run3-ma][cms_gen-sim-run3-ma_code] | [click for link][cms_gen-sim-run3-ma_sif] | [click for link][cms_gen-sim-run3-ma_img] | [v1.1][cms_gen-sim-run3-ma_pipelink] | ![ci][cms_gen-sim-run3-ma_pipeline]| 6.2G | +| cms | [hlt-ma][cms_hlt-ma_code] | [click for link][cms_hlt-ma_sif] | [click for link][cms_hlt-ma_img] | [v0.2][cms_hlt-ma_pipelink] | ![ci][cms_hlt-ma_pipeline]| 19G | +| cms | [mlpf][cms_mlpf_code] | [click for link][cms_mlpf_sif] | [click for link][cms_mlpf_img] | [ci-v0.4][cms_mlpf_pipelink] | ![ci][cms_mlpf_pipeline]| 7.6G | +| cms | [reco-run3-ma][cms_reco-run3-ma_code] | [click for link][cms_reco-run3-ma_sif] | [click for link][cms_reco-run3-ma_img] | [v1.2][cms_reco-run3-ma_pipelink] | ![ci][cms_reco-run3-ma_pipeline]| 6.5G | +| hello | [world-c7-ma][hello_world-c7-ma_code] | [click for link][hello_world-c7-ma_sif] | [click for link][hello_world-c7-ma_img] | [v1.0][hello_world-c7-ma_pipelink] | ![ci][hello_world-c7-ma_pipeline]| 759M | +| hello | [world-cs8-ma][hello_world-cs8-ma_code] | [click for link][hello_world-cs8-ma_sif] | [click for link][hello_world-cs8-ma_img] | [ci-v1.0][hello_world-cs8-ma_pipelink] | ![ci][hello_world-cs8-ma_pipeline]| 518M | +| igwn | [pe][igwn_pe_code] | [click for link][igwn_pe_sif] | [click for link][igwn_pe_img] | [v0.5][igwn_pe_pipelink] | ![ci][igwn_pe_pipeline]| 2.9G | +| juno | [gen-sim-reco][juno_gen-sim-reco_code] | [click for link][juno_gen-sim-reco_sif] | [click for link][juno_gen-sim-reco_img] | [v3.1][juno_gen-sim-reco_pipelink] | ![ci][juno_gen-sim-reco_pipeline]| 3.3G | +| lhcb | [sim-run3-ma][lhcb_sim-run3-ma_code] | [click for link][lhcb_sim-run3-ma_sif] | [click for link][lhcb_sim-run3-ma_img] | [v1.1][lhcb_sim-run3-ma_pipelink] | ![ci][lhcb_sim-run3-ma_pipeline]| 5.4G | +| mg5amc | [madgraph4gpu-2022][mg5amc_madgraph4gpu-2022_code] | [click for link][mg5amc_madgraph4gpu-2022_sif] | [click for link][mg5amc_madgraph4gpu-2022_img] | [ci-v0.10][mg5amc_madgraph4gpu-2022_pipelink] | ![ci][mg5amc_madgraph4gpu-2022_pipeline]| 11G | [alice_digi-reco-core-run3-ma_code]: https://gitlab.cern.ch/hep-benchmarks/hep-workloads/-/blob/master/alice/digi-reco-core-run3-ma/alice-digi-reco-core-run3-ma [alice_digi-reco-core-run3-ma_sif]: https://gitlab.cern.ch/hep-benchmarks/hep-workloads-sif/container_registry/?search%5B%5D=alice-digi-reco-core-run3-ma-bmk @@ -89,7 +89,7 @@ where: [cms_mlpf_code]: https://gitlab.cern.ch/hep-benchmarks/hep-workloads/-/blob/master/cms/mlpf/cms-mlpf [cms_mlpf_sif]: https://gitlab.cern.ch/hep-benchmarks/hep-workloads-sif/container_registry/?search%5B%5D=cms-mlpf-bmk [cms_mlpf_img]: https://gitlab.cern.ch/hep-benchmarks/hep-workloads/container_registry/?search%5B%5D=cms-mlpf-bmk -[cms_mlpf_pipelink]: https://gitlab.cern.ch/hep-benchmarks/hep-workloads/-/pipelines/4175345 +[cms_mlpf_pipelink]: https://gitlab.cern.ch/hep-benchmarks/hep-workloads/-/pipelines/8569540 [cms_mlpf_pipeline]: https://gitlab.cern.ch/hep-benchmarks/hep-workloads/badges/qa-build-cms-mlpf/pipeline.svg [cms_reco-run3-ma_code]: https://gitlab.cern.ch/hep-benchmarks/hep-workloads/-/blob/master/cms/reco-run3-ma/cms-reco-run3-ma diff --git a/cms/mlpf/Dockerfile.append b/cms/mlpf/Dockerfile.append index dd54f87cee6e20f3fa37da0328aacd377fdc9ac3..1b8e0556a33cd875ba58fd7ea8b8aa331965622d 100644 --- a/cms/mlpf/Dockerfile.append +++ b/cms/mlpf/Dockerfile.append @@ -1,14 +1,12 @@ -# Get the source code for MLPF -COPY particleflow /bmk/cms-mlpf/particleflow - -# Copy in the dependencies -COPY cms-mlpf/requirements.txt cms-mlpf/cms-mlpf-bmk.sh cms-mlpf/prepare-dataset.sh /bmk/cms-mlpf/ - # Install dependencies RUN \ - # cd /bmk/cms-mlpf && git submodule update --init --recursive - python3 -m pip install --upgrade pip==21.3.1 setuptools==59.5.0 && \ - python3 -m pip install --no-cache-dir -r /bmk/cms-mlpf/requirements.txt && \ - python3 -m pip install --no-cache-dir /bmk/cms-mlpf/particleflow/hep_tfds && \ - # Download and pre-process the dataset - bash /bmk/cms-mlpf/prepare-dataset.sh + dnf install -y git && \ + git clone https://gitlab.cern.ch/dsouthwi/particleflow.git --depth 1 -b bmk_torch /bmk/cms-mlpf && \ + python3 -m ensurepip && \ + python3 -m pip install torch==2.4.1 torchvision==0.19.1 torchaudio==2.4.1 --index-url https://download.pytorch.org/whl/cu124 && \ + # python3 -m pip install --no-cache-dir torch torchvision torchaudio && \ torch 2.5 broken for now + python3 -m pip install --no-cache-dir comet-ml awkward boost-histogram fastjet tqdm scikit-learn pandas mplhep numba tfds-nightly wheel pyyaml tensorboard + +# Get dataset +RUN curl -LO https://dsouthwi.web.cern.ch/files/clic_edm_qq_pf_2.1.0.tar.gz && \ + tar zxf clic_edm_qq_pf_2.1.0.tar.gz -C /bmk/cms-mlpf && rm clic_edm_qq_pf_2.1.0.tar.gz diff --git a/cms/mlpf/cms-mlpf.spec b/cms/mlpf/cms-mlpf.spec index 21277af610b5b1c74adae7a8ba8af0dad172fab0..a5bee5f656507c253a0ad92b01d2828bb602a6a1 100644 --- a/cms/mlpf/cms-mlpf.spec +++ b/cms/mlpf/cms-mlpf.spec @@ -1,9 +1,12 @@ HEPWL_BMKEXE=cms-mlpf-bmk.sh -HEPWL_BMKOPTS="" +# docker may run out of shared memory for events > 1000, for CPU-only +# critically, reduce number of convolutions to 1 for CPU CI +HEPWL_BMKOPTS="-x '--ntrain 300 --nvalid 300 --gpus 1 --num-epochs 2 --num-convs 1'" HEPWL_BMKDIR="cms-mlpf" -HEPWL_BMKDESCRIPTION="" +HEPWL_BMKDESCRIPTION="CMS Machine-Learned ParticleFlow (MLPF)" HEPWL_DOCKERIMAGENAME=cms-mlpf-bmk -HEPWL_DOCKERIMAGETAG=v0.1 -HEPWL_CVMFSREPOS=sft.cern.ch -HEPWL_BMKOS="gitlab-registry.cern.ch/dsouthwi/tensorflow/tensorflow-gpu:latest" +HEPWL_DOCKERIMAGETAG=ci-v0.4 # NB ci-vX.Y for tests, vX.Y for tags +HEPWL_CVMFSREPOS=NONE +HEPWL_BMKOS="gitlab-registry.cern.ch/linuxsupport/alma9-base:latest" HEPWL_BMKUSEGPU=1 +HEPWL_BUILDARCH="x86_64" diff --git a/cms/mlpf/cms-mlpf/DESCRIPTION b/cms/mlpf/cms-mlpf/DESCRIPTION index c20341cbb18bcba679e508d5ef6181dc353ea1dc..03674f2e805bd65f5287792d0c1abe20946029bb 100644 --- a/cms/mlpf/cms-mlpf/DESCRIPTION +++ b/cms/mlpf/cms-mlpf/DESCRIPTION @@ -1 +1,3 @@ -ML-based CMS workload reconstructing events using https://github.com/jpata/particleflow +ML-based CMS particleflow workload reconstructing events using <https://github.com/jpata/particleflow> + +Intended for GPU usage, CPU will take much longer. diff --git a/cms/mlpf/cms-mlpf/cms-mlpf-bmk.dev.sh b/cms/mlpf/cms-mlpf/cms-mlpf-bmk.dev.sh new file mode 100644 index 0000000000000000000000000000000000000000..7798b34a9a2bd94e51d97a327dc78d37805e9332 --- /dev/null +++ b/cms/mlpf/cms-mlpf/cms-mlpf-bmk.dev.sh @@ -0,0 +1,175 @@ +#!/bin/bash + +# Copyright 2019-2020 CERN. See the COPYRIGHT file at the top-level +# directory of this distribution. For licensing information, see the +# COPYING file at the top-level directory of this distribution. + +#set -x # enable debug printouts + +#set -e # immediate exit on error + +# Function doOne must be defined in each benchmark +# Input argument $1: process index (between 1 and $NCOPIES) +# Return value: please return 0 if this workload copy was successful, 1 otherwise +# The following variables are guaranteed to be defined and exported: NCOPIES, NTHREADS, NEVENTS_THREAD, BMKDIR, DEBUG +# The function is started in process-specific working directory <basewdir>/proc_$1: +# please store here the individual log files for each of the NCOPIES processes +function doOne(){ + if [ "$1" == "" ] || [ "$2" != "" ]; then echo "[doOne] ERROR! Invalid arguments '$@' to doOne"; return 1; fi + echo "[doOne ($1)] $(date) starting in $(pwd)" + sleep 5s + echo "Hello world" > out_$1.log 2>&1 + echo "[doOne ($1)] $(date) EXTRA_ARGS='$EXTRA_ARGS'" + + # parse extra args if any + options=$(getopt -a -n cms-mlpf-bmk -o g:D:B: --long nepochs:,ntrain:,ntest:,nvalid:,batch-multiplier:,gpus:,dtype:nworkers: -- "$EXTRA_ARGS") + eval set -- "$options" + while [ : ]; do + case "$1" in + --ntrain ) NTRAIN="$2"; shift;; + --ntest ) NTEST="$2"; shift;; + --nvalid ) NVALID="$2"; shift;; + --nepochs ) NEPOCHS="$2"; shift;; + --nworkers ) NWORKERS="$2"; shift;; + --gpus | -g ) NGPUS="$2"; shift;; + --batch-multiplier | -B ) BMULT="$2"; shift;; + --dtype | -D ) DTYPE="$2"; shift;; + --train ) TRAIN="--train";; + -- ) shift; break;; + esac + shift + done + + + # Run the workload + python3 mlpf/pyg_pipeline.py ${TRAIN} \ + --config parameters/pytorch/pyg-clic.yaml \ + --benchmark_dir $resultsDir \ + --gpus $NGPUS \ + --batch_size $BSIZE \ + --num-epochs $NEPOCHS \ + --ntrain $NTRAIN \ + --ntest $NTEST \ + --nvalid $NVALID \ + #--prefix /tmp/train_ \ + status=${?} + echo "[doOne ($1)] $(date) completed (status=$status)" + # Return 0 if this workload copy was successful, 1 otherwise + return $status +} + +# Default values for NCOPIES, NTHREADS, NEVENTS_THREAD must be set in each benchmark +NCOPIES=1 # cannot be changed by user input () +NTHREADS=1 # cannot be changed by user input (single-threaded single-process WL) +NEVENTS_THREAD=1 # not relevant for GPUs +# specific to MLPF +NEPOCHS=6 # must be >1 as 1st epoch is thrown away +NTRAIN=120000 # 0 is None, events to train on +NTEST=36000 # 0 is None, events to test training +BSIZE=8 # 8 is Default, batch size (too small device is under-loaded, too large OOM) +NGPUS=1 # 0 is Default, GPUs +TRAIN="--train" +DEBUG=0 +resultsDir="/results" + +function usage_detailed(){ + echo "" + echo "Additional MLPF parameters:" + echo " --nepochs : (int) Number of epochs >1 (default: $NEPOCHS)" + echo " --ntrain : (int) Train steps limit (default: $NTRAIN)" + echo " --ntest : (int) Test steps limit (default: $NTEST)" + echo " --nvalid : (int) Validation steps limit (default: $NVALID)" + echo " --batch_size : (int) Batch size (default: $BSIZE)" + echo " --dtype : (string) Data type {float32, float16, bfloat16}(default: $DTYPE)" + echo " -B --batch- : (int) Batch multiplier, 1=16G,5=80G GPU memory (default: $BATCH_MULTIPLIER)" + echo " -g --gpus : (int) Number of gpus to use (default: $NGPUS)" + +} + + +if [ -f /run/.containerenv ]; then FLAVOR="podman" +elif [ -f /.dockerenv ]; then FLAVOR="docker" +elif [ -f /singularity ]; then FLAVOR="singularity" +else FLAVOR="unknown"; +fi + +# Source the common benchmark driver +if [ -f $(dirname $0)/bmk-driver.sh ]; then + . $(dirname $0)/bmk-driver.sh +else + . $(dirname $0)/../../../common/bmk-driver.sh +fi + + +############################## + + +set -e + + +log() { + case $1 in + error) shift 1; echo -e "\e[31m>>> ERROR:\e[0m $*\n" | tee -a $resultsDir/out.log ; exit 2 ;; + info) shift 1; echo -e "\e[34m$*\e[0m\n" | tee -a $resultsDir/out.log ;; + silent) shift 1; echo "$*" >> $resultsDir/out.log ;; + *) echo "$*" | tee -a $resultsDir/out.log ; + esac +} + + + +# set CUDA_VISIBLE_DEVICES for tensorflow based on nvidia-smi (dirty nvidia-only check) +if type -P "nvidia-smi" &>/dev/null; then + DEVICES=$(nvidia-smi -L | wc -l) + log info "Detected $DEVICES nvidia GPUs" + export CUDA_VISIBLE_DEVICES=$(seq -s, 0 $(($DEVICES-1))) +fi + +# create /results/build to satisfy common build script (mimic bmk-driver.sh) +log silent "Creating /results/build" +mkdir -p $resultsDir/build +touch $resultsDir/build/.pointlessfile + +log info "Running benchmark MLPF" +log silent "Executing 'python3 mlpf/pipeline.py train \ + --config parameters/delphes-benchmark.yaml \ + --prefix /tmp/train_ \ + --plot-freq 1000000 \ + --benchmark_dir $resultsDir \ + --num_devices $NDEVICES \ + --batch_size $BSIZE \ + --nepochs $NEPOCHS \ + --ntrain $NTRAIN \ + --ntest $NTEST'" +cd /bmk/cms-mlpf/particleflow/ + + +REPORT=$(cat $resultsDir/result.json) + +generate_json() { + jq -n \ + --argjson nepochs "$NEPOCHS" \ + --argjson report "$REPORT" \ + --arg containment "$FLAVOR" \ + --arg description "$DESCRIPTION" \ + '{ + "run_info":{ + $nepochs + }, + $report, + "app":{ + $containment, + $description + } + }' +} +mkdir -p $resultsDir/report +if [ $skipSubDir -eq 0 ]; then + REPORT_PATH=$resultsDir/report/cms-mlpf_summary.json +else + REPORT_PATH=$resultsDir/cms-mlpf_summary.json +fi +generate_json > $REPORT_PATH +log info "Finished running MLPF. Final report written to $REPORT_PATH" + +# sourcing bmk-driver excluded for now pending rework to override common args diff --git a/cms/mlpf/cms-mlpf/cms-mlpf-bmk.sh b/cms/mlpf/cms-mlpf/cms-mlpf-bmk.sh index 838513a67de7e75661cc2a68d7a0561e074df5d7..1c1c8a67dfd6db734c1782f1e479c92f8910f9ce 100755 --- a/cms/mlpf/cms-mlpf/cms-mlpf-bmk.sh +++ b/cms/mlpf/cms-mlpf/cms-mlpf-bmk.sh @@ -1,172 +1,126 @@ #!/bin/bash -set -e +# Copyright 2019-2020 CERN. See the COPYRIGHT file at the top-level +# directory of this distribution. For licensing information, see the +# COPYING file at the top-level directory of this distribution. -if [ -f /run/.containerenv ]; then FLAVOR="podman" -elif [ -f /.dockerenv ]; then FLAVOR="docker" -elif [ -f /singularity ]; then FLAVOR="singularity" -else FLAVOR="unknown"; -fi +#set -x # enable debug printouts -# Default config -NEPOCHS=2 # must be >1 as 1st epoch is thrown away -NTRAIN=0 # 0 is None -NTEST=0 # 0 is None -BSIZE=4 # 4 is Default -NDEVICES=0 # 0 is Default -DEBUG=0 -resultsDir="/results" -skipSubDir=0 -MOP="none" -DESCRIPTION="Machine Learning Particle Flow (MLPF) benchmark" +#set -e # immediate exit on error -log() { - case $1 in - error) shift 1; echo -e "\e[31m>>> ERROR:\e[0m $*\n" | tee -a $resultsDir/out.log ; exit 2 ;; - info) shift 1; echo -e "\e[34m$*\e[0m\n" | tee -a $resultsDir/out.log ;; - silent) shift 1; echo "$*" >> $resultsDir/out.log ;; - *) echo "$*" | tee -a $resultsDir/out.log ; - esac -} +# Function doOne must be defined in each benchmark +# Input argument $1: process index (between 1 and $NCOPIES) +# Return value: please return 0 if this workload copy was successful, 1 otherwise +# The following variables are guaranteed to be defined and exported: NCOPIES, NTHREADS, NEVENTS_THREAD, BMKDIR, DEBUG +# The function is started in process-specific working directory <basewdir>/proc_$1: +# please store here the individual log files for each of the NCOPIES processes +function doOne(){ + if [ "$1" == "" ] || [ "$2" != "" ]; then echo "[doOne] ERROR! Invalid arguments '$@' to doOne"; return 1; fi + echo "[doOne ($1)] $(date) starting in $(pwd)" + sleep 5s + echo "Hello world" > out_$1.log 2>&1 + echo "[doOne ($1)] $(date) EXTRA_ARGS='$EXTRA_ARGS'" + echo "Starting ($1) with $EXTRA_ARGS" > out_$1.log + echo "[doOne ($1)] $(date) resultsDir='$resultsDir'" -function usage(){ - echo "" - echo "Usage: $0 [-w | --resultsdir <resultsDir>] [-W] [-c | --copies <NCOPIES>] [-n | --nepochs <NEPOCHS>] " \ - "[-B | --batch_size <BSIZE>] [-D | --num_devices <NDEVICES>] [--ntrain <NTRAIN>] [--ntest <NTEST>] " \ - "[-m | --mop <mode>] [-d | --debug] [-h | --help]" - echo " -w --resultsdir <resultsDir> : (path) results directory (default: /results , current: $resultsDir)" - echo " -W : store results in <resultsDir> directly" - echo " -n --nepochs : (int) Number of epochs >1 (default: 2, current: $NEPOCHS)" - echo " -B --batch_size : (int) Batch size per device (default: 4, current: $BSIZE)" - echo " -D --num_devices : (int) Number of devices to use (default: 0, current: $NDEVICES)" - echo " --ntrain : (int) Train steps limit (default: 0, current: $NTRAIN)" - echo " --ntest : (int) Test steps limit (default: 0, current: $NTEST)" - echo " -m --mop : (none|all|custom) clean working directory mode: none/all/custom (current: $MOP)" - echo " -d --debug : debug mode" - echo " -h --help : display this help and exit" - echo "" - echo "Mop mode: - none == do not remove working files, - all == remove all produced files (but summary json), - custom == custom implementation" - echo "Without -W (default): results are stored in a new subdirectory of <resultsDir>:" - echo " <resultsDir>/<uniqueid>/*.json" - echo " <resultsDir>/<uniqueid>/proc_1/*.log" - echo " <resultsDir>/<uniqueid>/proc_.../*.log" - echo " <resultsDir>/<uniqueid>/proc_<COPIES>/*.log" - echo "With -W (e.g. in the CI): results are stored in <resultsDir> directly:" - echo " <resultsDir>/*.json" - echo " <resultsDir>/proc_1/*.log" - echo " <resultsDir>/proc_.../*.log" - echo " <resultsDir>/proc_<NCOPIES>/*.log" - echo "" - echo "Without -w (default) and without -W: <resultsDir> is /results" - echo "Without -w (default) and with -W: <resultsDir> is a tmp directory /tmp/xxxx" - echo "" - if [ "$(type -t usage_detailed)" == "function" ]; then - echo -e "\nDetailed Usage:\n----------------\n" - ( usage_detailed ) # as a subprocess, just in case this has a 0 exit code... - fi - echo -e "DESCRIPTION\n" - if [ -e $BMKDIR/DESCRIPTION ]; then - cat $BMKDIR/DESCRIPTION - else - echo "Sorry there is no description included." - fi - echo "" - exit 2 # early termination (help or invalid arguments to benchmark script) -} + # parse extra args if any -parse_args() { - options=$(getopt -a -n cms-mlpf-bmk -o w:Wm:n:dD:B:h --long resultsdir:,nepochs:,ntrain:,ntest:,batch_size:,num_devices:,debug,help,mop -- "$@") - if [ $? != 0 ]; then echo "Invalid options provided." >&2; usage; fi - eval set -- "$options" - while true; do - case "$1" in - --help | -h ) usage; exit 0;; - --debug | -d ) DEBUG=1 ;; - -W ) skipSubDir=1;; - --mop | -m ) MOP="$2"; shift;; - --resultsdir | -w ) resultsDir="$2"; shift;; - --ntrain ) NTRAIN="$2"; shift;; - --ntest ) NTEST="$2"; shift;; - --nepochs | -n ) NEPOCHS="$2"; shift;; - --num_devices | -D ) NDEVICES="$2"; shift;; - --batch_size | -B ) BSIZE="$2"; shift;; - -- ) shift; break;; - esac - shift - done -} + # handle inherited args as well-- (-d --debug $DEBUG) -# TODO: implement MOP, DEBUG + # options=$(getopt -a -n cms-mlpf-bmk -o g:D:B: --long nepochs:,ntrain:,ntest:,nvalid:,batch-multiplier:,gpus:,dtype:nworkers: -- "$EXTRA_ARGS") + # eval set -- "$options" + # while [ : ]; do + # case "$1" in + # --ntrain ) NTRAIN="$2"; shift;; + # --ntest ) NTEST="$2"; shift;; + # --nvalid ) NVALID="$2"; shift;; + # --nepochs ) NEPOCHS="$2"; shift;; + # --nworkers ) NWORKERS="$2"; shift;; + # --gpus | -g ) NGPUS="$2"; shift;; + # --gpu-batch-multiplier | -B ) BSIZE="$2"; shift;; + # --dtype | -D ) DTYPE="$2"; shift;; + # --train ) TRAIN="--train";; + # -- ) shift; break;; + # esac + # shift + # done -parse_args $* + # Apparently the workload is now called from the results dir??? + pwd + cd /bmk/cms-mlpf -if [ -f "$resultsDir"/out.log ]; then rm "$resultsDir"/out.log; fi -log info "Base working directory: $resultsDir" + # Run the workload + python3 mlpf/pyg_pipeline.py ${TRAIN} \ + --config parameters/pytorch/pyg-clic-bmk.yaml \ + --benchmark \ + --experiments-dir $workDir \ + ${EXTRA_ARGS} + # --gpus $NGPUS \ + # --gpu-batch-multiplier $BSIZE \ + # --num-epochs $NEPOCHS \ + # --ntrain $NTRAIN \ + # --ntest $NTEST \ + # --nvalid $NVALID \ + # --dtype $DTYPE \ + # $TRAIN + -# set CUDA_VISIBLE_DEVICES for tensorflow based on nvidia-smi (dirty nvidia-only check) -if type -P "nvidia-smi" &>/dev/null; then - DEVICES=$(nvidia-smi -L | wc -l) - log info "Detected $DEVICES nvidia GPUs" - export CUDA_VISIBLE_DEVICES=$(seq -s, 0 $(($DEVICES-1))) -fi + #--prefix /tmp/train_ \ + status=${?} + echo "[doOne ($1)] $(date) completed (status=$status)" + # Return 0 if this workload copy was successful, 1 otherwise + return $status +} -# create /results/build to satisfy common build script (mimic bmk-driver.sh) -log silent "Creating /results/build" -mkdir -p $resultsDir/build -touch $resultsDir/build/.pointlessfile +function parseResults(){ + echo "[parseResults] Parsing results from baseWDir=$baseWDir" + resJSON=$(python3 /bmk/cms-mlpf/parseResults.py $baseWDir) + pystatus=$? + if [ "$pystatus" == "0" ]; then + echo $resJSON > $baseWDir/parser_output.json + cat $baseWDir/parser_output.json + fi + echo "[parseResults] python parser completed (status=$pystatus)" + return $pystatus +} +# Default values for NCOPIES, NTHREADS, NEVENTS_THREAD must be set in each benchmark +NCOPIES=1 # cannot be changed by user input () +NTHREADS=1 # cannot be changed by user input (single-threaded single-process WL) +NEVENTS_THREAD=1 # not relevant for GPUs +# specific to MLPF +NEPOCHS=6 # must be >1 as 1st epoch is thrown away +NTRAIN=120000 # 0 is None, events to train on +NTEST=36000 # 0 is None, events to test training +BSIZE=8 # 8 is Default, batch size (too small device is under-loaded, too large OOM) +NGPUS=1 # 0 is Default, GPUs +TRAIN="--train" +DTYPE="bfloat16" # float32, float16, bf16 +DEBUG=0 +#resultsDir="/results" -log info "Running benchmark MLPF" -log silent "Executing 'python3 mlpf/pipeline.py train \ - --config parameters/delphes-benchmark.yaml \ - --prefix /tmp/train_ \ - --plot-freq 1000000 \ - --benchmark_dir $resultsDir \ - --num_devices $NDEVICES \ - --batch_size $BSIZE \ - --nepochs $NEPOCHS \ - --ntrain $NTRAIN \ - --ntest $NTEST'" -cd /bmk/cms-mlpf/particleflow/ -python3 mlpf/pipeline.py train \ - --config parameters/delphes-benchmark.yaml \ - --prefix /tmp/train_ \ - --plot-freq 1000000 \ - --benchmark_dir $resultsDir \ - --num_devices $NDEVICES \ - --batch_size $BSIZE \ - --nepochs $NEPOCHS \ - --ntrain $NTRAIN \ - --ntest $NTEST +function usage_detailed(){ + echo "" + echo "Additional MLPF parameters: use -x '<EXTRA_ARGS>'" + echo " --num-epochs : (int) Number of epochs >1 (default: $NEPOCHS)" + echo " --ntrain : (int) Train steps limit (default: $NTRAIN)" + echo " --ntest : (int) Test steps limit (default: $NTEST)" + echo " --nvalid : (int) Validation steps limit (default: $NVALID)" + echo " --gpu-batch-multiplier : (int) Increases GPU batch size by constant multiplier 1=1G, 8=10G (default: $BSIZE)" + echo " --dtype : (string) Data type {float32, float16, bfloat16}(default: $DTYPE)" + echo " --gpus : (int) Number of gpus to use (default: $NGPUS)" + +} -REPORT=$(cat $resultsDir/result.json) -generate_json() { - jq -n \ - --argjson nepochs "$NEPOCHS" \ - --argjson report "$REPORT" \ - --arg containment "$FLAVOR" \ - --arg description "$DESCRIPTION" \ - '{ - "run_info":{ - $nepochs - }, - $report, - "app":{ - $containment, - $description - } - }' -} -mkdir -p $resultsDir/report -if [ $skipSubDir -eq 0 ]; then - REPORT_PATH=$resultsDir/report/cms-mlpf_summary.json -else - REPORT_PATH=$resultsDir/cms-mlpf_summary.json +if [ -f /run/.containerenv ]; then FLAVOR="podman" +elif [ -f /.dockerenv ]; then FLAVOR="docker" +elif [ -f /singularity ]; then FLAVOR="singularity" +else FLAVOR="unknown"; fi -generate_json > $REPORT_PATH -log info "Finished running MLPF. Final report written to $REPORT_PATH" -# sourcing bmk-driver excluded for now pending rework to override common args +# Source the common benchmark driver +if [ -f $(dirname $0)/bmk-driver.sh ]; then + . $(dirname $0)/bmk-driver.sh +else + . $(dirname $0)/../../../common/bmk-driver.sh +fi diff --git a/cms/mlpf/cms-mlpf/parseResults.py b/cms/mlpf/cms-mlpf/parseResults.py new file mode 100644 index 0000000000000000000000000000000000000000..cf6cdf0022a4ff25dfacd6868010c8b2d9159a76 --- /dev/null +++ b/cms/mlpf/cms-mlpf/parseResults.py @@ -0,0 +1,69 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +parseResults.py + +This is meant to be called from the parser script `parseResults.sh` + +""" +import argparse +import json +from pathlib import Path +import yaml + +directory = "history" # replace with the desired directory + +wl_stats = {} +wl_scores = {} +epoch_times = [] + +def parse_results(results_dir): + """results_dir is a path-like object to the directory containing a train-config yaml and history folder.""" + results_dir=Path(results_dir) + + if not results_dir.is_dir(): + raise ValueError(f"{results_dir} is not a directory") + + # check if passed $baseWDir or $workDir + results_dir = results_dir.joinpath("proc_1") + + with results_dir.joinpath("train-config.yaml").open(encoding="utf-8") as file: + config = yaml.safe_load(file) + + wl_stats["gpus"] = config["gpus"] + wl_stats["dtype"] = config["dtype"] + wl_stats["events"] = config["ntrain"] + wl_stats["gpu_batch_multiplier"] = config["gpu_batch_multiplier"] + wl_stats["batch_size"] = config["train_dataset"][config["dataset"]]["physical"]["batch_size"] + wl_stats["events_per_batch"] = ( + wl_stats["gpu_batch_multiplier"] * wl_stats["batch_size"] + if wl_stats["gpus"] > 0 + else wl_stats["batch_size"] + ) + wl_stats["conv_type"] = config["conv_type"] + wl_stats["dataset"] = config["train_dataset"][config["dataset"]]["physical"]["samples"] + + # parse all results json + for filepath in results_dir.joinpath(directory).rglob("*.json"): + with filepath.open() as file: + data = json.load(file) + if "epoch_train_time" in data: + epoch_times.append(round(data["epoch_train_time"], 4)) + + wl_stats["num_epochs"] = len(epoch_times) + wl_stats["epoch_times"] = sorted(epoch_times, reverse=True) + wl_stats["train_time"] = round(sum(epoch_times), 4) + wl_stats["throughput_per_epoch"] = [round(wl_stats["events"] / t, 4) for t in wl_stats["epoch_times"]] + + wl_scores["mean_throughput"] = round(sum(wl_stats["throughput_per_epoch"][1:])/(wl_stats["num_epochs"] -1 ), 4) + wl_scores["mean_epoch_time"] = round(sum(wl_stats["epoch_times"][1:])/(wl_stats["num_epochs"] -1 ), 4) + + report = {"wl-stats": wl_stats, "wl-scores": wl_scores} + print(json.dumps(report, indent=4)) + +if __name__ == "__main__": + parser=argparse.ArgumentParser() + parser.add_argument("results_dir", type=Path, help="path to results directory") + args=parser.parse_args() + + parse_results(args.results_dir) diff --git a/cms/mlpf/cms-mlpf/prepare-dataset.sh b/cms/mlpf/cms-mlpf/prepare-dataset.sh deleted file mode 100755 index 0f876447a78f287dad8656a9dfabe903030cbf10..0000000000000000000000000000000000000000 --- a/cms/mlpf/cms-mlpf/prepare-dataset.sh +++ /dev/null @@ -1,37 +0,0 @@ -#!/bin/bash -# Download the particleflow dataset from Zenodo -set -e - -# each split is about 50MB pickled and gzipped -MIRROR=https://zenodo.org/record/4559324/files - -DOWNLOAD_DIR=/bmk/data/zenodo -DELPHES_DIR=$DOWNLOAD_DIR/delphes_pf -echo "Download directory: $DOWNLOAD_DIR" - -PF_DIR=/bmk/cms-mlpf/particleflow -DATA_DIR=/bmk/cms-mlpf/tensorflow_datasets -echo "Data directory: $DATA_DIR" - -# create the download dir -mkdir -p $DELPHES_DIR - -# Test splits -for i in $(seq 0 19) ; do - TARGET=tev14_pythia8_ttbar_0_${i}.pkl.bz2 - echo "Downloading train split: $TARGET" - wget -q -P $DELPHES_DIR $MIRROR/$TARGET -done - -# Train splits -for i in $(seq 0 1) ; do - TARGET=tev14_pythia8_qcd_10_${i}.pkl.bz2 - echo "Downloading test split: $TARGET" - wget -q -P $DELPHES_DIR $MIRROR/$TARGET -done - -# build TDFS datasets -cd $PF_DIR -tfds build hep_tfds/heptfds/delphes_pf --download_dir $DOWNLOAD_DIR --data_dir $DATA_DIR - -rm -rf $DOWNLOAD_DIR diff --git a/cms/mlpf/cms-mlpf/requirements.txt b/cms/mlpf/cms-mlpf/requirements.txt deleted file mode 100644 index 0af52078d53cbe94c726ebb2d122cc83e4c37e16..0000000000000000000000000000000000000000 --- a/cms/mlpf/cms-mlpf/requirements.txt +++ /dev/null @@ -1,38 +0,0 @@ -numpy==1.19.5 -click==8.0.1 -tqdm==4.61.1 -seaborn==0.11.2 -scikit-optimize -tensorflow==2.6.0 -keras==2.6.0 -tensorflow-addons==0.13.0 -tensorflow-datasets==4.4.0 -tf-models-official==2.6.0 -tensorflow-estimator==2.6.0 -tensorflow-probability==0.14.1 -keras-tuner==1.0.3 -tf2onnx==1.9.2 -onnxruntime==1.8.1 -mplhep -ray==1.7.0 -ray[tune]==1.7.0 -nevergrad==0.4.3.post8 - -# The below is not needed to run mlpf-bmk.sh (when using tensorflow's official docker image as base) -# pandas==1.1.5 -# scikit-learn==0.24.2 -# scipy==1.5.4 -# scikit-optimize==0.9.0 -# matplotlib==3.2.2 -# hpbandster==0.7.4 -# hyperopt==0.2.5 -# hpbandster -# ConfigSpace==0.4.19 -# pyaml==6.0 -# onnx==1.10.1 -# tensorflow-text==2.6.0 -# tensorboard==2.6.0 -# tensorflow-estimator==2.6.0 -# tensorflow-metadata==1.1.0 -# tensorflow-model-optimization==0.7.0 -# comet-ml==3.15.4 diff --git a/cms/mlpf/particleflow b/cms/mlpf/particleflow deleted file mode 160000 index 9324c9625555addd9367133193a3c491fbc74376..0000000000000000000000000000000000000000 --- a/cms/mlpf/particleflow +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 9324c9625555addd9367133193a3c491fbc74376