Test extra args

37cf1699 · David Southwick · caad71bb · 37cf1699 · 37cf1699 · 37cf1699
Commit 37cf1699 authored 9 months ago by David Southwick
--- a/.gitmodules
+++ b/.gitmodules
-[submodule "cms/mlpf/particleflow"]
-	path = cms/mlpf/particleflow
-	url = https://github.com/dcsouthwick/particleflow
-        branch = benchmark_suite
-	fetchRecurseSubmodules = true
--- a/cms/mlpf/Dockerfile.append
+++ b/cms/mlpf/Dockerfile.append
-# Get the source code for MLPF
-COPY particleflow /bmk/cms-mlpf/particleflow
-
-# Copy in the dependencies
-COPY cms-mlpf/requirements.txt cms-mlpf/cms-mlpf-bmk.sh cms-mlpf/prepare-dataset.sh /bmk/cms-mlpf/

 # Install dependencies
 RUN \
-    # cd /bmk/cms-mlpf && git submodule update --init --recursive
-    python3 -m pip install --upgrade pip==21.3.1 setuptools==59.5.0 && \
-    python3 -m pip install --no-cache-dir -r /bmk/cms-mlpf/requirements.txt && \
-    python3 -m pip install --no-cache-dir /bmk/cms-mlpf/particleflow/hep_tfds && \
-    # Download and pre-process the dataset
-    bash /bmk/cms-mlpf/prepare-dataset.sh
+    dnf install -y git && \
+    git clone https://gitlab.cern.ch/dsouthwi/particleflow.git --depth 1 -b bmk_torch /bmk/cms-mlpf && \
+    python3 -m ensurepip && \
+    python3 -m pip install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124 && \
+    python3 -m pip install --no-cache-dir comet-ml awkward boost-histogram fastjet tqdm scikit-learn pandas mplhep numba tfds-nightly wheel pyyaml tensorboard && \
+    # temporary dataset hosting until zenodo link is ready
+    curl -LO https://dsouthwi.web.cern.ch/files/clic_edm_qq_pf_2.1.0.tar.gz && \
+    tar zxf clic_edm_qq_pf_2.1.0.tar.gz -C /bmk/cms-mlpf && rm clic_edm_qq_pf_2.1.0.tar.gz
+
--- a/cms/mlpf/cms-mlpf.spec
+++ b/cms/mlpf/cms-mlpf.spec
 HEPWL_BMKEXE=cms-mlpf-bmk.sh
 HEPWL_BMKOPTS=""
 HEPWL_BMKDIR="cms-mlpf"
-HEPWL_BMKDESCRIPTION=""
+HEPWL_BMKDESCRIPTION="CMS Machine-Learned ParticleFlow (MLPF)"
 HEPWL_DOCKERIMAGENAME=cms-mlpf-bmk
-HEPWL_DOCKERIMAGETAG=v0.1
-HEPWL_CVMFSREPOS=sft.cern.ch
-HEPWL_BMKOS="gitlab-registry.cern.ch/dsouthwi/tensorflow/tensorflow-gpu:latest"
+HEPWL_DOCKERIMAGETAG=ci-v0.3 # NB ci-vX.Y for tests, vX.Y for tags
+HEPWL_CVMFSREPOS=NONE
+HEPWL_BMKOS="gitlab-registry.cern.ch/linuxsupport/alma9-base:latest"
 HEPWL_BMKUSEGPU=1
+HEPWL_BUILDARCH="x86_64"
--- a/cms/mlpf/cms-mlpf/cms-mlpf-bmk.sh
+++ b/cms/mlpf/cms-mlpf/cms-mlpf-bmk.sh
 #!/bin/bash

-set -e
+# Copyright 2019-2020 CERN. See the COPYRIGHT file at the top-level
+# directory of this distribution. For licensing information, see the
+# COPYING file at the top-level directory of this distribution.

-if [ -f /run/.containerenv ]; then FLAVOR="podman"
-elif [ -f /.dockerenv ]; then FLAVOR="docker"
-elif [ -f /singularity ]; then FLAVOR="singularity"
-else FLAVOR="unknown";
-fi
+#set -x # enable debug printouts

-# Default config
-NEPOCHS=2     # must be >1 as 1st epoch is thrown away
-NTRAIN=0      # 0 is None
-NTEST=0       # 0 is None
-BSIZE=4       # 4 is Default
-NDEVICES=0    # 0 is Default
-DEBUG=0
-resultsDir="/results"
-skipSubDir=0
-MOP="none"
-DESCRIPTION="Machine Learning Particle Flow (MLPF) benchmark"
-
-log() {
-  case $1 in
-    error)  shift 1; echo -e "\e[31m>>> ERROR:\e[0m $*\n" | tee -a $resultsDir/out.log ; exit 2 ;;
-    info)   shift 1; echo -e "\e[34m$*\e[0m\n" | tee -a $resultsDir/out.log ;;
-    silent) shift 1; echo "$*" >> $resultsDir/out.log ;;
-    *)      echo "$*" | tee -a $resultsDir/out.log ;
-  esac
-}
+#set -e # immediate exit on error

-function usage(){
-  echo ""
-  echo "Usage: $0 [-w | --resultsdir <resultsDir>] [-W] [-c | --copies <NCOPIES>] [-n | --nepochs <NEPOCHS>] " \
-                 "[-B | --batch_size <BSIZE>] [-D | --num_devices <NDEVICES>] [--ntrain <NTRAIN>] [--ntest <NTEST>] " \
-                 "[-m | --mop <mode>] [-d | --debug] [-h | --help]"
-  echo "  -w --resultsdir <resultsDir> : (path) results directory (default: /results , current: $resultsDir)"
-  echo "  -W                           : store results in <resultsDir> directly"
-  echo "  -n --nepochs                 : (int) Number of epochs >1 (default: 2, current: $NEPOCHS)"
-  echo "  -B --batch_size              : (int) Batch size per device (default: 4, current: $BSIZE)"
-  echo "  -D --num_devices             : (int) Number of devices to use (default: 0, current: $NDEVICES)"
-  echo "     --ntrain                  : (int) Train steps limit (default: 0, current: $NTRAIN)"
-  echo "     --ntest                   : (int) Test steps limit (default: 0, current: $NTEST)"
-  echo "  -m --mop                     : (none|all|custom) clean working directory mode: none/all/custom (current: $MOP)"
-  echo "  -d --debug                   : debug mode"
-  echo "  -h --help                    : display this help and exit"
-  echo ""
-  echo "Mop mode: 
-          none   == do not remove working files, 
-          all    == remove all produced files (but summary json), 
-          custom == custom implementation"
-  echo "Without -W (default): results are stored in a new subdirectory of <resultsDir>:"
-  echo "  <resultsDir>/<uniqueid>/*.json"
-  echo "  <resultsDir>/<uniqueid>/proc_1/*.log"
-  echo "  <resultsDir>/<uniqueid>/proc_.../*.log"
-  echo "  <resultsDir>/<uniqueid>/proc_<COPIES>/*.log"
-  echo "With -W (e.g. in the CI): results are stored in <resultsDir> directly:"
-  echo "  <resultsDir>/*.json"
-  echo "  <resultsDir>/proc_1/*.log"
-  echo "  <resultsDir>/proc_.../*.log"
-  echo "  <resultsDir>/proc_<NCOPIES>/*.log"
-  echo ""
-  echo "Without -w (default) and without -W: <resultsDir> is /results"
-  echo "Without -w (default) and with -W: <resultsDir> is a tmp directory /tmp/xxxx"
-  echo ""
-  if [ "$(type -t usage_detailed)" == "function" ]; then
-    echo -e "\nDetailed Usage:\n----------------\n"
-    ( usage_detailed ) # as a subprocess, just in case this has a 0 exit code...
-  fi
-  echo -e "DESCRIPTION\n"
-  if [ -e $BMKDIR/DESCRIPTION ]; then
-      cat $BMKDIR/DESCRIPTION
-  else
-      echo "Sorry there is no description included."
-  fi
-  echo ""
-  exit 2 # early termination (help or invalid arguments to benchmark script)
-}
-
-parse_args() {
-  options=$(getopt -a -n cms-mlpf-bmk -o w:Wm:n:dD:B:h --long resultsdir:,nepochs:,ntrain:,ntest:,batch_size:,num_devices:,debug,help,mop -- "$@")
-  if [ $? != 0 ]; then echo "Invalid options provided." >&2; usage; fi
-  eval set -- "$options"
-  while true; do
-    case "$1" in
-      --help | -h ) usage; exit 0;;
-      --debug | -d ) DEBUG=1 ;;
-      -W ) skipSubDir=1;;
-      --mop | -m ) MOP="$2"; shift;;
-      --resultsdir | -w ) resultsDir="$2"; shift;;
-      --ntrain ) NTRAIN="$2"; shift;;
-      --ntest ) NTEST="$2"; shift;;
-      --nepochs | -n ) NEPOCHS="$2"; shift;;
-      --num_devices | -D ) NDEVICES="$2"; shift;;
-      --batch_size | -B ) BSIZE="$2"; shift;;
-      -- ) shift; break;;
-    esac
-    shift
-  done
-}
+# Function doOne must be defined in each benchmark
+# Input argument $1: process index (between 1 and $NCOPIES)
+# Return value: please return 0 if this workload copy was successful, 1 otherwise
+# The following variables are guaranteed to be defined and exported: NCOPIES, NTHREADS, NEVENTS_THREAD, BMKDIR, DEBUG
+# The function is started in process-specific working directory <basewdir>/proc_$1:
+# please store here the individual log files for each of the NCOPIES processes
+function doOne(){
+  if [ "$1" == "" ] || [ "$2" != "" ]; then echo "[doOne] ERROR! Invalid arguments '$@' to doOne"; return 1; fi
+  echo "[doOne ($1)] $(date) starting in $(pwd)"
+  sleep 5s 
+  echo "Hello world" > out_$1.log 2>&1
+  echo "[doOne ($1)] $(date) EXTRA_ARGS='$EXTRA_ARGS'"
+  echo "Starting ($1) with $EXTRA_ARGS" > out_$1.log
+  echo "[doOne ($1)] $(date) resultsDir='$resultsDir'"

-# TODO: implement MOP, DEBUG
+  # parse extra args if any
+  # options=$(getopt -a -n cms-mlpf-bmk -o g:D:B: --long nepochs:,ntrain:,ntest:,nvalid:,batch-multiplier:,gpus:,dtype:nworkers: -- "$EXTRA_ARGS")
+  # eval set -- "$options"
+  # while [ : ]; do
+  #   case "$1" in
+  #     --ntrain ) NTRAIN="$2"; shift;;
+  #     --ntest ) NTEST="$2"; shift;;
+  #     --nvalid ) NVALID="$2"; shift;;
+  #     --nepochs ) NEPOCHS="$2"; shift;;
+  #     --nworkers ) NWORKERS="$2"; shift;;
+  #     --gpus | -g ) NGPUS="$2"; shift;;
+  #     --batch-multiplier | -B ) BMULT="$2"; shift;;
+  #     --dtype | -D ) DTYPE="$2"; shift;;
+  #     --train ) TRAIN="--train";;
+  #     -- ) shift; break;;
+  #   esac
+  #   shift
+  # done

-parse_args $*

-if [ -f "$resultsDir"/out.log ]; then rm "$resultsDir"/out.log; fi
-log info "Base working directory: $resultsDir"
+  # Run the workload
+  python3 mlpf/pyg_pipeline.py --train \
+  --config parameters/pytorch/pyg-clic.yaml \
+  --benchmark_dir $resultsDir \
+  ${EXTRA_ARGS}

-# set CUDA_VISIBLE_DEVICES for tensorflow based on nvidia-smi (dirty nvidia-only check)
-if type -P "nvidia-smi" &>/dev/null; then
-  DEVICES=$(nvidia-smi -L | wc -l)
-  log info "Detected $DEVICES nvidia GPUs"
-  export CUDA_VISIBLE_DEVICES=$(seq -s, 0 $(($DEVICES-1)))
-fi
+  #--prefix /tmp/train_ \
+  status=${?}
+  echo "[doOne ($1)] $(date) completed (status=$status)"
+  # Return 0 if this workload copy was successful, 1 otherwise
+  return $status
+}

-# create /results/build to satisfy common build script (mimic bmk-driver.sh)
-log silent "Creating /results/build"
-mkdir -p $resultsDir/build
-touch $resultsDir/build/.pointlessfile
+# Default values for NCOPIES, NTHREADS, NEVENTS_THREAD must be set in each benchmark
+NCOPIES=1 # cannot be changed by user input ()
+NTHREADS=1 # cannot be changed by user input (single-threaded single-process WL)
+NEVENTS_THREAD=1 # not relevant for GPUs
+# specific to MLPF
+NEPOCHS=6     # must be >1 as 1st epoch is thrown away
+NTRAIN=120000 # 0 is None, events to train on
+NTEST=36000   # 0 is None, events to test training
+BSIZE=8       # 8 is Default, batch size (too small device is under-loaded, too large OOM)
+NGPUS=1    # 0 is Default, GPUs
+TRAIN="--train"
+DEBUG=0
+resultsDir="/results"

-log info "Running benchmark MLPF"
-log silent "Executing 'python3 mlpf/pipeline.py train \
-  --config parameters/delphes-benchmark.yaml \
-  --prefix /tmp/train_ \
-  --plot-freq 1000000 \
-  --benchmark_dir $resultsDir \
-  --num_devices $NDEVICES \
-  --batch_size $BSIZE \
-  --nepochs $NEPOCHS \
-  --ntrain $NTRAIN \
-  --ntest $NTEST'"
-cd /bmk/cms-mlpf/particleflow/
-python3 mlpf/pipeline.py train \
-  --config parameters/delphes-benchmark.yaml \
-  --prefix /tmp/train_ \
-  --plot-freq 1000000 \
-  --benchmark_dir $resultsDir \
-  --num_devices $NDEVICES \
-  --batch_size $BSIZE \
-  --nepochs $NEPOCHS \
-  --ntrain $NTRAIN \
-  --ntest $NTEST
+function usage_detailed(){
+  echo ""
+  echo "Additional MLPF parameters:"
+  echo "     --nepochs                 : (int) Number of epochs >1 (default: $NEPOCHS)"
+  echo "     --ntrain                  : (int) Train steps limit (default: $NTRAIN)"
+  echo "     --ntest                   : (int) Test steps limit (default: $NTEST)"
+  echo "     --nvalid                  : (int) Validation steps limit (default: $NVALID)"
+  echo "     --batch_size              : (int) Batch size (default: $BSIZE)"
+  echo "     --dtype                   : (string) Data type {float32, float16, bfloat16}(default: $DTYPE)"
+  echo "  -B --batch-        : (int) Batch multiplier, 1=16G,5=80G GPU memory (default: $BATCH_MULTIPLIER)"
+  echo "  -g --gpus                    : (int) Number of gpus to use (default: $NGPUS)"
+  
+}

-REPORT=$(cat $resultsDir/result.json)

-generate_json() {
-  jq -n \
-    --argjson nepochs "$NEPOCHS" \
-    --argjson report "$REPORT" \
-    --arg containment "$FLAVOR" \
-    --arg description "$DESCRIPTION" \
-    '{
-      "run_info":{
-        $nepochs
-      },
-      $report,
-      "app":{
-        $containment,
-        $description
-      }
-    }'
-}
-mkdir -p $resultsDir/report
-if [ $skipSubDir -eq 0 ]; then
-  REPORT_PATH=$resultsDir/report/cms-mlpf_summary.json
-else
-  REPORT_PATH=$resultsDir/cms-mlpf_summary.json
+if [ -f /run/.containerenv ]; then FLAVOR="podman"
+elif [ -f /.dockerenv ]; then FLAVOR="docker"
+elif [ -f /singularity ]; then FLAVOR="singularity"
+else FLAVOR="unknown";
 fi
-generate_json > $REPORT_PATH
-log info "Finished running MLPF. Final report written to $REPORT_PATH"

-# sourcing bmk-driver excluded for now pending rework to override common args
+# Source the common benchmark driver
+if [ -f $(dirname $0)/bmk-driver.sh ]; then
+  . $(dirname $0)/bmk-driver.sh
+else
+  . $(dirname $0)/../../../common/bmk-driver.sh
+fi
\ No newline at end of file
--- a/cms/mlpf/cms-mlpf/prepare-dataset.sh
+++ b/cms/mlpf/cms-mlpf/prepare-dataset.sh
-#!/bin/bash
-# Download the particleflow dataset from Zenodo
-set -e
-
-# each split is about 50MB pickled and gzipped
-MIRROR=https://zenodo.org/record/4559324/files
-
-DOWNLOAD_DIR=/bmk/data/zenodo
-DELPHES_DIR=$DOWNLOAD_DIR/delphes_pf
-echo "Download directory: $DOWNLOAD_DIR"
-
-PF_DIR=/bmk/cms-mlpf/particleflow
-DATA_DIR=/bmk/cms-mlpf/tensorflow_datasets
-echo "Data directory: $DATA_DIR"
-
-# create the download dir
-mkdir -p $DELPHES_DIR
-
-# Test splits
-for i in $(seq 0 19) ; do
-    TARGET=tev14_pythia8_ttbar_0_${i}.pkl.bz2
-    echo "Downloading train split: $TARGET"
-    wget -q -P $DELPHES_DIR $MIRROR/$TARGET
-done
-
-# Train splits
-for i in $(seq 0 1) ; do
-    TARGET=tev14_pythia8_qcd_10_${i}.pkl.bz2
-    echo "Downloading test split: $TARGET"
-    wget -q -P $DELPHES_DIR $MIRROR/$TARGET
-done
-
-# build TDFS datasets
-cd $PF_DIR
-tfds build hep_tfds/heptfds/delphes_pf --download_dir $DOWNLOAD_DIR --data_dir $DATA_DIR
-
-rm -rf $DOWNLOAD_DIR
--- a/cms/mlpf/cms-mlpf/requirements.txt
+++ b/cms/mlpf/cms-mlpf/requirements.txt
-numpy==1.19.5
-click==8.0.1
-tqdm==4.61.1
-seaborn==0.11.2
-scikit-optimize
-tensorflow==2.6.0
-keras==2.6.0
-tensorflow-addons==0.13.0
-tensorflow-datasets==4.4.0
-tf-models-official==2.6.0
-tensorflow-estimator==2.6.0
-tensorflow-probability==0.14.1
-keras-tuner==1.0.3
-tf2onnx==1.9.2
-onnxruntime==1.8.1
-mplhep
-ray==1.7.0
-ray[tune]==1.7.0
-nevergrad==0.4.3.post8
-
-# The below is not needed to run mlpf-bmk.sh (when using tensorflow's official docker image as base)
-# pandas==1.1.5
-# scikit-learn==0.24.2
-# scipy==1.5.4
-# scikit-optimize==0.9.0
-# matplotlib==3.2.2
-# hpbandster==0.7.4
-# hyperopt==0.2.5
-# hpbandster
-# ConfigSpace==0.4.19
-# pyaml==6.0
-# onnx==1.10.1
-# tensorflow-text==2.6.0
-# tensorboard==2.6.0
-# tensorflow-estimator==2.6.0
-# tensorflow-metadata==1.1.0
-# tensorflow-model-optimization==0.7.0
-# comet-ml==3.15.4
--- a/particleflow @ 9324c962
+++ b/particleflow @ 9324c962
-Subproject commit 9324c9625555addd9367133193a3c491fbc74376