Skip to content
Snippets Groups Projects
Commit 37cf1699 authored by David Southwick's avatar David Southwick
Browse files

Test extra args

parent caad71bb
No related branches found
No related tags found
2 merge requests!1006Master,!997merge qa-build-cms-mlpf commit c5927e6a from 8499759
[submodule "cms/mlpf/particleflow"]
path = cms/mlpf/particleflow
url = https://github.com/dcsouthwick/particleflow
branch = benchmark_suite
fetchRecurseSubmodules = true
# Get the source code for MLPF
COPY particleflow /bmk/cms-mlpf/particleflow
# Copy in the dependencies
COPY cms-mlpf/requirements.txt cms-mlpf/cms-mlpf-bmk.sh cms-mlpf/prepare-dataset.sh /bmk/cms-mlpf/
# Install dependencies
RUN \
# cd /bmk/cms-mlpf && git submodule update --init --recursive
python3 -m pip install --upgrade pip==21.3.1 setuptools==59.5.0 && \
python3 -m pip install --no-cache-dir -r /bmk/cms-mlpf/requirements.txt && \
python3 -m pip install --no-cache-dir /bmk/cms-mlpf/particleflow/hep_tfds && \
# Download and pre-process the dataset
bash /bmk/cms-mlpf/prepare-dataset.sh
dnf install -y git && \
git clone https://gitlab.cern.ch/dsouthwi/particleflow.git --depth 1 -b bmk_torch /bmk/cms-mlpf && \
python3 -m ensurepip && \
python3 -m pip install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124 && \
python3 -m pip install --no-cache-dir comet-ml awkward boost-histogram fastjet tqdm scikit-learn pandas mplhep numba tfds-nightly wheel pyyaml tensorboard && \
# temporary dataset hosting until zenodo link is ready
curl -LO https://dsouthwi.web.cern.ch/files/clic_edm_qq_pf_2.1.0.tar.gz && \
tar zxf clic_edm_qq_pf_2.1.0.tar.gz -C /bmk/cms-mlpf && rm clic_edm_qq_pf_2.1.0.tar.gz
HEPWL_BMKEXE=cms-mlpf-bmk.sh
HEPWL_BMKOPTS=""
HEPWL_BMKDIR="cms-mlpf"
HEPWL_BMKDESCRIPTION=""
HEPWL_BMKDESCRIPTION="CMS Machine-Learned ParticleFlow (MLPF)"
HEPWL_DOCKERIMAGENAME=cms-mlpf-bmk
HEPWL_DOCKERIMAGETAG=v0.1
HEPWL_CVMFSREPOS=sft.cern.ch
HEPWL_BMKOS="gitlab-registry.cern.ch/dsouthwi/tensorflow/tensorflow-gpu:latest"
HEPWL_DOCKERIMAGETAG=ci-v0.3 # NB ci-vX.Y for tests, vX.Y for tags
HEPWL_CVMFSREPOS=NONE
HEPWL_BMKOS="gitlab-registry.cern.ch/linuxsupport/alma9-base:latest"
HEPWL_BMKUSEGPU=1
HEPWL_BUILDARCH="x86_64"
#!/bin/bash
set -e
# Copyright 2019-2020 CERN. See the COPYRIGHT file at the top-level
# directory of this distribution. For licensing information, see the
# COPYING file at the top-level directory of this distribution.
if [ -f /run/.containerenv ]; then FLAVOR="podman"
elif [ -f /.dockerenv ]; then FLAVOR="docker"
elif [ -f /singularity ]; then FLAVOR="singularity"
else FLAVOR="unknown";
fi
#set -x # enable debug printouts
# Default config
NEPOCHS=2 # must be >1 as 1st epoch is thrown away
NTRAIN=0 # 0 is None
NTEST=0 # 0 is None
BSIZE=4 # 4 is Default
NDEVICES=0 # 0 is Default
DEBUG=0
resultsDir="/results"
skipSubDir=0
MOP="none"
DESCRIPTION="Machine Learning Particle Flow (MLPF) benchmark"
log() {
case $1 in
error) shift 1; echo -e "\e[31m>>> ERROR:\e[0m $*\n" | tee -a $resultsDir/out.log ; exit 2 ;;
info) shift 1; echo -e "\e[34m$*\e[0m\n" | tee -a $resultsDir/out.log ;;
silent) shift 1; echo "$*" >> $resultsDir/out.log ;;
*) echo "$*" | tee -a $resultsDir/out.log ;
esac
}
#set -e # immediate exit on error
function usage(){
echo ""
echo "Usage: $0 [-w | --resultsdir <resultsDir>] [-W] [-c | --copies <NCOPIES>] [-n | --nepochs <NEPOCHS>] " \
"[-B | --batch_size <BSIZE>] [-D | --num_devices <NDEVICES>] [--ntrain <NTRAIN>] [--ntest <NTEST>] " \
"[-m | --mop <mode>] [-d | --debug] [-h | --help]"
echo " -w --resultsdir <resultsDir> : (path) results directory (default: /results , current: $resultsDir)"
echo " -W : store results in <resultsDir> directly"
echo " -n --nepochs : (int) Number of epochs >1 (default: 2, current: $NEPOCHS)"
echo " -B --batch_size : (int) Batch size per device (default: 4, current: $BSIZE)"
echo " -D --num_devices : (int) Number of devices to use (default: 0, current: $NDEVICES)"
echo " --ntrain : (int) Train steps limit (default: 0, current: $NTRAIN)"
echo " --ntest : (int) Test steps limit (default: 0, current: $NTEST)"
echo " -m --mop : (none|all|custom) clean working directory mode: none/all/custom (current: $MOP)"
echo " -d --debug : debug mode"
echo " -h --help : display this help and exit"
echo ""
echo "Mop mode:
none == do not remove working files,
all == remove all produced files (but summary json),
custom == custom implementation"
echo "Without -W (default): results are stored in a new subdirectory of <resultsDir>:"
echo " <resultsDir>/<uniqueid>/*.json"
echo " <resultsDir>/<uniqueid>/proc_1/*.log"
echo " <resultsDir>/<uniqueid>/proc_.../*.log"
echo " <resultsDir>/<uniqueid>/proc_<COPIES>/*.log"
echo "With -W (e.g. in the CI): results are stored in <resultsDir> directly:"
echo " <resultsDir>/*.json"
echo " <resultsDir>/proc_1/*.log"
echo " <resultsDir>/proc_.../*.log"
echo " <resultsDir>/proc_<NCOPIES>/*.log"
echo ""
echo "Without -w (default) and without -W: <resultsDir> is /results"
echo "Without -w (default) and with -W: <resultsDir> is a tmp directory /tmp/xxxx"
echo ""
if [ "$(type -t usage_detailed)" == "function" ]; then
echo -e "\nDetailed Usage:\n----------------\n"
( usage_detailed ) # as a subprocess, just in case this has a 0 exit code...
fi
echo -e "DESCRIPTION\n"
if [ -e $BMKDIR/DESCRIPTION ]; then
cat $BMKDIR/DESCRIPTION
else
echo "Sorry there is no description included."
fi
echo ""
exit 2 # early termination (help or invalid arguments to benchmark script)
}
parse_args() {
options=$(getopt -a -n cms-mlpf-bmk -o w:Wm:n:dD:B:h --long resultsdir:,nepochs:,ntrain:,ntest:,batch_size:,num_devices:,debug,help,mop -- "$@")
if [ $? != 0 ]; then echo "Invalid options provided." >&2; usage; fi
eval set -- "$options"
while true; do
case "$1" in
--help | -h ) usage; exit 0;;
--debug | -d ) DEBUG=1 ;;
-W ) skipSubDir=1;;
--mop | -m ) MOP="$2"; shift;;
--resultsdir | -w ) resultsDir="$2"; shift;;
--ntrain ) NTRAIN="$2"; shift;;
--ntest ) NTEST="$2"; shift;;
--nepochs | -n ) NEPOCHS="$2"; shift;;
--num_devices | -D ) NDEVICES="$2"; shift;;
--batch_size | -B ) BSIZE="$2"; shift;;
-- ) shift; break;;
esac
shift
done
}
# Function doOne must be defined in each benchmark
# Input argument $1: process index (between 1 and $NCOPIES)
# Return value: please return 0 if this workload copy was successful, 1 otherwise
# The following variables are guaranteed to be defined and exported: NCOPIES, NTHREADS, NEVENTS_THREAD, BMKDIR, DEBUG
# The function is started in process-specific working directory <basewdir>/proc_$1:
# please store here the individual log files for each of the NCOPIES processes
function doOne(){
if [ "$1" == "" ] || [ "$2" != "" ]; then echo "[doOne] ERROR! Invalid arguments '$@' to doOne"; return 1; fi
echo "[doOne ($1)] $(date) starting in $(pwd)"
sleep 5s
echo "Hello world" > out_$1.log 2>&1
echo "[doOne ($1)] $(date) EXTRA_ARGS='$EXTRA_ARGS'"
echo "Starting ($1) with $EXTRA_ARGS" > out_$1.log
echo "[doOne ($1)] $(date) resultsDir='$resultsDir'"
# TODO: implement MOP, DEBUG
# parse extra args if any
# options=$(getopt -a -n cms-mlpf-bmk -o g:D:B: --long nepochs:,ntrain:,ntest:,nvalid:,batch-multiplier:,gpus:,dtype:nworkers: -- "$EXTRA_ARGS")
# eval set -- "$options"
# while [ : ]; do
# case "$1" in
# --ntrain ) NTRAIN="$2"; shift;;
# --ntest ) NTEST="$2"; shift;;
# --nvalid ) NVALID="$2"; shift;;
# --nepochs ) NEPOCHS="$2"; shift;;
# --nworkers ) NWORKERS="$2"; shift;;
# --gpus | -g ) NGPUS="$2"; shift;;
# --batch-multiplier | -B ) BMULT="$2"; shift;;
# --dtype | -D ) DTYPE="$2"; shift;;
# --train ) TRAIN="--train";;
# -- ) shift; break;;
# esac
# shift
# done
parse_args $*
if [ -f "$resultsDir"/out.log ]; then rm "$resultsDir"/out.log; fi
log info "Base working directory: $resultsDir"
# Run the workload
python3 mlpf/pyg_pipeline.py --train \
--config parameters/pytorch/pyg-clic.yaml \
--benchmark_dir $resultsDir \
${EXTRA_ARGS}
# set CUDA_VISIBLE_DEVICES for tensorflow based on nvidia-smi (dirty nvidia-only check)
if type -P "nvidia-smi" &>/dev/null; then
DEVICES=$(nvidia-smi -L | wc -l)
log info "Detected $DEVICES nvidia GPUs"
export CUDA_VISIBLE_DEVICES=$(seq -s, 0 $(($DEVICES-1)))
fi
#--prefix /tmp/train_ \
status=${?}
echo "[doOne ($1)] $(date) completed (status=$status)"
# Return 0 if this workload copy was successful, 1 otherwise
return $status
}
# create /results/build to satisfy common build script (mimic bmk-driver.sh)
log silent "Creating /results/build"
mkdir -p $resultsDir/build
touch $resultsDir/build/.pointlessfile
# Default values for NCOPIES, NTHREADS, NEVENTS_THREAD must be set in each benchmark
NCOPIES=1 # cannot be changed by user input ()
NTHREADS=1 # cannot be changed by user input (single-threaded single-process WL)
NEVENTS_THREAD=1 # not relevant for GPUs
# specific to MLPF
NEPOCHS=6 # must be >1 as 1st epoch is thrown away
NTRAIN=120000 # 0 is None, events to train on
NTEST=36000 # 0 is None, events to test training
BSIZE=8 # 8 is Default, batch size (too small device is under-loaded, too large OOM)
NGPUS=1 # 0 is Default, GPUs
TRAIN="--train"
DEBUG=0
resultsDir="/results"
log info "Running benchmark MLPF"
log silent "Executing 'python3 mlpf/pipeline.py train \
--config parameters/delphes-benchmark.yaml \
--prefix /tmp/train_ \
--plot-freq 1000000 \
--benchmark_dir $resultsDir \
--num_devices $NDEVICES \
--batch_size $BSIZE \
--nepochs $NEPOCHS \
--ntrain $NTRAIN \
--ntest $NTEST'"
cd /bmk/cms-mlpf/particleflow/
python3 mlpf/pipeline.py train \
--config parameters/delphes-benchmark.yaml \
--prefix /tmp/train_ \
--plot-freq 1000000 \
--benchmark_dir $resultsDir \
--num_devices $NDEVICES \
--batch_size $BSIZE \
--nepochs $NEPOCHS \
--ntrain $NTRAIN \
--ntest $NTEST
function usage_detailed(){
echo ""
echo "Additional MLPF parameters:"
echo " --nepochs : (int) Number of epochs >1 (default: $NEPOCHS)"
echo " --ntrain : (int) Train steps limit (default: $NTRAIN)"
echo " --ntest : (int) Test steps limit (default: $NTEST)"
echo " --nvalid : (int) Validation steps limit (default: $NVALID)"
echo " --batch_size : (int) Batch size (default: $BSIZE)"
echo " --dtype : (string) Data type {float32, float16, bfloat16}(default: $DTYPE)"
echo " -B --batch- : (int) Batch multiplier, 1=16G,5=80G GPU memory (default: $BATCH_MULTIPLIER)"
echo " -g --gpus : (int) Number of gpus to use (default: $NGPUS)"
}
REPORT=$(cat $resultsDir/result.json)
generate_json() {
jq -n \
--argjson nepochs "$NEPOCHS" \
--argjson report "$REPORT" \
--arg containment "$FLAVOR" \
--arg description "$DESCRIPTION" \
'{
"run_info":{
$nepochs
},
$report,
"app":{
$containment,
$description
}
}'
}
mkdir -p $resultsDir/report
if [ $skipSubDir -eq 0 ]; then
REPORT_PATH=$resultsDir/report/cms-mlpf_summary.json
else
REPORT_PATH=$resultsDir/cms-mlpf_summary.json
if [ -f /run/.containerenv ]; then FLAVOR="podman"
elif [ -f /.dockerenv ]; then FLAVOR="docker"
elif [ -f /singularity ]; then FLAVOR="singularity"
else FLAVOR="unknown";
fi
generate_json > $REPORT_PATH
log info "Finished running MLPF. Final report written to $REPORT_PATH"
# sourcing bmk-driver excluded for now pending rework to override common args
# Source the common benchmark driver
if [ -f $(dirname $0)/bmk-driver.sh ]; then
. $(dirname $0)/bmk-driver.sh
else
. $(dirname $0)/../../../common/bmk-driver.sh
fi
\ No newline at end of file
#!/bin/bash
# Download the particleflow dataset from Zenodo
set -e
# each split is about 50MB pickled and gzipped
MIRROR=https://zenodo.org/record/4559324/files
DOWNLOAD_DIR=/bmk/data/zenodo
DELPHES_DIR=$DOWNLOAD_DIR/delphes_pf
echo "Download directory: $DOWNLOAD_DIR"
PF_DIR=/bmk/cms-mlpf/particleflow
DATA_DIR=/bmk/cms-mlpf/tensorflow_datasets
echo "Data directory: $DATA_DIR"
# create the download dir
mkdir -p $DELPHES_DIR
# Test splits
for i in $(seq 0 19) ; do
TARGET=tev14_pythia8_ttbar_0_${i}.pkl.bz2
echo "Downloading train split: $TARGET"
wget -q -P $DELPHES_DIR $MIRROR/$TARGET
done
# Train splits
for i in $(seq 0 1) ; do
TARGET=tev14_pythia8_qcd_10_${i}.pkl.bz2
echo "Downloading test split: $TARGET"
wget -q -P $DELPHES_DIR $MIRROR/$TARGET
done
# build TDFS datasets
cd $PF_DIR
tfds build hep_tfds/heptfds/delphes_pf --download_dir $DOWNLOAD_DIR --data_dir $DATA_DIR
rm -rf $DOWNLOAD_DIR
numpy==1.19.5
click==8.0.1
tqdm==4.61.1
seaborn==0.11.2
scikit-optimize
tensorflow==2.6.0
keras==2.6.0
tensorflow-addons==0.13.0
tensorflow-datasets==4.4.0
tf-models-official==2.6.0
tensorflow-estimator==2.6.0
tensorflow-probability==0.14.1
keras-tuner==1.0.3
tf2onnx==1.9.2
onnxruntime==1.8.1
mplhep
ray==1.7.0
ray[tune]==1.7.0
nevergrad==0.4.3.post8
# The below is not needed to run mlpf-bmk.sh (when using tensorflow's official docker image as base)
# pandas==1.1.5
# scikit-learn==0.24.2
# scipy==1.5.4
# scikit-optimize==0.9.0
# matplotlib==3.2.2
# hpbandster==0.7.4
# hyperopt==0.2.5
# hpbandster
# ConfigSpace==0.4.19
# pyaml==6.0
# onnx==1.10.1
# tensorflow-text==2.6.0
# tensorboard==2.6.0
# tensorflow-estimator==2.6.0
# tensorflow-metadata==1.1.0
# tensorflow-model-optimization==0.7.0
# comet-ml==3.15.4
Subproject commit 9324c9625555addd9367133193a3c491fbc74376
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment