David Southwick · 6474b125 · ca41dcfd · 6474b125
--- a/cms/mlpf/cms-mlpf/cms-mlpf-bmk.sh

+ 100

− 35
+++ b/cms/mlpf/cms-mlpf/cms-mlpf-bmk.sh

+ 100

− 35
 @@ -9,74 +9,134 @@ else FLAVOR="unknown";
 fi

 # Default config
-NEPOCHS=1
+NEPOCHS=2
 NTRAIN=0      # 0 is None
 NTEST=0       # 0 is None
-BSIZE=0       # 0 is Default
+BSIZE=4       # 0 is Default
 NDEVICES=0    # 0 is Default
-RESULTSDIR=/results
+DEBUG=0
+resultsDir="/results"
+skipSubDir=0
+MOP="none"
 DESCRIPTION="Machine Learning Particle Flow (MLPF) benchmark"

 log() {
  case $1 in
-    error)  shift 1; echo -e "\e[31m>>> ERROR:\e[0m $*\n" | tee -a $RESULTSDIR/out.log ; exit 2 ;;
-    info)   shift 1; echo -e "\e[34m$*\e[0m\n" | tee -a $RESULTSDIR/out.log ;;
-    silent) shift 1; echo "$*" >> $RESULTSDIR/out.log ;;
-    *)      echo "$*" | tee -a $RESULTSDIR/out.log ;
+    error)  shift 1; echo -e "\e[31m>>> ERROR:\e[0m $*\n" | tee -a $resultsDir/out.log ; exit 2 ;;
+    info)   shift 1; echo -e "\e[34m$*\e[0m\n" | tee -a $resultsDir/out.log ;;
+    silent) shift 1; echo "$*" >> $resultsDir/out.log ;;
+    *)      echo "$*" | tee -a $resultsDir/out.log ;
  esac
 }

-usage() {
-  echo "Usage: $0 [options]"
-  echo "Options:"
-  echo "-h, --help              Prints this message and exit."
-  echo "-w, --resultsdir <str>  Results directory.        Default: $RESULTSDIR"
-  echo "-e, --nepochs <int>     Number of epochs.         Default: $NEPOCHS"
-  echo "-B, --batch_size <int>  Batch size per device.    Default: $BSIZE"
-  echo "-D, --num_devices <int> Number of devices to use. Default: $NDEVICES"
-  echo "    --ntrain <int>      Train steps limit.        Default: $NTRAIN"
-  echo "    --ntest <int>       Test steps limit.         Default: $NTEST"
-  exit 0
+function usage(){
+  echo ""
+  echo "Usage: $0 [-w | --resultsdir <resultsDir>] [-W] [-c | --copies <NCOPIES>] [-n | --nepochs <NEPOCHS>] " \
+                 "[-B | --batch_size <BSIZE>] [-D | --num_devices <NDEVICES>] [--ntrain <NTRAIN>] [--ntest <NTEST>] " \
+                 "[-m | --mop <mode>] [-d | --debug] [-h | --help]"
+  echo "  -w --resultsdir <resultsDir> : (path) results directory (default: /results , current: $resultsDir)"
+  echo "  -W                           : store results in <resultsDir> directly"
+  echo "  -n --nepochs                 : (int) Number of epochs (default: 1, current: $NEPOCHS)"
+  echo "  -B --batch_size              : (int) Batch size per device (default: 0, current: $BSIZE)"
+  echo "  -D --num_devices             : (int) Number of devices to use (default: 0, current: $NDEVICES)"
+  echo "     --ntrain                  : (int) Train steps limit (default: 0, current: $NTRAIN)"
+  echo "     --ntest                   : (int) Test steps limit (default: 0, current: $NTEST)"
+  echo "  -m --mop                     : (none|all|custom) clean working directory mode: none/all/custom (current: $MOP)"
+  echo "  -d --debug                   : debug mode"
+  echo "  -h --help                    : display this help and exit"
+  echo ""
+  echo "Mop mode: 
+          none   == do not remove working files, 
+          all    == remove all produced files (but summary json), 
+          custom == custom implementation"
+  echo "Without -W (default): results are stored in a new subdirectory of <resultsDir>:"
+  echo "  <resultsDir>/<uniqueid>/*.json"
+  echo "  <resultsDir>/<uniqueid>/proc_1/*.log"
+  echo "  <resultsDir>/<uniqueid>/proc_.../*.log"
+  echo "  <resultsDir>/<uniqueid>/proc_<COPIES>/*.log"
+  echo "With -W (e.g. in the CI): results are stored in <resultsDir> directly:"
+  echo "  <resultsDir>/*.json"
+  echo "  <resultsDir>/proc_1/*.log"
+  echo "  <resultsDir>/proc_.../*.log"
+  echo "  <resultsDir>/proc_<NCOPIES>/*.log"
+  echo ""
+  echo "Without -w (default) and without -W: <resultsDir> is /results"
+  echo "Without -w (default) and with -W: <resultsDir> is a tmp directory /tmp/xxxx"
+  echo ""
+  if [ "$(type -t usage_detailed)" == "function" ]; then
+    echo -e "\nDetailed Usage:\n----------------\n"
+    ( usage_detailed ) # as a subprocess, just in case this has a 0 exit code...
+  fi
+  echo -e "DESCRIPTION\n"
+  if [ -e $BMKDIR/DESCRIPTION ]; then
+      cat $BMKDIR/DESCRIPTION
+  else
+      echo "Sorry there is no description included."
+  fi
+  echo ""
+  exit 2 # early termination (help or invalid arguments to benchmark script)
 }

 parse_args() {
-  options=$(getopt --long resultsdir:,nepochs:,ntrain:,ntest:,batch_size:,num_devices:,help -o wWeDB:h -- "$@")
+  options=$(getopt -a -n cms-mlpf-bmk -o w:Wm:n:dD:B:h --long resultsdir:,nepochs:,ntrain:,ntest:,batch_size:,num_devices:,debug,help,mop -- "$@")
  if [ $? != 0 ]; then echo "Invalid options provided." >&2; usage; fi
  eval set -- "$options"
  while true; do
-    case $1 in
+    case "$1" in
      --help | -h ) usage; exit 0;;
-      --resultsdir | -w ) RESULTSDIR=$2; shift ;;
-      --ntrain ) NTRAIN=$2; shift ;;
-      --ntest ) NTEST=$2; shift ;;
-      --nepochs | -e ) NEPOCHS=$2; shift ;;
-      --num_devices | -D ) NDEVICES=$2; shift ;;
-      --batch_size | -B ) BSIZE=$2; shift ;;
+      --debug | -d ) DEBUG=1 ;;
+      -W ) skipSubDir=1;;
+      --mop | -m ) MOP="$2"; shift;;
+      --resultsdir | -w ) resultsDir="$2"; shift;;
+      --ntrain ) NTRAIN="$2"; shift;;
+      --ntest ) NTEST="$2"; shift;;
+      --nepochs | -n ) NEPOCHS="$2"; shift;;
+      --num_devices | -D ) NDEVICES="$2"; shift;;
+      --batch_size | -B ) BSIZE="$2"; shift;;
      -- ) shift; break;;
    esac
    shift
  done
 }

+# TODO: implement MOP, DEBUG
+
 parse_args $*

-if [ -f "$RESULTSDIR"/out.log ]; then rm "$RESULTSDIR"/out.log; fi
-log info "Base working directory: $RESULTSDIR"
-log info "Running benchmark MLPF"
+if [ -f "$resultsDir"/out.log ]; then rm "$resultsDir"/out.log; fi
+log info "Base working directory: $resultsDir"
+
+# set CUDA_VISIBLE_DEVICES for tensorflow based on nvidia-smi (dirty nvidia-only check)
+if type -P "nvidia-smi" &>/dev/null; then
+  DEVICES=$(nvidia-smi -L | wc -l)
+  log info "Detected $DEVICES nvidia GPUs"
+  export CUDA_VISIBLE_DEVICES=$(seq -s, 0 $(($DEVICES-1)))
+fi

-cd /workspace/particleflow/
+log info "Running benchmark MLPF"
+log silent "Executing 'python3 mlpf/pipeline.py train \
+  --config parameters/delphes-benchmark.yaml \
+  --prefix /tmp/train_ \
+  --plot-freq 1000000 \
+  --benchmark_dir $resultsDir \
+  --num_devices $NDEVICES \
+  --batch_size $BSIZE \
+  --nepochs $NEPOCHS \
+  --ntrain $NTRAIN \
+  --ntest $NTEST'"
+cd /bmk/cms-mlpf/particleflow/
 python3 mlpf/pipeline.py train \
  --config parameters/delphes-benchmark.yaml \
  --prefix /tmp/train_ \
  --plot-freq 1000000 \
-  --benchmark_dir $RESULTSDIR \
+  --benchmark_dir $resultsDir \
  --num_devices $NDEVICES \
  --batch_size $BSIZE \
  --nepochs $NEPOCHS \
  --ntrain $NTRAIN \
  --ntest $NTEST

-REPORT=$(cat $RESULTSDIR/result.json)
+REPORT=$(cat $resultsDir/result.json)

 generate_json() {
  jq -n \
 @@ -95,6 +155,11 @@ generate_json() {
      }
    }'
 }
-
-generate_json > $RESULTSDIR/mlpf-report.json
-log info "Finished running MLPF"
+mkdir -p $resultsDir/report
+if [ $skipSubDir -eq 0 ]; then
+  REPORT_PATH=$resultsDir/report/cms-mlpf_summary.json
+else
+  REPORT_PATH=$resultsDir/cms-mlpf_summary.json
+fi
+generate_json > $REPORT_PATH
+log info "Finished running MLPF. Final report written to $REPORT_PATH"