diff --git a/.gitmodules b/.gitmodules
index aed4fd3e7e9995d169ec516e4ef3eae15e3ff092..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,5 +0,0 @@
-[submodule "cms/mlpf/particleflow"]
-	path = cms/mlpf/particleflow
-	url = https://github.com/dcsouthwick/particleflow
-        branch = benchmark_suite
-	fetchRecurseSubmodules = true
diff --git a/WL_list.md b/WL_list.md
index a66f27314cc5553b327bb21b59d26874b6b85361..c00ab08044f088a098f2ec4d5c571c0192e1245a 100644
--- a/WL_list.md
+++ b/WL_list.md
@@ -21,22 +21,22 @@ where:
 
 | Experiment |  WL repo  | SIF image registry | Docker image registry| Latest Built Version | Latest Pipeline status | Unpacked container size | 
 | -------- | -------- | -------- | -------- | -------- | -------- | -------- |
-| alice | [digi-reco-core-run3-ma][alice_digi-reco-core-run3-ma_code] | [click for link][alice_digi-reco-core-run3-ma_sif] | [click for link][alice_digi-reco-core-run3-ma_img] | [v3.0][alice_digi-reco-core-run3-ma_pipelink] | ![ci][alice_digi-reco-core-run3-ma_pipeline]| 16G |
-| atlas | [gen_sherpa-ma][atlas_gen_sherpa-ma_code] | [click for link][atlas_gen_sherpa-ma_sif] | [click for link][atlas_gen_sherpa-ma_img] | [v2.2][atlas_gen_sherpa-ma_pipelink] | ![ci][atlas_gen_sherpa-ma_pipeline]|  22G  |
-| atlas | [reco_mt-ma][atlas_reco_mt-ma_code] | [click for link][atlas_reco_mt-ma_sif] | [click for link][atlas_reco_mt-ma_img] | [v2.3][atlas_reco_mt-ma_pipelink] | ![ci][atlas_reco_mt-ma_pipeline]|       22G       |
-| atlas | [sim_mt-ma][atlas_sim_mt-ma_code] | [click for link][atlas_sim_mt-ma_sif] | [click for link][atlas_sim_mt-ma_img] | [v2.1][atlas_sim_mt-ma_pipelink] | ![ci][atlas_sim_mt-ma_pipeline]|      20G      |
-| belle2 | [gen-sim-reco-ma][belle2_gen-sim-reco-ma_code] | [click for link][belle2_gen-sim-reco-ma_sif] | [click for link][belle2_gen-sim-reco-ma_img] | [v2.2][belle2_gen-sim-reco-ma_pipelink] | ![ci][belle2_gen-sim-reco-ma_pipeline]|                 3.4G                 |
-| cms | [digi-run3-ma][cms_digi-run3-ma_code] | [click for link][cms_digi-run3-ma_sif] | [click for link][cms_digi-run3-ma_img] | [v1.1][cms_digi-run3-ma_pipelink] | ![ci][cms_digi-run3-ma_pipeline]|              5.8G              |
-| cms | [gen-sim-run3-ma][cms_gen-sim-run3-ma_code] | [click for link][cms_gen-sim-run3-ma_sif] | [click for link][cms_gen-sim-run3-ma_img] | [v1.1][cms_gen-sim-run3-ma_pipelink] | ![ci][cms_gen-sim-run3-ma_pipeline]|                            6.2G                            |
-| cms | [hlt-ma][cms_hlt-ma_code] | [click for link][cms_hlt-ma_sif] | [click for link][cms_hlt-ma_img] | [v0.2][cms_hlt-ma_pipelink] | ![ci][cms_hlt-ma_pipeline]|                    19G                    |
-| cms | [mlpf][cms_mlpf_code] | [click for link][cms_mlpf_sif] | [click for link][cms_mlpf_img] | [v0.1][cms_mlpf_pipelink] | ![ci][cms_mlpf_pipeline]|                                                            |
-| cms | [reco-run3-ma][cms_reco-run3-ma_code] | [click for link][cms_reco-run3-ma_sif] | [click for link][cms_reco-run3-ma_img] | [v1.2][cms_reco-run3-ma_pipelink] | ![ci][cms_reco-run3-ma_pipeline]|               6.5G               |
-| hello | [world-c7-ma][hello_world-c7-ma_code] | [click for link][hello_world-c7-ma_sif] | [click for link][hello_world-c7-ma_img] | [v1.0][hello_world-c7-ma_pipelink] | ![ci][hello_world-c7-ma_pipeline]|       759M       |
-| hello | [world-cs8-ma][hello_world-cs8-ma_code] | [click for link][hello_world-cs8-ma_sif] | [click for link][hello_world-cs8-ma_img] | [ci-v1.0][hello_world-cs8-ma_pipelink] | ![ci][hello_world-cs8-ma_pipeline]|                  518M                  |
-| igwn | [pe][igwn_pe_code] | [click for link][igwn_pe_sif] | [click for link][igwn_pe_img] | [v0.5][igwn_pe_pipelink] | ![ci][igwn_pe_pipeline]|         2.9G         |
-| juno | [gen-sim-reco][juno_gen-sim-reco_code] | [click for link][juno_gen-sim-reco_sif] | [click for link][juno_gen-sim-reco_img] | [v3.1][juno_gen-sim-reco_pipelink] | ![ci][juno_gen-sim-reco_pipeline]|    3.3G    |
-| lhcb | [sim-run3-ma][lhcb_sim-run3-ma_code] | [click for link][lhcb_sim-run3-ma_sif] | [click for link][lhcb_sim-run3-ma_img] | [v1.1][lhcb_sim-run3-ma_pipelink] | ![ci][lhcb_sim-run3-ma_pipeline]|                5.4G                |
-| mg5amc | [madgraph4gpu-2022][mg5amc_madgraph4gpu-2022_code] | [click for link][mg5amc_madgraph4gpu-2022_sif] | [click for link][mg5amc_madgraph4gpu-2022_img] | [ci-v0.10][mg5amc_madgraph4gpu-2022_pipelink] | ![ci][mg5amc_madgraph4gpu-2022_pipeline]|                        11G                        |
+| alice | [digi-reco-core-run3-ma][alice_digi-reco-core-run3-ma_code] | [click for link][alice_digi-reco-core-run3-ma_sif] | [click for link][alice_digi-reco-core-run3-ma_img] | [v3.0][alice_digi-reco-core-run3-ma_pipelink] | ![ci][alice_digi-reco-core-run3-ma_pipeline]|  16G  |
+| atlas | [gen_sherpa-ma][atlas_gen_sherpa-ma_code] | [click for link][atlas_gen_sherpa-ma_sif] | [click for link][atlas_gen_sherpa-ma_img] | [v2.2][atlas_gen_sherpa-ma_pipelink] | ![ci][atlas_gen_sherpa-ma_pipeline]|   22G   |
+| atlas | [reco_mt-ma][atlas_reco_mt-ma_code] | [click for link][atlas_reco_mt-ma_sif] | [click for link][atlas_reco_mt-ma_img] | [v2.3][atlas_reco_mt-ma_pipelink] | ![ci][atlas_reco_mt-ma_pipeline]|        22G        |
+| atlas | [sim_mt-ma][atlas_sim_mt-ma_code] | [click for link][atlas_sim_mt-ma_sif] | [click for link][atlas_sim_mt-ma_img] | [v2.1][atlas_sim_mt-ma_pipelink] | ![ci][atlas_sim_mt-ma_pipeline]|       20G       |
+| belle2 | [gen-sim-reco-ma][belle2_gen-sim-reco-ma_code] | [click for link][belle2_gen-sim-reco-ma_sif] | [click for link][belle2_gen-sim-reco-ma_img] | [v2.2][belle2_gen-sim-reco-ma_pipelink] | ![ci][belle2_gen-sim-reco-ma_pipeline]|                  3.4G                  |
+| cms | [digi-run3-ma][cms_digi-run3-ma_code] | [click for link][cms_digi-run3-ma_sif] | [click for link][cms_digi-run3-ma_img] | [v1.1][cms_digi-run3-ma_pipelink] | ![ci][cms_digi-run3-ma_pipeline]|               5.8G               |
+| cms | [gen-sim-run3-ma][cms_gen-sim-run3-ma_code] | [click for link][cms_gen-sim-run3-ma_sif] | [click for link][cms_gen-sim-run3-ma_img] | [v1.1][cms_gen-sim-run3-ma_pipelink] | ![ci][cms_gen-sim-run3-ma_pipeline]|                             6.2G                             |
+| cms | [hlt-ma][cms_hlt-ma_code] | [click for link][cms_hlt-ma_sif] | [click for link][cms_hlt-ma_img] | [v0.2][cms_hlt-ma_pipelink] | ![ci][cms_hlt-ma_pipeline]|                     19G                     |
+| cms | [mlpf][cms_mlpf_code] | [click for link][cms_mlpf_sif] | [click for link][cms_mlpf_img] | [ci-v0.4][cms_mlpf_pipelink] | ![ci][cms_mlpf_pipeline]| 7.6G |
+| cms | [reco-run3-ma][cms_reco-run3-ma_code] | [click for link][cms_reco-run3-ma_sif] | [click for link][cms_reco-run3-ma_img] | [v1.2][cms_reco-run3-ma_pipelink] | ![ci][cms_reco-run3-ma_pipeline]|                6.5G                |
+| hello | [world-c7-ma][hello_world-c7-ma_code] | [click for link][hello_world-c7-ma_sif] | [click for link][hello_world-c7-ma_img] | [v1.0][hello_world-c7-ma_pipelink] | ![ci][hello_world-c7-ma_pipeline]|        759M        |
+| hello | [world-cs8-ma][hello_world-cs8-ma_code] | [click for link][hello_world-cs8-ma_sif] | [click for link][hello_world-cs8-ma_img] | [ci-v1.0][hello_world-cs8-ma_pipelink] | ![ci][hello_world-cs8-ma_pipeline]|                   518M                   |
+| igwn | [pe][igwn_pe_code] | [click for link][igwn_pe_sif] | [click for link][igwn_pe_img] | [v0.5][igwn_pe_pipelink] | ![ci][igwn_pe_pipeline]|          2.9G          |
+| juno | [gen-sim-reco][juno_gen-sim-reco_code] | [click for link][juno_gen-sim-reco_sif] | [click for link][juno_gen-sim-reco_img] | [v3.1][juno_gen-sim-reco_pipelink] | ![ci][juno_gen-sim-reco_pipeline]|     3.3G     |
+| lhcb | [sim-run3-ma][lhcb_sim-run3-ma_code] | [click for link][lhcb_sim-run3-ma_sif] | [click for link][lhcb_sim-run3-ma_img] | [v1.1][lhcb_sim-run3-ma_pipelink] | ![ci][lhcb_sim-run3-ma_pipeline]|                 5.4G                 |
+| mg5amc | [madgraph4gpu-2022][mg5amc_madgraph4gpu-2022_code] | [click for link][mg5amc_madgraph4gpu-2022_sif] | [click for link][mg5amc_madgraph4gpu-2022_img] | [ci-v0.10][mg5amc_madgraph4gpu-2022_pipelink] | ![ci][mg5amc_madgraph4gpu-2022_pipeline]|                         11G                         |
 
 [alice_digi-reco-core-run3-ma_code]: https://gitlab.cern.ch/hep-benchmarks/hep-workloads/-/blob/master/alice/digi-reco-core-run3-ma/alice-digi-reco-core-run3-ma
 [alice_digi-reco-core-run3-ma_sif]: https://gitlab.cern.ch/hep-benchmarks/hep-workloads-sif/container_registry/?search%5B%5D=alice-digi-reco-core-run3-ma-bmk
@@ -89,7 +89,7 @@ where:
 [cms_mlpf_code]: https://gitlab.cern.ch/hep-benchmarks/hep-workloads/-/blob/master/cms/mlpf/cms-mlpf
 [cms_mlpf_sif]: https://gitlab.cern.ch/hep-benchmarks/hep-workloads-sif/container_registry/?search%5B%5D=cms-mlpf-bmk
 [cms_mlpf_img]: https://gitlab.cern.ch/hep-benchmarks/hep-workloads/container_registry/?search%5B%5D=cms-mlpf-bmk
-[cms_mlpf_pipelink]: https://gitlab.cern.ch/hep-benchmarks/hep-workloads/-/pipelines/4175345
+[cms_mlpf_pipelink]: https://gitlab.cern.ch/hep-benchmarks/hep-workloads/-/pipelines/8569540
 [cms_mlpf_pipeline]: https://gitlab.cern.ch/hep-benchmarks/hep-workloads/badges/qa-build-cms-mlpf/pipeline.svg
 
 [cms_reco-run3-ma_code]: https://gitlab.cern.ch/hep-benchmarks/hep-workloads/-/blob/master/cms/reco-run3-ma/cms-reco-run3-ma
diff --git a/cms/mlpf/Dockerfile.append b/cms/mlpf/Dockerfile.append
index dd54f87cee6e20f3fa37da0328aacd377fdc9ac3..1b8e0556a33cd875ba58fd7ea8b8aa331965622d 100644
--- a/cms/mlpf/Dockerfile.append
+++ b/cms/mlpf/Dockerfile.append
@@ -1,14 +1,12 @@
-# Get the source code for MLPF
-COPY particleflow /bmk/cms-mlpf/particleflow
-
-# Copy in the dependencies
-COPY cms-mlpf/requirements.txt cms-mlpf/cms-mlpf-bmk.sh cms-mlpf/prepare-dataset.sh /bmk/cms-mlpf/
-
 # Install dependencies
 RUN \
-    # cd /bmk/cms-mlpf && git submodule update --init --recursive
-    python3 -m pip install --upgrade pip==21.3.1 setuptools==59.5.0 && \
-    python3 -m pip install --no-cache-dir -r /bmk/cms-mlpf/requirements.txt && \
-    python3 -m pip install --no-cache-dir /bmk/cms-mlpf/particleflow/hep_tfds && \
-    # Download and pre-process the dataset
-    bash /bmk/cms-mlpf/prepare-dataset.sh
+    dnf install -y git && \
+    git clone https://gitlab.cern.ch/dsouthwi/particleflow.git --depth 1 -b bmk_torch /bmk/cms-mlpf && \
+    python3 -m ensurepip && \
+    python3 -m pip install torch==2.4.1 torchvision==0.19.1 torchaudio==2.4.1 --index-url https://download.pytorch.org/whl/cu124 && \
+    # python3 -m pip install --no-cache-dir torch torchvision torchaudio && \ torch 2.5 broken for now
+    python3 -m pip install --no-cache-dir comet-ml awkward boost-histogram fastjet tqdm scikit-learn pandas mplhep numba tfds-nightly wheel pyyaml tensorboard
+
+# Get dataset
+RUN curl -LO https://dsouthwi.web.cern.ch/files/clic_edm_qq_pf_2.1.0.tar.gz && \
+    tar zxf clic_edm_qq_pf_2.1.0.tar.gz -C /bmk/cms-mlpf && rm clic_edm_qq_pf_2.1.0.tar.gz
diff --git a/cms/mlpf/cms-mlpf.spec b/cms/mlpf/cms-mlpf.spec
index 21277af610b5b1c74adae7a8ba8af0dad172fab0..a5bee5f656507c253a0ad92b01d2828bb602a6a1 100644
--- a/cms/mlpf/cms-mlpf.spec
+++ b/cms/mlpf/cms-mlpf.spec
@@ -1,9 +1,12 @@
 HEPWL_BMKEXE=cms-mlpf-bmk.sh
-HEPWL_BMKOPTS=""
+# docker may run out of shared memory for events > 1000, for CPU-only
+# critically, reduce number of convolutions to 1 for CPU CI
+HEPWL_BMKOPTS="-x '--ntrain 300 --nvalid 300 --gpus 1 --num-epochs 2 --num-convs 1'"
 HEPWL_BMKDIR="cms-mlpf"
-HEPWL_BMKDESCRIPTION=""
+HEPWL_BMKDESCRIPTION="CMS Machine-Learned ParticleFlow (MLPF)"
 HEPWL_DOCKERIMAGENAME=cms-mlpf-bmk
-HEPWL_DOCKERIMAGETAG=v0.1
-HEPWL_CVMFSREPOS=sft.cern.ch
-HEPWL_BMKOS="gitlab-registry.cern.ch/dsouthwi/tensorflow/tensorflow-gpu:latest"
+HEPWL_DOCKERIMAGETAG=ci-v0.4 # NB ci-vX.Y for tests, vX.Y for tags
+HEPWL_CVMFSREPOS=NONE
+HEPWL_BMKOS="gitlab-registry.cern.ch/linuxsupport/alma9-base:latest"
 HEPWL_BMKUSEGPU=1
+HEPWL_BUILDARCH="x86_64"
diff --git a/cms/mlpf/cms-mlpf/DESCRIPTION b/cms/mlpf/cms-mlpf/DESCRIPTION
index c20341cbb18bcba679e508d5ef6181dc353ea1dc..03674f2e805bd65f5287792d0c1abe20946029bb 100644
--- a/cms/mlpf/cms-mlpf/DESCRIPTION
+++ b/cms/mlpf/cms-mlpf/DESCRIPTION
@@ -1 +1,3 @@
-ML-based CMS workload reconstructing events using https://github.com/jpata/particleflow
+ML-based CMS particleflow workload reconstructing events using <https://github.com/jpata/particleflow>
+
+Intended for GPU usage, CPU will take much longer.
diff --git a/cms/mlpf/cms-mlpf/cms-mlpf-bmk.dev.sh b/cms/mlpf/cms-mlpf/cms-mlpf-bmk.dev.sh
new file mode 100644
index 0000000000000000000000000000000000000000..7798b34a9a2bd94e51d97a327dc78d37805e9332
--- /dev/null
+++ b/cms/mlpf/cms-mlpf/cms-mlpf-bmk.dev.sh
@@ -0,0 +1,175 @@
+#!/bin/bash
+
+# Copyright 2019-2020 CERN. See the COPYRIGHT file at the top-level
+# directory of this distribution. For licensing information, see the
+# COPYING file at the top-level directory of this distribution.
+
+#set -x # enable debug printouts
+
+#set -e # immediate exit on error
+
+# Function doOne must be defined in each benchmark
+# Input argument $1: process index (between 1 and $NCOPIES)
+# Return value: please return 0 if this workload copy was successful, 1 otherwise
+# The following variables are guaranteed to be defined and exported: NCOPIES, NTHREADS, NEVENTS_THREAD, BMKDIR, DEBUG
+# The function is started in process-specific working directory <basewdir>/proc_$1:
+# please store here the individual log files for each of the NCOPIES processes
+function doOne(){
+  if [ "$1" == "" ] || [ "$2" != "" ]; then echo "[doOne] ERROR! Invalid arguments '$@' to doOne"; return 1; fi
+  echo "[doOne ($1)] $(date) starting in $(pwd)"
+  sleep 5s 
+  echo "Hello world" > out_$1.log 2>&1
+  echo "[doOne ($1)] $(date) EXTRA_ARGS='$EXTRA_ARGS'"
+
+  # parse extra args if any
+  options=$(getopt -a -n cms-mlpf-bmk -o g:D:B: --long nepochs:,ntrain:,ntest:,nvalid:,batch-multiplier:,gpus:,dtype:nworkers: -- "$EXTRA_ARGS")
+  eval set -- "$options"
+  while [ : ]; do
+    case "$1" in
+      --ntrain ) NTRAIN="$2"; shift;;
+      --ntest ) NTEST="$2"; shift;;
+      --nvalid ) NVALID="$2"; shift;;
+      --nepochs ) NEPOCHS="$2"; shift;;
+      --nworkers ) NWORKERS="$2"; shift;;
+      --gpus | -g ) NGPUS="$2"; shift;;
+      --batch-multiplier | -B ) BMULT="$2"; shift;;
+      --dtype | -D ) DTYPE="$2"; shift;;
+      --train ) TRAIN="--train";;
+      -- ) shift; break;;
+    esac
+    shift
+  done
+
+
+  # Run the workload
+  python3 mlpf/pyg_pipeline.py ${TRAIN} \
+  --config parameters/pytorch/pyg-clic.yaml \
+  --benchmark_dir $resultsDir \
+  --gpus $NGPUS \
+  --batch_size $BSIZE \
+  --num-epochs $NEPOCHS \
+  --ntrain $NTRAIN \
+  --ntest $NTEST \
+  --nvalid $NVALID \
+  #--prefix /tmp/train_ \
+  status=${?}
+  echo "[doOne ($1)] $(date) completed (status=$status)"
+  # Return 0 if this workload copy was successful, 1 otherwise
+  return $status
+}
+
+# Default values for NCOPIES, NTHREADS, NEVENTS_THREAD must be set in each benchmark
+NCOPIES=1 # cannot be changed by user input ()
+NTHREADS=1 # cannot be changed by user input (single-threaded single-process WL)
+NEVENTS_THREAD=1 # not relevant for GPUs
+# specific to MLPF
+NEPOCHS=6     # must be >1 as 1st epoch is thrown away
+NTRAIN=120000 # 0 is None, events to train on
+NTEST=36000   # 0 is None, events to test training
+BSIZE=8       # 8 is Default, batch size (too small device is under-loaded, too large OOM)
+NGPUS=1    # 0 is Default, GPUs
+TRAIN="--train"
+DEBUG=0
+resultsDir="/results"
+
+function usage_detailed(){
+  echo ""
+  echo "Additional MLPF parameters:"
+  echo "     --nepochs                 : (int) Number of epochs >1 (default: $NEPOCHS)"
+  echo "     --ntrain                  : (int) Train steps limit (default: $NTRAIN)"
+  echo "     --ntest                   : (int) Test steps limit (default: $NTEST)"
+  echo "     --nvalid                  : (int) Validation steps limit (default: $NVALID)"
+  echo "     --batch_size              : (int) Batch size (default: $BSIZE)"
+  echo "     --dtype                   : (string) Data type {float32, float16, bfloat16}(default: $DTYPE)"
+  echo "  -B --batch-        : (int) Batch multiplier, 1=16G,5=80G GPU memory (default: $BATCH_MULTIPLIER)"
+  echo "  -g --gpus                    : (int) Number of gpus to use (default: $NGPUS)"
+  
+}
+
+
+if [ -f /run/.containerenv ]; then FLAVOR="podman"
+elif [ -f /.dockerenv ]; then FLAVOR="docker"
+elif [ -f /singularity ]; then FLAVOR="singularity"
+else FLAVOR="unknown";
+fi
+
+# Source the common benchmark driver
+if [ -f $(dirname $0)/bmk-driver.sh ]; then
+  . $(dirname $0)/bmk-driver.sh
+else
+  . $(dirname $0)/../../../common/bmk-driver.sh
+fi
+
+
+##############################
+
+
+set -e
+
+
+log() {
+  case $1 in
+    error)  shift 1; echo -e "\e[31m>>> ERROR:\e[0m $*\n" | tee -a $resultsDir/out.log ; exit 2 ;;
+    info)   shift 1; echo -e "\e[34m$*\e[0m\n" | tee -a $resultsDir/out.log ;;
+    silent) shift 1; echo "$*" >> $resultsDir/out.log ;;
+    *)      echo "$*" | tee -a $resultsDir/out.log ;
+  esac
+}
+
+
+
+# set CUDA_VISIBLE_DEVICES for tensorflow based on nvidia-smi (dirty nvidia-only check)
+if type -P "nvidia-smi" &>/dev/null; then
+  DEVICES=$(nvidia-smi -L | wc -l)
+  log info "Detected $DEVICES nvidia GPUs"
+  export CUDA_VISIBLE_DEVICES=$(seq -s, 0 $(($DEVICES-1)))
+fi
+
+# create /results/build to satisfy common build script (mimic bmk-driver.sh)
+log silent "Creating /results/build"
+mkdir -p $resultsDir/build
+touch $resultsDir/build/.pointlessfile
+
+log info "Running benchmark MLPF"
+log silent "Executing 'python3 mlpf/pipeline.py train \
+  --config parameters/delphes-benchmark.yaml \
+  --prefix /tmp/train_ \
+  --plot-freq 1000000 \
+  --benchmark_dir $resultsDir \
+  --num_devices $NDEVICES \
+  --batch_size $BSIZE \
+  --nepochs $NEPOCHS \
+  --ntrain $NTRAIN \
+  --ntest $NTEST'"
+cd /bmk/cms-mlpf/particleflow/
+
+
+REPORT=$(cat $resultsDir/result.json)
+
+generate_json() {
+  jq -n \
+    --argjson nepochs "$NEPOCHS" \
+    --argjson report "$REPORT" \
+    --arg containment "$FLAVOR" \
+    --arg description "$DESCRIPTION" \
+    '{
+      "run_info":{
+        $nepochs
+      },
+      $report,
+      "app":{
+        $containment,
+        $description
+      }
+    }'
+}
+mkdir -p $resultsDir/report
+if [ $skipSubDir -eq 0 ]; then
+  REPORT_PATH=$resultsDir/report/cms-mlpf_summary.json
+else
+  REPORT_PATH=$resultsDir/cms-mlpf_summary.json
+fi
+generate_json > $REPORT_PATH
+log info "Finished running MLPF. Final report written to $REPORT_PATH"
+
+# sourcing bmk-driver excluded for now pending rework to override common args
diff --git a/cms/mlpf/cms-mlpf/cms-mlpf-bmk.sh b/cms/mlpf/cms-mlpf/cms-mlpf-bmk.sh
index 838513a67de7e75661cc2a68d7a0561e074df5d7..1c1c8a67dfd6db734c1782f1e479c92f8910f9ce 100755
--- a/cms/mlpf/cms-mlpf/cms-mlpf-bmk.sh
+++ b/cms/mlpf/cms-mlpf/cms-mlpf-bmk.sh
@@ -1,172 +1,126 @@
 #!/bin/bash
 
-set -e
+# Copyright 2019-2020 CERN. See the COPYRIGHT file at the top-level
+# directory of this distribution. For licensing information, see the
+# COPYING file at the top-level directory of this distribution.
 
-if [ -f /run/.containerenv ]; then FLAVOR="podman"
-elif [ -f /.dockerenv ]; then FLAVOR="docker"
-elif [ -f /singularity ]; then FLAVOR="singularity"
-else FLAVOR="unknown";
-fi
+#set -x # enable debug printouts
 
-# Default config
-NEPOCHS=2     # must be >1 as 1st epoch is thrown away
-NTRAIN=0      # 0 is None
-NTEST=0       # 0 is None
-BSIZE=4       # 4 is Default
-NDEVICES=0    # 0 is Default
-DEBUG=0
-resultsDir="/results"
-skipSubDir=0
-MOP="none"
-DESCRIPTION="Machine Learning Particle Flow (MLPF) benchmark"
+#set -e # immediate exit on error
 
-log() {
-  case $1 in
-    error)  shift 1; echo -e "\e[31m>>> ERROR:\e[0m $*\n" | tee -a $resultsDir/out.log ; exit 2 ;;
-    info)   shift 1; echo -e "\e[34m$*\e[0m\n" | tee -a $resultsDir/out.log ;;
-    silent) shift 1; echo "$*" >> $resultsDir/out.log ;;
-    *)      echo "$*" | tee -a $resultsDir/out.log ;
-  esac
-}
+# Function doOne must be defined in each benchmark
+# Input argument $1: process index (between 1 and $NCOPIES)
+# Return value: please return 0 if this workload copy was successful, 1 otherwise
+# The following variables are guaranteed to be defined and exported: NCOPIES, NTHREADS, NEVENTS_THREAD, BMKDIR, DEBUG
+# The function is started in process-specific working directory <basewdir>/proc_$1:
+# please store here the individual log files for each of the NCOPIES processes
+function doOne(){
+  if [ "$1" == "" ] || [ "$2" != "" ]; then echo "[doOne] ERROR! Invalid arguments '$@' to doOne"; return 1; fi
+  echo "[doOne ($1)] $(date) starting in $(pwd)"
+  sleep 5s 
+  echo "Hello world" > out_$1.log 2>&1
+  echo "[doOne ($1)] $(date) EXTRA_ARGS='$EXTRA_ARGS'"
+  echo "Starting ($1) with $EXTRA_ARGS" > out_$1.log
+  echo "[doOne ($1)] $(date) resultsDir='$resultsDir'"
 
-function usage(){
-  echo ""
-  echo "Usage: $0 [-w | --resultsdir <resultsDir>] [-W] [-c | --copies <NCOPIES>] [-n | --nepochs <NEPOCHS>] " \
-                 "[-B | --batch_size <BSIZE>] [-D | --num_devices <NDEVICES>] [--ntrain <NTRAIN>] [--ntest <NTEST>] " \
-                 "[-m | --mop <mode>] [-d | --debug] [-h | --help]"
-  echo "  -w --resultsdir <resultsDir> : (path) results directory (default: /results , current: $resultsDir)"
-  echo "  -W                           : store results in <resultsDir> directly"
-  echo "  -n --nepochs                 : (int) Number of epochs >1 (default: 2, current: $NEPOCHS)"
-  echo "  -B --batch_size              : (int) Batch size per device (default: 4, current: $BSIZE)"
-  echo "  -D --num_devices             : (int) Number of devices to use (default: 0, current: $NDEVICES)"
-  echo "     --ntrain                  : (int) Train steps limit (default: 0, current: $NTRAIN)"
-  echo "     --ntest                   : (int) Test steps limit (default: 0, current: $NTEST)"
-  echo "  -m --mop                     : (none|all|custom) clean working directory mode: none/all/custom (current: $MOP)"
-  echo "  -d --debug                   : debug mode"
-  echo "  -h --help                    : display this help and exit"
-  echo ""
-  echo "Mop mode: 
-          none   == do not remove working files, 
-          all    == remove all produced files (but summary json), 
-          custom == custom implementation"
-  echo "Without -W (default): results are stored in a new subdirectory of <resultsDir>:"
-  echo "  <resultsDir>/<uniqueid>/*.json"
-  echo "  <resultsDir>/<uniqueid>/proc_1/*.log"
-  echo "  <resultsDir>/<uniqueid>/proc_.../*.log"
-  echo "  <resultsDir>/<uniqueid>/proc_<COPIES>/*.log"
-  echo "With -W (e.g. in the CI): results are stored in <resultsDir> directly:"
-  echo "  <resultsDir>/*.json"
-  echo "  <resultsDir>/proc_1/*.log"
-  echo "  <resultsDir>/proc_.../*.log"
-  echo "  <resultsDir>/proc_<NCOPIES>/*.log"
-  echo ""
-  echo "Without -w (default) and without -W: <resultsDir> is /results"
-  echo "Without -w (default) and with -W: <resultsDir> is a tmp directory /tmp/xxxx"
-  echo ""
-  if [ "$(type -t usage_detailed)" == "function" ]; then
-    echo -e "\nDetailed Usage:\n----------------\n"
-    ( usage_detailed ) # as a subprocess, just in case this has a 0 exit code...
-  fi
-  echo -e "DESCRIPTION\n"
-  if [ -e $BMKDIR/DESCRIPTION ]; then
-      cat $BMKDIR/DESCRIPTION
-  else
-      echo "Sorry there is no description included."
-  fi
-  echo ""
-  exit 2 # early termination (help or invalid arguments to benchmark script)
-}
+  # parse extra args if any
 
-parse_args() {
-  options=$(getopt -a -n cms-mlpf-bmk -o w:Wm:n:dD:B:h --long resultsdir:,nepochs:,ntrain:,ntest:,batch_size:,num_devices:,debug,help,mop -- "$@")
-  if [ $? != 0 ]; then echo "Invalid options provided." >&2; usage; fi
-  eval set -- "$options"
-  while true; do
-    case "$1" in
-      --help | -h ) usage; exit 0;;
-      --debug | -d ) DEBUG=1 ;;
-      -W ) skipSubDir=1;;
-      --mop | -m ) MOP="$2"; shift;;
-      --resultsdir | -w ) resultsDir="$2"; shift;;
-      --ntrain ) NTRAIN="$2"; shift;;
-      --ntest ) NTEST="$2"; shift;;
-      --nepochs | -n ) NEPOCHS="$2"; shift;;
-      --num_devices | -D ) NDEVICES="$2"; shift;;
-      --batch_size | -B ) BSIZE="$2"; shift;;
-      -- ) shift; break;;
-    esac
-    shift
-  done
-}
+  # handle inherited args as well-- (-d --debug  $DEBUG)
 
-# TODO: implement MOP, DEBUG
+  # options=$(getopt -a -n cms-mlpf-bmk -o g:D:B: --long nepochs:,ntrain:,ntest:,nvalid:,batch-multiplier:,gpus:,dtype:nworkers: -- "$EXTRA_ARGS")
+  # eval set -- "$options"
+  # while [ : ]; do
+  #   case "$1" in
+  #     --ntrain ) NTRAIN="$2"; shift;;
+  #     --ntest ) NTEST="$2"; shift;;
+  #     --nvalid ) NVALID="$2"; shift;;
+  #     --nepochs ) NEPOCHS="$2"; shift;;
+  #     --nworkers ) NWORKERS="$2"; shift;;
+  #     --gpus | -g ) NGPUS="$2"; shift;;
+  #     --gpu-batch-multiplier | -B ) BSIZE="$2"; shift;;
+  #     --dtype | -D ) DTYPE="$2"; shift;;
+  #     --train ) TRAIN="--train";;
+  #     -- ) shift; break;;
+  #   esac
+  #   shift
+  # done
 
-parse_args $*
+  # Apparently the workload is now called from the results dir???
+  pwd
+  cd /bmk/cms-mlpf
 
-if [ -f "$resultsDir"/out.log ]; then rm "$resultsDir"/out.log; fi
-log info "Base working directory: $resultsDir"
+  # Run the workload
+  python3 mlpf/pyg_pipeline.py ${TRAIN} \
+  --config parameters/pytorch/pyg-clic-bmk.yaml \
+  --benchmark \
+  --experiments-dir $workDir \
+  ${EXTRA_ARGS}
+  # --gpus $NGPUS \
+  # --gpu-batch-multiplier $BSIZE \
+  # --num-epochs $NEPOCHS \
+  # --ntrain $NTRAIN \
+  # --ntest $NTEST \
+  # --nvalid $NVALID \
+  # --dtype $DTYPE \
+  # $TRAIN
+  
 
-# set CUDA_VISIBLE_DEVICES for tensorflow based on nvidia-smi (dirty nvidia-only check)
-if type -P "nvidia-smi" &>/dev/null; then
-  DEVICES=$(nvidia-smi -L | wc -l)
-  log info "Detected $DEVICES nvidia GPUs"
-  export CUDA_VISIBLE_DEVICES=$(seq -s, 0 $(($DEVICES-1)))
-fi
+  #--prefix /tmp/train_ \
+  status=${?}
+  echo "[doOne ($1)] $(date) completed (status=$status)"
+  # Return 0 if this workload copy was successful, 1 otherwise
+  return $status
+}
 
-# create /results/build to satisfy common build script (mimic bmk-driver.sh)
-log silent "Creating /results/build"
-mkdir -p $resultsDir/build
-touch $resultsDir/build/.pointlessfile
+function parseResults(){
+  echo "[parseResults] Parsing results from baseWDir=$baseWDir"
+  resJSON=$(python3 /bmk/cms-mlpf/parseResults.py $baseWDir)
+  pystatus=$?
+  if [ "$pystatus" == "0" ]; then
+    echo $resJSON > $baseWDir/parser_output.json
+    cat $baseWDir/parser_output.json
+  fi
+  echo "[parseResults] python parser completed (status=$pystatus)"
+  return $pystatus
+}
+# Default values for NCOPIES, NTHREADS, NEVENTS_THREAD must be set in each benchmark
+NCOPIES=1 # cannot be changed by user input ()
+NTHREADS=1 # cannot be changed by user input (single-threaded single-process WL)
+NEVENTS_THREAD=1 # not relevant for GPUs
+# specific to MLPF
+NEPOCHS=6     # must be >1 as 1st epoch is thrown away
+NTRAIN=120000 # 0 is None, events to train on
+NTEST=36000   # 0 is None, events to test training
+BSIZE=8       # 8 is Default, batch size (too small device is under-loaded, too large OOM)
+NGPUS=1    # 0 is Default, GPUs
+TRAIN="--train"
+DTYPE="bfloat16"  # float32, float16, bf16
+DEBUG=0
+#resultsDir="/results"
 
-log info "Running benchmark MLPF"
-log silent "Executing 'python3 mlpf/pipeline.py train \
-  --config parameters/delphes-benchmark.yaml \
-  --prefix /tmp/train_ \
-  --plot-freq 1000000 \
-  --benchmark_dir $resultsDir \
-  --num_devices $NDEVICES \
-  --batch_size $BSIZE \
-  --nepochs $NEPOCHS \
-  --ntrain $NTRAIN \
-  --ntest $NTEST'"
-cd /bmk/cms-mlpf/particleflow/
-python3 mlpf/pipeline.py train \
-  --config parameters/delphes-benchmark.yaml \
-  --prefix /tmp/train_ \
-  --plot-freq 1000000 \
-  --benchmark_dir $resultsDir \
-  --num_devices $NDEVICES \
-  --batch_size $BSIZE \
-  --nepochs $NEPOCHS \
-  --ntrain $NTRAIN \
-  --ntest $NTEST
+function usage_detailed(){
+  echo ""
+  echo "Additional MLPF parameters: use -x '<EXTRA_ARGS>'"
+  echo "     --num-epochs              : (int) Number of epochs >1 (default: $NEPOCHS)"
+  echo "     --ntrain                  : (int) Train steps limit (default: $NTRAIN)"
+  echo "     --ntest                   : (int) Test steps limit (default: $NTEST)"
+  echo "     --nvalid                  : (int) Validation steps limit (default: $NVALID)"
+  echo "     --gpu-batch-multiplier    : (int) Increases GPU batch size by constant multiplier 1=1G, 8=10G (default: $BSIZE)"
+  echo "     --dtype                   : (string) Data type {float32, float16, bfloat16}(default: $DTYPE)"
+  echo "     --gpus                    : (int) Number of gpus to use (default: $NGPUS)"
+  
+}
 
-REPORT=$(cat $resultsDir/result.json)
 
-generate_json() {
-  jq -n \
-    --argjson nepochs "$NEPOCHS" \
-    --argjson report "$REPORT" \
-    --arg containment "$FLAVOR" \
-    --arg description "$DESCRIPTION" \
-    '{
-      "run_info":{
-        $nepochs
-      },
-      $report,
-      "app":{
-        $containment,
-        $description
-      }
-    }'
-}
-mkdir -p $resultsDir/report
-if [ $skipSubDir -eq 0 ]; then
-  REPORT_PATH=$resultsDir/report/cms-mlpf_summary.json
-else
-  REPORT_PATH=$resultsDir/cms-mlpf_summary.json
+if [ -f /run/.containerenv ]; then FLAVOR="podman"
+elif [ -f /.dockerenv ]; then FLAVOR="docker"
+elif [ -f /singularity ]; then FLAVOR="singularity"
+else FLAVOR="unknown";
 fi
-generate_json > $REPORT_PATH
-log info "Finished running MLPF. Final report written to $REPORT_PATH"
 
-# sourcing bmk-driver excluded for now pending rework to override common args
+# Source the common benchmark driver
+if [ -f $(dirname $0)/bmk-driver.sh ]; then
+  . $(dirname $0)/bmk-driver.sh
+else
+  . $(dirname $0)/../../../common/bmk-driver.sh
+fi
diff --git a/cms/mlpf/cms-mlpf/parseResults.py b/cms/mlpf/cms-mlpf/parseResults.py
new file mode 100644
index 0000000000000000000000000000000000000000..cf6cdf0022a4ff25dfacd6868010c8b2d9159a76
--- /dev/null
+++ b/cms/mlpf/cms-mlpf/parseResults.py
@@ -0,0 +1,69 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+parseResults.py
+
+This is meant to be called from the parser script `parseResults.sh`
+
+"""
+import argparse
+import json
+from pathlib import Path
+import yaml
+
+directory = "history"  # replace with the desired directory
+
+wl_stats = {}
+wl_scores = {}
+epoch_times = []
+
+def parse_results(results_dir):
+    """results_dir is a path-like object to the directory containing a train-config yaml and history folder."""
+    results_dir=Path(results_dir)
+
+    if not results_dir.is_dir():
+        raise ValueError(f"{results_dir} is not a directory")
+    
+    # check if passed $baseWDir or $workDir
+    results_dir = results_dir.joinpath("proc_1")
+
+    with results_dir.joinpath("train-config.yaml").open(encoding="utf-8") as file:
+        config = yaml.safe_load(file)
+
+    wl_stats["gpus"] = config["gpus"]
+    wl_stats["dtype"] = config["dtype"]
+    wl_stats["events"] = config["ntrain"]
+    wl_stats["gpu_batch_multiplier"] = config["gpu_batch_multiplier"]
+    wl_stats["batch_size"] = config["train_dataset"][config["dataset"]]["physical"]["batch_size"]
+    wl_stats["events_per_batch"] = (
+        wl_stats["gpu_batch_multiplier"] * wl_stats["batch_size"]
+        if wl_stats["gpus"] > 0
+        else wl_stats["batch_size"]
+    )
+    wl_stats["conv_type"] = config["conv_type"]
+    wl_stats["dataset"] = config["train_dataset"][config["dataset"]]["physical"]["samples"]
+
+    # parse all results json
+    for filepath in results_dir.joinpath(directory).rglob("*.json"):
+        with filepath.open() as file:
+            data = json.load(file)
+            if "epoch_train_time" in data:
+                epoch_times.append(round(data["epoch_train_time"], 4))
+
+    wl_stats["num_epochs"] = len(epoch_times)
+    wl_stats["epoch_times"] = sorted(epoch_times, reverse=True)
+    wl_stats["train_time"] = round(sum(epoch_times), 4)
+    wl_stats["throughput_per_epoch"] = [round(wl_stats["events"] / t, 4) for t in wl_stats["epoch_times"]]
+
+    wl_scores["mean_throughput"] = round(sum(wl_stats["throughput_per_epoch"][1:])/(wl_stats["num_epochs"] -1 ), 4)
+    wl_scores["mean_epoch_time"] = round(sum(wl_stats["epoch_times"][1:])/(wl_stats["num_epochs"] -1 ), 4)
+
+    report = {"wl-stats": wl_stats, "wl-scores": wl_scores}
+    print(json.dumps(report, indent=4))
+
+if __name__ == "__main__":
+    parser=argparse.ArgumentParser()
+    parser.add_argument("results_dir", type=Path, help="path to results directory")
+    args=parser.parse_args()
+
+    parse_results(args.results_dir)
diff --git a/cms/mlpf/cms-mlpf/prepare-dataset.sh b/cms/mlpf/cms-mlpf/prepare-dataset.sh
deleted file mode 100755
index 0f876447a78f287dad8656a9dfabe903030cbf10..0000000000000000000000000000000000000000
--- a/cms/mlpf/cms-mlpf/prepare-dataset.sh
+++ /dev/null
@@ -1,37 +0,0 @@
-#!/bin/bash
-# Download the particleflow dataset from Zenodo
-set -e
-
-# each split is about 50MB pickled and gzipped
-MIRROR=https://zenodo.org/record/4559324/files
-
-DOWNLOAD_DIR=/bmk/data/zenodo
-DELPHES_DIR=$DOWNLOAD_DIR/delphes_pf
-echo "Download directory: $DOWNLOAD_DIR"
-
-PF_DIR=/bmk/cms-mlpf/particleflow
-DATA_DIR=/bmk/cms-mlpf/tensorflow_datasets
-echo "Data directory: $DATA_DIR"
-
-# create the download dir
-mkdir -p $DELPHES_DIR
-
-# Test splits
-for i in $(seq 0 19) ; do
-    TARGET=tev14_pythia8_ttbar_0_${i}.pkl.bz2
-    echo "Downloading train split: $TARGET"
-    wget -q -P $DELPHES_DIR $MIRROR/$TARGET
-done
-
-# Train splits
-for i in $(seq 0 1) ; do
-    TARGET=tev14_pythia8_qcd_10_${i}.pkl.bz2
-    echo "Downloading test split: $TARGET"
-    wget -q -P $DELPHES_DIR $MIRROR/$TARGET
-done
-
-# build TDFS datasets
-cd $PF_DIR
-tfds build hep_tfds/heptfds/delphes_pf --download_dir $DOWNLOAD_DIR --data_dir $DATA_DIR
-
-rm -rf $DOWNLOAD_DIR
diff --git a/cms/mlpf/cms-mlpf/requirements.txt b/cms/mlpf/cms-mlpf/requirements.txt
deleted file mode 100644
index 0af52078d53cbe94c726ebb2d122cc83e4c37e16..0000000000000000000000000000000000000000
--- a/cms/mlpf/cms-mlpf/requirements.txt
+++ /dev/null
@@ -1,38 +0,0 @@
-numpy==1.19.5
-click==8.0.1
-tqdm==4.61.1
-seaborn==0.11.2
-scikit-optimize
-tensorflow==2.6.0
-keras==2.6.0
-tensorflow-addons==0.13.0
-tensorflow-datasets==4.4.0
-tf-models-official==2.6.0
-tensorflow-estimator==2.6.0
-tensorflow-probability==0.14.1
-keras-tuner==1.0.3
-tf2onnx==1.9.2
-onnxruntime==1.8.1
-mplhep
-ray==1.7.0
-ray[tune]==1.7.0
-nevergrad==0.4.3.post8
-
-# The below is not needed to run mlpf-bmk.sh (when using tensorflow's official docker image as base)
-# pandas==1.1.5
-# scikit-learn==0.24.2
-# scipy==1.5.4
-# scikit-optimize==0.9.0
-# matplotlib==3.2.2
-# hpbandster==0.7.4
-# hyperopt==0.2.5
-# hpbandster
-# ConfigSpace==0.4.19
-# pyaml==6.0
-# onnx==1.10.1
-# tensorflow-text==2.6.0
-# tensorboard==2.6.0
-# tensorflow-estimator==2.6.0
-# tensorflow-metadata==1.1.0
-# tensorflow-model-optimization==0.7.0
-# comet-ml==3.15.4
diff --git a/cms/mlpf/particleflow b/cms/mlpf/particleflow
deleted file mode 160000
index 9324c9625555addd9367133193a3c491fbc74376..0000000000000000000000000000000000000000
--- a/cms/mlpf/particleflow
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 9324c9625555addd9367133193a3c491fbc74376