diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 5aeca707a72220689897cff1d3d36147a67d2dd8..f2195624a6addc0b3428c45ada5bdbe931ed98bc 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -1,9 +1,12 @@ stages: -- test -- announce-promoted-image - triggers +- test + +##################################################### +### ATLAS KV (a test of cvmfs functionality) +##################################################### -job_test_kv: +.job_test_kv: stage: test image: gitlab-registry.cern.ch/hep-benchmarks/hep-workloads-builder/dind:qa tags: @@ -33,6 +36,29 @@ job_test_kv: expire_in: 1 week when: always +##################################################### +### CMS PATATRACK +##################################################### + +patatrack: + stage: triggers + trigger: + include: + - local: cms/cms-patatrack-ci.yml + strategy: depend + only: + variables: + - $CI_COMMIT_BRANCH =~ /^qa.*$/ + - $CI_COMMIT_TAG =~ /^v.*$/ + changes: + - cms/patatrack/* + - cms/patatrack/ci-scripts/* + - cms/patatrack/cms-patatrack/* + - cms/patatrack/cms-patatrack/utility_scripts/* + +##################################################### +### LHC Simple Track +##################################################### simpletrack: stage: triggers @@ -41,7 +67,8 @@ simpletrack: strategy: depend only: variables: - - $CI_COMMIT_BRANCH == "qa" + - $CI_COMMIT_BRANCH =~ /^qa.*$/ + - $CI_COMMIT_TAG =~ /^v.*$/ changes: - lhc/simpletrack/Dockerfile.* - lhc/simpletrack/lhc-simpletrack.* diff --git a/README.md b/README.md index d22104fe1a056f4211ef518f8557dc1354f988c3..47824ca2ef899fd82830baab8c759418941f7224 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,12 @@ # hep-workloads-GPU -Build standalone reference HEP workloads for benchmarking purposes on GPUs \ No newline at end of file +Build standalone reference HEP workloads for benchmarking purposes on GPUs + +The documentation of the individual workloads can be found in the following links + +## Notebooks + +| Internal Doc | Note | External Link | +| :--- | :--- | :--- | +| [Simple Track](https://gitlab.cern.ch/hep-benchmarks/hep-workloads-gpu/-/blob/master/lhc/simpletrack/README.md) | Simulation of LHC turning particles | tbd | +| [CMS Patatrack](https://gitlab.cern.ch/hep-benchmarks/hep-workloads-gpu/-/blob/master/cms/README.md) | CMS HLT Reconstruction code | [CMS patatrack github project](https://github.com/cms-patatrack) | diff --git a/cms/README.md b/cms/README.md new file mode 100644 index 0000000000000000000000000000000000000000..74a5c079b6fa2b3034cb972b453cf281530c353b --- /dev/null +++ b/cms/README.md @@ -0,0 +1,19 @@ +# CMS GPU workloads + +The sub-folders contain workloads provided by the CMS experiment that run on CPU+GPU system. +The reconstruction package is known as CMS Patratrack and is published in https://github.com/cms-patatrack + +We use it to build a CPU+GPU benchmark workload, following the same approaches developed for the HEP-workloads targetting CPUs [HEP workloads](https://gitlab.cern.ch/hep-benchmarks/hep-workloads) +The purpose of this hep-workloads-gpu gitlab project is to build standalone container including software, data and orchestrator procedures needed to run the CMS workload as a benchmark. +For this purpose a limited set of events is used to run the reconstruction workload and measure the performance in terms of event throughput. + +The procedure to build the standalone container is documented in the gitlab CI [yml file](https://gitlab.cern.ch/hep-benchmarks/hep-workloads-gpu/-/blob/qa/cms/cms-patatrack-ci.yml) + +In order to run the standalone container follow these steps and look for results in the defined RESULTS_DIR + +``` +export RESULTS_DIR=/any_path_you_like +export IMAGE_NAME=gitlab-registry.cern.ch/hep-benchmarks/hep-workloads-gpu/cms/cms-patatrack-nvidia-bmk:qa +docker pull ${IMAGE_NAME} +docker run --rm --gpus '"device=0"' -v ${RESULTS_DIR}:/results ${IMAGE_NAME} +``` \ No newline at end of file diff --git a/cms/cms-patatrack-ci.yml b/cms/cms-patatrack-ci.yml new file mode 100644 index 0000000000000000000000000000000000000000..ab394abe9287933378a2e3967e616557b2c41b26 --- /dev/null +++ b/cms/cms-patatrack-ci.yml @@ -0,0 +1,175 @@ +--- +stages: + - build_0 + - build_1 + - build_2 + - snapshot + - build_standalone + - test +#- publish +#- announce + +########################## +## Templates ############# + +# .definition_build_image_kaniko: &template_build_image_kaniko +# tags: +# - hep-workload-gpu-docker-builder +# image: # NB enable shared runners and do not specify a CI tag +# name: gitlab-registry.cern.ch/ci-tools/docker-image-builder # CERN version of the Kaniko image +# entrypoint: [""] +# script: +# - echo "current commit is ${CI_COMMIT_SHA:0:8}" +# - echo "current branch is ${CI_COMMIT_BRANCH}" +# - echo "current tag is ${CI_COMMIT_TAG}" +# - if [[ -z $DOCKERFILE ]]; then echo "ERROR variable DOCKERFILE is not defined "; exit 1; fi +# - if [[ -z $CONTEXT ]]; then echo "ERROR variable CONTEXT is not defined "; exit 1; fi +# - if [[ -z $IMAGE_NAME ]]; then echo "ERROR variable IMAGE_NAME is not defined "; exit 1; fi +# - if [[ -z $IMAGE_TAG ]]; then echo "ERROR variable IMAGE_TAG is not defined "; exit 1; fi +# - export DESTINATIONS="--destination $CI_REGISTRY_IMAGE/$IMAGE_NAME:$IMAGE_TAG --destination $CI_REGISTRY_IMAGE/$IMAGE_NAME:ci-${CI_COMMIT_BRANCH}-${CI_COMMIT_SHA:0:8}" +# - echo "DESTINATIONS $DESTINATIONS" +# # Prepare Kaniko configuration file +# - echo "{\"auths\":{\"$CI_REGISTRY\":{\"username\":\"$CI_REGISTRY_USER\",\"password\":\"$CI_REGISTRY_PASSWORD\"}}}" > /kaniko/.docker/config.json +# # Build and push the image from the Dockerfile at the root of the project. +# # To push to a specific docker tag, amend the --destination parameter, e.g. --destination $CI_REGISTRY_IMAGE:$CI_BUILD_REF_NAME +# # See https://docs.gitlab.com/ee/ci/variables/predefined_variables.html#variables-reference for available variables +# - /kaniko/executor --context $CONTEXT --dockerfile $DOCKERFILE $DESTINATIONS + +.definition_build_image: &template_build_image + tags: + - hep-workload-gpu-docker-builder + image: + name: gitlab-registry.cern.ch/hep-benchmarks/hep-workloads-builder/dind:qa # Use instead of kaniko. FIXME use a prod tag + entrypoint: [""] + script: + - echo "current commit is ${CI_COMMIT_SHA:0:8}" + - echo "current branch is ${CI_COMMIT_BRANCH}" + - echo "current tag is ${CI_COMMIT_TAG}" + - if [[ -z $DOCKERFILE ]]; then echo "ERROR variable DOCKERFILE is not defined "; exit 1; fi + - if [[ -z $CONTEXT ]]; then echo "ERROR variable CONTEXT is not defined "; exit 1; fi + - if [[ -z $IMAGE_NAME ]]; then echo "ERROR variable IMAGE_NAME is not defined "; exit 1; fi + - if [[ -z $IMAGE_TAG ]]; then echo "ERROR variable IMAGE_TAG is not defined "; exit 1; fi + - docker rmi -f $CI_REGISTRY_IMAGE/$IMAGE_NAME:$IMAGE_TAG || echo "image $CI_REGISTRY_IMAGE/$IMAGE_NAME:$IMAGE_TAG does not exist" + - echo $CI_BUILD_TOKEN | docker login -u gitlab-ci-token --password-stdin gitlab-registry.cern.ch + - docker build --no-cache -t $CI_REGISTRY_IMAGE/$IMAGE_NAME:ci-${CI_COMMIT_BRANCH}-${CI_COMMIT_SHA:0:8} -f $DOCKERFILE $CONTEXT + - docker tag $CI_REGISTRY_IMAGE/$IMAGE_NAME:ci-${CI_COMMIT_BRANCH}-${CI_COMMIT_SHA:0:8} $CI_REGISTRY_IMAGE/$IMAGE_NAME:$IMAGE_TAG + - docker push $CI_REGISTRY_IMAGE/$IMAGE_NAME:ci-${CI_COMMIT_BRANCH}-${CI_COMMIT_SHA:0:8} + - docker push $CI_REGISTRY_IMAGE/$IMAGE_NAME:$IMAGE_TAG + - docker rmi $CI_REGISTRY_IMAGE/$IMAGE_NAME:ci-${CI_COMMIT_BRANCH}-${CI_COMMIT_SHA:0:8} + +########################################################### +# docker in docker image: to trigger other docker runs +########################################################### + +job_build_image_step0: + stage: build_0 + before_script: + - export DOCKERFILE=$CI_PROJECT_DIR/cms/patatrack/ci-scripts/nvidia.Dockerfile.0 + - export CONTEXT=$CI_PROJECT_DIR/cms/patatrack + - export IMAGE_NAME=cms/cms-patatrack-nvidia-0 + - export IMAGE_TAG=${CI_COMMIT_TAG:-$CI_COMMIT_BRANCH} + <<: *template_build_image + only: + changes: + - cms/patatrack/ci-scripts/nvidia.Dockerfile.0 + + +job_build_image_step1: + stage: build_1 + before_script: + - export DOCKERFILE=$CI_PROJECT_DIR/cms/patatrack/ci-scripts/nvidia.Dockerfile.1 + - export CONTEXT=$CI_PROJECT_DIR/cms/patatrack + - export IMAGE_NAME=cms/cms-patatrack-nvidia-1 + - export IMAGE_TAG=${CI_COMMIT_TAG:-$CI_COMMIT_BRANCH} + <<: *template_build_image + only: + changes: + - cms/patatrack/ci-scripts/nvidia.Dockerfile.0 + - cms/patatrack/ci-scripts/nvidia.Dockerfile.1 + +job_build_image_step2: + stage: build_2 + before_script: + - export DOCKERFILE=$CI_PROJECT_DIR/cms/patatrack/ci-scripts/nvidia.Dockerfile.2 + - export CONTEXT=$CI_PROJECT_DIR/cms/patatrack + - export IMAGE_NAME=cms/cms-patatrack-nvidia-2 + - export IMAGE_TAG=${CI_COMMIT_TAG:-$CI_COMMIT_BRANCH} + <<: *template_build_image + only: + changes: + - cms/patatrack/ci-scripts/nvidia.Dockerfile.* + - cms/patatrack/cms-patatrack/* + - cms/patatrack/cms-patatrack/utility_scripts/* + +job_snapshot_cvmfs: + stage: snapshot + tags: + - hep-workload-gpu-docker-builder + image: + name: gitlab-registry.cern.ch/hep-benchmarks/hep-workloads-builder/dind:qa + before_script: + - source cms/patatrack/ci-scripts/snapshot_cvmfs.sh + - _before_script + script: + - source cms/patatrack/ci-scripts/snapshot_cvmfs.sh + - _script + after_script: + - source cms/patatrack/ci-scripts/snapshot_cvmfs.sh + - _after_script + only: + variables: + - $CI_COMMIT_BRANCH =~ /^qa.*$/ + - $CI_COMMIT_TAG =~ /^v.*$/ + changes: + - cms/patatrack/ci-scripts/nvidia.Dockerfile.* + - cms/patatrack/ci-scripts/snapshot_cvmfs.sh + - cms/patatrack/cms-patatrack/* + - cms/patatrack/cms-patatrack/utility_scripts/* + artifacts: + paths: + - ${CI_PROJECT_DIR}/traces + - ${CI_PROJECT_DIR}/cvmfs_export_dir_content + - ${CI_PROJECT_DIR}/cvmfs_export_py2-scipy_content + - ${CI_PROJECT_DIR}/cms/patatrack/cvmfs + expire_in: 1 week + when: always + +job_build_standalone_image: + stage: build_standalone + before_script: + - export DOCKERFILE=$CI_PROJECT_DIR/cms/patatrack/ci-scripts/nvidia.Dockerfile.2 + - export CONTEXT=$CI_PROJECT_DIR/cms/patatrack + - export IMAGE_NAME=cms/cms-patatrack-nvidia-bmk + - export IMAGE_TAG=${CI_COMMIT_TAG:-$CI_COMMIT_BRANCH} + <<: *template_build_image + only: + changes: + - cms/patatrack/ci-scripts/nvidia.Dockerfile.* + - cms/patatrack/ci-scripts/snapshot_cvmfs.sh + - cms/patatrack/cms-patatrack/* + - cms/patatrack/cms-patatrack/utility_scripts/* + +job_test_standalone_image: + stage: test + tags: + - hep-workload-gpu-docker-builder + image: + name: gitlab-registry.cern.ch/hep-benchmarks/hep-workloads-builder/dind:qa + script: + - source cms/patatrack/ci-scripts/test_standalone_image.sh + - _script + only: + variables: + - $CI_COMMIT_BRANCH =~ /^qa.*$/ + - $CI_COMMIT_TAG =~ /^v.*$/ + changes: + - cms/patatrack/ci-scripts/nvidia.Dockerfile.* + - cms/patatrack/ci-scripts/snapshot_cvmfs.sh + - cms/patatrack/ci-scripts/test_standalone_image.sh + - cms/patatrack/cms-patatrack/* + - cms/patatrack/cms-patatrack/utility_scripts/* + artifacts: + paths: + - ${CI_PROJECT_DIR}/${RESULTS_DIR} + expire_in: 1 week + when: always \ No newline at end of file diff --git a/cms/patatrack/ci-scripts/build_2.sh b/cms/patatrack/ci-scripts/build_2.sh new file mode 100755 index 0000000000000000000000000000000000000000..8a78e56332ccd274d0e5635f11b68433ebc6cd38 --- /dev/null +++ b/cms/patatrack/ci-scripts/build_2.sh @@ -0,0 +1,46 @@ +#!/bin/bash + +set -x +set -e + +# First move all folders in the right place +date +mv /stage/cvmfs /cvmfs + +date +mv /stage/cms-patatrack /bmk/./cms-patatrack + +# Make only readable +date +chmod -R 555 /cvmfs + +# FIXME This checksum takes a lot of time. +# Commenting it. Can be substituted by a checksum using cvmfs utilities +#tar -cf /tmp/cvmfs_checksum.tar /cvmfs && md5sum /tmp/cvmfs_checksum.tar | cut -f1 -d" " > /tmp/cvmfs_checksum && rm /tmp/cvmfs_checksum.tar + +# Checksum code in orchestrator dir. +# This MUST happen before linking the data dir +# otherwise will take a lot of time to tar +date +tar -cf /tmp/bmk_checksum.tar /bmk && md5sum /tmp/bmk_checksum.tar | cut -f1 -d" " >/tmp/bmk_checksum && rm /tmp/bmk_checksum.tar #FIXME + +# The data dir has already a checksum in /tmp/bmkdata_checksum +# generated in nvidia.Dockerfile.1 +date +if [ ! -d /bmk/./cms-patatrack/data ]; then + mkdir /bmk/./cms-patatrack/data +fi +for file in $(ls /bmk/data); do + ln -sf /bmk/data/$file /bmk/./cms-patatrack/data/$file +done + +date +cvmfs_checksum=$(cat /tmp/cvmfs_checksum || echo "NotAvailable") +bmkdata_checksum=$(cat /tmp/bmkdata_checksum || echo "NotAvailable") +bmk_checksum=$(cat /tmp/bmk_checksum || echo "NotAvailable") +echo '{"version":"v1.3","description":"CMS RECO of ttbar events, based on CMSSW_10_2_9","cvmfs_checksum":"'$cvmfs_checksum'","bmkdata_checksum":"'$bmkdata_checksum'","bmk_checksum":"'$bmk_checksum'"}' >/bmk/./cms-patatrack/version.json #FIXME + +# Add user 'bmkuser' to benchmarks as a non-root user (BMK-166 and BMK-167) +# shoudl not be needed, using cvmfs read only +#groupadd bmkuser +#useradd -g bmkuser --create-home --shell /bin/bash bmkuser diff --git a/cms/patatrack/ci-scripts/nvidia.Dockerfile.0 b/cms/patatrack/ci-scripts/nvidia.Dockerfile.0 new file mode 100644 index 0000000000000000000000000000000000000000..6d661957d01759049c958ae03c577e994433be81 --- /dev/null +++ b/cms/patatrack/ci-scripts/nvidia.Dockerfile.0 @@ -0,0 +1,17 @@ +# FIXME: need to build in gitlab this base image. Was done by hand +FROM nvidia/cuda:10.1-devel-centos7 + +RUN yum install -y \ + which \ + man \ + file \ + util-linux \ + jq \ + gcc \ + wget \ + tar freetype \ + perl perl-Data-Dumper \ + patch git vim; yum clean all + +RUN yum --enablerepo=extras install epel-release -y + diff --git a/cms/patatrack/ci-scripts/nvidia.Dockerfile.1 b/cms/patatrack/ci-scripts/nvidia.Dockerfile.1 new file mode 100644 index 0000000000000000000000000000000000000000..b63ab1a425ccd2ba3027676a6baf73eeb1a89f78 --- /dev/null +++ b/cms/patatrack/ci-scripts/nvidia.Dockerfile.1 @@ -0,0 +1,20 @@ +FROM gitlab-registry.cern.ch/hep-benchmarks/hep-workloads-gpu/cms/cms-patatrack-nvidia-0:qa + + +# Prepare a data directory for downloading large files that should normally be cacheable (BMK-159) +# Its contents should be retrieved in Dockerfile.append, before /bmk/<bmkdir> is copied over +# Each file it contains is then individually symlinked to /bmk/<bmkdir>/data/<file> in Dockerfile.template +RUN mkdir -p /bmk/data + + +# Add here any workload-specific Dockerfile instructions. +# They will be appended to the Dockerfile generated from a common template. + + +RUN echo -e "\nExtracting Patatrack dataset..."; \ + wget -q https://hep-benchmarks.web.cern.ch/hep-benchmarks/hep-workloads/data/cms/patatrack/opendata.tar -O /bmk/data/opendata.tar; \ + cd /bmk/data/; \ + md5sum opendata.tar | cut -f1 -d" " > /tmp/bmkdata_checksum; \ + tar -xvf ./opendata.tar; \ + rm ./opendata.tar + diff --git a/cms/patatrack/ci-scripts/nvidia.Dockerfile.2 b/cms/patatrack/ci-scripts/nvidia.Dockerfile.2 new file mode 100644 index 0000000000000000000000000000000000000000..ecc4be0800513767da41dec2e755505ad1fe37a6 --- /dev/null +++ b/cms/patatrack/ci-scripts/nvidia.Dockerfile.2 @@ -0,0 +1,7 @@ +FROM gitlab-registry.cern.ch/hep-benchmarks/hep-workloads-gpu/cms/cms-patatrack-nvidia-1:qa + +COPY . /stage/ +RUN ls -la /stage/* +RUN /stage/ci-scripts/build_2.sh + +ENTRYPOINT ["/bmk/./cms-patatrack/cms-patatrack-bmk.sh"] diff --git a/cms/patatrack/ci-scripts/nvidia.Dockerfile.2_old b/cms/patatrack/ci-scripts/nvidia.Dockerfile.2_old new file mode 100644 index 0000000000000000000000000000000000000000..c14ed615b06b0c7324adaf56c75d138ba630eaf2 --- /dev/null +++ b/cms/patatrack/ci-scripts/nvidia.Dockerfile.2_old @@ -0,0 +1,52 @@ +FROM gitlab-registry.cern.ch/hep-benchmarks/hep-workloads-gpu/cms/cms-patatrack-nvidia-1:qa + +# ********* DOCKERFILE TEMPLATE start ********* +# ******* PLEASE DO NOT EDIT THIS FILE! ******* +# This is the common template for all HEP workloads (BMK-124 and BMK-159). +# Please add workload-specific instructions in Dockerfile.append. + +# Optionally allow disabling the cache only from this point onwards if using +# docker build -t your-image --build-arg CACHEBUST=$(date +%s) . +# See https://github.com/moby/moby/issues/1996#issuecomment-185872769 + +###ARG CACHEBUST=1 + +###RUN echo CACHEBUST=$CACHEBUST + +# This should normally contain always the same files and be cacheable (BMK-159) +COPY ./cvmfs /cvmfs +RUN chmod -R 555 /cvmfs + +# FIXME This checksum takes a lot of time. +# Commenting it. Can be substituted by a checksum using cvmfs utilities +#RUN tar -cf /tmp/cvmfs_checksum.tar /cvmfs && md5sum /tmp/cvmfs_checksum.tar | cut -f1 -d" " > /tmp/cvmfs_checksum && rm /tmp/cvmfs_checksum.tar +RUN touch /tmp/cvmfs_checksum + +# This may also be cacheable in most cases except when /bmk contents change +COPY ./cms-patatrack /bmk/./cms-patatrack + +# FIXME currently there is not common and the driver is in the patatrack folder +#COPY common/bmk-driver.sh /bmk/./cms-patatrack/bmk-driver.sh + +# Checksum code in orchestrator dir. +# This MUST happen before linking the data dir +# otherwise will take a lot of time to tar +RUN tar -cf /tmp/bmk_checksum.tar /bmk && md5sum /tmp/bmk_checksum.tar | cut -f1 -d" " > /tmp/bmk_checksum && rm /tmp/bmk_checksum.tar #FIXME + +# The data dir has already a checksum in /tmp/bmkdata_checksum +# generated in nvidia.Dockerfile.1 +RUN if [ ! -d /bmk/./cms-patatrack/data ]; then mkdir /bmk/./cms-patatrack/data; fi +RUN for file in $(cd /bmk/data; ls); do ln -sf /bmk/data/$file /bmk/./cms-patatrack/data/$file; done + +RUN cvmfs_checksum=`cat /tmp/cvmfs_checksum` && bmkdata_checksum=`cat /tmp/bmkdata_checksum` && bmk_checksum=`cat /tmp/bmk_checksum` && rm /tmp/cvmfs_checksum /tmp/bmkdata_checksum /tmp/bmk_checksum && echo '{"version":"v1.3","description":"CMS RECO of ttbar events, based on CMSSW_10_2_9","cvmfs_checksum":"'$cvmfs_checksum'","bmkdata_checksum":"'$bmkdata_checksum'","bmk_checksum":"'$bmk_checksum'"}' > /bmk/./cms-patatrack/version.json #FIXME + +ENTRYPOINT ["/bmk/./cms-patatrack/cms-patatrack-bmk.sh"] + +# This contains provenance data that can never be cached +COPY ./cvmfs/.provenance /cvmfs/.provenance + +# Add user 'bmkuser' to run benchmarks as a non-root user (BMK-166 and BMK-167) +RUN groupadd bmkuser + +RUN useradd -g bmkuser --create-home --shell /bin/bash bmkuser +# ********* DOCKERFILE TEMPLATE end ********* diff --git a/cms/patatrack/ci-scripts/snapshot_cvmfs.sh b/cms/patatrack/ci-scripts/snapshot_cvmfs.sh new file mode 100644 index 0000000000000000000000000000000000000000..4cf653363c44ddf57f4284270f001e2b6a97cb42 --- /dev/null +++ b/cms/patatrack/ci-scripts/snapshot_cvmfs.sh @@ -0,0 +1,39 @@ +#!/bin/bash + +# script used in gitlab CI +# for job job_snapshot_cvmfs +# in file cms/cms-patatrack-ci.yml + +function _before_script() { + docker pull ${CVMFS_IMAGE} + docker run --name cvmfs_${CI_JOB_ID} -d --privileged -v ${CVMFS_EXPORT_DIR}:${CVMFS_EXPORT_DIR} -v ${CIENV_CVMFSVOLUME}:/cvmfs:shared ${CVMFS_IMAGE} -r ${CIENV_CVMFSREPO} -t /tmp/traces +} + +function _script() { + sleep 1m # to give time to cvmfs to start + echo "CVMFS_EXPORT_DIR is $CVMFS_EXPORT_DIR" + # check cvmfs is running + docker exec cvmfs_${CI_JOB_ID} cvmfs_config probe + # Here comes the dry run of the CMS Patatrack container. Arguments are for the time being defaults/hardcoded FIXME + docker pull gitlab-registry.cern.ch/hep-benchmarks/hep-workloads-gpu/cms/cms-patatrack-nvidia-2:qa + docker run --name patatrack_container --gpus '"device=0"' -v ${CIENV_CVMFSVOLUME}:/cvmfs gitlab-registry.cern.ch/hep-benchmarks/hep-workloads-gpu/cms/cms-patatrack-nvidia-2:qa -e 100 -t 8 -c 1 + # run shrinkwrapper + docker exec cvmfs_${CI_JOB_ID} /root/shrinkwrap.sh -t /tmp/traces/ -e ${CVMFS_EXPORT_DIR} + # FIXME this is a dirty patch needed to make scipy running. cvmfs shrinkwrapper alone does not copy all files of that dir. To be investigated why + ls -lR ${CVMFS_EXPORT_DIR}/cvmfs/cms.cern.ch/slc7_amd64_gcc820/external/py2-scipy/1.2.3-bcolbf/lib/python2.7 >${CI_PROJECT_DIR}/cvmfs_export_py2-scipy_content + rm -fr ${CVMFS_EXPORT_DIR}/cvmfs/cms.cern.ch/slc7_amd64_gcc820/external/py2-scipy/1.2.3-bcolbf/lib/python2.7/site-packages + docker cp patatrack_container:/cvmfs/cms.cern.ch/slc7_amd64_gcc820/external/py2-scipy/1.2.3-bcolbf/lib/python2.7/site-packages ${CVMFS_EXPORT_DIR}/cvmfs/cms.cern.ch/slc7_amd64_gcc820/external/py2-scipy/1.2.3-bcolbf/lib/python2.7 + # remove duplicated data + rm -rf ${CVMFS_EXPORT_DIR}/cvmfs/.data + ls -R ${CVMFS_EXPORT_DIR} >${CI_PROJECT_DIR}/cvmfs_export_dir_content +} + +function _after_script() { + docker rm -f cvmfs_${CI_JOB_ID} + docker rm -f patatrack_container +} + +export CIENV_CVMFSVOLUME=/scratch/cvmfs_hep/CI-JOB-${CI_JOB_ID} +export CVMFS_EXPORT_DIR=${CI_PROJECT_DIR}/cms/patatrack +export CIENV_CVMFSREPO=cms.cern.ch +export CVMFS_IMAGE=gitlab-registry.cern.ch/hep-benchmarks/hep-workloads-builder/cvmfs-image:${CI_COMMIT_TAG:-$CI_COMMIT_BRANCH} \ No newline at end of file diff --git a/cms/patatrack/ci-scripts/test_standalone_image.sh b/cms/patatrack/ci-scripts/test_standalone_image.sh new file mode 100644 index 0000000000000000000000000000000000000000..217e361bb43648dcd5082f9fa945ad6f120c3269 --- /dev/null +++ b/cms/patatrack/ci-scripts/test_standalone_image.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +# script used in gitlab CI +# for job job_test_standalone_image +# in file cms/cms-patatrack-ci.yml + +function _script() { + docker pull ${IMAGE_NAME} + # Here comes the test run of the CMS Patatrack standalone container. Arguments are for the time being defaults/hardcoded FIXME + docker run --rm --gpus '"device=0"' -v ${RESULTS_DIR}:/results ${IMAGE_NAME} -e 100 -t 8 -c 1 + mv ${RESULTS_DIR} ${CI_PROJECT_DIR}/. +} + +export RESULTS_DIR=/scratch/results/CI-JOB-${CI_JOB_ID} +export IMAGE_NAME=gitlab-registry.cern.ch/hep-benchmarks/hep-workloads-gpu/cms/cms-patatrack-nvidia-bmk:${CI_COMMIT_TAG:-$CI_COMMIT_BRANCH} diff --git a/cms/patatrack/cms-patatrack.spec b/cms/patatrack/cms-patatrack.spec new file mode 100644 index 0000000000000000000000000000000000000000..24be9b6a78d693a7bff8b1fddc2ed5f9add45caf --- /dev/null +++ b/cms/patatrack/cms-patatrack.spec @@ -0,0 +1,7 @@ +HEPWL_BMKEXE=cms-patatrack-bmk.sh +HEPWL_BMKOPTS="-t 4 -e 3" +HEPWL_BMKDIR=cms-patatrack +HEPWL_BMKDESCRIPTION="CMS PATATRACK, based on CMSSW_10_2_9" +HEPWL_DOCKERIMAGENAME=cms-patatrack-bmk +HEPWL_DOCKERIMAGETAG=v0.1 +HEPWL_CVMFSREPOS=cms.cern.ch diff --git a/cms/patatrack/cms-patatrack/DESCRIPTION b/cms/patatrack/cms-patatrack/DESCRIPTION new file mode 100644 index 0000000000000000000000000000000000000000..d3a74fb8a043a66a6469083a6287101837c10aa5 --- /dev/null +++ b/cms/patatrack/cms-patatrack/DESCRIPTION @@ -0,0 +1,4 @@ +THIS IS FOR CMS-RECO. FIX for patatrack +Reconstruction and analysis data creation. +The application is multi-threaded and requires an input data file containing simulated events. +The score consists of throughput (events per second) and CPU (CPU seconds per event). \ No newline at end of file diff --git a/cms/patatrack/cms-patatrack/bmk-driver.sh b/cms/patatrack/cms-patatrack/bmk-driver.sh new file mode 100644 index 0000000000000000000000000000000000000000..e711aeb142892d04b9b1ca8fbe0b435da3285610 --- /dev/null +++ b/cms/patatrack/cms-patatrack/bmk-driver.sh @@ -0,0 +1,443 @@ +if [ "$BASH_SOURCE" = "$0" ]; then echo "ERROR! This script ($0) was not sourced"; exit 1; fi +if [ "$BASH_SOURCE" = "" ]; then echo "ERROR! This script was not sourced from bash"; return 1; fi + +bmkDriver=$(basename ${BASH_SOURCE}) +bmkScript=$(basename $0) +BMKDIR=$(cd $(dirname $0); pwd) + +function myecho(){ + echo -e "[${FUNCNAME[1]}] $@" +} + +function advertise_bmkdriver(){ + myecho "\n========================================================================" + myecho "[$bmkDriver] $(date) entering common benchmark driver" + myecho "========================================================================\n" + myecho "[$bmkDriver] entering from $bmkScript\n" + # Dump workload-specific directory + myecho "[$bmkDriver] benchmark directory BMKDIR=${BMKDIR}:\n" + ls -lRt $BMKDIR + if [ -d $BMKDIR/../data ]; then + myecho "\n[$bmkDriver] data directory ${BMKDIR}/../data:\n" + ls -lRt $BMKDIR/../data + fi + echo +} + +# Check that mandatory functions exist or load them otherwise +function check_mandatory_functions(){ + # Check that function doOne has been defined + if [ "$(type -t doOne)" != "function" ]; then + myecho "[$bmkDriver] ERROR! Function 'doOne' must be defined in $bmkScript" # internal error (missing code) + exit 1; + fi + # Check that function parseResults has been defined, otherwise load it from parseResults.sh + if [ "$(type -t parseResults)" != "function" ]; then + myecho "[$bmkDriver] load parseResults.sh (function 'parseResults' is not defined in $bmkScript)" + if [ -f ${BMKDIR}/parseResults.sh ]; then + myecho "[$bmkDriver] sourcing ${BMKDIR}/parseResults.sh\n" + . ${BMKDIR}/parseResults.sh + if [ "$(type -t parseResults)" != "function" ]; then + myecho "[$bmkDriver] ERROR! Function 'parseResults' must be defined in $bmkScript or parseResults.sh" # internal error (missing code) + exit 1; + fi + else + myecho "[$bmkDriver] ERROR! 'parseResults' not defined and ${BMKDIR}/parseResults.sh not found\n" # internal error (missing code) + exit 1 + fi + fi +} + +# Check that mandatory variables have been defined (default values) +function check_mandatory_variables(){ + # Variables NCOPIES, NTHREADS, NEVENTS_THREAD have default values specific to each benchmark + for var in NCOPIES NTHREADS NEVENTS_THREAD; do + if [ "${!var}" == "" ]; then + myecho "[$bmkDriver] ERROR! A default value of $var must be set in $bmkScript" # internal error (missing code) + exit 1; + fi + done + echo +} + +# Variables USER_NCOPIES, USER_NTHREADS, USER_NEVENTS_THREAD are empty by default +USER_NCOPIES= +USER_NTHREADS= +USER_NEVENTS_THREADS= + +# Variable resultsDir has default value /results +# Variables skipSubDir and DEBUG are 0 by default +resultsDir=/results +skipSubDir=0 +DEBUG=0 + +function advertise_user_defined_variables(){ + for var in NCOPIES NTHREADS NEVENTS_THREAD; do + myecho "Default (from $bmkScript): $var=${!var}" + done + echo + for var in USER_NCOPIES USER_NTHREADS USER_NEVENTS_THREAD; do + myecho "Default (from $bmkDriver): $var=${!var}" + done + echo + for var in resultsDir skipSubDir DEBUG; do + myecho "Default (from $bmkDriver): $var=${!var}" + done +} + +# Usage function +function usage(){ + echo "" + echo "Usage: $0 [-w <resultsDir>] [-W] [-c <NCOPIES>] [-t <NTHREADS>] [-e <NEVENTS_PER_THREAD>] [-d] [-h]" + echo " -w <resultsDir> : results directory (default: /results , current: $resultsDir)" + echo " -W : store results in <resultsDir> directly (default: 0 , current: $skipSubDir)" + echo " -c <NCOPIES> : # identical copies (default $NCOPIES)" + echo " -t <NTHREADS> : # threads (or processes, or threads*processes) per copy (default $NTHREADS)" + echo " -e <NEVENTS_THREAD> : # events per thread (default $NEVENTS_THREAD)" + echo " -d : debug mode (current: $DEBUG)" + echo " -h : display this help and exit" + echo "" + if [ $NTHREADS -eq 1 ]; then + echo "NTHREADS : the default value NTHREADS=1 of this parameter cannot be changed" + echo " (single-threaded single-process workload application)" + echo "" + fi + echo "Without -W (default): results are stored in a new subdirectory of <resultsDir>:" + echo " <resultsDir>/<uniqueid>/*.json" + echo " <resultsDir>/<uniqueid>/proc_1/*.log" + echo " <resultsDir>/<uniqueid>/proc_.../*.log" + echo " <resultsDir>/<uniqueid>/proc_<COPIES>/*.log" + echo "With -W (e.g. in the CI): results are stored in <resultsDir> directly:" + echo " <resultsDir>/*.json" + echo " <resultsDir>/proc_1/*.log" + echo " <resultsDir>/proc_.../*.log" + echo " <resultsDir>/proc_<NCOPIES>/*.log" + echo "" + echo "Without -w (default) and without -W: <resultsDir> is /results" + echo "Without -w (default) and with -W: <resultsDir> is a tmp directory /tmp/xxxx" + echo "" + if [ "$(type -t usage_detailed)" == "function" ]; then + echo "\nDetailed Usage:\n----------------\n" + ( usage_detailed ) # as a subprocess, just in case this has a 0 exit code... + fi + echo "DESCRIPTION\n" + if [ -e $BMKDIR/DESCRIPTION ]; then + cat $BMKDIR/DESCRIPTION + else + echo "Sorry there is not description included." + fi + echo "" + exit 1 # early termination (help or invalid arguments to benchmark script) +} + +##################### +### HERE MAIN STARTS +##################### + +# Parse the input arguments +callUsage== +while getopts "c:t:e:w:Wdh" o; do + case ${o} in + c) + if [ $OPTARG -gt 0 ]; then + USER_NCOPIES=$OPTARG + else + myecho "[$bmkDriver] ERROR! Invalid argument '-c $OPTARG' (must be > 0)" + exit 1 # early termination (invalid arguments to benchmark script) + fi + ;; + t) + if [ $OPTARG -gt 0 ]; then + USER_NTHREADS=$OPTARG + if [ $NTHREADS -eq 1 ] && [ $USER_NTHREADS -ne 1 ]; then + myecho "[$bmkDriver] ERROR! Invalid argument '-t $OPTARG' (default NTHREADS=1 cannot be changed)" + exit 1 # early termination (invalid arguments to benchmark script) + fi + else + myecho "[$bmkDriver] ERROR! Invalid argument '-t $OPTARG' (must be > 0)" + exit 1 # early termination (invalid arguments to benchmark script) + fi + ;; + e) + if [ $OPTARG -gt 0 ]; then + USER_NEVENTS_THREAD=$OPTARG + else + myecho "[$bmkDriver] ERROR! Invalid argument '-e $OPTARG' (must be > 0)" + exit 1 + fi + ;; + w) + resultsDir=$OPTARG + ;; + W) + skipSubDir=1 + ;; + d) + DEBUG=1 + ;; + *) + callUsage=1 # need to do in this way to enable parsing of all arguments (see BMK-258) + ;; + esac +done + +if [ "$DEBUG" == 1 ]; then + myecho "\n[$bmkDriver] Parse input arguments '$@'\n" + advertise_bmkdriver + advertise_user_defined_variables +fi + +# No other input arguments are expected +shift $((OPTIND -1)) +if [ "$1" != "" ]; then usage; fi + +if [ "$callUsage" == "1" ]; then usage; fi + +# Check that mandatory functions exist or load them otherwise +check_mandatory_functions + +# Check that mandatory variables have been defined (default values) +check_mandatory_variables + +# Dump all relevant variables after parsing the input arguments +for var in USER_NCOPIES USER_NTHREADS USER_NEVENTS_THREAD; do + myecho "Current value: $var=${!var}" +done +echo +for var in resultsDir skipSubDir DEBUG; do + myecho "Current value: $var=${!var}" +done +echo + +# Variable resultsDir must be set through command line options +# Backward compatibility: all benchmarks initially hardcoded 'RESULTS_DIR=/results' +if [ "${resultsDir}" == "" ]; then + ###echo "[$bmkDriver] ERROR! resultsDir not specified ('-w' missing)" + ###exit 1 # early termination (invalid arguments to benchmark script) + if [ "$skipSubDir" == "1" ]; then + myecho "[$bmkDriver] WARNING! resultsDir not specified ('-w' missing), but '-W' is present: create a directory in /tmp\n" + resultsDir=$(mktemp -d) + else + myecho "[$bmkDriver] WARNING! resultsDir not specified ('-w' missing) and '-W' is missing: assume '/results'\n" + resultsDir=/results + fi +fi + +# Check that resultsDir is an existing directory +if [ ! -d ${resultsDir} ]; then + mkdir -p ${resultsDir} + if [ "$?" != "0" ]; then + myecho "[$bmkDriver] ERROR! directory '${resultsDir}' not found and could not be created" + exit 1 # early termination (cannot start processing) + fi +fi + +# Status code of the validateInputArguments and doOne steps +# fail<0 : validateInputArguments failed +# fail>0 : doOne failed +# fail=0 : OK +fail=0 + +# Call function validateInputArguments if it exists +if [ "$(type -t validateInputArguments)" != "function" ]; then + myecho "[$bmkDriver] function 'validateInputArguments' not found: use input arguments as given\n" + if [ "$USER_NCOPIES" != "" ]; then NCOPIES=$USER_NCOPIES; fi + if [ "$USER_NTHREADS" != "" ]; then NTHREADS=$USER_NTHREADS; fi # already checked that USER_NTHREADS must be 1 if NTHREADS is 1 + if [ "$USER_NEVENTS_THREAD" != "" ]; then NEVENTS_THREAD=$USER_NEVENTS_THREAD; fi +else + myecho "[$bmkDriver] function 'validateInputArguments' starting\n" + if ! validateInputArguments; then fail=-1; fi + myecho "\n[$bmkDriver] function 'validateInputArguments' completed (status=$fail)\n" +fi + +# Set baseWDir and create it if necessary +if [ "$skipSubDir" == "1" ]; then + baseWDir=${resultsDir} + myecho "[$bmkDriver] base working directory : $baseWDir\n" +else + baseWDir=${resultsDir}/$(basename $0 -bmk.sh)-c${NCOPIES}-e${NEVENTS_THREAD}-$(date +%s)_$(((RANDOM%9000)+1000)) + myecho "[$bmkDriver] base working directory : $baseWDir\n" + if ! mkdir $baseWDir; then + myecho "[$bmkDriver] ERROR! directory '${baseWDir}' cannot be created" + exit 1 # early termination (cannot start processing) + fi +fi +baseWDir=$(cd $baseWDir; pwd) + +# Dump all relevant variables after validating the input arguments +# Keep a copy on a separate log too for parser tests on previous logs +touch $baseWDir/inputs.log +for var in NCOPIES NTHREADS NEVENTS_THREAD; do + if [ "${!var}" == "" ] || ! [[ ${!var} =~ ^[0-9]+$ ]] || [ ! ${!var} -gt 0 ]; then + myecho "[$bmkDriver] ERROR! Invalid value $var=${!var}" + exit 1; + fi + myecho "Current value: $var=${!var}" + myecho "$var=${!var}" >> $baseWDir/inputs.log +done +echo + +# Keep a copy of the version.json file for parser tests on previous logs +if [ -f $BMKDIR/version.json ]; then + cp $BMKDIR/version.json $baseWDir +fi + +# Define APP before doOne (BMK-152) and parseResults +APP=$(basename ${BMKDIR}) # or equivalently here $(basename $0 -bmk.sh) +myecho "[$bmkDriver] APP=${APP}\n" + +# Wrapper for the doOne function +function doOneWrapper(){ + if [ "$1" == "" ] || [ "$2" != "" ]; then + myecho "[$bmkDriver] ERROR! Invalid arguments '$@' to doOneWrapper" # internal error (inconsistent code) + return 1 # NB: return or exit are equivalent here because doOneWrapper is executed as a subprocess + fi + myecho "\n[doOneWrapper ($1)] $(date) : process $1 started" + ###sleep 5 # this is not needed if the list of jobs is compiled from all '$!' + workDir=$(pwd)/proc_$1 # current directory is $baseWDir here + myecho "[doOneWrapper ($1)] workdir is ${workDir}" + if ! mkdir -p $workDir || ! cd $workDir; then + myecho "\n[doOneWrapper ($1)] $(date) : process $1 failed (cannot create workdir)\n" + return 1 + fi + log=${workDir}/doOneWrapper_$1.log + myecho "[doOneWrapper ($1)] logfile is $log" + if ! touch $log ; then + myecho "\n[doOneWrapper ($1)] $(date) : process $1 failed (cannot create logfile)\n" + return 1 + fi + myecho "[doOneWrapper ($1)] $(date) : process $1 configured" 2>&1 | tee -a $log # configured means that log exists + mkdir $workDir/HOME + export HOME=$workDir/HOME # avoid writing to /root in read-only docker or to host HOME in singularity (BMK-166) + myecho "[doOneWrapper ($1)] HOME=$HOME" 2>&1 | tee -a $log + cd -P /proc/self && basename $PWD | ( read thispid; \ + myecho "[doOneWrapper ($1)] current process pid is $thispid" 2>&1 | tee -a $log ) # see https://stackoverflow.com/a/15170225 + cd - > /dev/null + local pid=$(cat $log | grep "current process pid is" | sed -e "s/.*current process pid is //") + local parsertest=0 # hardcoded: 0 => doOne (default); 1 => test the parser on old logs and bypass doOne (BMK-152) + if [ $parsertest -eq 0 ]; then + # if [ "$(whoami)" == "root" ] && cat /proc/self/cgroup | cut -d/ -f2 | grep docker > /dev/null; then + # myecho "[doOneWrapper ($1)] inside docker - run doOne as bmkuser\n" 2>&1 | tee -a $log + # export -f doOne + # chown -R bmkuser:bmkuser $workDir 2>&1 | tee -a $log + # su bmkuser -s /bin/bash -c "doOne $1" 2>&1 | tee -a $log + # local status=${PIPESTATUS[0]} # NB do not use $? if you pipe to tee! + # chown -R root:root $workDir 2>&1 | tee -a $log + # else + myecho "[doOneWrapper ($1)] not inside docker - run doOne as $(whoami)\n" 2>&1 | tee -a $log + doOne $1 2>&1 | tee -a $log + local status=${PIPESTATUS[0]} # NB do not use $? if you pipe to tee! + # fi + else + cp -dpr $BMKDIR/jobs/refjob/proc_$1/* . + local status=$? + \rm -f *${APP}*.json + myecho "[doOneWrapper ($1)] DUMMY doOne: copy old logs for parser tests (BMK-152)" + fi + if [ "$status" == "0" ]; then + myecho "\n[doOneWrapper ($1)] $(date) : process $1 (pid=$pid) completed ok\n" 2>&1 | tee -a $log + return 0 + else + myecho "\n[doOneWrapper ($1)] $(date) : process $1 (pid=$pid) failed\n" 2>&1 | tee -a $log + return 1 + fi +} + +# Export variables to the doOne subprocesses +for var in NCOPIES NTHREADS NEVENTS_THREAD BMKDIR DEBUG APP; do + export $var +done + +# Spawn doOne subprocesses (unless validateInputArguments failed) +if [ $fail -eq 0 ]; then + + # Spawn subprocesses (and keep track of their list of them using '$!') + myecho "------------------------------------------------------------------------" + myecho "[$bmkDriver] spawn $NCOPIES processes" + myecho "------------------------------------------------------------------------\n" + jobs="" + for i in $(seq 1 $NCOPIES); do + ( cd $baseWDir; doOneWrapper $i ) & + ipid=$! + [ $DEBUG -gt 0 ] && myecho "[$bmkDriver] spawned process $i with pid $ipid" + jobs="$jobs $ipid" + sleep 0.1 # stagger job creation by 100ms + done + + # Wait for all subprocesses to complete and check their exit codes + # [NB: do not use 'jobs -p': some jobs may be missing if already completed] + [ $DEBUG -gt 0 ] && myecho "\n[$bmkDriver] $(date) ... waiting for spawned processes with pid's$jobs\n" + wait $jobs > /dev/null 2>&1 + fail=0 # unnecessary but harmless (this code is only executed if $fail -eq 0) + for i in $(seq 1 $NCOPIES); do + if [ $(cat $baseWDir/proc_$i/doOneWrapper_$i.log | grep "[doOneWrapper ($i)]" | grep "completed ok" | wc -l) -ne 1 ]; then + let "fail+=1" + fi + done + myecho "\n------------------------------------------------------------------------" + if [ $fail -gt 0 ]; then + myecho "[$bmkDriver] ERROR! $fail processes failed (out of $NCOPIES)" + else + myecho "[$bmkDriver] all $NCOPIES processes completed successfully" + fi + myecho "------------------------------------------------------------------------\n" + +# Skip the doOne step if validateInputArguments failed +else + myecho "[$bmkDriver] validateInputArguments failed: skip doOne processing" +fi + +myecho '''FIXME bmkDriver is forced to exit here, + the parsing of results should be implemented + and this exit point removed + ''' +exit 0 #FIXME + +# Parse results and generate summary using function parseResults +# - parseResults is started in the base working directoy +# - the number of failed jobs is passed to parseResults as input parameter +# - if a separate function generateSummary exists, it must be internally called by parseResults +# - the environment variable APP=<vo>-<workload> defines the name of the json file ${APP}_summary.json +cd $baseWDir +myecho "[$bmkDriver] parse results and generate summary: starting" +myecho "[$bmkDriver] current directory : $(pwd)\n" +parseResults $fail +parse=$? +myecho "\n[$bmkDriver] parse results and generate summary: completed (status=$parse)" + +# Validate json files syntax (BMK-137) +cd $baseWDir +myecho "\n[$bmkDriver] json file validation: starting" +json=0 +jsonFile=$baseWDir/${APP}_summary.json +jsonFile_new=$baseWDir/${APP}_summary_new.json +if [ ! -f ${jsonFile} ]; then + myecho "[$bmkDriver] ERROR! json file '${jsonFile}' not found" + json=1 +else + myecho "[$bmkDriver] lint json file '${jsonFile}' syntax using jq" + if ! jq '.' -c < ${jsonFile}; then + myecho "[$bmkDriver] json file '${jsonFile}' lint validation failed" + json=1 + fi +fi +if [ -f ${jsonFile_new} ]; then + myecho "[$bmkDriver] lint json file '${jsonFile_new}' syntax using jq" + if ! jq '.' -c < ${jsonFile_new}; then + myecho "[$bmkDriver] json file '${jsonFile_new}' lint validation failed" + json=1 + fi +fi +myecho "[$bmkDriver] json file validation: completed (status=$json)\n" + +# NB: This script is meant to be sourced, it does not return or exit at the end +if [ $parse -ne 0 ] || [ $fail -ne 0 ] || [ $json -ne 0 ]; then + bmkStatus=1 +else + bmkStatus=0 +fi +myecho "[$bmkDriver] exiting back to $bmkScript" +myecho "\n========================================================================" +myecho "[$bmkDriver] $(date) exiting common benchmark driver (status=$bmkStatus)" +myecho "========================================================================\n" +exit $bmkStatus diff --git a/cms/patatrack/cms-patatrack/cms-patatrack-bmk.sh b/cms/patatrack/cms-patatrack/cms-patatrack-bmk.sh new file mode 100755 index 0000000000000000000000000000000000000000..eed2c5c4b55b633c34d7d09d5bbae2977b246279 --- /dev/null +++ b/cms/patatrack/cms-patatrack/cms-patatrack-bmk.sh @@ -0,0 +1,99 @@ +#!/bin/env bash +# Wrapper script based on work from https://github.com/sciaba/patatrack-tests +# 2020.06 David Southwick <david.southwick@cern.ch> - include newer workflow for pre8 patatrack, singularity support + +#set -x # enable debug printouts + +set -e # immediate exit on error + + +function myecho(){ + echo -e "[${FUNCNAME[1]}] $@" +} + +# Function doOne must be defined in each benchmark +# Input argument $1: process index (between 1 and $NCOPIES) +# Return value: please return 0 if this workload copy was successful, 1 otherwise +# The following variables are guaranteed to be defined and exported: NCOPIES, NTHREADS, NEVENTS_THREAD, BMKDIR, DEBUG +# The function is started in process-specific working directory <basewdir>/proc_$1: +# please store here the individual log files for each of the NCOPIES processes +function doOne(){ + if [ "$1" == "" ] || [ "$2" != "" ]; then myecho "ERROR! Invalid arguments '$@' to doOne"; return 1; fi + myecho "($1) $(date) starting in $(pwd)" + # Extra CMS-PATATRACK-specific setup + + ####################################### + # This needs to be fixed + + myecho "current dir is `pwd`" + myecho "files in `pwd` are" + ls -l + ${BMKDIR}/utility_scripts/benchmark.py ${BMKDIR}/cmssw_config.py #>>$LOG 2>&1 3>&1 + ####################################### + + status=${?} + myecho "($1) $(date) completed (status=$status)" + # Return 0 if this workload copy was successful, 1 otherwise + return $status +} + +# FIXME +# Using validateInputArguments for another purpose +# It woudl be useful to have a preparation function called by the driver + +# Optional function validateInputArguments may be defined in each benchmark +# If it exists, it is expected to set NCOPIES, NTHREADS, NEVENTS_THREAD +# (based on previous defaults and on user inputs USER_NCOPIES, USER_NTHREADS, USER_NEVENTS_THREADS) +# Input arguments: none +# Return value: please return 0 if input arguments are valid, 1 otherwise +# The following variables are guaranteed to be defined: NCOPIES, NTHREADS, NEVENTS_THREAD +# (benchmark defaults) and USER_NCOPIES, USER_NTHREADS, USER_NEVENTS_THREADS (user inputs) +function validateInputArguments(){ + + export CMSSW_RELEASE=CMSSW_11_1_0_pre8_Patatrack + export VO_CMS_SW_DIR=/cvmfs/cms.cern.ch + export LC_ALL=en_US.UTF-8 + + source $VO_CMS_SW_DIR/cmsset_default.sh + [[ ! -e ${CMSSW_RELEASE} ]] && scram project CMSSW ${CMSSW_RELEASE} + cd ${CMSSW_RELEASE}/src; + eval `scramv1 runtime -sh`; + cd - + + env | grep LD_LIBRARY_PATH + env | grep SRT_LD_LIBRARY_PATH_SCRAMRT + + # FIXME: so far, after having snapshotted cvmfs the LD_LIBRARY_PATH + # FIXME: does not contain all path needed as when cvmfs is bind mounted + # FIXME: therefore I'm forcing it to be as the correct one + export LD_LIBRARY_PATH=/bmk/cms-patatrack/CMSSW_11_1_0_pre8_Patatrack/biglib/slc7_amd64_gcc820:/bmk/cms-patatrack/CMSSW_11_1_0_pre8_Patatrack/lib/slc7_amd64_gcc820:/bmk/cms-patatrack/CMSSW_11_1_0_pre8_Patatrack/external/slc7_amd64_gcc820/lib:/cvmfs/cms.cern.ch/slc7_amd64_gcc820/cms/cmssw/CMSSW_11_1_0_pre8_Patatrack/biglib/slc7_amd64_gcc820:/cvmfs/cms.cern.ch/slc7_amd64_gcc820/cms/cmssw/CMSSW_11_1_0_pre8_Patatrack/lib/slc7_amd64_gcc820:/cvmfs/cms.cern.ch/slc7_amd64_gcc820/cms/cmssw/CMSSW_11_1_0_pre8_Patatrack/external/slc7_amd64_gcc820/lib:/cvmfs/cms.cern.ch/slc7_amd64_gcc820/external/llvm/9.0.1-pfdnen/lib64:/cvmfs/cms.cern.ch/slc7_amd64_gcc820/external/gcc/8.2.0-bcolbf/lib64:/cvmfs/cms.cern.ch/slc7_amd64_gcc820/external/gcc/8.2.0-bcolbf/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64 + + export SRT_LD_LIBRARY_PATH_SCRAMRT=/bmk/cms-patatrack/CMSSW_11_1_0_pre8_Patatrack/biglib/slc7_amd64_gcc820:/bmk/cms-patatrack/CMSSW_11_1_0_pre8_Patatrack/lib/slc7_amd64_gcc820:/bmk/cms-patatrack/CMSSW_11_1_0_pre8_Patatrack/external/slc7_amd64_gcc820/lib:/cvmfs/cms.cern.ch/slc7_amd64_gcc820/cms/cmssw/CMSSW_11_1_0_pre8_Patatrack/biglib/slc7_amd64_gcc820:/cvmfs/cms.cern.ch/slc7_amd64_gcc820/cms/cmssw/CMSSW_11_1_0_pre8_Patatrack/lib/slc7_amd64_gcc820:/cvmfs/cms.cern.ch/slc7_amd64_gcc820/cms/cmssw/CMSSW_11_1_0_pre8_Patatrack/external/slc7_amd64_gcc820/lib:/cvmfs/cms.cern.ch/slc7_amd64_gcc820/external/llvm/9.0.1-pfdnen/lib64:/cvmfs/cms.cern.ch/slc7_amd64_gcc820/external/gcc/8.2.0-bcolbf/lib64:/cvmfs/cms.cern.ch/slc7_amd64_gcc820/external/gcc/8.2.0-bcolbf/lib + + # Configure WL copy + myecho "info about python and tests" + python --version + which python + python -c 'import scipy; print(scipy.__path__)' + python -c 'import numpy; print(numpy.__path__)' + python -c 'from scipy import stats; import numpy as np; x=np.array([1,2,3]); y=np.array([1.1,2,2.9]); print(stats.linregress(x,y).slope)' + return 0 +} + +# Default values for NCOPIES, NTHREADS, NEVENTS_THREAD must be set in each benchmark +NTHREADS=8 +NCOPIES=1 +NEVENTS_THREAD=10 +if [ "$NCOPIES" -lt 1 ]; then # when $NTHREADS > nproc + NCOPIES=1 + NTHREADS=`nproc` +fi + +export LC_ALL=en_US.UTF-8 + +# Source the common benchmark driver +if [ -f $(dirname $0)/bmk-driver.sh ]; then + . $(dirname $0)/bmk-driver.sh +else + . $(dirname $0)/../../../common/bmk-driver.sh +fi diff --git a/cms/patatrack/cms-patatrack/cms-reco-bmk.sh b/cms/patatrack/cms-patatrack/cms-reco-bmk.sh new file mode 100755 index 0000000000000000000000000000000000000000..6a91e8f4f7c98029df1d85a43fb20afa6505415f --- /dev/null +++ b/cms/patatrack/cms-patatrack/cms-reco-bmk.sh @@ -0,0 +1,93 @@ +#!/bin/bash + +#set -x # enable debug printouts + +#set -e # immediate exit on error + +# Function doOne must be defined in each benchmark +# Input argument $1: process index (between 1 and $NCOPIES) +# Return value: please return 0 if this workload copy was successful, 1 otherwise +# The following variables are guaranteed to be defined and exported: NCOPIES, NTHREADS, NEVENTS_THREAD, BMKDIR, DEBUG +# The function is started in process-specific working directory <basewdir>/proc_$1: +# please store here the individual log files for each of the NCOPIES processes +function doOne(){ + if [ "$1" == "" ] || [ "$2" != "" ]; then echo "[doOne] ERROR! Invalid arguments '$@' to doOne"; return 1; fi + echo "[doOne ($1)] $(date) starting in $(pwd)" + # Extra CMS-RECO-specific setup + export CMSSW_RELEASE=CMSSW_10_2_9 + export VO_CMS_SW_DIR=/cvmfs/cms.cern.ch + source $VO_CMS_SW_DIR/cmsset_default.sh + export SCRAM_ARCH=slc6_amd64_gcc700 + [[ ! -e ${CMSSW_RELEASE} ]] && scram project CMSSW ${CMSSW_RELEASE} + pushd ${CMSSW_RELEASE}; eval `scramv1 runtime -sh`; popd + # Configure WL copy + ln -s ${BMKDIR}/data/GlobalTag.db ./GlobalTag.db + ln -s ${BMKDIR}/data/*.root . + CMSSW_CONF=step3_RAW2DIGI_L1Reco_RECO_EI_PAT_DQM.py + JOB_EVENTS=$(( NEVENTS_THREAD * NTHREADS )) # bash shell arithmetic, may use var instead of $var + cp ${BMKDIR}/${CMSSW_CONF}_template ./${CMSSW_CONF} + sed -e "s@_NEVENTS_@${JOB_EVENTS}@g" -e "s@_NTHREADS_@$NTHREADS@g" -i ./${CMSSW_CONF} + # Execute WL copy + LOG=out_$1.log + cmsRun ./${CMSSW_CONF} >>$LOG 2>&1 3>&1 + status=${?} + echo "[doOne ($1)] $(date) completed (status=$status)" + # Return 0 if this workload copy was successful, 1 otherwise + return $status +} + +# Optional function validateInputArguments may be defined in each benchmark +# If it exists, it is expected to set NCOPIES, NTHREADS, NEVENTS_THREAD +# (based on previous defaults and on user inputs USER_NCOPIES, USER_NTHREADS, USER_NEVENTS_THREADS) +# Input arguments: none +# Return value: please return 0 if input arguments are valid, 1 otherwise +# The following variables are guaranteed to be defined: NCOPIES, NTHREADS, NEVENTS_THREAD +# (benchmark defaults) and USER_NCOPIES, USER_NTHREADS, USER_NEVENTS_THREADS (user inputs) +function validateInputArguments(){ + if [ "$1" != "" ]; then echo "[validateInputArguments] ERROR! Invalid arguments '$@' to validateInputArguments"; return 1; fi + echo "[validateInputArguments] validate input arguments" + # Number of copies and number of threads per copy + if [ "$USER_NTHREADS" != "" ] && [ "$USER_NCOPIES" != "" ]; then + NCOPIES=$USER_NCOPIES + NTHREADS=$USER_NTHREADS + elif [ "$USER_NTHREADS" != "" ]; then + NTHREADS=$USER_NTHREADS + NCOPIES=$((`nproc`/$NTHREADS)) + elif [ "$USER_NCOPIES" != "" ]; then + NCOPIES=$USER_NCOPIES + NTHREADS=$((`nproc`/$NCOPIES)) + fi + # Number of events per thread + if [ "$USER_NEVENTS_THREAD" != "" ]; then NEVENTS_THREAD=$USER_NEVENTS_THREAD; fi + # Return 0 if input arguments are valid, 1 otherwise + # Report any issues to parseResults via s_msg + export s_msg="ok" + tot_load=$(($NCOPIES*$NTHREADS)) + if [ $tot_load -gt `nproc` ]; then + s_msg="[ERROR] NCOPIES*NTHREADS=$NCOPIES*$NTHREADS=$tot_load > number of available cores (`nproc`)" + return 1 + elif [ $tot_load -eq 0 ]; then + s_msg="[ERROR] NCOPIES*NTHREADS=$NCOPIES*$NTHREADS=$tot_load. Please fix it" + return 1 + elif [ $tot_load -ne `nproc` ]; + then s_msg="[WARNING] NCOPIES*NTHREADS ($NCOPIES*$NTHREADS=$tot_load) != `nproc` (number of available cores nproc)" + echo $s_msg + fi + return 0 +} + +# Default values for NCOPIES, NTHREADS, NEVENTS_THREAD must be set in each benchmark +NTHREADS=4 +NCOPIES=$(( `nproc` / $NTHREADS )) +NEVENTS_THREAD=100 +if [ "$NCOPIES" -lt 1 ]; then # when $NTHREADS > nproc + NCOPIES=1 + NTHREADS=`nproc` +fi + +# Source the common benchmark driver +if [ -f $(dirname $0)/bmk-driver.sh ]; then + . $(dirname $0)/bmk-driver.sh +else + . $(dirname $0)/../../../common/bmk-driver.sh +fi diff --git a/cms/patatrack/cms-patatrack/cmssw_config.py b/cms/patatrack/cms-patatrack/cmssw_config.py new file mode 100644 index 0000000000000000000000000000000000000000..4fd5c3bde294afc0ca15697402dd516b1dda0cfc --- /dev/null +++ b/cms/patatrack/cms-patatrack/cmssw_config.py @@ -0,0 +1,141 @@ +# Auto generated configuration file +# using: +# Revision: 1.19 +# Source: /local/reps/CMSSW/CMSSW/Configuration/Applications/python/ConfigBuilder.py,v +# with command line options: profile --data --era Run2_2018 --geometry DB:Extended --conditions 102X_dataRun2_HLT_v2 -s RAW2DIGI:RawToDigi_pixelOnly,RECO:reconstruction_pixelTrackingOnly,DQM:@pixelTrackingOnlyDQM --procModifiers gpu --customise RecoPixelVertexing/Configuration/customizePixelTracksForProfiling.customizePixelTracksForProfilingGPUOnly -n 4200 --nThreads 8 --runUnscheduled --filein file:step2.root --fileout file:step3.root --datatier GEN-SIM-RECO,DQMIO --eventcontent RECOSIM,DQM --python_filename profile.py --no_exec +import FWCore.ParameterSet.Config as cms + +from Configuration.Eras.Era_Run2_2018_cff import Run2_2018 +from Configuration.ProcessModifiers.gpu_cff import gpu + +process = cms.Process('RECO',Run2_2018,gpu) + +# import of standard configurations +process.load('Configuration.StandardSequences.Services_cff') +process.load('SimGeneral.HepPDTESSource.pythiapdt_cfi') +process.load('FWCore.MessageService.MessageLogger_cfi') +process.load('Configuration.EventContent.EventContent_cff') +process.load('Configuration.StandardSequences.GeometryRecoDB_cff') +process.load('Configuration.StandardSequences.MagneticField_AutoFromDBCurrent_cff') +process.load('Configuration.StandardSequences.RawToDigi_Data_cff') +process.load('Configuration.StandardSequences.Reconstruction_Data_cff') +process.load('DQMServices.Core.DQMStoreNonLegacy_cff') +process.load('DQMOffline.Configuration.DQMOffline_cff') +process.load('Configuration.StandardSequences.FrontierConditions_GlobalTag_cff') + +process.maxEvents = cms.untracked.PSet( + input = cms.untracked.int32(4200), + output = cms.optional.untracked.allowed(cms.int32,cms.PSet) +) + +process.options = cms.untracked.PSet( + FailPath = cms.untracked.vstring(), + IgnoreCompletely = cms.untracked.vstring(), + Rethrow = cms.untracked.vstring(), + SkipEvent = cms.untracked.vstring(), + allowUnscheduled = cms.obsolete.untracked.bool, + canDeleteEarly = cms.untracked.vstring(), + emptyRunLumiMode = cms.obsolete.untracked.string, + eventSetup = cms.untracked.PSet( + forceNumberOfConcurrentIOVs = cms.untracked.PSet( + + ), + numberOfConcurrentIOVs = cms.untracked.uint32(1) + ), + fileMode = cms.untracked.string('FULLMERGE'), + forceEventSetupCacheClearOnNewRun = cms.untracked.bool(False), + makeTriggerResults = cms.obsolete.untracked.bool, + numberOfConcurrentLuminosityBlocks = cms.untracked.uint32(1), + numberOfConcurrentRuns = cms.untracked.uint32(1), + numberOfStreams = cms.untracked.uint32(0), + numberOfThreads = cms.untracked.uint32(1), + printDependencies = cms.untracked.bool(False), + sizeOfStackForThreadsInKB = cms.optional.untracked.uint32, + throwIfIllegalParameter = cms.untracked.bool(True), + wantSummary = cms.untracked.bool(False) +) + +# Production Info +process.configurationMetadata = cms.untracked.PSet( + annotation = cms.untracked.string('profile nevts:4200'), + name = cms.untracked.string('Applications'), + version = cms.untracked.string('$Revision: 1.19 $') +) + +# Output definition + +process.RECOSIMoutput = cms.OutputModule("PoolOutputModule", + dataset = cms.untracked.PSet( + dataTier = cms.untracked.string('GEN-SIM-RECO'), + filterName = cms.untracked.string('') + ), + fileName = cms.untracked.string('file:step3.root'), + outputCommands = process.RECOSIMEventContent.outputCommands, + splitLevel = cms.untracked.int32(0) +) + +process.DQMoutput = cms.OutputModule("DQMRootOutputModule", + dataset = cms.untracked.PSet( + dataTier = cms.untracked.string('DQMIO'), + filterName = cms.untracked.string('') + ), + fileName = cms.untracked.string('file:step3_inDQM.root'), + outputCommands = process.DQMEventContent.outputCommands, + splitLevel = cms.untracked.int32(0) +) + +# Additional output definition + +# Other statements +from Configuration.AlCa.GlobalTag import GlobalTag +process.GlobalTag = GlobalTag(process.GlobalTag, '102X_upgrade2018_design_v9', '') + +# Path and EndPath definitions +process.raw2digi_step = cms.Path(process.RawToDigi_pixelOnly) +process.reconstruction_step = cms.Path(process.reconstruction_pixelTrackingOnly) +process.dqmoffline_step = cms.EndPath(process.DQMOfflinePixelTracking) +process.dqmofflineOnPAT_step = cms.EndPath(process.PostDQMOffline) +process.RECOSIMoutput_step = cms.EndPath(process.RECOSIMoutput) +process.DQMoutput_step = cms.EndPath(process.DQMoutput) + +# Schedule definition +process.schedule = cms.Schedule(process.raw2digi_step,process.reconstruction_step,process.dqmoffline_step,process.dqmofflineOnPAT_step,process.RECOSIMoutput_step,process.DQMoutput_step) +from PhysicsTools.PatAlgos.tools.helpers import associatePatAlgosToolsTask +associatePatAlgosToolsTask(process) + +#Setup FWK for multithreaded +process.options.numberOfThreads=cms.untracked.uint32(8) +process.options.numberOfStreams=cms.untracked.uint32(0) +process.options.numberOfConcurrentLuminosityBlocks=cms.untracked.uint32(1) + +# customisation of the process. + +# Automatic addition of the customisation function from RecoPixelVertexing.Configuration.customizePixelTracksForProfiling +from RecoPixelVertexing.Configuration.customizePixelTracksForProfiling import customizePixelTracksForProfilingGPUOnly + +#call to customisation function customizePixelTracksForProfilingGPUOnly imported from RecoPixelVertexing.Configuration.customizePixelTracksForProfiling +process = customizePixelTracksForProfilingGPUOnly(process) + +# End of customisation functions +#do not add changes to your config after this point (unless you know what you are doing) + + +# Customisation from command line + +#Have logErrorHarvester wait for the same EDProducers to finish as those providing data for the OutputModule +from FWCore.Modules.logErrorHarvester_cff import customiseLogErrorHarvesterUsingOutputCommands +process = customiseLogErrorHarvesterUsingOutputCommands(process) + +# Add early deletion of temporary data products to reduce peak memory need +from Configuration.StandardSequences.earlyDeleteSettings_cff import customiseEarlyDelete +process = customiseEarlyDelete(process) +# End adding early deletion + +# load data using the DAQ source +process.load('sourceFromPixelRaw_cff') + +# report CUDAService messages +process.MessageLogger.categories.append("CUDAService") + +# print the summary +process.options.wantSummary = cms.untracked.bool( True ) diff --git a/cms/patatrack/cms-patatrack/data/.keepme b/cms/patatrack/cms-patatrack/data/.keepme new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/cms/patatrack/cms-patatrack/parseResults.sh b/cms/patatrack/cms-patatrack/parseResults.sh new file mode 100644 index 0000000000000000000000000000000000000000..2af853aac8b43aa43590ca6075fdc86f15c17570 --- /dev/null +++ b/cms/patatrack/cms-patatrack/parseResults.sh @@ -0,0 +1,67 @@ +function generateSummary(){ + echo -e "{\"copies\":$NCOPIES , \"threads_per_copy\":$NTHREADS , \"events_per_thread\" : $NEVENTS_THREAD , \"wl-scores\": $res_score, \"wl-stats\": {\"throughput_score\": $res_thr , \"CPU_score\": $res_cpu }, \"log\": \"${s_msg}\", \"app\": `cat $BMKDIR/version.json` }" > ${APP}_summary.json + cat ${APP}_summary.json +} + +# Function parseResults must be defined in each benchmark (or in a separate file parseResults.sh) +# [NB: if a separate function generateSummary exists, it must be internally called by parseResults] +# Input argument $1: status code <fail> from validateInputArguments and doOne steps: +# - <fail> < 0: validateInputArguments failed +# - <fail> > 0: doOne failed (<fail> processes failed out of $NCOPIES) +# - <fail> = 0: OK +# Return value: please return 0 if parsing was successful, 1 otherwise +# The following variables are guaranteed to be defined and exported: NCOPIES, NTHREADS, NEVENTS_THREAD, BMKDIR, DEBUG, APP +# The environment variable APP=<vo>-<workload> defines the name of the json file ${APP}_summary.json +# Logfiles have been stored in process-specific working directories <basewdir>/proc_<1...NCOPIES> +# The function is started in the base working directory <basewdir>: +# please store here the overall json summary file for all NCOPIES processes combined +function parseResults(){ + if [ "$1" == "" ] || [ "$2" != "" ]; then echo "[parseresults] ERROR! Invalid arguments '$@' to parseResults"; return 1; fi + echo "[parseResults] parse results and generate summary (previous status: $1)" + echo "[parseResults] current directory: $(pwd)" + export res_cpu='""' + export res_thr='""' + export res_score='""' + export s_msg="ok" + if [ "$1" -ne 0 ]; then + echo "Previous steps failed: skip parsing, go to generateSummary" + generateSummary # this has no return code + return 1 + else + #----------------------- + # Parse results + #----------------------- + echo "[parseResults] parsing results from" proc_*/out_*.log + # Documentation of cmssw time report at https://github.com/cms-sw/cmssw/blob/09c3fce6626f70fd04223e7dacebf0b485f73f54/FWCore/Services/plugins/Timing.cc#L240 + # Parsing Event Throughput: xxxx ev/s + res_thr=`grep -H "Event Throughput" proc_*/out_*.log | sed -e "s@[^:]*: Event Throughput: \([ 0-9\.]*\) ev/s@\1@" | awk 'BEGIN{amin=1000000;amax=0;count=0;} { val=$1; a[count]=val; count+=1; sum+=val; if(amax<val) amax=val; if(amin>val) amin=val} END{n = asort(a); if (n % 2) { median=a[(n + 1) / 2]; } else {median=(a[(n / 2)] + a[(n / 2) + 1]) / 2.0;}; +printf "{\"score\": %.4f, \"avg\": %.4f, \"median\": %.4f, \"min\": %.4f, \"max\": %.4f}", sum, sum/count, median, amin, amax +}' nevt=$NEVENTS_THREAD nthread=$NTHREADS || (echo "{}"; return 1)` + STATUS_1=$? + + #Duplicating above parsing, as quick and dirty. SHoudl be replaced by a python parser + res_score=`grep -H "Event Throughput" proc_*/out_*.log | sed -e "s@[^:]*: Event Throughput: \([ 0-9\.]*\) ev/s@\1@" | awk 'BEGIN{amin=1000000;amax=0;count=0;} { val=$1; a[count]=val; count+=1; sum+=val; if(amax<val) amax=val; if(amin>val) amin=val} END{n = asort(a); if (n % 2) { median=a[(n + 1) / 2]; } else {median=(a[(n / 2)] + a[(n / 2) + 1]) / 2.0;}; +printf "{\"reco\": %.4f}", sum +}' nevt=$NEVENTS_THREAD nthread=$NTHREADS || (echo "{}"; return 1)` + + # Parsing CPU Summary: \n- Total loop:: xxxx seconds of all CPUs + res_cpu=`grep -H -A2 "CPU Summary" proc_*/out_*.log | grep "Total loop" | sed -e "s@.*\sTotal loop: \([ 0-9\.]*\)@\1@" | awk 'BEGIN{amin=1000000;amax=0;count=0;} { val=nevt*nthread/$1; a[count]=val; count+=1; sum+=val; if(amax<val) amax=val; if(amin>val) amin=val} END{n = asort(a); if (n % 2) {median=a[(n + 1) / 2]; } else {median=(a[(n / 2)] + a[(n / 2) + 1]) / 2.0;}; +printf "{\"score\": %.4f, \"avg\": %.4f, \"median\": %.4f, \"min\": %.4f, \"max\": %.4f}", sum, sum/count, median, amin, amax +}' nevt=$NEVENTS_THREAD nthread=$NTHREADS || (echo "{}"; return 1)` + STATUS_2=$? + [[ "$STATUS_1" == "0" ]] && [[ "$STATUS_2" == "0" ]] + STATUS=$? + [[ "$STATUS" != "0" ]] && export s_msg="ERROR" + echo "[parseResults] parsing completed (status=$STATUS)" + #----------------------- + # Generate summary + #----------------------- + echo "[parseResults] generate summary" + generateSummary # this has no return code + #----------------------- + # Return status + #----------------------- + # Return 0 if result parsing and json generation were successful, 1 otherwise + return $STATUS + fi +} diff --git a/cms/patatrack/cms-patatrack/prepare-patch.sh b/cms/patatrack/cms-patatrack/prepare-patch.sh new file mode 100755 index 0000000000000000000000000000000000000000..bf2fbac7516457c3bba311711944b2d0e744d093 --- /dev/null +++ b/cms/patatrack/cms-patatrack/prepare-patch.sh @@ -0,0 +1,30 @@ +#!/bin/env bash + +# FIXME: THIS set of replaces should simply go in the hep-worklaods-gpu repo +# the dependency from the sciaba repo should go away, and possibly also the onte from patatrack-scripts +# or at least use a specific branch of patatrack-scripts + +install_dir="/tmp/install" +echo -e "\nCloning Patatrack repos into ${install_dir}..." + +ls -l ${install_dir} + +cd $install_dir + +# Clone software repos +git clone https://github.com/cms-patatrack/patatrack-scripts +git clone https://github.com/sciaba/patatrack-tests + +echo -e "\nSet up Patatrack Scripts..." +# Prepare scripts +cp ${install_dir}/patatrack-tests/*/*.patch \ + ${install_dir}/patatrack-tests/config/sourceFromPixelRaw_cff.py \ + ${install_dir}/patatrack-scripts/ + +cd ${install_dir}/patatrack-scripts/ +patch -b --forward workflow.sh workflow.patch + +ls -l + +[ ! -d /bmk/cms-patatrack ] && mkdir -p /bmk/cms-patatrack +cp -r ${install_dir}/patatrack-scripts /bmk/cms-patatrack \ No newline at end of file diff --git a/cms/patatrack/cms-patatrack/test_parser.sh b/cms/patatrack/cms-patatrack/test_parser.sh new file mode 100755 index 0000000000000000000000000000000000000000..4778af6a08d28bf4f3c3d0c761ba37a1aa447243 --- /dev/null +++ b/cms/patatrack/cms-patatrack/test_parser.sh @@ -0,0 +1,2 @@ +#!/bin/bash +$(dirname $0)/../../../common/parsertest.sh $(dirname $0) diff --git a/cms/patatrack/cms-patatrack/utility_scripts/benchmark.py b/cms/patatrack/cms-patatrack/utility_scripts/benchmark.py new file mode 100755 index 0000000000000000000000000000000000000000..e3347b89b84c58b47b2ac67342c36b7400da8436 --- /dev/null +++ b/cms/patatrack/cms-patatrack/utility_scripts/benchmark.py @@ -0,0 +1,66 @@ +#! /usr/bin/env python + +import sys +import os +import copy + +from multirun import * +import FWCore.ParameterSet.Config as cms + + +if __name__ == "__main__": + if not 'CMSSW_BASE' in os.environ: + # FIXME print a meaningful error message + sys.exit(1) + + if len(sys.argv) == 1: + # FIXME print a meaningful error message + sys.exit(1) + + # TODO parse arguments and options from the command line + options = { + 'verbose' : False, + 'plumbing' : False, + 'warmup' : True, + 'events' : 4200, + 'repeats' : 4, + 'jobs' : 1, + 'threads' : 8, # per job + 'streams' : 8, # per job + 'gpus_per_job' : 1, # per job + 'allow_hyperthreading': False, # this has no effect if set_cpu_affinity is False + 'set_cpu_affinity' : True, + 'set_gpu_affinity' : True, + 'logdir' : None, # relative or absolute path, or None to disable storing the logs + 'keep' : [], # output files to be kept + } + + +#### FIXME: Not clear if for GPU benchamring purposes we need to +#### FIXME: run before io benchmark. Skipping for the itme being +#### FIXME: setting flag to False + run_io_benchmark = False + #run_io_benchmark = True + + info() + + for config in sys.argv[1:]: + process = parseProcess(config) + + if run_io_benchmark: + print 'Benchmarking only I/O' + io = copy.deepcopy(process) + io.hltGetRaw = cms.EDAnalyzer("HLTGetRaw", RawDataCollection = cms.InputTag("rawDataCollector")) + io.path = cms.Path(io.hltGetRaw) + io.schedule = cms.Schedule(io.path) + if 'PrescaleService' in io.__dict__: + del io.PrescaleService + io_options = copy.deepcopy(options) + io_options['logdir'] = None + io_options['keep'] = [] + multiCmsRun(io, **io_options) + run_io_benchmark = False + print + + print 'Benchmarking %s' % config + multiCmsRun(process, **options) diff --git a/cms/patatrack/cms-patatrack/utility_scripts/cpuinfo.py b/cms/patatrack/cms-patatrack/utility_scripts/cpuinfo.py new file mode 100755 index 0000000000000000000000000000000000000000..77c16ec8607c3b8cb41f10472fb9bd1dd27c2382 --- /dev/null +++ b/cms/patatrack/cms-patatrack/utility_scripts/cpuinfo.py @@ -0,0 +1,80 @@ +#! /usr/bin/env python + +import sys +import subprocess +import re +import collections + + +class CPUInfo(object): + def __init__(self, socket = None, model = None): + self.socket = socket + self.model = model + self.cores = {} + self.hardware_threads = [] + self.physical_processors = [] + + def add_core(self, core, thread): + if core in self.cores: + self.cores[core].append(thread) + else: + self.cores[core] = [ thread ] + + def finalise(self): + for core in self.cores.values(): + self.physical_processors.append(core[0]) + self.hardware_threads.extend(core) + self.physical_processors.sort() + self.hardware_threads.sort() + + +# cache results across calls +__cache = None + + +# return a mapping between sockets and CPUInfo objects +def get_cpu_info(cache = True): + global __cache + if cache and __cache: + return __cache + + cpus = collections.OrderedDict() + + model = 'Unknown CPU' + description = subprocess.Popen(['lscpu', ], stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()[0] + for line in description.splitlines(): + if 'Model name:' in line: + model = line.split(':')[1].strip() + break + + devices = subprocess.Popen(['lscpu', '-b', '-p=SOCKET,NODE,CORE,CPU'], stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()[0] + for line in devices.splitlines(): + if '#' in line: + continue + + sock, numa, core, proc = line.split(',') + sock = int(sock) if sock else 0 + numa = int(numa) if numa else sock # currently unused + core = int(core) if core else 0 + proc = int(proc) if proc else 0 + + if not sock in cpus: + cpus[sock] = CPUInfo(sock, model) + cpus[sock].add_core(core, proc) + + for cpu in cpus.values(): + cpu.finalise() + + if cache: + __cache = cpus + + return cpus + + +if __name__ == "__main__": + cpus = get_cpu_info() + print '%d CPUs:' % len(cpus) + for cpu in cpus.values(): + print ' %d: %s (%d cores, %d threads)' % (cpu.socket, cpu.model, len(cpu.physical_processors), len(cpu.hardware_threads)) + print ' cores: %s' % ', '.join(map(str, cpu.physical_processors)) + print ' HT\'s: %s' % ', '.join(map(str, cpu.hardware_threads)) diff --git a/cms/patatrack/cms-patatrack/utility_scripts/gpuinfo.py b/cms/patatrack/cms-patatrack/utility_scripts/gpuinfo.py new file mode 100755 index 0000000000000000000000000000000000000000..7abd837f98b6d2da6ceb402eaba392a3910d23af --- /dev/null +++ b/cms/patatrack/cms-patatrack/utility_scripts/gpuinfo.py @@ -0,0 +1,58 @@ +#! /usr/bin/env python + +import sys +import os +import subprocess +import re +import collections + + +class GPUInfo(object): + def __init__(self, device = None, model = None): + self.device = device + self.model = model + + +# cache results across calls +__cache = None + + +# return a mapping between devices and GPUInfo objects +def get_gpu_info(cache = True): + global __cache + if cache and __cache: + return __cache + + gpus = collections.OrderedDict() + + visible = None + if 'CUDA_VISIBLE_DEVICES' in os.environ: + if os.environ['CUDA_VISIBLE_DEVICES'] == '': + visible = [] + else: + visible = [int(device) for device in os.environ['CUDA_VISIBLE_DEVICES'].split(',')] + + devices = subprocess.Popen(['cudaComputeCapabilities', ], stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()[0] + for line in devices.splitlines(): + matches = re.match(r' *([0-9]+) +([0-9]+\.[0-9]) +(.*)', line) + if matches: + device = int(matches.group(1)) + if visible: + device = visible[device] + model = matches.group(3).strip() + gpus[device] = GPUInfo(device, model) + + if cache: + __cache = gpus + + return gpus + + +if __name__ == "__main__": + gpus = get_gpu_info() + if gpus: + print '%d visible NVIDIA GPUs:' % len(gpus) + for gpu in gpus.values(): + print ' %d: %s' % (gpu.device, gpu.model) + else: + print 'No visible NVIDIA GPUs' diff --git a/cms/patatrack/cms-patatrack/utility_scripts/multirun.py b/cms/patatrack/cms-patatrack/utility_scripts/multirun.py new file mode 100755 index 0000000000000000000000000000000000000000..d1281a5f83d39166944f53511fd08f8b13b0f17e --- /dev/null +++ b/cms/patatrack/cms-patatrack/utility_scripts/multirun.py @@ -0,0 +1,455 @@ +#! /usr/bin/env python + +import sys +import os +import copy +import imp +import itertools +import math +import shutil +import subprocess +import tempfile +from collections import defaultdict +from datetime import datetime +import numpy as np +from scipy import stats + +# FIXME check that CMSSW_BASE is set +import FWCore.ParameterSet.Config as cms + +# set the output encoding to UTF-8 for pipes and redirects +from set_output_encoding import * +set_output_encoding(encoding='utf-8', force=True) + +from cpuinfo import * +from gpuinfo import * +from threaded import threaded + +cpus = get_cpu_info() +gpus = get_gpu_info() + +epoch = datetime.now() + +@threaded +def singleCmsRun(filename, workdir, logdir = None, keep = [], verbose = False, cpus = None, gpus = None, *args): + # optionally set CPU affinity + command = ('cmsRun', filename) + args + if cpus is not None: + command = ('taskset', '-c', cpus) + command + cmdline = ' '.join(command) + + # optionally set GPU affinity + environment = os.environ.copy() + if gpus is not None: + environment['CUDA_VISIBLE_DEVICES'] = gpus + cmdline = 'CUDA_VISIBLE_DEVICES=' + gpus + ' ' + cmdline + + if verbose: + print cmdline + + # run a cmsRun job, redirecting standard output and error to files + lognames = ('stdout', 'stderr') + logfiles = tuple('%s/%s' % (workdir, name) for name in ('stdout', 'stderr')) + stdout = open(logfiles[0], 'w') + stderr = open(logfiles[1], 'w') + job = subprocess.Popen(command, cwd = workdir, env = environment, stdout = stdout, stderr = stderr) + job.communicate() + stdout.close() + stderr.close() + + # if requested, move the logs and any additional artifacts to the log directory + if logdir: + for name in list(keep) + list(lognames): + if os.path.isfile(workdir + '/' + name): + shutil.move(workdir + '/' + name, '%s/cmsRun%06d.%s' % (logdir, job.pid, name)) + logfiles = tuple('%s/cmsRun%06d.%s' % (logdir, job.pid, name) for name in lognames) + + stderr = open(logfiles[1], 'r') + + if (job.returncode < 0): + print "The underlying cmsRun job was killed by signal %d" % -job.returncode + print + print "The last lines of the error log are:" + print "".join(stderr.readlines()[-10:]) + print + print "See %s and %s for the full logs" % logfiles + stderr.close() + return None + + elif (job.returncode > 0): + print "The underlying cmsRun job failed with return code %d" % job.returncode + print + print "The last lines of the error log are:" + print "".join(stderr.readlines()[-10:]) + print + print "See %s and %s for the full logs" % logfiles + stderr.close() + return None + + if verbose: + print "The underlying cmsRun job completed successfully" + + # analyse the output + date_format = '%d-%b-%Y %H:%M:%S.%f' + # expected format + # 100, 18-Mar-2020 12:16:39.172836 CET + begin_pattern = re.compile(r'%MSG-i ThroughputService: *AfterModEndJob') + line_pattern = re.compile(r' *(\d+), (\d+-...-\d\d\d\d \d\d:\d\d:\d\d.\d\d\d\d\d\d) .*') + + events = [] + times = [] + matching = False + for line in stderr: + # look for the begin marker + if not matching: + if begin_pattern.match(line): + matching = True + continue + + matches = line_pattern.match(line) + # check for the end of the events list + if not matches: + break + + # read the matching lines + event = int(matches.group(1)) + time = datetime.strptime(matches.group(2), date_format) + events.append(event) + times.append((time - epoch).total_seconds()) + + stderr.close() + return (tuple(events), tuple(times)) + + +def parseProcess(filename): + # parse the given configuration file and return the `process` object it define + # the import logic is taken from edmConfigDump + try: + handle = open(filename, 'r') + except: + print "Failed to open %s: %s" % (filename, sys.exc_info()[1]) + sys.exit(1) + + # make the behaviour consistent with 'cmsRun file.py' + sys.path.append(os.getcwd()) + try: + pycfg = imp.load_source('pycfg', filename, handle) + process = pycfg.process + except: + print "Failed to parse %s: %s" % (filename, sys.exc_info()[1]) + sys.exit(1) + + handle.close() + return process + + +def multiCmsRun( + process, # the cms.Process object to run + data = None, # a file-like object for storing performance measurements + header = True, # write a header before the measurements + warmup = True, # whether to run an extra warm-up job + logdir = None, # a relative or absolute path where to store individual jobs' log files, or None + keep = [], # additional output files to be kept + verbose = False, # whether to print extra messages + plumbing = False, # print output in a machine-readable format + events = -1, # number of events to process (default: unlimited) + repeats = 1, # number of times to repeat each job (default: 1) + jobs = 1, # number of jobs to run in parallel (default: 1) + threads = 1, # number of CPU threads per job (default: 1) + streams = 1, # number of EDM streams per job (default: 1) + gpus_per_job = 1, # number of GPUs per job (default: 1) + allow_hyperthreading = True, # whether to use extra CPU cores from HyperThreading + set_cpu_affinity = False, # whether to set CPU affinity + set_gpu_affinity = False, # whether yo set GPU affinity + *args): # additional arguments passed to cmsRun + # set the number of streams and threads + process.options.numberOfThreads = cms.untracked.uint32( threads ) + process.options.numberOfStreams = cms.untracked.uint32( streams ) + + # set the number of events to process + process.maxEvents.input = cms.untracked.int32( events ) + + # print a message every 100 events + if not 'ThroughputService' in process.__dict__: + process.ThroughputService = cms.Service('ThroughputService', + enableDQM = cms.untracked.bool(False), + ) + process.ThroughputService.printEventSummary = cms.untracked.bool(True) + process.ThroughputService.eventResolution = cms.untracked.uint32(100) + if events > -1: + process.ThroughputService.eventRange = cms.untracked.uint32(events) + + if not 'MessageLogger' in process.__dict__: + process.load('FWCore.MessageService.MessageLogger_cfi') + if not 'ThroughputService' in process.MessageLogger.categories: + process.MessageLogger.categories.append('ThroughputService') + process.MessageLogger.cerr.ThroughputService = cms.untracked.PSet( + limit = cms.untracked.int32(10000000), + reportEvery = cms.untracked.int32(1) + ) + + # make a full dump of the configuration, to make changes to the number of threads, streams, etc. + workdir = tempfile.mkdtemp(prefix = 'cmsRun') + config = open(os.path.join(workdir, 'process.py'), 'w') + config.write(process.dumpPython()) + config.close() + + cpu_assignment = [ None ] * jobs + if set_cpu_affinity: + # build the list of CPUs for each job: + # - build a list of all "processors", grouped by sockets, cores and hardware threads, e.g. + # [ 0,2,4,6,8,10,12,14,16,18,20,22,24,26,1,3,5,7,9,11,13,15,17,19,21,23,25,27 ] + # - split the list by the number of jobs; if the number of jobs is a multiple of the number of sockets + # the jobs should automatically be split on socket boundaries + # - otherwise some jobs may span multiple sockets, e.g. + # [ 0,2,4,6 ], [ 8,10,12,14 ], [ 16,18,20,22 ], [ 24,26,1,3 ], [ 5,7,9,11 ], [ 13,15,17,19 ], [ 21,23,25,27 ] + # TODO: set the processor assignment as an argument, to support arbitrary splitting + if allow_hyperthreading: + cpu_list = list(itertools.chain(*(map(str, cpu.hardware_threads) for cpu in cpus.values()))) + else: + cpu_list = list(itertools.chain(*(map(str, cpu.physical_processors) for cpu in cpus.values()))) + + # if all the jobs fit within individual sockets, assing jobs to sockets in a round-robin + if len(cpu_list) // len(cpus) // threads * len(cpus) >= jobs: + cpu_assignment = [ list() for i in range(jobs) ] + if allow_hyperthreading: + available_cpus = [ copy.copy(cpu.hardware_threads) for cpu in cpus.values() ] + else: + available_cpus = [ copy.copy(cpu.physical_processors) for cpu in cpus.values() ] + for job in range(jobs): + socket = job % len(cpus) + cpu_assignment[job] = ','.join(map(str, available_cpus[socket][0:threads])) + del available_cpus[socket][0:threads] + + # otherwise, split the list by the number of jobs, and possibly overcommit + else: + if len(cpu_list) >= jobs * threads: + # split the list by the number of jobs + index = [ i * threads for i in range(jobs+1) ] + else: + # fill all cpus and overcommit + index = [ i * len(cpu_list) // jobs for i in range(jobs+1) ] + + cpu_assignment = [ ','.join(cpu_list[index[i]:index[i+1]]) for i in range(jobs) ] + + gpu_assignment = [ None ] * jobs + if set_gpu_affinity: + # build the list of GPUs for each job: + # - if the number of GPUs per job is greater than or equal to the number of GPUs in the system, + # run each job on all GPUs + # - otherwise, assign GPUs to jobs in a round-robin fashon + # TODO: set the GPU assignment as an argument, to support arbitrary splitting + if gpus_per_job >= len(gpus): + gpu_assignment = [ ','.join(map(str, gpus.keys())) for i in range(jobs) ] + else: + gpu_repeated = map(str, itertools.islice(itertools.cycle(gpus.keys()), jobs * gpus_per_job)) + gpu_assignment = [ ','.join(gpu_repeated[i*gpus_per_job:(i+1)*gpus_per_job]) for i in range(jobs) ] + + if warmup: + # warm up to cache the binaries, data and conditions + jobdir = os.path.join(workdir, "warmup") + os.mkdir(jobdir) + # recreate logs' directory + if logdir is not None: + thislogdir = logdir + '/warmup' + shutil.rmtree(thislogdir, True) + os.makedirs(thislogdir) + else: + thislogdir = None + print 'Warming up' + thread = singleCmsRun(config.name, jobdir, thislogdir, [], verbose, cpu_assignment[0], gpu_assignment[0], *args) + thread.start() + thread.join() + shutil.rmtree(jobdir) + print + + if repeats > 1: + n_times = '%d times' % repeats + elif repeats == 1: + n_times = 'once' + else: + n_times = 'indefinitely' + + if events >= 0: + n_events = str(events) + else: + n_events = 'all' + + print 'Running %s over %s events with %d jobs, each with %d threads, %d streams and %d GPUs' % (n_times, n_events, jobs, threads, streams, gpus_per_job) + + # store the values to compute the average throughput over the repetitions + failed = [ False ] * repeats + if repeats > 1 and not plumbing: + throughputs = [ None ] * repeats + overlaps = [ None ] * repeats + + # store performance points for later analysis + if data and header: + data.write('%s, %s, %s, %s, %s, %s, %s, %s\n' % ('jobs', 'overlap', 'CPU threads per job', 'EDM streams per job', 'GPUs per jobs', 'number of events', 'average throughput (ev/s)', 'uncertainty (ev/s)')) + + iterations = xrange(repeats) if repeats > 0 else itertools.count() + for repeat in iterations: + # run the jobs reading the output to extract the event throughput + events = [ None ] * jobs + times = [ None ] * jobs + fits = [ None ] * jobs + job_threads = [ None ] * jobs + # recreate logs' directory + if logdir is not None: + thislogdir = logdir + '/step%04d' % repeat + shutil.rmtree(thislogdir, True) + os.makedirs(thislogdir) + else: + thislogdir = None + # create work threads + for job in range(jobs): + jobdir = os.path.join(workdir, "step%02d_part%02d" % (repeat, job)) + os.mkdir(jobdir) + job_threads[job] = singleCmsRun(config.name, jobdir, thislogdir, keep, verbose, cpu_assignment[job], gpu_assignment[job], *args) + + # start all threads + for thread in job_threads: + thread.start() + + # join all threads + failed_jobs = [ False ] * jobs + consistent_events = defaultdict(int) + for job, thread in enumerate(job_threads): + # implicitly wait for the thread to complete + result = thread.result.get() + if result is None or not(all(result)): + failed_jobs[job] = True + continue + (e, t) = result + consistent_events[tuple(e)] += 1 + events[job] = np.array(e) + times[job] = np.array(t) + print('job %s , thread %s' % (job, thread)) + print ('events %s' %events[job]) + print ('times %s' %times[job]) + fits[job] = stats.linregress(times[job], events[job]) + print ('fits %s' %fits[job].slope) + + # if any jobs failed, skip the whole measurement + if any(failed_jobs): + print '%d %s failed, this measurement will be ignored' % (sum(failed_jobs), 'jobs' if sum(failed_jobs) > 1 else 'job') + failed[repeat] = True + continue + + # if all jobs were successful, delete the temporary directories + for job in range(jobs): + jobdir = os.path.join(workdir, "step%02d_part%02d" % (repeat, job)) + shutil.rmtree(jobdir) + + reference_events = np.array(sorted(consistent_events, key = consistent_events.get, reverse = True)[0]) + + # check for jobs with inconsistent events + inconsistent = [ False ] * jobs + for job in range(jobs): + if (len(events[job]) != len(reference_events)) or any(events[job] != reference_events): + print 'Inconsistent measurement points for job %d, will be skipped' % job + inconsistent[job] = True + + # delete data from inconsistent jobs + for job in range(jobs-1, -1, -1): + if inconsistent[job]: + del times[job] + del fits[job] + del inconsistent[job] + jobs -= 1 + + # measure the average throughput + used_events = reference_events[-1] - reference_events[0] + print('fit slope: %s' % [fit.slope for fit in fits]) + + throughput = sum(fit.slope for fit in fits) + error = math.sqrt(sum(fit.stderr * fit.stderr for fit in fits)) + if jobs > 1: + # if running more than on job in parallel, estimate and print the overlap among them + overlap = (min(t[-1] for t in times) - max(t[0] for t in times)) / sum(t[-1] - t[0] for t in times) * len(times) + if overlap < 0.: + overlap = 0. + # machine- or human-readable formatting + formatting = '%8.1f\t%8.1f\t%d\t%0.1f%%' if plumbing else u'%8.1f \u00b1 %5.1f ev/s (%d events, %0.1f%% overlap)' + print formatting % (throughput, error, used_events, overlap * 100.) + else: + overlap = 1. + # machine- or human-readable formatting + formatting = '%8.1f\t%8.1f\t%d' if plumbing else u'%8.1f \u00b1 %5.1f ev/s (%d events)' + print formatting % (throughput, error, used_events) + + # store the values to compute the average throughput over the repetitions + if repeats > 1 and not plumbing: + throughputs[repeat] = throughput + overlaps[repeat] = overlap + + # store performance points for later analysis + if data: + data.write('%d, %f, %d, %d, %d, %d, %f, %f\n' % (jobs, overlap, threads, streams, gpus_per_job, used_events, throughput, error)) + + + # compute the average throughput over the repetitions + if repeats > 1 and not plumbing: + # filter out the jobs with an overlap lower than 95% + values = [ throughputs[i] for i in range(repeats) if overlaps[i] >= 0.95 ] + n = len(values) + if n > 0: + value = np.average(values) + error = np.std(values, ddof=1) + else: + # no jobs with an overlap > 95%, use the "best" one + value = throughputs[overlaps.index(max(overlaps))] + error = float('nan') + print ' --------------------' + if n == repeats: + formatting = u'%8.1f \u00b1 %5.1f ev/s' + print formatting % (value, error) + elif n > 0: + formatting = u'%8.1f \u00b1 %5.1f ev/s (based on %d measurements)' + print formatting % (value, error, n) + else: + formatting = u'%8.1f (single measurement with the highest overlap)' + print formatting % (value, ) + print + + # delete the temporary work dir + shutil.rmtree(workdir) + + +def info(): + print '%d CPUs:' % len(cpus) + for cpu in cpus.values(): + print ' %d: %s (%d cores, %d threads)' % (cpu.socket, cpu.model, len(cpu.physical_processors), len(cpu.hardware_threads)) + print + + print '%d visible NVIDIA GPUs:' % len(gpus) + for gpu in gpus.values(): + print ' %d: %s' % (gpu.device, gpu.model) + print + + +if __name__ == "__main__": + options = { + 'verbose' : False, + 'plumbing' : False, + 'warmup' : True, + 'events' : 4200, + 'repeats' : 4, + 'jobs' : 2, + 'threads' :16, # per job + 'streams' : 8, # per job + 'gpus_per_job' : 2, # per job + 'allow_hyperthreading': True, + 'set_cpu_affinity' : True, + 'set_gpu_affinity' : True, + } + + # TODO parse arguments and options from the command line + + if options['verbose']: + info() + + if len(sys.argv) > 1: + process = parseProcess(sys.argv[1]) + multiCmsRun(process, **options) + diff --git a/cms/patatrack/cms-patatrack/utility_scripts/plot_scan.py b/cms/patatrack/cms-patatrack/utility_scripts/plot_scan.py new file mode 100755 index 0000000000000000000000000000000000000000..183ce627210ed283f073b2c4ec5d3934033c6bce --- /dev/null +++ b/cms/patatrack/cms-patatrack/utility_scripts/plot_scan.py @@ -0,0 +1,116 @@ +#! /usr/bin/env python + +import sys +import os.path + +import numpy as np +import pandas as pd +import matplotlib as mpl +mpl.use('agg') +import seaborn as sns + +# plot content options +options = { + 'normalise': False, # True: plot the average throughput per job, False: plot the total throughput + 'x axis': 'EDM streams', # 'CPU threads per job', 'CPU threads', 'EDM streams per job', 'EDM streams' +} + +# workaround for seaborn 0.9.0 +def fix_plot_range(plot, zoom = False): + data = plot.data[plot._x_var] + xmin = min(data) + xmax = max(data) + step = (xmax - xmin) * 0.05 + plot.set(xlim=(xmin - step, xmax + step)) + if not zoom: + plot.set(ylim=(0, None)) + + +sns.set(style={ # based on 'whitegrid' + 'axes.axisbelow': True, + 'axes.edgecolor': '.15', # .8 + 'axes.facecolor': 'white', + 'axes.grid': True, + 'axes.labelcolor': '.15', + 'axes.linewidth': 1, + 'figure.facecolor': 'white', + 'font.family': ['sans-serif'], + 'font.sans-serif': ['Arial', 'DejaVu Sans', 'Liberation Sans', 'Bitstream Vera Sans', 'sans-serif'], + 'grid.color': '.8', + 'grid.linestyle': '-', + 'image.cmap': 'rocket', + 'legend.frameon': False, + 'legend.numpoints': 1, + 'legend.scatterpoints': 1, + 'lines.solid_capstyle': 'round', + 'text.color': '.15', + 'xtick.color': '.15', + 'xtick.direction': 'out', + 'xtick.major.size': 0, + 'xtick.minor.size': 0, + 'ytick.color': '.15', + 'ytick.direction': 'out', + 'ytick.major.size': 0, + 'ytick.minor.size': 0, +}) + +sns.set_palette([ + (0., 0., 1.), # ROOT kBlue + (1., 0., 0.), # ROOT kRed + (0., 0., 0.), # ROOT kBlack + (1., 0.4, 0.), # ROOT kOrange +7 + (0.8, 0.2, 0.8), # ROOT kMagenta -3 +], 5) + +data = [] + +for filename in sys.argv[1:]: + # expected file format: + # jobs, overlap, CPU threads per job, EDM streams per job, GPUs per jobs, number of events, average throughput (ev/s), uncertainty (ev/s) + # 2, 0.994863, 6, 6, 1, 4000, 3591.314398, 1.665309 + # ... + values = pd.read_csv(filename).rename(columns=lambda x: x.strip()) + + # if the data does not have a name, build it from the file name + if not 'name' in values: + name = os.path.basename(filename) + if '.' in name: + i = name.rindex('.') + name = name[:i] + values.insert(0, 'name', [ name ] * len(values), True) + data.append(values) + +df = pd.concat(data, ignore_index = True) +del data + +# normalise to the number of jobs +if options['normalise']: + df['average throughput (ev/s)'] /= df['jobs'] + df['uncertainty (ev/s)'] /= df['jobs'] + +# compute the total numer of CPU threads and EDM streams +df['CPU threads'] = df['CPU threads per job'] * df['jobs'] +df['EDM streams'] = df['EDM streams per job'] * df['jobs'] + +plot = sns.lmplot( + data = df, + x = options['x axis'], + y = 'average throughput (ev/s)', + fit_reg = True, # estimate and plot a regression model + order = 4, # polynomial fit + hue = 'name', # different categories + height = 5.4, # plot height in inches, at 100 dpi + aspect = 16./9., # plot aspect ratio + legend = True, + legend_out = True, # show the legend to the right of the plot + truncate = False, + ci = 95., + ) + +# zoomed-in version of the plot +fix_plot_range(plot, zoom = True) # workaround for seaborn 0.9.0 +plot.savefig('zoom.png') + +# full Y axis +fix_plot_range(plot) # workaround for seaborn 0.9.0 +plot.savefig('plot.png') diff --git a/cms/patatrack/cms-patatrack/utility_scripts/set_output_encoding.py b/cms/patatrack/cms-patatrack/utility_scripts/set_output_encoding.py new file mode 100644 index 0000000000000000000000000000000000000000..fa5c837ecc6d4e0108143564e195a1b4fe077a7c --- /dev/null +++ b/cms/patatrack/cms-patatrack/utility_scripts/set_output_encoding.py @@ -0,0 +1,18 @@ +# -*- coding: utf-8 -*- + +# see https://stackoverflow.com/a/19700891/2050986 + +def set_output_encoding(encoding='utf-8', force=False): + import sys + import codecs + '''When piping to the terminal, python knows the encoding needed, and + sets it automatically. But when piping to another program (for example, + | less), python can not check the output encoding. In that case, it + is None. What I am doing here is to catch this situation for both + stdout and stderr and force the encoding''' + current = sys.stdout.encoding + if current is None or force: + sys.stdout = codecs.getwriter(encoding)(sys.stdout) + current = sys.stderr.encoding + if current is None or force: + sys.stderr = codecs.getwriter(encoding)(sys.stderr) diff --git a/cms/patatrack/cms-patatrack/utility_scripts/sourceFromPixelRaw_cff.py b/cms/patatrack/cms-patatrack/utility_scripts/sourceFromPixelRaw_cff.py new file mode 100644 index 0000000000000000000000000000000000000000..d2c5f704c406301fc1bfab8da1ac60a9b05968fd --- /dev/null +++ b/cms/patatrack/cms-patatrack/utility_scripts/sourceFromPixelRaw_cff.py @@ -0,0 +1,51 @@ +import FWCore.ParameterSet.Config as cms + +import glob +fed_prefix = '/bmk/data/store/opendata/cms' +fed_path = 'MonteCarloUpgrade/RunIIAutumn18DR/TTToHadronic_TuneCP5_13TeV-powheg-pythia8/run000001' +fed_basedir = fed_prefix + '/' + fed_path +fed_files = glob.glob(fed_basedir + '/*.raw') + +# input +FastMonitoringService = cms.Service( "FastMonitoringService", + filePerFwkStream = cms.untracked.bool( False ), + fastMonIntervals = cms.untracked.uint32( 2 ), + sleepTime = cms.untracked.int32( 1 ) +) + +EvFDaqDirector = cms.Service( "EvFDaqDirector", + runNumber = cms.untracked.uint32( 1 ), + + baseDir = cms.untracked.string( "tmp" ), + buBaseDir = cms.untracked.string( "tmp" ), + + useFileBroker = cms.untracked.bool( False ), + fileBrokerKeepAlive = cms.untracked.bool( True ), + fileBrokerPort = cms.untracked.string( "8080" ), + fileBrokerUseLocalLock = cms.untracked.bool( True ), + fuLockPollInterval = cms.untracked.uint32( 2000 ), + + requireTransfersPSet = cms.untracked.bool( False ), + selectedTransferMode = cms.untracked.string( "" ), + mergingPset = cms.untracked.string( "" ), + + outputAdler32Recheck = cms.untracked.bool( False ), +) + +source = cms.Source( "FedRawDataInputSource", + runNumber = cms.untracked.uint32( 1 ), + getLSFromFilename = cms.untracked.bool(True), + testModeNoBuilderUnit = cms.untracked.bool(False), + verifyAdler32 = cms.untracked.bool( True ), + verifyChecksum = cms.untracked.bool( True ), + alwaysStartFromfirstLS = cms.untracked.uint32( 0 ), + + useL1EventID = cms.untracked.bool( True ), # True for MC, True/False for data + eventChunkBlock = cms.untracked.uint32( 240 ), # 32 + eventChunkSize = cms.untracked.uint32( 240), # 32 + maxBufferedFiles = cms.untracked.uint32( 8 ), # 2 + numBuffers = cms.untracked.uint32( 8 ), # 2 + + fileListMode = cms.untracked.bool( True ), # False + fileNames = cms.untracked.vstring(*fed_files), +) diff --git a/cms/patatrack/cms-patatrack/utility_scripts/threaded.py b/cms/patatrack/cms-patatrack/utility_scripts/threaded.py new file mode 100644 index 0000000000000000000000000000000000000000..8418aedd46d51cfcbca63047d5ad3894574a0a5c --- /dev/null +++ b/cms/patatrack/cms-patatrack/utility_scripts/threaded.py @@ -0,0 +1,21 @@ +# see https://stackoverflow.com/questions/6893968/how-to-get-the-return-value-from-a-thread-in-python/14331755#14331755 + +def threaded(f, daemon=False): + import threading + import Queue + + def wrapper(q, *args, **kwargs): + '''this function calls the decorated function and puts the result in a queue''' + ret = f(*args, **kwargs) + q.put(ret) + + def wrap(*args, **kwargs): + '''this is the function returned from the decorator. It fires off wrapper + in a new thread and returns the thread object with the result queue attached''' + q = Queue.Queue() + t = threading.Thread(target=wrapper, args = (q,) + args, kwargs = kwargs) + t.daemon = daemon + t.result = q + return t + + return wrap diff --git a/cms/patatrack/cvmfs/.provenance/.keepme b/cms/patatrack/cvmfs/.provenance/.keepme new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/cms/patatrack/traces/cms.cern.ch_spec_custom.txt b/cms/patatrack/traces/cms.cern.ch_spec_custom.txt new file mode 100644 index 0000000000000000000000000000000000000000..e1badbcfe7e2791c18c36773c0a3321a621e3366 --- /dev/null +++ b/cms/patatrack/traces/cms.cern.ch_spec_custom.txt @@ -0,0 +1,3 @@ +/slc7_amd64_gcc820/external/py2-future/* +/slc7_amd64_gcc820/external/py2-numpy/* +/slc7_amd64_gcc820/external/py2-scipy/* \ No newline at end of file diff --git a/lhc/simpletrack/CHANGELOG.md b/lhc/simpletrack/CHANGELOG.md index 15f96362a5357aefcda4bb4f4b1b03080bd159db..5f76ff05ea93bd87961f767ef916c3169cd42b47 100644 --- a/lhc/simpletrack/CHANGELOG.md +++ b/lhc/simpletrack/CHANGELOG.md @@ -1,32 +1,26 @@ -# 0.3.0 (June 28th 2020) +# QA + +FIXES: +* For unpriviledged Singularity runs switched to /tmp/jobs as output folder. + +# Master UPDATES: * intel: NEO version updated to 20.25.17111 and oneAPI DPC++ to 2020.10.6.0.4 (June releases). * ROCm container added +CHANGES: +* Tagged build based on the spec definition. +* Using trigger-based build to rebuild only on simpletrack changes. +* CI/CD basic functionality test added for the CPU-based container builds i.e. intel and pocl + FEATURES: * Switched to argument for benchmark setup instead of environment variables. - -# 0.2.1 (June 25th 2020) +* Added "benchmark" mode to run and generate json output for the runs. +* Generate yaml alongside the json summary. +* Standalone execution of the simpletrack benchmark without an orchestrator. FIXES: * ocl-icd-dev package explicitely installed now to avoid build failures. - -# 0.2.0 (June 16th 2020) - -FIXES: * Using simpletrack device lists instead of clinfo. -FEATURES: -* Added "benchmark" mode to run and generate json output for the runs. -* Generate yaml alongside the json summary. - -# 0.1.0 (June 13th 2020) - -FEATURES: -* Standalone execution of the simpletrack benchmark without an orchestrator. - -CHANGES: -* Tagged build based on the spec definition. -* Using trigger-based build to rebuild only on simpletrack changes. -* CI/CD basic functionality test added for the CPU-based container builds i.e. intel and pocl diff --git a/lhc/simpletrack/README.md b/lhc/simpletrack/README.md index a05d7ab2e674f58c6685e12eb4d695af0c742934..bfbb3e9c59b442d2268ace6dd723f7ddcef9d9c0 100644 --- a/lhc/simpletrack/README.md +++ b/lhc/simpletrack/README.md @@ -10,7 +10,7 @@ Docker images containing OpenCL-oriented Simpletrack benchmark built for a selec | | __intel__ | __rocm__ | __nvidia__ | __pocl__ | |--------------|:-----------:|:-----------:|:--------:|:----------:| | __GPU__ | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | | -| __CPU__ | :heavy_check_mark: | :grey_question: | | :heavy_check_mark: | +| __CPU__ | :heavy_check_mark: | | | :heavy_check_mark: | # Usage @@ -33,15 +33,15 @@ Options: - Use the benchmark option "all" to execute runs on all available devices: ```~$ docker run --rm <image:tag> -b "all"``` - To discover available platforms use the show option: ```~$ docker run --rm <image:tag> -s``` -The benchmark mode allows to generate logs and output files in a default location (/simpletrack/examples/lhc/jobs or $CI_PROJECT_DIR) for either a single or all available devices. +The benchmark mode allows to generate logs and output files in a default location (/tmp/jobs or $CI_PROJECT_DIR) for either a single or all available devices. -## Docker GPU Passthrough +## GPU Passthrough To passthrough the device to the container, use the following options: -| Target | Passthrough option | -|:------------|:-------------------| -| __Nvidia__ | ```--gpus all``` | -| __AMD__ | ```--device /dev/kfd --device /dev/dri``` | -| __Intel__ | ```--device /dev/dri``` | +| Target | Docker | Singularity | +|:------------|:-------------------|:------------| +| __Nvidia__ | ```--gpus all``` | ```--nv``` | +| __AMD__ | ```--device /dev/kfd --device /dev/dri``` | ```--rocm``` | +| __Intel__ | ```--device /dev/dri``` | | diff --git a/lhc/simpletrack/lhc-simpletrack.sh b/lhc/simpletrack/lhc-simpletrack.sh index 8bf0ca1d6bc3acc4b15a515eeaf950ca2b183925..5ff77760f4b41cbcc15cd9d8357511e588208ed6 100755 --- a/lhc/simpletrack/lhc-simpletrack.sh +++ b/lhc/simpletrack/lhc-simpletrack.sh @@ -73,7 +73,7 @@ get_json() { ################################### ####### Main ###################### -if [ ! "$CI_PROJECT_DIR" == "" ]; then WORK_DIR=$CI_PROJECT_DIR/jobs; else WORK_DIR=`pwd`/jobs; fi +if [ ! "$CI_PROJECT_DIR" == "" ]; then WORK_DIR=$CI_PROJECT_DIR/jobs; else WORK_DIR="/tmp/jobs"; fi if [ ! -d $WORK_DIR ]; then mkdir -p $WORK_DIR; fi if [ -f $WORK_DIR/out.log ]; then rm $WORK_DIR/out.log; fi