diff --git a/cms/patatrack/ci-scripts/Dockerfile.nvidia.full b/cms/patatrack/ci-scripts/Dockerfile.nvidia.full
index c422efc4cf25ee5bc4bf2cf5cfea8c8559556bb2..f88d388df1e94f6a7a82670954845565ea4c942d 100644
--- a/cms/patatrack/ci-scripts/Dockerfile.nvidia.full
+++ b/cms/patatrack/ci-scripts/Dockerfile.nvidia.full
@@ -3,5 +3,5 @@ FROM gitlab-registry.cern.ch/hep-benchmarks/hep-workloads-gpu/cms/cms-patatrack-
 COPY . /stage/
 RUN ls -la /stage/*
 RUN /stage/ci-scripts/build_final.sh
-
+USER 1001
 ENTRYPOINT ["/bmk/./cms-patatrack/cms-patatrack-bmk.sh"]
diff --git a/cms/patatrack/ci-scripts/build_final.sh b/cms/patatrack/ci-scripts/build_final.sh
index afef513f39249df8fa79cb968341dfff4f38f888..608f2147f27c3b209b2c063b36b8f7ea07eeb068 100755
--- a/cms/patatrack/ci-scripts/build_final.sh
+++ b/cms/patatrack/ci-scripts/build_final.sh
@@ -10,10 +10,6 @@ mv /stage/cvmfs /cvmfs
 date
 mv /stage/cms-patatrack /bmk/./cms-patatrack
 
-# Make only readable
-date
-chmod -R 555 /cvmfs
-
 # FIXME This checksum takes a lot of time.
 # Commenting it. Can be substituted by a checksum using cvmfs utilities
 #tar -cf /tmp/cvmfs_checksum.tar /cvmfs && md5sum /tmp/cvmfs_checksum.tar | cut -f1 -d" " > /tmp/cvmfs_checksum && rm /tmp/cvmfs_checksum.tar
@@ -42,5 +38,11 @@ echo '{"version":"v1.3","description":"CMS RECO of ttbar events, based on CMSSW_
 
 # Add user 'bmkuser' to benchmarks as a non-root user (BMK-166 and BMK-167)
 # shoudl not be needed, using cvmfs read only
-#groupadd bmkuser
-#useradd -g bmkuser --create-home --shell /bin/bash bmkuser
+date
+useradd -u 1001 -r -g 0 -d /bmk -s /sbin/nologin -c "Default Application User" bmkuser; 
+
+# remove write permissions from /cvmfs
+chmod -R a-w /cvmfs; 
+chmod -R g+rX /cvmfs; 
+
+chown -R 1001:0 /bmk /tmp /scratch /results
diff --git a/cms/patatrack/ci-scripts/snapshot_cvmfs.sh b/cms/patatrack/ci-scripts/snapshot_cvmfs.sh
index b885ab198f56a965d9ecdfd4aec899c8827c4d77..0386d3cdaab62e9b202352ab8da51312ea92e0fd 100644
--- a/cms/patatrack/ci-scripts/snapshot_cvmfs.sh
+++ b/cms/patatrack/ci-scripts/snapshot_cvmfs.sh
@@ -39,9 +39,13 @@ function _after_script() {
     #sigkill and remove containers, images remain.
     docker rm -f cvmfs_${CI_JOB_ID}
     docker rm -f patatrack_container
+    rm -f ${CIENV_CVMFSVOLUME}
 }
 
-# TODO: clean up $CIENV_CVMFSVOLUME, clean up docker image cache
+export CI_PROJECT_DIR=${CI_PROJECT_DIR:-$(pwd)}
+export CI_JOB_ID=${CI_JOB_ID:-noci}
+export CI_COMMIT_BRANCH=${CI_COMMIT_BRANCH:-qa}
+
 export CIENV_CVMFSVOLUME=/scratch/cvmfs_hep/CI-JOB-${CI_JOB_ID}
 export CVMFS_EXPORT_DIR=${CI_PROJECT_DIR}/cms/patatrack
 export CIENV_CVMFSREPO=cms.cern.ch
diff --git a/cms/patatrack/cms-patatrack/cms-patatrack-bmk.sh b/cms/patatrack/cms-patatrack/cms-patatrack-bmk.sh
index 1f1aac4b235ff26106b7f786f84c392b138e5d10..ed7bd58bb767b217009c82bddbcc304c1513b6da 100755
--- a/cms/patatrack/cms-patatrack/cms-patatrack-bmk.sh
+++ b/cms/patatrack/cms-patatrack/cms-patatrack-bmk.sh
@@ -28,7 +28,7 @@ function doOne(){
   myecho "current dir is `pwd`"
   myecho "files in `pwd` are"
   ls -l
-  ${BMKDIR}/utility_scripts/benchmark.py ${BMKDIR}/cmssw_config.py #>>$LOG 2>&1 3>&1
+  ${BMKDIR}/utility_scripts/benchmark.py ${BMKDIR}/profile_pixel-only_GPU.py #>>$LOG 2>&1 3>&1
   #######################################
 
   status=${?}
diff --git a/cms/patatrack/cms-patatrack/parseResults.sh b/cms/patatrack/cms-patatrack/parseResults.sh
index 2af853aac8b43aa43590ca6075fdc86f15c17570..d9b1b2bd91b3b86063dac16868ee60805d07aa4a 100644
--- a/cms/patatrack/cms-patatrack/parseResults.sh
+++ b/cms/patatrack/cms-patatrack/parseResults.sh
@@ -35,7 +35,7 @@ function parseResults(){
     # Documentation of cmssw time report at https://github.com/cms-sw/cmssw/blob/09c3fce6626f70fd04223e7dacebf0b485f73f54/FWCore/Services/plugins/Timing.cc#L240
     # Parsing  Event Throughput: xxxx ev/s
     res_thr=`grep -H "Event Throughput" proc_*/out_*.log | sed -e "s@[^:]*: Event Throughput: \([ 0-9\.]*\) ev/s@\1@" | awk 'BEGIN{amin=1000000;amax=0;count=0;}  { val=$1; a[count]=val; count+=1; sum+=val; if(amax<val) amax=val; if(amin>val) amin=val} END{n = asort(a); if (n % 2) {   median=a[(n + 1) / 2]; } else {median=(a[(n / 2)] + a[(n / 2) + 1]) / 2.0;};
-printf "{\"score\": %.4f, \"avg\": %.4f, \"median\": %.4f, \"min\": %.4f, \"max\": %.4f}", sum, sum/count, median, amin, amax
+printf "{\"avg\": %.4f, \"median\": %.4f, \"min\": %.4f, \"max\": %.4f}", sum/count, median, amin, amax
 }'  nevt=$NEVENTS_THREAD nthread=$NTHREADS || (echo "{}"; return 1)`
     STATUS_1=$?
 
@@ -46,7 +46,7 @@ printf "{\"reco\": %.4f}", sum
 
     # Parsing  CPU Summary: \n- Total loop:: xxxx seconds of all CPUs
     res_cpu=`grep -H -A2 "CPU Summary" proc_*/out_*.log | grep "Total loop" | sed -e "s@.*\sTotal loop: \([ 0-9\.]*\)@\1@" | awk 'BEGIN{amin=1000000;amax=0;count=0;}  { val=nevt*nthread/$1; a[count]=val; count+=1; sum+=val; if(amax<val) amax=val; if(amin>val) amin=val} END{n = asort(a); if (n % 2) {median=a[(n + 1) / 2]; } else {median=(a[(n / 2)] + a[(n / 2) + 1]) / 2.0;};
-printf "{\"score\": %.4f, \"avg\": %.4f, \"median\": %.4f, \"min\": %.4f, \"max\": %.4f}", sum, sum/count, median, amin, amax
+printf "{\"avg\": %.4f, \"median\": %.4f, \"min\": %.4f, \"max\": %.4f}", sum/count, median, amin, amax
 }' nevt=$NEVENTS_THREAD nthread=$NTHREADS || (echo "{}"; return 1)`
     STATUS_2=$?
     [[ "$STATUS_1" == "0" ]] && [[ "$STATUS_2" == "0" ]]
diff --git a/cms/patatrack/cms-patatrack/profile_pixel-only_CPU.py b/cms/patatrack/cms-patatrack/profile_pixel-only_CPU.py
new file mode 100644
index 0000000000000000000000000000000000000000..5974734d2145ba8f8b73fc7d7e82ab7814a1508d
--- /dev/null
+++ b/cms/patatrack/cms-patatrack/profile_pixel-only_CPU.py
@@ -0,0 +1,186 @@
+# Auto generated configuration file
+# using: 
+# Revision: 1.19 
+# Source: /local/reps/CMSSW/CMSSW/Configuration/Applications/python/ConfigBuilder.py,v 
+# with command line options: step3 --conditions auto:phase1_2018_realistic -n 10 --era Run2_2018 --eventcontent RECOSIM,DQM --runUnscheduled -s RAW2DIGI:RawToDigi_pixelOnly,RECO:reconstruction_pixelTrackingOnly,VALIDATION:@pixelTrackingOnlyValidation,DQM:@pixelTrackingOnlyDQM --datatier GEN-SIM-RECO,DQMIO --geometry DB:Extended --conditions auto:phase1_2018_design --no_exec --filein file:step2.root --fileout file:step3.root --nThreads 16
+import FWCore.ParameterSet.Config as cms
+
+from Configuration.Eras.Era_Run2_2018_cff import Run2_2018
+
+process = cms.Process('RECO',Run2_2018)
+
+# import of standard configurations
+process.load('Configuration.StandardSequences.Services_cff')
+process.load('SimGeneral.HepPDTESSource.pythiapdt_cfi')
+process.load('FWCore.MessageService.MessageLogger_cfi')
+process.load('Configuration.EventContent.EventContent_cff')
+process.load('SimGeneral.MixingModule.mixNoPU_cfi')
+process.load('Configuration.StandardSequences.GeometryRecoDB_cff')
+process.load('Configuration.StandardSequences.MagneticField_cff')
+process.load('Configuration.StandardSequences.RawToDigi_cff')
+process.load('Configuration.StandardSequences.Reconstruction_cff')
+process.load('Configuration.StandardSequences.Validation_cff')
+process.load('DQMServices.Core.DQMStoreNonLegacy_cff')
+process.load('DQMOffline.Configuration.DQMOfflineMC_cff')
+process.load('Configuration.StandardSequences.FrontierConditions_GlobalTag_cff')
+
+process.maxEvents = cms.untracked.PSet(
+    input = cms.untracked.int32(10),
+    output = cms.optional.untracked.allowed(cms.int32,cms.PSet)
+)
+
+# Input source
+process.source = cms.Source("PoolSource",
+    fileNames = cms.untracked.vstring('file:step2.root'),
+    secondaryFileNames = cms.untracked.vstring()
+)
+
+process.options = cms.untracked.PSet(
+    FailPath = cms.untracked.vstring(),
+    IgnoreCompletely = cms.untracked.vstring(),
+    Rethrow = cms.untracked.vstring(),
+    SkipEvent = cms.untracked.vstring(),
+    allowUnscheduled = cms.obsolete.untracked.bool,
+    canDeleteEarly = cms.untracked.vstring(),
+    emptyRunLumiMode = cms.obsolete.untracked.string,
+    eventSetup = cms.untracked.PSet(
+        forceNumberOfConcurrentIOVs = cms.untracked.PSet(
+
+        ),
+        numberOfConcurrentIOVs = cms.untracked.uint32(1)
+    ),
+    fileMode = cms.untracked.string('FULLMERGE'),
+    forceEventSetupCacheClearOnNewRun = cms.untracked.bool(False),
+    makeTriggerResults = cms.obsolete.untracked.bool,
+    numberOfConcurrentLuminosityBlocks = cms.untracked.uint32(1),
+    numberOfConcurrentRuns = cms.untracked.uint32(1),
+    numberOfStreams = cms.untracked.uint32(0),
+    numberOfThreads = cms.untracked.uint32(1),
+    printDependencies = cms.untracked.bool(False),
+    sizeOfStackForThreadsInKB = cms.optional.untracked.uint32,
+    throwIfIllegalParameter = cms.untracked.bool(True),
+    wantSummary = cms.untracked.bool(False)
+)
+
+# Production Info
+process.configurationMetadata = cms.untracked.PSet(
+    annotation = cms.untracked.string('step3 nevts:10'),
+    name = cms.untracked.string('Applications'),
+    version = cms.untracked.string('$Revision: 1.19 $')
+)
+
+# Output definition
+
+process.RECOSIMoutput = cms.OutputModule("PoolOutputModule",
+    dataset = cms.untracked.PSet(
+        dataTier = cms.untracked.string('GEN-SIM-RECO'),
+        filterName = cms.untracked.string('')
+    ),
+    fileName = cms.untracked.string('file:step3.root'),
+    outputCommands = process.RECOSIMEventContent.outputCommands,
+    splitLevel = cms.untracked.int32(0)
+)
+
+process.DQMoutput = cms.OutputModule("DQMRootOutputModule",
+    dataset = cms.untracked.PSet(
+        dataTier = cms.untracked.string('DQMIO'),
+        filterName = cms.untracked.string('')
+    ),
+    fileName = cms.untracked.string('file:step3_inDQM.root'),
+    outputCommands = process.DQMEventContent.outputCommands,
+    splitLevel = cms.untracked.int32(0)
+)
+
+# Additional output definition
+
+# Other statements
+process.mix.playback = True
+process.mix.digitizers = cms.PSet()
+for a in process.aliases: delattr(process, a)
+process.RandomNumberGeneratorService.restoreStateLabel=cms.untracked.string("randomEngineStateProducer")
+from Configuration.AlCa.GlobalTag import GlobalTag
+process.GlobalTag = GlobalTag(process.GlobalTag, 'auto:phase1_2018_design', '')
+
+# Path and EndPath definitions
+process.raw2digi_step = cms.Path(process.RawToDigi_pixelOnly)
+process.reconstruction_step = cms.Path(process.reconstruction_pixelTrackingOnly)
+process.prevalidation_step = cms.Path(process.globalPrevalidationPixelTrackingOnly)
+process.pL1TkElectronsEllipticMatchHGC = cms.Path(process.L1TkElectronsEllipticMatchHGC)
+process.pL1TkMuon = cms.Path(process.L1TkMuons+process.L1TkMuonsTP)
+process.pL1TkIsoElectronsHGC = cms.Path(process.L1TkIsoElectronsHGC)
+process.pL1TkIsoElectronsCrystal = cms.Path(process.L1TkIsoElectronsCrystal)
+process.pL1TkPrimaryVertex = cms.Path(process.L1TkPrimaryVertex)
+process.pL1TkElectronsLooseHGC = cms.Path(process.L1TkElectronsLooseHGC)
+process.pL1TkPhotonsCrystal = cms.Path(process.L1TkPhotonsCrystal)
+process.pL1TkElectronsEllipticMatchCrystal = cms.Path(process.L1TkElectronsEllipticMatchCrystal)
+process.pL1TkElectronsHGC = cms.Path(process.L1TkElectronsHGC)
+process.pL1TkPhotonsHGC = cms.Path(process.L1TkPhotonsHGC)
+process.pL1TkElectronsCrystal = cms.Path(process.L1TkElectronsCrystal)
+process.pL1TkElectronsLooseCrystal = cms.Path(process.L1TkElectronsLooseCrystal)
+process.validation_step = cms.EndPath(process.globalValidationPixelTrackingOnly)
+process.dqmoffline_step = cms.EndPath(process.DQMOfflinePixelTracking)
+process.dqmofflineOnPAT_step = cms.EndPath(process.PostDQMOffline)
+process.RECOSIMoutput_step = cms.EndPath(process.RECOSIMoutput)
+process.DQMoutput_step = cms.EndPath(process.DQMoutput)
+
+# Schedule definition
+process.schedule = cms.Schedule(process.raw2digi_step,process.reconstruction_step,process.prevalidation_step,process.validation_step,process.dqmoffline_step,process.dqmofflineOnPAT_step,process.RECOSIMoutput_step,process.DQMoutput_step)
+from PhysicsTools.PatAlgos.tools.helpers import associatePatAlgosToolsTask
+associatePatAlgosToolsTask(process)
+
+#Setup FWK for multithreaded
+process.options.numberOfThreads=cms.untracked.uint32(16)
+process.options.numberOfStreams=cms.untracked.uint32(0)
+process.options.numberOfConcurrentLuminosityBlocks=cms.untracked.uint32(1)
+
+# customisation of the process.
+
+# Automatic addition of the customisation function from SimGeneral.MixingModule.fullMixCustomize_cff
+from SimGeneral.MixingModule.fullMixCustomize_cff import setCrossingFrameOn 
+
+#call to customisation function setCrossingFrameOn imported from SimGeneral.MixingModule.fullMixCustomize_cff
+process = setCrossingFrameOn(process)
+
+# End of customisation functions
+#do not add changes to your config after this point (unless you know what you are doing)
+from FWCore.ParameterSet.Utilities import convertToUnscheduled
+process=convertToUnscheduled(process)
+
+
+# Customisation from command line
+
+#Have logErrorHarvester wait for the same EDProducers to finish as those providing data for the OutputModule
+from FWCore.Modules.logErrorHarvester_cff import customiseLogErrorHarvesterUsingOutputCommands
+process = customiseLogErrorHarvesterUsingOutputCommands(process)
+
+# Add early deletion of temporary data products to reduce peak memory need
+from Configuration.StandardSequences.earlyDeleteSettings_cff import customiseEarlyDelete
+process = customiseEarlyDelete(process)
+# End adding early deletion
+
+# load the CUDA service, but disable it for running on CPU
+process.CUDAService = cms.Service("CUDAService",
+    enabled = cms.untracked.bool(False)
+)
+
+# customise the configuration for profiling the Pixel-only workflow on CPU
+from RecoPixelVertexing.Configuration.customizePixelTracksSoAonCPU import customizePixelTracksSoAonCPUForProfiling
+process = customizePixelTracksSoAonCPUForProfiling(process)
+
+# load data using the DAQ source
+del process.source
+process.load('sourceFromPixelRaw_cff')
+
+# the raw data do not have the random number state
+del process.RandomNumberGeneratorService.restoreStateLabel
+
+# build triplets and run the broken line fit
+process.caHitNtupletCUDA.minHitsPerNtuplet = 3
+process.caHitNtupletCUDA.includeJumpingForwardDoublets = True
+process.caHitNtupletCUDA.useRiemannFit = False
+
+# report CUDAService messages
+process.MessageLogger.categories.append("CUDAService")
+
+# print the summary
+process.options.wantSummary = cms.untracked.bool( True )
diff --git a/cms/patatrack/cms-patatrack/profile_pixel-only_GPU.py b/cms/patatrack/cms-patatrack/profile_pixel-only_GPU.py
new file mode 100644
index 0000000000000000000000000000000000000000..8a39d3938e805e137844dae0f9fc8671a1fd9d28
--- /dev/null
+++ b/cms/patatrack/cms-patatrack/profile_pixel-only_GPU.py
@@ -0,0 +1,182 @@
+# Auto generated configuration file
+# using: 
+# Revision: 1.19 
+# Source: /local/reps/CMSSW/CMSSW/Configuration/Applications/python/ConfigBuilder.py,v 
+# with command line options: step3 --conditions auto:phase1_2018_realistic -n 10 --era Run2_2018 --eventcontent RECOSIM,DQM --runUnscheduled --procModifiers gpu -s RAW2DIGI:RawToDigi_pixelOnly,RECO:reconstruction_pixelTrackingOnly,VALIDATION:@pixelTrackingOnlyValidation,DQM:@pixelTrackingOnlyDQM --datatier GEN-SIM-RECO,DQMIO --geometry DB:Extended --conditions auto:phase1_2018_design --no_exec --filein file:step2.root --fileout file:step3.root --nThreads 16
+import FWCore.ParameterSet.Config as cms
+
+from Configuration.Eras.Era_Run2_2018_cff import Run2_2018
+from Configuration.ProcessModifiers.gpu_cff import gpu
+
+process = cms.Process('RECO',Run2_2018,gpu)
+
+# import of standard configurations
+process.load('Configuration.StandardSequences.Services_cff')
+process.load('SimGeneral.HepPDTESSource.pythiapdt_cfi')
+process.load('FWCore.MessageService.MessageLogger_cfi')
+process.load('Configuration.EventContent.EventContent_cff')
+process.load('SimGeneral.MixingModule.mixNoPU_cfi')
+process.load('Configuration.StandardSequences.GeometryRecoDB_cff')
+process.load('Configuration.StandardSequences.MagneticField_cff')
+process.load('Configuration.StandardSequences.RawToDigi_cff')
+process.load('Configuration.StandardSequences.Reconstruction_cff')
+process.load('Configuration.StandardSequences.Validation_cff')
+process.load('DQMServices.Core.DQMStoreNonLegacy_cff')
+process.load('DQMOffline.Configuration.DQMOfflineMC_cff')
+process.load('Configuration.StandardSequences.FrontierConditions_GlobalTag_cff')
+
+process.maxEvents = cms.untracked.PSet(
+    input = cms.untracked.int32(4200),
+    output = cms.optional.untracked.allowed(cms.int32,cms.PSet)
+)
+
+# Input source
+process.source = cms.Source("PoolSource",
+    fileNames = cms.untracked.vstring('file:step2.root'),
+    secondaryFileNames = cms.untracked.vstring()
+)
+
+process.options = cms.untracked.PSet(
+    FailPath = cms.untracked.vstring(),
+    IgnoreCompletely = cms.untracked.vstring(),
+    Rethrow = cms.untracked.vstring(),
+    SkipEvent = cms.untracked.vstring(),
+    allowUnscheduled = cms.obsolete.untracked.bool,
+    canDeleteEarly = cms.untracked.vstring(),
+    emptyRunLumiMode = cms.obsolete.untracked.string,
+    eventSetup = cms.untracked.PSet(
+        forceNumberOfConcurrentIOVs = cms.untracked.PSet(
+
+        ),
+        numberOfConcurrentIOVs = cms.untracked.uint32(1)
+    ),
+    fileMode = cms.untracked.string('FULLMERGE'),
+    forceEventSetupCacheClearOnNewRun = cms.untracked.bool(False),
+    makeTriggerResults = cms.obsolete.untracked.bool,
+    numberOfConcurrentLuminosityBlocks = cms.untracked.uint32(1),
+    numberOfConcurrentRuns = cms.untracked.uint32(1),
+    numberOfStreams = cms.untracked.uint32(0),
+    numberOfThreads = cms.untracked.uint32(1),
+    printDependencies = cms.untracked.bool(False),
+    sizeOfStackForThreadsInKB = cms.optional.untracked.uint32,
+    throwIfIllegalParameter = cms.untracked.bool(True),
+    wantSummary = cms.untracked.bool(False)
+)
+
+# Production Info
+process.configurationMetadata = cms.untracked.PSet(
+    annotation = cms.untracked.string('step3 nevts:10'),
+    name = cms.untracked.string('Applications'),
+    version = cms.untracked.string('$Revision: 1.19 $')
+)
+
+# Output definition
+
+process.RECOSIMoutput = cms.OutputModule("PoolOutputModule",
+    dataset = cms.untracked.PSet(
+        dataTier = cms.untracked.string('GEN-SIM-RECO'),
+        filterName = cms.untracked.string('')
+    ),
+    fileName = cms.untracked.string('file:step3.root'),
+    outputCommands = process.RECOSIMEventContent.outputCommands,
+    splitLevel = cms.untracked.int32(0)
+)
+
+process.DQMoutput = cms.OutputModule("DQMRootOutputModule",
+    dataset = cms.untracked.PSet(
+        dataTier = cms.untracked.string('DQMIO'),
+        filterName = cms.untracked.string('')
+    ),
+    fileName = cms.untracked.string('file:step3_inDQM.root'),
+    outputCommands = process.DQMEventContent.outputCommands,
+    splitLevel = cms.untracked.int32(0)
+)
+
+# Additional output definition
+
+# Other statements
+process.mix.playback = True
+process.mix.digitizers = cms.PSet()
+for a in process.aliases: delattr(process, a)
+process.RandomNumberGeneratorService.restoreStateLabel=cms.untracked.string("randomEngineStateProducer")
+from Configuration.AlCa.GlobalTag import GlobalTag
+process.GlobalTag = GlobalTag(process.GlobalTag, 'auto:phase1_2018_design', '')
+
+# Path and EndPath definitions
+process.raw2digi_step = cms.Path(process.RawToDigi_pixelOnly)
+process.reconstruction_step = cms.Path(process.reconstruction_pixelTrackingOnly)
+process.prevalidation_step = cms.Path(process.globalPrevalidationPixelTrackingOnly)
+process.pL1TkElectronsEllipticMatchHGC = cms.Path(process.L1TkElectronsEllipticMatchHGC)
+process.pL1TkMuon = cms.Path(process.L1TkMuons+process.L1TkMuonsTP)
+process.pL1TkIsoElectronsHGC = cms.Path(process.L1TkIsoElectronsHGC)
+process.pL1TkIsoElectronsCrystal = cms.Path(process.L1TkIsoElectronsCrystal)
+process.pL1TkPrimaryVertex = cms.Path(process.L1TkPrimaryVertex)
+process.pL1TkElectronsLooseHGC = cms.Path(process.L1TkElectronsLooseHGC)
+process.pL1TkPhotonsCrystal = cms.Path(process.L1TkPhotonsCrystal)
+process.pL1TkElectronsEllipticMatchCrystal = cms.Path(process.L1TkElectronsEllipticMatchCrystal)
+process.pL1TkElectronsHGC = cms.Path(process.L1TkElectronsHGC)
+process.pL1TkPhotonsHGC = cms.Path(process.L1TkPhotonsHGC)
+process.pL1TkElectronsCrystal = cms.Path(process.L1TkElectronsCrystal)
+process.pL1TkElectronsLooseCrystal = cms.Path(process.L1TkElectronsLooseCrystal)
+process.validation_step = cms.EndPath(process.globalValidationPixelTrackingOnly)
+process.dqmoffline_step = cms.EndPath(process.DQMOfflinePixelTracking)
+process.dqmofflineOnPAT_step = cms.EndPath(process.PostDQMOffline)
+process.RECOSIMoutput_step = cms.EndPath(process.RECOSIMoutput)
+process.DQMoutput_step = cms.EndPath(process.DQMoutput)
+
+# Schedule definition
+process.schedule = cms.Schedule(process.raw2digi_step,process.reconstruction_step,process.prevalidation_step,process.validation_step,process.dqmoffline_step,process.dqmofflineOnPAT_step,process.RECOSIMoutput_step,process.DQMoutput_step)
+from PhysicsTools.PatAlgos.tools.helpers import associatePatAlgosToolsTask
+associatePatAlgosToolsTask(process)
+
+#Setup FWK for multithreaded
+process.options.numberOfThreads=cms.untracked.uint32(16)
+process.options.numberOfStreams=cms.untracked.uint32(0)
+process.options.numberOfConcurrentLuminosityBlocks=cms.untracked.uint32(1)
+
+# customisation of the process.
+
+# Automatic addition of the customisation function from SimGeneral.MixingModule.fullMixCustomize_cff
+from SimGeneral.MixingModule.fullMixCustomize_cff import setCrossingFrameOn 
+
+#call to customisation function setCrossingFrameOn imported from SimGeneral.MixingModule.fullMixCustomize_cff
+process = setCrossingFrameOn(process)
+
+# End of customisation functions
+#do not add changes to your config after this point (unless you know what you are doing)
+from FWCore.ParameterSet.Utilities import convertToUnscheduled
+process=convertToUnscheduled(process)
+
+
+# Customisation from command line
+
+#Have logErrorHarvester wait for the same EDProducers to finish as those providing data for the OutputModule
+from FWCore.Modules.logErrorHarvester_cff import customiseLogErrorHarvesterUsingOutputCommands
+process = customiseLogErrorHarvesterUsingOutputCommands(process)
+
+# Add early deletion of temporary data products to reduce peak memory need
+from Configuration.StandardSequences.earlyDeleteSettings_cff import customiseEarlyDelete
+process = customiseEarlyDelete(process)
+# End adding early deletion
+
+# customise the configuration for profiling the Pixel-only workflow on GPU
+from RecoPixelVertexing.Configuration.customizePixelTracksForProfiling import customizePixelTracksForProfilingGPUOnly
+process = customizePixelTracksForProfilingGPUOnly(process)
+
+# load data using the DAQ source
+del process.source
+process.load('sourceFromPixelRaw_cff')
+
+# the raw data do not have the random number state
+del process.RandomNumberGeneratorService.restoreStateLabel
+
+# build triplets and run the broken line fit
+process.caHitNtupletCUDA.minHitsPerNtuplet = 3
+process.caHitNtupletCUDA.includeJumpingForwardDoublets = True
+process.caHitNtupletCUDA.useRiemannFit = False
+
+# report CUDAService messages
+process.MessageLogger.categories.append("CUDAService")
+
+# print the summary
+process.options.wantSummary = cms.untracked.bool( True )
diff --git a/cms/patatrack/cms-patatrack/utility_scripts/benchmark.py b/cms/patatrack/cms-patatrack/utility_scripts/benchmark.py
index e3347b89b84c58b47b2ac67342c36b7400da8436..5a3edd0e138c79c0831c6b9c4ca774d7634f2099 100755
--- a/cms/patatrack/cms-patatrack/utility_scripts/benchmark.py
+++ b/cms/patatrack/cms-patatrack/utility_scripts/benchmark.py
@@ -31,8 +31,8 @@ if __name__ == "__main__":
     'allow_hyperthreading': False,      # this has no effect if set_cpu_affinity is False
     'set_cpu_affinity'    : True,
     'set_gpu_affinity'    : True,
-    'logdir'              : None,       # relative or absolute path, or None to disable storing the logs
-    'keep'                : [],         # output files to be kept
+    'logdir'              : 'logs',       # relative or absolute path, or None to disable storing the logs
+    'keep'                : [ 'resources.json' ],       # additional output files to be kept, along with the logs
   }
 
 
diff --git a/cms/patatrack/cms-patatrack/utility_scripts/multirun.py b/cms/patatrack/cms-patatrack/utility_scripts/multirun.py
index d1281a5f83d39166944f53511fd08f8b13b0f17e..fbe6d57283d946d868a854dd476d2d0168c44359 100755
--- a/cms/patatrack/cms-patatrack/utility_scripts/multirun.py
+++ b/cms/patatrack/cms-patatrack/utility_scripts/multirun.py
@@ -46,6 +46,7 @@ def singleCmsRun(filename, workdir, logdir = None, keep = [], verbose = False, c
 
   if verbose:
     print cmdline
+    sys.stdout.flush()
 
   # run a cmsRun job, redirecting standard output and error to files
   lognames = ('stdout', 'stderr')
@@ -73,6 +74,7 @@ def singleCmsRun(filename, workdir, logdir = None, keep = [], verbose = False, c
     print "".join(stderr.readlines()[-10:])
     print
     print "See %s and %s for the full logs" % logfiles
+    sys.stdout.flush()
     stderr.close()
     return None
 
@@ -83,11 +85,13 @@ def singleCmsRun(filename, workdir, logdir = None, keep = [], verbose = False, c
     print "".join(stderr.readlines()[-10:])
     print
     print "See %s and %s for the full logs" % logfiles
+    sys.stdout.flush()
     stderr.close()
     return None
 
   if verbose:
     print "The underlying cmsRun job completed successfully"
+    sys.stdout.flush()
 
   # analyse the output
   date_format  = '%d-%b-%Y %H:%M:%S.%f'
@@ -257,6 +261,7 @@ def multiCmsRun(
     else:
       thislogdir = None
     print 'Warming up'
+    sys.stdout.flush()
     thread = singleCmsRun(config.name, jobdir, thislogdir, [], verbose, cpu_assignment[0], gpu_assignment[0], *args)
     thread.start()
     thread.join()
@@ -276,6 +281,7 @@ def multiCmsRun(
     n_events = 'all'
 
   print 'Running %s over %s events with %d jobs, each with %d threads, %d streams and %d GPUs' % (n_times, n_events, jobs, threads, streams, gpus_per_job)
+  sys.stdout.flush()
 
   # store the values to compute the average throughput over the repetitions
   failed = [ False ] * repeats
@@ -333,6 +339,7 @@ def multiCmsRun(
     # if any jobs failed, skip the whole measurement
     if any(failed_jobs):
       print '%d %s failed, this measurement will be ignored' % (sum(failed_jobs), 'jobs' if sum(failed_jobs) > 1 else 'job')
+      sys.stdout.flush()
       failed[repeat] = True
       continue
 
@@ -348,6 +355,7 @@ def multiCmsRun(
     for job in range(jobs):
       if (len(events[job]) != len(reference_events)) or any(events[job] != reference_events):
         print 'Inconsistent measurement points for job %d, will be skipped' % job
+        sys.stdout.flush()
         inconsistent[job] = True
 
     # delete data from inconsistent jobs
@@ -377,6 +385,7 @@ def multiCmsRun(
       # machine- or human-readable formatting
       formatting = '%8.1f\t%8.1f\t%d' if plumbing else u'%8.1f \u00b1 %5.1f ev/s (%d events)'
       print formatting % (throughput, error, used_events)
+    sys.stdout.flush()
 
     # store the values to compute the average throughput over the repetitions
     if repeats > 1 and not plumbing:
@@ -410,7 +419,10 @@ def multiCmsRun(
     else:
       formatting = u'%8.1f (single measurement with the highest overlap)'
       print formatting % (value, )
+
+  if not plumbing:
     print
+    sys.stdout.flush()
 
   # delete the temporary work dir
   shutil.rmtree(workdir)
@@ -426,6 +438,7 @@ def info():
   for gpu in gpus.values():
     print '  %d: %s' % (gpu.device, gpu.model)
   print
+  sys.stdout.flush()
 
 
 if __name__ == "__main__":
diff --git a/cms/patatrack/cms-patatrack/utility_scripts/workflow.sh b/cms/patatrack/cms-patatrack/utility_scripts/workflow.sh
new file mode 100644
index 0000000000000000000000000000000000000000..1cdf04163de769fb60fc62b08c9841af298967e4
--- /dev/null
+++ b/cms/patatrack/cms-patatrack/utility_scripts/workflow.sh
@@ -0,0 +1,66 @@
+#! /bin/bash
+
+# create the Pixel-only workflow for running on CPU over 2018 MC samples with "ideal" conditions
+runTheMatrix.py -j 0 -t 16 --command='--conditions auto:phase1_2018_design' -l 10824.5 
+
+cp 10824.5_*/step3_*.py profile_pixel-only_CPU.py
+cat >> profile_pixel-only_CPU.py << @EOF
+
+# load the CUDA service, but disable it for running on CPU
+process.CUDAService = cms.Service("CUDAService",
+    enabled = cms.untracked.bool(False)
+)
+
+# customise the configuration for profiling the Pixel-only workflow on CPU
+from RecoPixelVertexing.Configuration.customizePixelTracksSoAonCPU import customizePixelTracksSoAonCPUForProfiling
+process = customizePixelTracksSoAonCPUForProfiling(process)
+
+# load data using the DAQ source
+del process.source
+process.load('sourceFromPixelRaw_cff')
+
+# the raw data do not have the random number state
+del process.RandomNumberGeneratorService.restoreStateLabel
+
+# build triplets and run the broken line fit
+process.caHitNtupletCUDA.minHitsPerNtuplet = 3
+process.caHitNtupletCUDA.includeJumpingForwardDoublets = True
+process.caHitNtupletCUDA.useRiemannFit = False
+
+# report CUDAService messages
+process.MessageLogger.categories.append("CUDAService")
+
+# print the summary
+process.options.wantSummary = cms.untracked.bool( True )
+@EOF
+
+
+# create the Pixel-only workflow for running on GPU over 2018 MC samples with "ideal" conditions:
+runTheMatrix.py -j 0 -t 16 --command='--conditions auto:phase1_2018_design' -l 10824.502
+
+cp 10824.502_*/step3_*.py profile_pixel-only_GPU.py
+
+cat >> profile_pixel-only_GPU.py << @EOF
+
+# customise the configuration for profiling the Pixel-only workflow on GPU
+from RecoPixelVertexing.Configuration.customizePixelTracksForProfiling import customizePixelTracksForProfilingGPUOnly
+process = customizePixelTracksForProfilingGPUOnly(process)
+
+# load data using the DAQ source
+del process.source
+process.load('sourceFromPixelRaw_cff')
+
+# the raw data do not have the random number state
+del process.RandomNumberGeneratorService.restoreStateLabel
+
+# build triplets and run the broken line fit
+process.caHitNtupletCUDA.minHitsPerNtuplet = 3
+process.caHitNtupletCUDA.includeJumpingForwardDoublets = True
+process.caHitNtupletCUDA.useRiemannFit = False
+
+# report CUDAService messages
+process.MessageLogger.categories.append("CUDAService")
+
+# print the summary
+process.options.wantSummary = cms.untracked.bool( True )
+@EOF