Skip to content
Snippets Groups Projects
Commit e2d2e199 authored by Robin Hofsaess's avatar Robin Hofsaess
Browse files

[BMK-1696] WIP -- Update parser (minimal version)

parent 975e955e
No related branches found
No related tags found
No related merge requests found
......@@ -2,6 +2,9 @@
# directory of this distribution. For licensing information, see the
# COPYING file at the top-level directory of this distribution.
# TODO: add info on MPS used
# TODO: add HLT config version
parseResultsDir=$(cd $(dirname ${BASH_SOURCE}); pwd) # needed to locate parseResults.py
......@@ -11,71 +14,145 @@ parseResultsDir=$(cd $(dirname ${BASH_SOURCE}); pwd) # needed to locate parseRes
# The function is started in the base working directory <basewdir>:
# please store here the overall json summary file for all NCOPIES processes combined
function parseResults(){
echo "[parseResults] current directory: $(pwd)"
export nevt_thr='-1'
export res_cpu='{}'
export res_thr='{}'
echo "[parseResults] INFO -- Current directory: $(pwd)"
echo "[parseResults] INFO -- Collecting results for run mode >>$MODE<<"
export output_file="cms-hlt-gpu.score"
export res_score='{}'
#-----------------------
# Parse results
#-----------------------
echo "[parseResults] parsing results from" proc_*/out_*.log
# Documentation of cmssw time report at https://github.com/cms-sw/cmssw/blob/09c3fce6626f70fd04223e7dacebf0b485f73f54/FWCore/Services/plugins/Timing.cc#L240
# Parsing Number of Events
nevt_thr=`grep -h "Number of Events" proc_*/out_gpu_*.log | sed -e "s@.*:\([ 0-9\.]*\).*@\1@" | awk 'BEGIN{n=0; count=0;} {n+=$1; count+=1} END{print n/count/nthreads}' nthreads=$NTHREADS || (echo "-1"; return 1)`
# Log file structure:
# -----------------------
# |-CPU
# |---proc_0
# |-----cpu_1.log
# |---proc_1
# |-----cpu_2.log
# |...
# |---proc_N
# |-----cpu_N.log
# -----------------------
#|-GPU
#|---proc_0 # WL copy on GPU 0
#|-----gpu0_0.log # cmsRun 1
#|-----gpu0_1.log # cmsRun 2
#|...
#|-----gpu0_5.log # 6 copies are default
#|---proc_1 # WL copy on GPU 1
#|-----gpu1_0.log # cmsRun 1
#|-----gpu1_1.log # cmsRun 2
#|...
#|-----gpu1_5.log # 6 copies are default
# -----------------------
log_files=($(ls proc_*/*))
echo "[parseResults] INFO -- parsing results from:" "${log_files[@]}"
# Get actual number of processed events
# This is necessary, because sometimes, CMSSW fails
# If a sufficient large number was processed, the results are still valid
declare -a processed_events
processed_events=($(for f in proc_*/*.log; do awk 'NR>2 && /Average throughput/ {sub(/,/, "", a[NR-2]); print a[NR-2]} {a[NR]=$1}' "$f"; done))
STATUS_1=$?
# Parsing Event Throughput: xxxx ev/s
res_thr=`grep -H "Event Throughput" proc_*/out_gpu_*.log | sed -e "s@[^:]*: Event Throughput: \([ 0-9\.]*\) ev/s@\1@" | awk 'BEGIN{amin=1000000;amax=0;count=0;} { val=$1; a[count]=val; count+=1; sum+=val; if(amax<val) amax=val; if(amin>val) amin=val} END{n = asort(a); if (n % 2) { median=a[(n + 1) / 2]; } else {median=(a[(n / 2)] + a[(n / 2) + 1]) / 2.0;}; printf "{\"avg\": %.4f, \"median\": %.4f, \"min\": %.4f, \"max\": %.4f, \"count\": %d}", sum/count, median, amin, amax, count}' || (echo "{}"; return 1)`
# Check if sufficient events were processed to guarantee reliable results
for i in "${!processed_events[@]}"; do
ev=${processed_events[$i]}
ratio=$(awk "BEGIN {printf \"%.4f\", $ev / $NEVENTS}")
if awk "BEGIN {exit ($ratio < 0.8) ? 0 : 1}"; then
echo "[parseResults] ERROR -- run $((i + 1 )) was not successful. Only $ev of $NEVENTS events processed."
fi
done
# Start times
declare -a start_times
while IFS= read -r line; do
# Extract datetime from line
datetime=$(echo "$line" | awk '{print $(NF-3) " " $(NF-2)}')
# Convert CEST (UTC+2) to UTC, then to Unix time
start_time=$(TZ=UTC date -d "$datetime -2 hours" +%s)
# Append to array
start_times+=("$start_time")
done < <(grep -nir "MSG-i ThreadStreamSetup: (NoModuleName)" proc_*/*.log)
STATUS_2=$?
# printf "Start times:\n%s\n" "${start_times[@]}" # DEBUG
# End times
declare -a end_times
while IFS= read -r line; do
# Extract datetime from line
datetime=$(echo "$line" | awk '{print $(NF-3) " " $(NF-2)}')
#Duplicating above parsing, as quick and dirty. Should be replaced by a python parser
res_score=`grep -H "Event Throughput" proc_*/out_gpu_*.log | sed -e "s@[^:]*: Event Throughput: \([ 0-9\.]*\) ev/s@\1@" | awk 'BEGIN{amin=1000000;amax=0;count=0;} { val=$1; a[count]=val; count+=1; sum+=val; if(amax<val) amax=val; if(amin>val) amin=val} END{n = asort(a); if (n % 2) { median=a[(n + 1) / 2]; } else {median=(a[(n / 2)] + a[(n / 2) + 1]) / 2.0;}; printf "{\"hlt\": %.4f}", sum}' || (echo "{}"; return 1)`
# Convert CEST (UTC+2) to UTC, then to Unix time
end_time=$(TZ=UTC date -d "$datetime -2 hours" +%s)
end_times+=("$end_time")
done < <(grep -nir "ThroughputService: AfterModEndJob" proc_*/*.log)
STATUS_3=$?
# printf "End times:\n%s\n" "${end_times[@]}" # DEBUG
# Write detailed score to output file # TODO: tbd
echo "LogFile,StartTime,EndTime,Duration,ProcessedEvents,Throughput(events/sec)" > "$output_file" # Header
# Calculate throughput
declare -a throughputs
for i in "${!start_times[@]}"; do
file=${log_files[$i]}
start=${start_times[$i]}
end=${end_times[$i]}
events=${processed_events[$i]}
# Parsing CPU Summary: \n- Total loop:: xxxx seconds of all CPUs
res_cpu=`grep -H -A2 "CPU Summary" proc_*/out_gpu_*.log | grep "Total loop" | sed -e "s@.*\sTotal loop: \([ 0-9\.]*\)@\1@" | awk 'BEGIN{amin=1000000;amax=0;count=0;} { val=nevt*nthreads/$1; a[count]=val; count+=1; sum+=val; if(amax<val) amax=val; if(amin>val) amin=val} END{n = asort(a); if (n % 2) {median=a[(n + 1) / 2]; } else {median=(a[(n / 2)] + a[(n / 2) + 1]) / 2.0;}; printf "{\"avg\": %.4f, \"median\": %.4f, \"min\": %.4f, \"max\": %.4f, \"count\": %d}", sum/count, median, amin, amax, count}' nevt=$nevt_thr nthreads=$NTHREADS || (echo "{}"; return 1)`
STATUS_4=$?
duration=$((end - start))
# Parsing the log to see if GPUs where used
count_gpu=`grep -c 'CUDA device' proc_1/out_gpu_*.log`
if (( duration > 0 )); then
throughput=$(awk "BEGIN {printf \"%.2f\", $events / $duration}")
throughputs+=("$throughput")
echo "[parseResults] INFO --Job $(( i + 1 )): $events events in $duration sec → $throughput events/sec"
else
echo "[parseResults] ERROR -- Job $(( i + 1)): Invalid duration ($start to $end)"
fi
echo "$file,$start,$end,$duration,$events,$throughput" >> "$output_file"
done
# printf "%s\n" "${throughputs[@]}" # DEBUG
# Total throughput == Score
res_score=0
for run in "${throughputs[@]}"; do
res_score=$(echo "$res_score + $run" | bc)
done
echo "Total throughput: $res_score events/sec"
# TODO: to be reviewed:
# ----------------------------------------------------------
# Extracting the GPU utilization from prmon
gpu_num=-1
gpu_util=-1
gpu_mem=-1
PRMON_FILE=$baseWDir/prmon_res/prmon_1/prmon.json
if [ -e ${PRMON_FILE} ]; then
gpu_num=`cat ${PRMON_FILE} | jq '.["Avg"]["ngpus"]'`
gpu_util=`cat ${PRMON_FILE} | jq '.["Avg"]["gpusmpct"]'`
gpu_mem=`cat ${PRMON_FILE} | jq '.["Avg"]["gpumempct"]'`
fi
# Measuring the speedup due to GPUs. The number of processed events must match, the throughputs must be extracted and the CPU-only throughput must not be zero, otherwise the value -1 is produced
nevt2_thr=`grep -h "Number of Events" proc_*/out_cpu_*.log | sed -e "s@.*:\([ 0-9\.]*\).*@\1@" | awk 'BEGIN{n=0; count=0;} {n+=$1; count+=1} END{print n/count/nthreads}' nthreads=$NTHREADS || (echo "-1"; return 1)`
[[ "$?" == 0 && "$nevt2_thr" == "$nevt_thr" ]]
STATUS_5=$?
tp_gpu=`grep -H "Event Throughput" proc_*/out_gpu_*.log | sed -e "s@[^:]*: Event Throughput: \([ 0-9\.]*\) ev/s@\1@" | awk 'BEGIN{count=0;} { val=$1; count+=1; sum+=val; } END{printf "%.4f", sum/count}' || (echo ""; return 1)`
STATUS_6=$?
tp_cpu=`grep -H "Event Throughput" proc_*/out_cpu_*.log | sed -e "s@[^:]*: Event Throughput: \([ 0-9\.]*\) ev/s@\1@" | awk 'BEGIN{count=0;} { val=$1; count+=1; sum+=val; } END{printf "%.4f", sum/count}' || (echo ""; return 1)`
STATUS_7=$?
if [[ "$STATUS_5" == "0" && "$STATUS_6" == "0" && "$STATUS_7" == "0" && $tp_cpu != "0" ]] ; then
speedup=`awk "BEGIN {print $tp_gpu / $tp_cpu}"`
else
speedup="-1"
gpu_num_pr=`cat ${PRMON_FILE} | jq '.["Avg"]["ngpus"]'`
gpu_util_pr=`cat ${PRMON_FILE} | jq '.["Avg"]["gpusmpct"]'`
gpu_mem_pr=`cat ${PRMON_FILE} | jq '.["Avg"]["gpumempct"]'`
fi
# ----------------------------------------------------------
res_gpu="{\"num_GPUs\": $count_gpu , \"speedup\": $speedup , \"gpu_num\": $gpu_num , \"gpu_util\": $gpu_util , \"gpu_mem\": $gpu_mem }"
res_gpu="{\"num_GPUs\": $NGPUS , \"gpu_num_pr\": $gpu_num_pr , \"gpu_util_pr\": $gpu_util_pr , \"gpu_mem_pr\": $gpu_mem_pr }"
[[ "$STATUS_1" == "0" ]] && [[ "$STATUS_2" == "0" ]] && [[ "$STATUS_3" == "0" ]] && [[ "$STATUS_4" == "0" ]]
[[ "$STATUS_1" == "0" ]] && [[ "$STATUS_2" == "0" ]] && [[ "$STATUS_3" == "0" ]]
STATUS=$?
echo "[parseResults] parsing completed (status=$STATUS)"
echo "[parseResults] INFO -- Parsing completed (status=$STATUS)"
[[ "$STATUS" != "0" ]] && return $STATUS
#-----------------------
# Generate summary
#-----------------------
echo "[parseResults] generate summary"
NEVENTS_THREAD=${nevt_thr} # assign to NEVENTS_THREAD that goes to generateSummary the effective number of events per thread processed
resJSON="{\"wl-scores\": $res_score, \"wl-stats\": {\"throughput_score\": $res_thr , \"CPU_score\": $res_cpu , \"GPU_stats\": $res_gpu }}"
echo "[parseResults] INFO -- Generate summary"
resJSON="{\"wl-scores\": $res_score, \"wl-stats\": {\"GPU_stats_pr\": $res_gpu }}" # TODO: Add configuration?
echo $resJSON > $baseWDir/parser_output.json
#-----------------------
# Return status
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment