diff --git a/cms/hlt-ma/cms-hlt-ma/parseResults.sh b/cms/hlt-ma/cms-hlt-ma/parseResults.sh index e7c1016bd07c707642512c0510cb451de3b98ba0..dca22b75723223e572be8317af6209836dc8c328 100755 --- a/cms/hlt-ma/cms-hlt-ma/parseResults.sh +++ b/cms/hlt-ma/cms-hlt-ma/parseResults.sh @@ -2,6 +2,9 @@ # directory of this distribution. For licensing information, see the # COPYING file at the top-level directory of this distribution. +# TODO: add info on MPS used +# TODO: add HLT config version + parseResultsDir=$(cd $(dirname ${BASH_SOURCE}); pwd) # needed to locate parseResults.py @@ -11,71 +14,145 @@ parseResultsDir=$(cd $(dirname ${BASH_SOURCE}); pwd) # needed to locate parseRes # The function is started in the base working directory <basewdir>: # please store here the overall json summary file for all NCOPIES processes combined function parseResults(){ - echo "[parseResults] current directory: $(pwd)" - export nevt_thr='-1' - export res_cpu='{}' - export res_thr='{}' + echo "[parseResults] INFO -- Current directory: $(pwd)" + echo "[parseResults] INFO -- Collecting results for run mode >>$MODE<<" + export output_file="cms-hlt-gpu.score" export res_score='{}' + #----------------------- # Parse results #----------------------- - echo "[parseResults] parsing results from" proc_*/out_*.log - # Documentation of cmssw time report at https://github.com/cms-sw/cmssw/blob/09c3fce6626f70fd04223e7dacebf0b485f73f54/FWCore/Services/plugins/Timing.cc#L240 - # Parsing Number of Events - nevt_thr=`grep -h "Number of Events" proc_*/out_gpu_*.log | sed -e "s@.*:\([ 0-9\.]*\).*@\1@" | awk 'BEGIN{n=0; count=0;} {n+=$1; count+=1} END{print n/count/nthreads}' nthreads=$NTHREADS || (echo "-1"; return 1)` + + # Log file structure: + # ----------------------- + # |-CPU + # |---proc_0 + # |-----cpu_1.log + # |---proc_1 + # |-----cpu_2.log + # |... + # |---proc_N + # |-----cpu_N.log + # ----------------------- + #|-GPU + #|---proc_0 # WL copy on GPU 0 + #|-----gpu0_0.log # cmsRun 1 + #|-----gpu0_1.log # cmsRun 2 + #|... + #|-----gpu0_5.log # 6 copies are default + #|---proc_1 # WL copy on GPU 1 + #|-----gpu1_0.log # cmsRun 1 + #|-----gpu1_1.log # cmsRun 2 + #|... + #|-----gpu1_5.log # 6 copies are default + # ----------------------- + + log_files=($(ls proc_*/*)) + echo "[parseResults] INFO -- parsing results from:" "${log_files[@]}" + + # Get actual number of processed events + # This is necessary, because sometimes, CMSSW fails + # If a sufficient large number was processed, the results are still valid + declare -a processed_events + processed_events=($(for f in proc_*/*.log; do awk 'NR>2 && /Average throughput/ {sub(/,/, "", a[NR-2]); print a[NR-2]} {a[NR]=$1}' "$f"; done)) STATUS_1=$? - # Parsing Event Throughput: xxxx ev/s - res_thr=`grep -H "Event Throughput" proc_*/out_gpu_*.log | sed -e "s@[^:]*: Event Throughput: \([ 0-9\.]*\) ev/s@\1@" | awk 'BEGIN{amin=1000000;amax=0;count=0;} { val=$1; a[count]=val; count+=1; sum+=val; if(amax<val) amax=val; if(amin>val) amin=val} END{n = asort(a); if (n % 2) { median=a[(n + 1) / 2]; } else {median=(a[(n / 2)] + a[(n / 2) + 1]) / 2.0;}; printf "{\"avg\": %.4f, \"median\": %.4f, \"min\": %.4f, \"max\": %.4f, \"count\": %d}", sum/count, median, amin, amax, count}' || (echo "{}"; return 1)` + # Check if sufficient events were processed to guarantee reliable results + for i in "${!processed_events[@]}"; do + ev=${processed_events[$i]} + ratio=$(awk "BEGIN {printf \"%.4f\", $ev / $NEVENTS}") + + if awk "BEGIN {exit ($ratio < 0.8) ? 0 : 1}"; then + echo "[parseResults] ERROR -- run $((i + 1 )) was not successful. Only $ev of $NEVENTS events processed." + fi + done + + # Start times + declare -a start_times + while IFS= read -r line; do + # Extract datetime from line + datetime=$(echo "$line" | awk '{print $(NF-3) " " $(NF-2)}') + + # Convert CEST (UTC+2) to UTC, then to Unix time + start_time=$(TZ=UTC date -d "$datetime -2 hours" +%s) + + # Append to array + start_times+=("$start_time") + done < <(grep -nir "MSG-i ThreadStreamSetup: (NoModuleName)" proc_*/*.log) STATUS_2=$? + # printf "Start times:\n%s\n" "${start_times[@]}" # DEBUG + + # End times + declare -a end_times + while IFS= read -r line; do + # Extract datetime from line + datetime=$(echo "$line" | awk '{print $(NF-3) " " $(NF-2)}') - #Duplicating above parsing, as quick and dirty. Should be replaced by a python parser - res_score=`grep -H "Event Throughput" proc_*/out_gpu_*.log | sed -e "s@[^:]*: Event Throughput: \([ 0-9\.]*\) ev/s@\1@" | awk 'BEGIN{amin=1000000;amax=0;count=0;} { val=$1; a[count]=val; count+=1; sum+=val; if(amax<val) amax=val; if(amin>val) amin=val} END{n = asort(a); if (n % 2) { median=a[(n + 1) / 2]; } else {median=(a[(n / 2)] + a[(n / 2) + 1]) / 2.0;}; printf "{\"hlt\": %.4f}", sum}' || (echo "{}"; return 1)` + # Convert CEST (UTC+2) to UTC, then to Unix time + end_time=$(TZ=UTC date -d "$datetime -2 hours" +%s) + end_times+=("$end_time") + done < <(grep -nir "ThroughputService: AfterModEndJob" proc_*/*.log) STATUS_3=$? + # printf "End times:\n%s\n" "${end_times[@]}" # DEBUG + + # Write detailed score to output file # TODO: tbd + echo "LogFile,StartTime,EndTime,Duration,ProcessedEvents,Throughput(events/sec)" > "$output_file" # Header + + # Calculate throughput + declare -a throughputs + for i in "${!start_times[@]}"; do + file=${log_files[$i]} + start=${start_times[$i]} + end=${end_times[$i]} + events=${processed_events[$i]} - # Parsing CPU Summary: \n- Total loop:: xxxx seconds of all CPUs - res_cpu=`grep -H -A2 "CPU Summary" proc_*/out_gpu_*.log | grep "Total loop" | sed -e "s@.*\sTotal loop: \([ 0-9\.]*\)@\1@" | awk 'BEGIN{amin=1000000;amax=0;count=0;} { val=nevt*nthreads/$1; a[count]=val; count+=1; sum+=val; if(amax<val) amax=val; if(amin>val) amin=val} END{n = asort(a); if (n % 2) {median=a[(n + 1) / 2]; } else {median=(a[(n / 2)] + a[(n / 2) + 1]) / 2.0;}; printf "{\"avg\": %.4f, \"median\": %.4f, \"min\": %.4f, \"max\": %.4f, \"count\": %d}", sum/count, median, amin, amax, count}' nevt=$nevt_thr nthreads=$NTHREADS || (echo "{}"; return 1)` - STATUS_4=$? + duration=$((end - start)) - # Parsing the log to see if GPUs where used - count_gpu=`grep -c 'CUDA device' proc_1/out_gpu_*.log` + if (( duration > 0 )); then + throughput=$(awk "BEGIN {printf \"%.2f\", $events / $duration}") + throughputs+=("$throughput") + echo "[parseResults] INFO --Job $(( i + 1 )): $events events in $duration sec → $throughput events/sec" + else + echo "[parseResults] ERROR -- Job $(( i + 1)): Invalid duration ($start to $end)" + fi + echo "$file,$start,$end,$duration,$events,$throughput" >> "$output_file" + done + # printf "%s\n" "${throughputs[@]}" # DEBUG + # Total throughput == Score + res_score=0 + for run in "${throughputs[@]}"; do + res_score=$(echo "$res_score + $run" | bc) + done + echo "Total throughput: $res_score events/sec" + + # TODO: to be reviewed: + # ---------------------------------------------------------- # Extracting the GPU utilization from prmon gpu_num=-1 gpu_util=-1 gpu_mem=-1 PRMON_FILE=$baseWDir/prmon_res/prmon_1/prmon.json if [ -e ${PRMON_FILE} ]; then - gpu_num=`cat ${PRMON_FILE} | jq '.["Avg"]["ngpus"]'` - gpu_util=`cat ${PRMON_FILE} | jq '.["Avg"]["gpusmpct"]'` - gpu_mem=`cat ${PRMON_FILE} | jq '.["Avg"]["gpumempct"]'` - fi - # Measuring the speedup due to GPUs. The number of processed events must match, the throughputs must be extracted and the CPU-only throughput must not be zero, otherwise the value -1 is produced - nevt2_thr=`grep -h "Number of Events" proc_*/out_cpu_*.log | sed -e "s@.*:\([ 0-9\.]*\).*@\1@" | awk 'BEGIN{n=0; count=0;} {n+=$1; count+=1} END{print n/count/nthreads}' nthreads=$NTHREADS || (echo "-1"; return 1)` - [[ "$?" == 0 && "$nevt2_thr" == "$nevt_thr" ]] - STATUS_5=$? - tp_gpu=`grep -H "Event Throughput" proc_*/out_gpu_*.log | sed -e "s@[^:]*: Event Throughput: \([ 0-9\.]*\) ev/s@\1@" | awk 'BEGIN{count=0;} { val=$1; count+=1; sum+=val; } END{printf "%.4f", sum/count}' || (echo ""; return 1)` - STATUS_6=$? - tp_cpu=`grep -H "Event Throughput" proc_*/out_cpu_*.log | sed -e "s@[^:]*: Event Throughput: \([ 0-9\.]*\) ev/s@\1@" | awk 'BEGIN{count=0;} { val=$1; count+=1; sum+=val; } END{printf "%.4f", sum/count}' || (echo ""; return 1)` - STATUS_7=$? - if [[ "$STATUS_5" == "0" && "$STATUS_6" == "0" && "$STATUS_7" == "0" && $tp_cpu != "0" ]] ; then - speedup=`awk "BEGIN {print $tp_gpu / $tp_cpu}"` - else - speedup="-1" + gpu_num_pr=`cat ${PRMON_FILE} | jq '.["Avg"]["ngpus"]'` + gpu_util_pr=`cat ${PRMON_FILE} | jq '.["Avg"]["gpusmpct"]'` + gpu_mem_pr=`cat ${PRMON_FILE} | jq '.["Avg"]["gpumempct"]'` fi + # ---------------------------------------------------------- - res_gpu="{\"num_GPUs\": $count_gpu , \"speedup\": $speedup , \"gpu_num\": $gpu_num , \"gpu_util\": $gpu_util , \"gpu_mem\": $gpu_mem }" + res_gpu="{\"num_GPUs\": $NGPUS , \"gpu_num_pr\": $gpu_num_pr , \"gpu_util_pr\": $gpu_util_pr , \"gpu_mem_pr\": $gpu_mem_pr }" - [[ "$STATUS_1" == "0" ]] && [[ "$STATUS_2" == "0" ]] && [[ "$STATUS_3" == "0" ]] && [[ "$STATUS_4" == "0" ]] + [[ "$STATUS_1" == "0" ]] && [[ "$STATUS_2" == "0" ]] && [[ "$STATUS_3" == "0" ]] STATUS=$? - echo "[parseResults] parsing completed (status=$STATUS)" + echo "[parseResults] INFO -- Parsing completed (status=$STATUS)" [[ "$STATUS" != "0" ]] && return $STATUS + + #----------------------- # Generate summary #----------------------- - echo "[parseResults] generate summary" - NEVENTS_THREAD=${nevt_thr} # assign to NEVENTS_THREAD that goes to generateSummary the effective number of events per thread processed - resJSON="{\"wl-scores\": $res_score, \"wl-stats\": {\"throughput_score\": $res_thr , \"CPU_score\": $res_cpu , \"GPU_stats\": $res_gpu }}" + echo "[parseResults] INFO -- Generate summary" + resJSON="{\"wl-scores\": $res_score, \"wl-stats\": {\"GPU_stats_pr\": $res_gpu }}" # TODO: Add configuration? echo $resJSON > $baseWDir/parser_output.json #----------------------- # Return status