From e2330b1a6191662294eb4552bf06c7ed43bcb0bd Mon Sep 17 00:00:00 2001
From: Rafal Bielski <rafal.bielski@cern.ch>
Date: Thu, 5 Nov 2020 20:33:01 +0100
Subject: [PATCH] SchedulerMonSvc updates for online HLT framework monitoring

* Rename "sampling period" property to "monitoring interval" because the service does not collect samples but takes snapshots in fixed intervals
* Change default interval to 100 ms
* Add new histograms with variables divided by number of slots
* Improve python configuration to allow more options steered by ConfigFlags
* Enable SchedulerMonSvc by default in runHLT_standalone job options (for athenaHLT)
---
 .../python/TrigSteerMonitorConfig.py          | 41 +++++++++++++------
 .../TrigSteerMonitor/src/SchedulerMonSvc.cxx  | 25 +++++++----
 .../TrigSteerMonitor/src/SchedulerMonSvc.h    |  4 +-
 .../TriggerJobOpts/python/Modifiers.py        |  6 ++-
 .../TriggerJobOpts/share/runHLT_standalone.py |  1 +
 5 files changed, 54 insertions(+), 23 deletions(-)

diff --git a/Trigger/TrigMonitoring/TrigSteerMonitor/python/TrigSteerMonitorConfig.py b/Trigger/TrigMonitoring/TrigSteerMonitor/python/TrigSteerMonitorConfig.py
index 23be91a7d8f..86a3b8fe3e1 100644
--- a/Trigger/TrigMonitoring/TrigSteerMonitor/python/TrigSteerMonitorConfig.py
+++ b/Trigger/TrigMonitoring/TrigSteerMonitor/python/TrigSteerMonitorConfig.py
@@ -8,18 +8,13 @@ def SchedulerMonSvcCfg(flags, name='SchedulerMonSvc'):
     monsvc = CompFactory.SchedulerMonSvc(name)
     monsvc.MonTool = GenericMonitoringTool('MonTool', HistPath='HLTFramework/'+name)
 
-    # Plots vs snapshot number can be enabled with flags.SchedulerMonSvc.enablePlotsVsSnapNumber
-    # They are disabled by default as they don't give much information on top of the plots vs walltime
-    enablePlotsVsSnapNumber = False
-    if flags.hasCategory('SchedulerMonSvc') and flags.SchedulerMonSvc.hasFlag('enablePlotsVsSnapNumber'):
-        enablePlotsVsSnapNumber = flags.SchedulerMonSvc.enablePlotsVsSnapNumber
-
-    # From GaudiHive AlgsExecutionStates::State enum
-    stateNames = ['INITIAL', 'CONTROLREADY', 'DATAREADY', 'RESOURCELESS',
-                  'SCHEDULED', 'EVTACCEPTED', 'EVTREJECTED', 'ERROR']
-    activeStates = ['CONTROLREADY', 'DATAREADY', 'RESOURCELESS', 'SCHEDULED']
-
     # Helper functions
+    def getFlag(flagName, defaultValue):
+        if flags.hasCategory('SchedulerMonSvc') and flags.SchedulerMonSvc.hasFlag(flagName):
+            return eval('flags.SchedulerMonSvc.'+flagName)
+        else:
+            return defaultValue
+
     def defineHist1D(varName, varLabel, labels=None, **kwargs):
         monsvc.MonTool.defineHistogram(varName, path='EXPERT', type='TH1D',
                                 title=varLabel+';'+varLabel+';Snapshots',
@@ -43,14 +38,34 @@ def SchedulerMonSvcCfg(flags, name='SchedulerMonSvc'):
         if enablePlotsVsSnapNumber:
             defineHistVsSnapNumber(varName, varLabel, type2D, labels, ybins=nbins, ymin=min, ymax=max, **kwargs)
 
+    # Flags propagated to SchedulerMonSvc properties, can be set with flags.SchedulerMonSvc.<flagName>
+    monsvc.SchedulerName = getFlag('SchedulerName', 'AvalancheSchedulerSvc')
+    monsvc.MonIntervalMillisec = getFlag('MonIntervalMillisec', 100)
+
+    # Flags enabling/disabling histogram categories, can be set with flags.SchedulerMonSvc.<flagName>
+    enablePlotsVsSnapNumber = getFlag('enablePlotsVsSnapNumber', False)
+    enablePlotsOverThreads = getFlag('enablePlotsOverThreads', True)
+    enablePlotsOverSlots = getFlag('enablePlotsOverSlots', True)
+    enablePlotsOverActive = getFlag('enablePlotsOverActive', True)
+
+    # From GaudiHive AlgsExecutionStates::State enum
+    stateNames = ['INITIAL', 'CONTROLREADY', 'DATAREADY', 'RESOURCELESS',
+                  'SCHEDULED', 'EVTACCEPTED', 'EVTREJECTED', 'ERROR']
+    activeStates = ['CONTROLREADY', 'DATAREADY', 'RESOURCELESS', 'SCHEDULED']
+
     # Histogram definitions
     defineStandardHistogramSet('AlgStates', 'Algorithm state', 8, -0.5, 7.5, labels=stateNames, type2D='TH2D', weight='StateTotalCounts')
     defineStandardHistogramSet('FreeSlots', 'Number of free slots', 10, 0, 10)
+    defineStandardHistogramSet('FreeSlotsFraction', 'Fraction of free slots', 100, 0, 1)
     for state in stateNames:
         defineStandardHistogramSet(state, 'N algs in '+state+' state', 100, 0, 100)
     for state in activeStates:
-        defineStandardHistogramSet(state+'_Over_Threads', 'N '+state+' / N threads', 100, 0, 10)
-        defineStandardHistogramSet(state+'_Over_Active', 'N '+state+' / N active states', 100, 0, 1)
+        if enablePlotsOverThreads:
+            defineStandardHistogramSet(state+'_Over_Threads', 'N '+state+' / N threads', 100, 0, 10)
+        if enablePlotsOverSlots:
+            defineStandardHistogramSet(state+'_Over_Slots', 'N '+state+' / N slots', 100, 0, 10)
+        if enablePlotsOverActive:
+            defineStandardHistogramSet(state+'_Over_Active', 'N '+state+' / N active states', 100, 0, 1)
 
     monsvc.MonTool.defineHistogram('TIME_monCallback', path='EXPERT', type='TH1D',
                                     title='Time of callback calls;Time [us];Calls',
diff --git a/Trigger/TrigMonitoring/TrigSteerMonitor/src/SchedulerMonSvc.cxx b/Trigger/TrigMonitoring/TrigSteerMonitor/src/SchedulerMonSvc.cxx
index 25d83390386..fbb6dd4e74f 100644
--- a/Trigger/TrigMonitoring/TrigSteerMonitor/src/SchedulerMonSvc.cxx
+++ b/Trigger/TrigMonitoring/TrigSteerMonitor/src/SchedulerMonSvc.cxx
@@ -60,8 +60,9 @@ StatusCode SchedulerMonSvc::startMonitoring() {
     }
   }
 
-  // Get the number of threads
+  // Get the number of threads and slots
   int numThreads = std::stoi( SmartIF<IProperty>(m_scheduler)->getProperty("ThreadPoolSize").toString() );
+  int numSlots = std::stoi( serviceLocator()->service("EventDataSvc").as<IProperty>()->getProperty("NSlots").toString() );
 
   // Flag the monitoring as running (prevents going past this point twice)
   if (bool expected = false; not m_running.compare_exchange_strong(expected, true)) {
@@ -70,11 +71,11 @@ StatusCode SchedulerMonSvc::startMonitoring() {
   }
 
   // Construct the callback and pass it to the scheduler monitoring API
-  auto monCallback = [this, &numThreads](IScheduler::OccupancySnapshot snap) -> void {
+  auto monCallback = [this, &numThreads, &numSlots](IScheduler::OccupancySnapshot snap) -> void {
     auto monTime = Monitored::Timer("TIME_monCallback");
     // Calculate and update snap counters
     const ClockType::duration wallTime = snap.time - m_startTime;
-    const size_t thisSnapCounter = std::chrono::duration_cast<std::chrono::milliseconds>(wallTime).count() / m_samplingPeriodMillisec.value();
+    const size_t thisSnapCounter = std::chrono::duration_cast<std::chrono::milliseconds>(wallTime).count() / m_monIntervalMillisec.value();
     const size_t lastSnapCounter = m_lastSnapCounter.exchange(thisSnapCounter);
     const int periodsSinceLastSnap = thisSnapCounter - lastSnapCounter;
 
@@ -97,9 +98,11 @@ StatusCode SchedulerMonSvc::startMonitoring() {
     // Monitor alg state counts absolute numbers and ratios to N threads and N active states
     std::vector<Monitored::Scalar<int>> mon_stateCounts;
     std::vector<Monitored::Scalar<double>> mon_stateCountsOverThreads;
+    std::vector<Monitored::Scalar<double>> mon_stateCountsOverSlots;
     std::vector<Monitored::Scalar<double>> mon_stateCountsOverActive;
     mon_stateCounts.reserve(static_cast<size_t>(AlgState::MAXVALUE));
     mon_stateCountsOverThreads.reserve(static_cast<size_t>(AlgState::MAXVALUE));
+    mon_stateCountsOverSlots.reserve(static_cast<size_t>(AlgState::MAXVALUE));
     mon_stateCountsOverActive.reserve(static_cast<size_t>(AlgState::MAXVALUE));
     int activeCount = 0;
     for (size_t i : s_activeAlgStateNumbers) {
@@ -108,26 +111,34 @@ StatusCode SchedulerMonSvc::startMonitoring() {
     for (size_t i : s_algStateNumbers) {
       mon_stateCounts.emplace_back(s_algStateNames[i].data(), stateTotalCounts[i]);
       mon_stateCountsOverThreads.emplace_back(s_algStateNames[i].data()+"_Over_Threads"s, divAsDouble(stateTotalCounts[i], numThreads));
+      mon_stateCountsOverSlots.emplace_back(s_algStateNames[i].data()+"_Over_Slots"s, divAsDouble(stateTotalCounts[i], numSlots));
       double toActive = (activeCount > 0) ? divAsDouble(stateTotalCounts[i], activeCount) : 0;
       mon_stateCountsOverActive.emplace_back(s_algStateNames[i].data()+"_Over_Active"s, toActive);
     }
 
     // Monitor number of free slots
     auto mon_freeSlots = Monitored::Scalar("FreeSlots", m_scheduler->freeSlots());
+    auto mon_freeSlotsFrac = Monitored::Scalar("FreeSlotsFraction", divAsDouble(m_scheduler->freeSlots(), numSlots));
 
     // Reserve vector of references with size equal to the number of variables added into the vector in the loop below
     std::vector<std::reference_wrapper<Monitored::IMonitoredVariable>> allMonVars;
-    allMonVars.reserve(5 + mon_stateCounts.size() + mon_stateCountsOverThreads.size() + mon_stateCountsOverActive.size());
+    allMonVars.reserve(6 +
+                       mon_stateCounts.size() +
+                       mon_stateCountsOverThreads.size() +
+                       mon_stateCountsOverSlots.size() +
+                       mon_stateCountsOverActive.size());
     // Fill monitoring histograms once for each sampling period passed since the last fill
     // If multiple sampling periods passed, it means the scheduler state didn't change during that time
     for (size_t snapNumber=lastSnapCounter+1; snapNumber<=thisSnapCounter; ++snapNumber) {
       auto mon_snapNumber = Monitored::Scalar("SnapNumber", snapNumber);
-      auto mon_wallTimeSec = Monitored::Scalar("WallTimeSeconds", snapNumber*m_samplingPeriodMillisec.value()*1e-3);
+      auto mon_wallTimeSec = Monitored::Scalar("WallTimeSeconds", snapNumber*m_monIntervalMillisec.value()*1e-3);
       allMonVars.clear();
       allMonVars.insert(allMonVars.end(), mon_stateCounts.begin(), mon_stateCounts.end());
       allMonVars.insert(allMonVars.end(), mon_stateCountsOverThreads.begin(), mon_stateCountsOverThreads.end());
+      allMonVars.insert(allMonVars.end(), mon_stateCountsOverSlots.begin(), mon_stateCountsOverSlots.end());
       allMonVars.insert(allMonVars.end(), mon_stateCountsOverActive.begin(), mon_stateCountsOverActive.end());
-      allMonVars.insert(allMonVars.end(), {mon_stateNumber, mon_stateTotalCounts, mon_freeSlots, mon_snapNumber, mon_wallTimeSec});
+      allMonVars.insert(allMonVars.end(), {mon_stateNumber, mon_stateTotalCounts, mon_freeSlots, mon_freeSlotsFrac,
+                                           mon_snapNumber, mon_wallTimeSec});
       Monitored::Group(m_monTool, allMonVars);
     }
     monTime.stop();
@@ -136,7 +147,7 @@ StatusCode SchedulerMonSvc::startMonitoring() {
 
   // Start monitoring
   m_startTime = ClockType::now();
-  m_scheduler->recordOccupancy(m_samplingPeriodMillisec.value(), std::move(monCallback));
+  m_scheduler->recordOccupancy(m_monIntervalMillisec.value(), std::move(monCallback));
 
   ATH_MSG_INFO("Scheduler monitoring started");
 
diff --git a/Trigger/TrigMonitoring/TrigSteerMonitor/src/SchedulerMonSvc.h b/Trigger/TrigMonitoring/TrigSteerMonitor/src/SchedulerMonSvc.h
index 16e1849b3cb..27abd7089c4 100644
--- a/Trigger/TrigMonitoring/TrigSteerMonitor/src/SchedulerMonSvc.h
+++ b/Trigger/TrigMonitoring/TrigSteerMonitor/src/SchedulerMonSvc.h
@@ -45,8 +45,8 @@ private:
   // Properties and handles
   Gaudi::Property<std::string> m_schedulerName {
     this, "SchedulerName", "AvalancheSchedulerSvc", "Name of the scheduler"};
-  Gaudi::Property<unsigned int> m_samplingPeriodMillisec {
-    this, "SamplingPeriodMillisec", 5, "Target sampling period in milliseconds"};
+  Gaudi::Property<unsigned int> m_monIntervalMillisec {
+    this, "MonIntervalMillisec", 100, "Monitoring snapshot interval in milliseconds"};
   ToolHandle<GenericMonitoringTool> m_monTool {
     this, "MonTool", "", "Monitoring tool"};
 
diff --git a/Trigger/TriggerCommon/TriggerJobOpts/python/Modifiers.py b/Trigger/TriggerCommon/TriggerJobOpts/python/Modifiers.py
index 1d20e1eb6f9..32c6aea48c6 100644
--- a/Trigger/TriggerCommon/TriggerJobOpts/python/Modifiers.py
+++ b/Trigger/TriggerCommon/TriggerJobOpts/python/Modifiers.py
@@ -1117,8 +1117,12 @@ class enableSchedulerMon(_modifier):
     Enable SchedulerMonSvc
     """
     def preSetup(self):
-        from AthenaConfiguration.ComponentAccumulator import CAtoGlobalWrapper
         from AthenaConfiguration.AllConfigFlags import ConfigFlags as flags
+        if not flags.Trigger.Online.isPartition:
+            log.debug('SchedulerMonSvc currently only works with athenaHLT / online partition. Skipping setup.')
+            return
+
+        from AthenaConfiguration.ComponentAccumulator import CAtoGlobalWrapper
         from TrigSteerMonitor.TrigSteerMonitorConfig import SchedulerMonSvcCfg
         CAtoGlobalWrapper(SchedulerMonSvcCfg, flags)
     
diff --git a/Trigger/TriggerCommon/TriggerJobOpts/share/runHLT_standalone.py b/Trigger/TriggerCommon/TriggerJobOpts/share/runHLT_standalone.py
index 5b5c3d7e805..abe44297404 100644
--- a/Trigger/TriggerCommon/TriggerJobOpts/share/runHLT_standalone.py
+++ b/Trigger/TriggerCommon/TriggerJobOpts/share/runHLT_standalone.py
@@ -243,6 +243,7 @@ else:           # More data modifiers
                      #Monitoring L1Topo at ROB level
                      #'L1TopoCheck',
                      'forceTileRODMap',
+                     'enableSchedulerMon'
     ]
 
 TriggerFlags.doID = ConfigFlags.Trigger.doID = opt.doID
-- 
GitLab