From e2330b1a6191662294eb4552bf06c7ed43bcb0bd Mon Sep 17 00:00:00 2001 From: Rafal Bielski <rafal.bielski@cern.ch> Date: Thu, 5 Nov 2020 20:33:01 +0100 Subject: [PATCH] SchedulerMonSvc updates for online HLT framework monitoring * Rename "sampling period" property to "monitoring interval" because the service does not collect samples but takes snapshots in fixed intervals * Change default interval to 100 ms * Add new histograms with variables divided by number of slots * Improve python configuration to allow more options steered by ConfigFlags * Enable SchedulerMonSvc by default in runHLT_standalone job options (for athenaHLT) --- .../python/TrigSteerMonitorConfig.py | 41 +++++++++++++------ .../TrigSteerMonitor/src/SchedulerMonSvc.cxx | 25 +++++++---- .../TrigSteerMonitor/src/SchedulerMonSvc.h | 4 +- .../TriggerJobOpts/python/Modifiers.py | 6 ++- .../TriggerJobOpts/share/runHLT_standalone.py | 1 + 5 files changed, 54 insertions(+), 23 deletions(-) diff --git a/Trigger/TrigMonitoring/TrigSteerMonitor/python/TrigSteerMonitorConfig.py b/Trigger/TrigMonitoring/TrigSteerMonitor/python/TrigSteerMonitorConfig.py index 23be91a7d8f..86a3b8fe3e1 100644 --- a/Trigger/TrigMonitoring/TrigSteerMonitor/python/TrigSteerMonitorConfig.py +++ b/Trigger/TrigMonitoring/TrigSteerMonitor/python/TrigSteerMonitorConfig.py @@ -8,18 +8,13 @@ def SchedulerMonSvcCfg(flags, name='SchedulerMonSvc'): monsvc = CompFactory.SchedulerMonSvc(name) monsvc.MonTool = GenericMonitoringTool('MonTool', HistPath='HLTFramework/'+name) - # Plots vs snapshot number can be enabled with flags.SchedulerMonSvc.enablePlotsVsSnapNumber - # They are disabled by default as they don't give much information on top of the plots vs walltime - enablePlotsVsSnapNumber = False - if flags.hasCategory('SchedulerMonSvc') and flags.SchedulerMonSvc.hasFlag('enablePlotsVsSnapNumber'): - enablePlotsVsSnapNumber = flags.SchedulerMonSvc.enablePlotsVsSnapNumber - - # From GaudiHive AlgsExecutionStates::State enum - stateNames = ['INITIAL', 'CONTROLREADY', 'DATAREADY', 'RESOURCELESS', - 'SCHEDULED', 'EVTACCEPTED', 'EVTREJECTED', 'ERROR'] - activeStates = ['CONTROLREADY', 'DATAREADY', 'RESOURCELESS', 'SCHEDULED'] - # Helper functions + def getFlag(flagName, defaultValue): + if flags.hasCategory('SchedulerMonSvc') and flags.SchedulerMonSvc.hasFlag(flagName): + return eval('flags.SchedulerMonSvc.'+flagName) + else: + return defaultValue + def defineHist1D(varName, varLabel, labels=None, **kwargs): monsvc.MonTool.defineHistogram(varName, path='EXPERT', type='TH1D', title=varLabel+';'+varLabel+';Snapshots', @@ -43,14 +38,34 @@ def SchedulerMonSvcCfg(flags, name='SchedulerMonSvc'): if enablePlotsVsSnapNumber: defineHistVsSnapNumber(varName, varLabel, type2D, labels, ybins=nbins, ymin=min, ymax=max, **kwargs) + # Flags propagated to SchedulerMonSvc properties, can be set with flags.SchedulerMonSvc.<flagName> + monsvc.SchedulerName = getFlag('SchedulerName', 'AvalancheSchedulerSvc') + monsvc.MonIntervalMillisec = getFlag('MonIntervalMillisec', 100) + + # Flags enabling/disabling histogram categories, can be set with flags.SchedulerMonSvc.<flagName> + enablePlotsVsSnapNumber = getFlag('enablePlotsVsSnapNumber', False) + enablePlotsOverThreads = getFlag('enablePlotsOverThreads', True) + enablePlotsOverSlots = getFlag('enablePlotsOverSlots', True) + enablePlotsOverActive = getFlag('enablePlotsOverActive', True) + + # From GaudiHive AlgsExecutionStates::State enum + stateNames = ['INITIAL', 'CONTROLREADY', 'DATAREADY', 'RESOURCELESS', + 'SCHEDULED', 'EVTACCEPTED', 'EVTREJECTED', 'ERROR'] + activeStates = ['CONTROLREADY', 'DATAREADY', 'RESOURCELESS', 'SCHEDULED'] + # Histogram definitions defineStandardHistogramSet('AlgStates', 'Algorithm state', 8, -0.5, 7.5, labels=stateNames, type2D='TH2D', weight='StateTotalCounts') defineStandardHistogramSet('FreeSlots', 'Number of free slots', 10, 0, 10) + defineStandardHistogramSet('FreeSlotsFraction', 'Fraction of free slots', 100, 0, 1) for state in stateNames: defineStandardHistogramSet(state, 'N algs in '+state+' state', 100, 0, 100) for state in activeStates: - defineStandardHistogramSet(state+'_Over_Threads', 'N '+state+' / N threads', 100, 0, 10) - defineStandardHistogramSet(state+'_Over_Active', 'N '+state+' / N active states', 100, 0, 1) + if enablePlotsOverThreads: + defineStandardHistogramSet(state+'_Over_Threads', 'N '+state+' / N threads', 100, 0, 10) + if enablePlotsOverSlots: + defineStandardHistogramSet(state+'_Over_Slots', 'N '+state+' / N slots', 100, 0, 10) + if enablePlotsOverActive: + defineStandardHistogramSet(state+'_Over_Active', 'N '+state+' / N active states', 100, 0, 1) monsvc.MonTool.defineHistogram('TIME_monCallback', path='EXPERT', type='TH1D', title='Time of callback calls;Time [us];Calls', diff --git a/Trigger/TrigMonitoring/TrigSteerMonitor/src/SchedulerMonSvc.cxx b/Trigger/TrigMonitoring/TrigSteerMonitor/src/SchedulerMonSvc.cxx index 25d83390386..fbb6dd4e74f 100644 --- a/Trigger/TrigMonitoring/TrigSteerMonitor/src/SchedulerMonSvc.cxx +++ b/Trigger/TrigMonitoring/TrigSteerMonitor/src/SchedulerMonSvc.cxx @@ -60,8 +60,9 @@ StatusCode SchedulerMonSvc::startMonitoring() { } } - // Get the number of threads + // Get the number of threads and slots int numThreads = std::stoi( SmartIF<IProperty>(m_scheduler)->getProperty("ThreadPoolSize").toString() ); + int numSlots = std::stoi( serviceLocator()->service("EventDataSvc").as<IProperty>()->getProperty("NSlots").toString() ); // Flag the monitoring as running (prevents going past this point twice) if (bool expected = false; not m_running.compare_exchange_strong(expected, true)) { @@ -70,11 +71,11 @@ StatusCode SchedulerMonSvc::startMonitoring() { } // Construct the callback and pass it to the scheduler monitoring API - auto monCallback = [this, &numThreads](IScheduler::OccupancySnapshot snap) -> void { + auto monCallback = [this, &numThreads, &numSlots](IScheduler::OccupancySnapshot snap) -> void { auto monTime = Monitored::Timer("TIME_monCallback"); // Calculate and update snap counters const ClockType::duration wallTime = snap.time - m_startTime; - const size_t thisSnapCounter = std::chrono::duration_cast<std::chrono::milliseconds>(wallTime).count() / m_samplingPeriodMillisec.value(); + const size_t thisSnapCounter = std::chrono::duration_cast<std::chrono::milliseconds>(wallTime).count() / m_monIntervalMillisec.value(); const size_t lastSnapCounter = m_lastSnapCounter.exchange(thisSnapCounter); const int periodsSinceLastSnap = thisSnapCounter - lastSnapCounter; @@ -97,9 +98,11 @@ StatusCode SchedulerMonSvc::startMonitoring() { // Monitor alg state counts absolute numbers and ratios to N threads and N active states std::vector<Monitored::Scalar<int>> mon_stateCounts; std::vector<Monitored::Scalar<double>> mon_stateCountsOverThreads; + std::vector<Monitored::Scalar<double>> mon_stateCountsOverSlots; std::vector<Monitored::Scalar<double>> mon_stateCountsOverActive; mon_stateCounts.reserve(static_cast<size_t>(AlgState::MAXVALUE)); mon_stateCountsOverThreads.reserve(static_cast<size_t>(AlgState::MAXVALUE)); + mon_stateCountsOverSlots.reserve(static_cast<size_t>(AlgState::MAXVALUE)); mon_stateCountsOverActive.reserve(static_cast<size_t>(AlgState::MAXVALUE)); int activeCount = 0; for (size_t i : s_activeAlgStateNumbers) { @@ -108,26 +111,34 @@ StatusCode SchedulerMonSvc::startMonitoring() { for (size_t i : s_algStateNumbers) { mon_stateCounts.emplace_back(s_algStateNames[i].data(), stateTotalCounts[i]); mon_stateCountsOverThreads.emplace_back(s_algStateNames[i].data()+"_Over_Threads"s, divAsDouble(stateTotalCounts[i], numThreads)); + mon_stateCountsOverSlots.emplace_back(s_algStateNames[i].data()+"_Over_Slots"s, divAsDouble(stateTotalCounts[i], numSlots)); double toActive = (activeCount > 0) ? divAsDouble(stateTotalCounts[i], activeCount) : 0; mon_stateCountsOverActive.emplace_back(s_algStateNames[i].data()+"_Over_Active"s, toActive); } // Monitor number of free slots auto mon_freeSlots = Monitored::Scalar("FreeSlots", m_scheduler->freeSlots()); + auto mon_freeSlotsFrac = Monitored::Scalar("FreeSlotsFraction", divAsDouble(m_scheduler->freeSlots(), numSlots)); // Reserve vector of references with size equal to the number of variables added into the vector in the loop below std::vector<std::reference_wrapper<Monitored::IMonitoredVariable>> allMonVars; - allMonVars.reserve(5 + mon_stateCounts.size() + mon_stateCountsOverThreads.size() + mon_stateCountsOverActive.size()); + allMonVars.reserve(6 + + mon_stateCounts.size() + + mon_stateCountsOverThreads.size() + + mon_stateCountsOverSlots.size() + + mon_stateCountsOverActive.size()); // Fill monitoring histograms once for each sampling period passed since the last fill // If multiple sampling periods passed, it means the scheduler state didn't change during that time for (size_t snapNumber=lastSnapCounter+1; snapNumber<=thisSnapCounter; ++snapNumber) { auto mon_snapNumber = Monitored::Scalar("SnapNumber", snapNumber); - auto mon_wallTimeSec = Monitored::Scalar("WallTimeSeconds", snapNumber*m_samplingPeriodMillisec.value()*1e-3); + auto mon_wallTimeSec = Monitored::Scalar("WallTimeSeconds", snapNumber*m_monIntervalMillisec.value()*1e-3); allMonVars.clear(); allMonVars.insert(allMonVars.end(), mon_stateCounts.begin(), mon_stateCounts.end()); allMonVars.insert(allMonVars.end(), mon_stateCountsOverThreads.begin(), mon_stateCountsOverThreads.end()); + allMonVars.insert(allMonVars.end(), mon_stateCountsOverSlots.begin(), mon_stateCountsOverSlots.end()); allMonVars.insert(allMonVars.end(), mon_stateCountsOverActive.begin(), mon_stateCountsOverActive.end()); - allMonVars.insert(allMonVars.end(), {mon_stateNumber, mon_stateTotalCounts, mon_freeSlots, mon_snapNumber, mon_wallTimeSec}); + allMonVars.insert(allMonVars.end(), {mon_stateNumber, mon_stateTotalCounts, mon_freeSlots, mon_freeSlotsFrac, + mon_snapNumber, mon_wallTimeSec}); Monitored::Group(m_monTool, allMonVars); } monTime.stop(); @@ -136,7 +147,7 @@ StatusCode SchedulerMonSvc::startMonitoring() { // Start monitoring m_startTime = ClockType::now(); - m_scheduler->recordOccupancy(m_samplingPeriodMillisec.value(), std::move(monCallback)); + m_scheduler->recordOccupancy(m_monIntervalMillisec.value(), std::move(monCallback)); ATH_MSG_INFO("Scheduler monitoring started"); diff --git a/Trigger/TrigMonitoring/TrigSteerMonitor/src/SchedulerMonSvc.h b/Trigger/TrigMonitoring/TrigSteerMonitor/src/SchedulerMonSvc.h index 16e1849b3cb..27abd7089c4 100644 --- a/Trigger/TrigMonitoring/TrigSteerMonitor/src/SchedulerMonSvc.h +++ b/Trigger/TrigMonitoring/TrigSteerMonitor/src/SchedulerMonSvc.h @@ -45,8 +45,8 @@ private: // Properties and handles Gaudi::Property<std::string> m_schedulerName { this, "SchedulerName", "AvalancheSchedulerSvc", "Name of the scheduler"}; - Gaudi::Property<unsigned int> m_samplingPeriodMillisec { - this, "SamplingPeriodMillisec", 5, "Target sampling period in milliseconds"}; + Gaudi::Property<unsigned int> m_monIntervalMillisec { + this, "MonIntervalMillisec", 100, "Monitoring snapshot interval in milliseconds"}; ToolHandle<GenericMonitoringTool> m_monTool { this, "MonTool", "", "Monitoring tool"}; diff --git a/Trigger/TriggerCommon/TriggerJobOpts/python/Modifiers.py b/Trigger/TriggerCommon/TriggerJobOpts/python/Modifiers.py index 1d20e1eb6f9..32c6aea48c6 100644 --- a/Trigger/TriggerCommon/TriggerJobOpts/python/Modifiers.py +++ b/Trigger/TriggerCommon/TriggerJobOpts/python/Modifiers.py @@ -1117,8 +1117,12 @@ class enableSchedulerMon(_modifier): Enable SchedulerMonSvc """ def preSetup(self): - from AthenaConfiguration.ComponentAccumulator import CAtoGlobalWrapper from AthenaConfiguration.AllConfigFlags import ConfigFlags as flags + if not flags.Trigger.Online.isPartition: + log.debug('SchedulerMonSvc currently only works with athenaHLT / online partition. Skipping setup.') + return + + from AthenaConfiguration.ComponentAccumulator import CAtoGlobalWrapper from TrigSteerMonitor.TrigSteerMonitorConfig import SchedulerMonSvcCfg CAtoGlobalWrapper(SchedulerMonSvcCfg, flags) diff --git a/Trigger/TriggerCommon/TriggerJobOpts/share/runHLT_standalone.py b/Trigger/TriggerCommon/TriggerJobOpts/share/runHLT_standalone.py index 5b5c3d7e805..abe44297404 100644 --- a/Trigger/TriggerCommon/TriggerJobOpts/share/runHLT_standalone.py +++ b/Trigger/TriggerCommon/TriggerJobOpts/share/runHLT_standalone.py @@ -243,6 +243,7 @@ else: # More data modifiers #Monitoring L1Topo at ROB level #'L1TopoCheck', 'forceTileRODMap', + 'enableSchedulerMon' ] TriggerFlags.doID = ConfigFlags.Trigger.doID = opt.doID -- GitLab