From 9842472947e19a66f21cc409bdd4bc21f48a5c82 Mon Sep 17 00:00:00 2001 From: Rafal Bielski <rafal.bielski@cern.ch> Date: Fri, 28 Aug 2020 21:42:59 +0000 Subject: [PATCH] Add SchedulerMonSvc to monitor Scheduler status in online histograms --- .../TrigControl/TrigServices/CMakeLists.txt | 2 +- .../TrigServices/src/HltEventLoopMgr.cxx | 11 +- .../TrigServices/src/HltEventLoopMgr.h | 5 + .../TrigSteerMonitor/CMakeLists.txt | 8 +- .../TrigSteerMonitor/ISchedulerMonSvc.h | 25 +++ .../python/TrigSteerMonitorConfig.py | 206 ++++-------------- .../python/TrigSteerMonitorLegacyConfig.py | 159 ++++++++++++++ .../TrigSteerMonitor/python/__init__.py | 6 +- .../TrigSteerMonitor/src/SchedulerMonSvc.cxx | 97 +++++++++ .../TrigSteerMonitor/src/SchedulerMonSvc.h | 60 +++++ .../components/TrigSteerMonitor_entries.cxx | 2 + .../python/TestingTrigSteeringConfig.py | 6 +- .../TrigSteering/python/TrigSteeringConfig.py | 2 +- .../TriggerJobOpts/python/Modifiers.py | 16 ++ 14 files changed, 434 insertions(+), 171 deletions(-) create mode 100644 Trigger/TrigMonitoring/TrigSteerMonitor/TrigSteerMonitor/ISchedulerMonSvc.h create mode 100644 Trigger/TrigMonitoring/TrigSteerMonitor/python/TrigSteerMonitorLegacyConfig.py create mode 100644 Trigger/TrigMonitoring/TrigSteerMonitor/src/SchedulerMonSvc.cxx create mode 100644 Trigger/TrigMonitoring/TrigSteerMonitor/src/SchedulerMonSvc.h diff --git a/HLT/Trigger/TrigControl/TrigServices/CMakeLists.txt b/HLT/Trigger/TrigControl/TrigServices/CMakeLists.txt index 2dd9082a709..3bcc59c935c 100644 --- a/HLT/Trigger/TrigControl/TrigServices/CMakeLists.txt +++ b/HLT/Trigger/TrigControl/TrigServices/CMakeLists.txt @@ -22,7 +22,7 @@ atlas_add_component( TrigServices ${TDAQ-COMMON_LIBRARIES} ${TDAQ_LIBRARIES} ${CORAL_LIBRARIES} AthenaBaseComps AthenaInterprocess AthenaKernel AthenaMonitoringKernelLib AthenaPoolUtilities ByteStreamCnvSvcBaseLib ByteStreamData EventInfoUtils GaudiKernel RDBAccessSvcLib StoreGateLib TrigKernel - TrigOutputHandlingLib TrigSteeringEvent xAODEventInfo xAODTrigger ) + TrigOutputHandlingLib TrigSteeringEvent TrigSteerMonitorLib xAODEventInfo xAODTrigger ) # Install files from the package: atlas_install_python_modules( python/*.py diff --git a/HLT/Trigger/TrigControl/TrigServices/src/HltEventLoopMgr.cxx b/HLT/Trigger/TrigControl/TrigServices/src/HltEventLoopMgr.cxx index 50c589da8c7..82df74fdf77 100644 --- a/HLT/Trigger/TrigControl/TrigServices/src/HltEventLoopMgr.cxx +++ b/HLT/Trigger/TrigControl/TrigServices/src/HltEventLoopMgr.cxx @@ -186,6 +186,9 @@ StatusCode HltEventLoopMgr::initialize() ATH_CHECK(m_evtSelector->createContext(m_evtSelContext)); // create an EvtSelectorContext ATH_CHECK(m_outputCnvSvc.retrieve()); ATH_CHECK(m_ioCompMgr.retrieve()); + if (m_monitorScheduler) { + ATH_CHECK(m_schedulerMonSvc.retrieve()); + } //---------------------------------------------------------------------------- // Initialise tools @@ -250,7 +253,8 @@ StatusCode HltEventLoopMgr::finalize() m_detectorStore, m_inputMetaDataStore, m_evtSelector, - m_outputCnvSvc); + m_outputCnvSvc, + m_schedulerMonSvc); releaseTool(m_coolHelper, m_hltResultMaker, @@ -440,6 +444,9 @@ StatusCode HltEventLoopMgr::hltUpdateAfterFork(const ptree& /*pt*/) StatusCode HltEventLoopMgr::executeRun(int maxevt) { ATH_MSG_VERBOSE("start of " << __FUNCTION__); + + if (m_monitorScheduler) ATH_CHECK(m_schedulerMonSvc->startMonitoring()); + StatusCode sc = StatusCode::SUCCESS; try { sc = nextEvent(maxevt); @@ -454,6 +461,8 @@ StatusCode HltEventLoopMgr::executeRun(int maxevt) sc = StatusCode::FAILURE; } + if (m_monitorScheduler) ATH_CHECK(m_schedulerMonSvc->stopMonitoring()); + // Stop the timer thread { ATH_MSG_DEBUG("Stopping the timeout thread"); diff --git a/HLT/Trigger/TrigControl/TrigServices/src/HltEventLoopMgr.h b/HLT/Trigger/TrigControl/TrigServices/src/HltEventLoopMgr.h index 6a6360f476e..835fc498563 100644 --- a/HLT/Trigger/TrigControl/TrigServices/src/HltEventLoopMgr.h +++ b/HLT/Trigger/TrigControl/TrigServices/src/HltEventLoopMgr.h @@ -9,6 +9,7 @@ #include "TrigKernel/ITrigEventLoopMgr.h" #include "TrigOutputHandling/HLTResultMTMaker.h" #include "TrigSteeringEvent/OnlineErrorCode.h" +#include "TrigSteerMonitor/ISchedulerMonSvc.h" // Athena includes #include "AthenaBaseComps/AthService.h" @@ -181,6 +182,7 @@ private: ServiceHandle<IIoComponentMgr> m_ioCompMgr; ServiceHandle<IEvtSelector> m_evtSelector{this, "EvtSel", "EvtSel"}; ServiceHandle<IConversionSvc> m_outputCnvSvc{this, "OutputCnvSvc", "OutputCnvSvc"}; + ServiceHandle<ISchedulerMonSvc> m_schedulerMonSvc{this, "SchedulerMonSvc", "SchedulerMonSvc"}; ToolHandle<TrigCOOLUpdateHelper> m_coolHelper{this, "CoolUpdateTool", "TrigCOOLUpdateHelper"}; ToolHandle<HLTResultMTMaker> m_hltResultMaker{this, "ResultMaker", "HLTResultMTMaker"}; ToolHandle<GenericMonitoringTool> m_monTool{this, "MonTool", "", "Monitoring tool"}; @@ -244,6 +246,9 @@ private: this, "RewriteLVL1", false, "Encode L1 results to ByteStream and write to the output. Possible only with athenaHLT, not online."}; + Gaudi::Property<bool> m_monitorScheduler{ + this, "MonitorScheduler", false, "Enable SchedulerMonSvc to collect scheduler status data in online histograms"}; + SG::WriteHandleKey<EventContext> m_eventContextWHKey{ this, "EventContextWHKey", "EventContext", "StoreGate key for recording EventContext"}; diff --git a/Trigger/TrigMonitoring/TrigSteerMonitor/CMakeLists.txt b/Trigger/TrigMonitoring/TrigSteerMonitor/CMakeLists.txt index 7314ac388b2..034f4b7ee3e 100644 --- a/Trigger/TrigMonitoring/TrigSteerMonitor/CMakeLists.txt +++ b/Trigger/TrigMonitoring/TrigSteerMonitor/CMakeLists.txt @@ -8,11 +8,17 @@ find_package( Boost COMPONENTS filesystem thread system ) find_package( ROOT COMPONENTS Core Tree MathCore Hist ) find_package( tdaq-common ) +# Interface library +atlas_add_library( TrigSteerMonitorLib + INTERFACE + PUBLIC_HEADERS TrigSteerMonitor + LINK_LIBRARIES GaudiKernel ) + # Component(s) in the package: atlas_add_component( TrigSteerMonitor src/*.h src/*.cxx src/components/*.cxx INCLUDE_DIRS ${Boost_INCLUDE_DIRS} ${ROOT_INCLUDE_DIRS} ${TDAQ-COMMON_INCLUDE_DIRS} - LINK_LIBRARIES ${Boost_LIBRARIES} ${ROOT_LIBRARIES} ${TDAQ-COMMON_LIBRARIES} AthenaBaseComps AthenaInterprocess AthenaKernel AthenaMonitoringKernelLib AthenaMonitoringLib EventInfo GaudiKernel StoreGateLib TrigCompositeUtilsLib TrigConfData TrigConfHLTData TrigConfInterfaces TrigConfL1Data TrigDataAccessMonitoringLib TrigInterfacesLib TrigMonitorBaseLib TrigNavigationLib TrigSteeringEvent TrigSteeringLib TrigT1Interfaces TrigT1Result xAODEventInfo xAODTrigger ) + LINK_LIBRARIES ${Boost_LIBRARIES} ${ROOT_LIBRARIES} ${TDAQ-COMMON_LIBRARIES} TrigSteerMonitorLib AthenaBaseComps AthenaInterprocess AthenaKernel AthenaMonitoringKernelLib AthenaMonitoringLib EventInfo GaudiKernel StoreGateLib TrigCompositeUtilsLib TrigConfData TrigConfHLTData TrigConfInterfaces TrigConfL1Data TrigDataAccessMonitoringLib TrigInterfacesLib TrigMonitorBaseLib TrigNavigationLib TrigSteeringEvent TrigSteeringLib TrigT1Interfaces TrigT1Result xAODEventInfo xAODTrigger ) # Install files from the package: atlas_install_python_modules( python/*.py ) diff --git a/Trigger/TrigMonitoring/TrigSteerMonitor/TrigSteerMonitor/ISchedulerMonSvc.h b/Trigger/TrigMonitoring/TrigSteerMonitor/TrigSteerMonitor/ISchedulerMonSvc.h new file mode 100644 index 00000000000..9d77764b803 --- /dev/null +++ b/Trigger/TrigMonitoring/TrigSteerMonitor/TrigSteerMonitor/ISchedulerMonSvc.h @@ -0,0 +1,25 @@ +/* + Copyright (C) 2002-2020 CERN for the benefit of the ATLAS collaboration +*/ + +#ifndef TRIGSTEERMONITOR_ISCHEDULERMONSVC_H +#define TRIGSTEERMONITOR_ISCHEDULERMONSVC_H + +#include "GaudiKernel/IInterface.h" + +/**@class ISchedulerMonSvc + * @brief Defines interface for interaction with a service monitoring the Scheduler status + */ +class ISchedulerMonSvc: virtual public IInterface { +public: + /// Interface ID + DeclareInterfaceID(ISchedulerMonSvc, 1, 0); + + /// Start querying and monitoring Scheduler status + virtual StatusCode startMonitoring() = 0; + + /// Stop querying and monitoring Scheduler status + virtual StatusCode stopMonitoring() = 0; +}; + +#endif // TRIGSTEERMONITOR_ISCHEDULERMONSVC_H diff --git a/Trigger/TrigMonitoring/TrigSteerMonitor/python/TrigSteerMonitorConfig.py b/Trigger/TrigMonitoring/TrigSteerMonitor/python/TrigSteerMonitorConfig.py index 98720c1c1ff..3b6a9030b49 100644 --- a/Trigger/TrigMonitoring/TrigSteerMonitor/python/TrigSteerMonitorConfig.py +++ b/Trigger/TrigMonitoring/TrigSteerMonitor/python/TrigSteerMonitorConfig.py @@ -1,159 +1,47 @@ -# Copyright (C) 2002-2019 CERN for the benefit of the ATLAS collaboration - -from TrigSteerMonitor.TrigSteerMonitorConf import * - - -class TrigErrorMonConfigValidation(TrigErrorMon): - """ HLT Error Code monitoring """ - __slots__ = [ ] - def __init__(self,name="TrigErrorMonValidation"): - super(TrigErrorMonConfigValidation, self).__init__(name) - self.LBNHistoryDepth=0 - - def target(self): - return [ "Validation" ] - -class TrigRoIMoniConfigValidation(TrigRoIMoni): - """ RoI monitoring """ - __slots__ = [ ] - def __init__(self,name="TrigRoIMoniValidation"): - super(TrigRoIMoniConfigValidation, self).__init__(name) - self.LBNHistoryDepth=0 - self.ThreshMultiMax=18 - - def target(self): - return [ "Validation" ] - -class TrigSignatureMoniConfigValidation(TrigSignatureMoni): - """ Signature monitoring """ - __slots__ = [ ] - def __init__(self,name="TrigSignatureMoniValidation"): - super(TrigSignatureMoniConfigValidation, self).__init__(name) - self.LBNHistoryDepth=0 - - def target(self): - return [ "Validation" ] - -class TrigTEMoniConfig(TrigTEMoni): - """ TriggerElement monitoring """ - __slots__ = [ ] - def __init__(self,name="TrigTEMoni"): - super(TrigTEMoniConfig, self).__init__(name) - self.LBNHistoryDepth=0 - - def target(self): - return [ "Validation" ] - - -class TrigChainMoniConfig(TrigChainMoni): - """ Chains monitoring """ - __slots__ = [ ] - def __init__(self,name="TrigChainMoni"): - super(TrigChainMoniConfig, self).__init__(name) - self.LBNHistoryDepth=0 - - def target(self): - return [ "Online", "Validation" ] - -class TrigErrorMonitor(TrigErrorMon): - """ HLT Error Code monitoring """ - __slots__ = [ ] - def __init__(self,name="TrigErrorMonitor"): - super(TrigErrorMonitor, self).__init__(name) - self.LBNHistoryGroup=10 - self.expertMode=False - - def target(self): - return [ "Online", "Validation" ] - -class TrigErrorExpertMonitor(TrigErrorMon): - """ HLT Error Code monitoring for experts (all errorcodes) """ - __slots__ = [ ] - def __init__(self,name="TrigErrorExpertMonitor"): - super(TrigErrorExpertMonitor, self).__init__(name) - self.LBNHistoryGroup=10 - self.expertMode=True - - def target(self): - return [ "Online", "Validation" ] - -class TrigRoIMoniConfigOnline(TrigRoIMoni): - """ RoI monitoring """ - __slots__ = [ ] - def __init__(self,name="TrigRoIMoniOnline"): - super(TrigRoIMoniConfigOnline, self).__init__(name) - self.LBNHistoryDepth=2 - self.ThreshMultiMax=30 - - def target(self): - return [ "Online" ] - -class TrigSignatureMoniConfigOnline(TrigSignatureMoni): - """ Signature monitoring """ - __slots__ = [ ] - def __init__(self,name="TrigSignatureMoniOnline"): - super(TrigSignatureMoniConfigOnline, self).__init__(name) - self.LBNHistoryDepth=5 - - def target(self): - return [ "Online" ] - -class TrigRateMoniConfig20s(TrigRateMoni): - """ Rates monitor for online use only """ - def __init__(self,name="TrigRate20s"): - super(TrigRateMoniConfig20s, self).__init__(name) - self.IntervalDuration = 20 - self.NumberOfIntervals = 3 - self.doChains=True - self.doStreams=True - self.StreamSets = [ - 'recording_physics_prompt:Main', - 'recording_physics_delayed:BphysDelayed,ExoDelayed', - 'recording_physics_other:' - ] - - def target(self): - return [ "Online" ] - -class TrigMemMonitor(TrigMemMoni): - """ Memory monitor """ - def __init__(self,name="TrigMemMonitor"): - super(TrigMemMonitor, self).__init__(name) - - from AthenaCommon.AppMgr import ServiceMgr as svcMgr,theApp - if not hasattr(svcMgr.AuditorSvc,"TrigMemAuditor"): - from TrigSteerMonitor.TrigSteerMonitorConf import TrigMemAuditor - svcMgr.AuditorSvc += TrigMemAuditor() - theApp.AuditAlgorithms = True - - def target(self): - return [ "Online" ] - -class TrigROBMoniConfig(TrigROBMoni): - """ ROB request monitor for online use """ - def __init__(self,name="TrigROBMoni"): - super(TrigROBMoniConfig, self).__init__(name) - - def target(self): - return [ "OnlineDetail" ] - -class TrigCorMonitor(TrigCorMoni): - """ Trigger L1 and HLT correlation monitor """ - def __init__(self,name="TrigCorMonitor"): - super(TrigCorMonitor, self).__init__(name) - - def target(self): - return [ "Online", "Validation" ] - -TrigSteerMonitorToolList = [ TrigRateMoniConfig20s(), # leave first(!) so it gets finalized first (ATDSUPPORT-223) - TrigErrorMonitor(), - TrigErrorExpertMonitor(), - TrigRoIMoniConfigValidation(), TrigRoIMoniConfigOnline(), - TrigSignatureMoniConfigValidation(), TrigSignatureMoniConfigOnline(), - TrigTEMoniConfig(), - TrigChainMoniConfig(), - TrigMemMonitor(), - TrigROBMoniConfig(), - TrigCorMonitor() ] - - +# Copyright (C) 2002-2020 CERN for the benefit of the ATLAS collaboration + +from AthenaConfiguration.ComponentAccumulator import ComponentAccumulator +from AthenaConfiguration.ComponentFactory import CompFactory +from AthenaMonitoringKernel.GenericMonitoringTool import GenericMonitoringTool + +def SchedulerMonSvcCfg(flags, name='SchedulerMonSvc'): + monsvc = CompFactory.SchedulerMonSvc(name) + monsvc.MonTool = GenericMonitoringTool('MonTool', HistPath='HLTFramework/'+name) + + # From GaudiHive AlgsExecutionStates::State enum + stateNames = ['INITIAL', 'CONTROLREADY', 'DATAREADY', 'RESOURCELESS', + 'SCHEDULED', 'EVTACCEPTED', 'EVTREJECTED', 'ERROR'] + + monsvc.MonTool.defineHistogram('SnapNumber,AlgStates', weight='StateTotalCounts', path='EXPERT', type='TH2D', + title='Scheduler algorithm states vs time (snap number);Snap number;Algorithm state', + xbins=1000, xmin=0, xmax=1000, + ybins=8, ymin=-0.5, ymax=7.5, ylabels=stateNames, + opt='kCanRebin') + monsvc.MonTool.defineHistogram('WallTimeSeconds,AlgStates', weight='StateTotalCounts', path='EXPERT', type='TH2D', + title='Scheduler algorithm states vs time;Time [s];Algorithm state', + xbins=600, xmin=0, xmax=60, + ybins=8, ymin=-0.5, ymax=7.5, ylabels=stateNames, + opt='kCanRebin') + monsvc.MonTool.defineHistogram('SnapNumber,FreeSlots', path='EXPERT', type='TProfile', + title='Number of free slots vs time (snap number);Snap number;Number of free slots', + xbins=1000, xmin=0, xmax=1000, + ybins=10, ymin=0, ymax=10, + opt='kCanRebin') + monsvc.MonTool.defineHistogram('WallTimeSeconds,FreeSlots', path='EXPERT', type='TProfile', + title='Number of free slots vs time ;Time [s];Number of free slots', + xbins=600, xmin=0, xmax=60, + ybins=10, ymin=0, ymax=10, + opt='kCanRebin') + monsvc.MonTool.defineHistogram('AlgStates', weight='StateTotalCounts', path='EXPERT', type='TH1D', + title='Scheduler algorithm states;Algorithm states;Snapshots', + xbins=8, xmin=-0.5, xmax=7.5, xlabels=stateNames) + monsvc.MonTool.defineHistogram('FreeSlots', path='EXPERT', type='TH1D', + title='Number of free slots;Free slots;Snapshots', + xbins=10, xmin=0, xmax=10, opt='kCanRebin') + monsvc.MonTool.defineHistogram('TIME_monCallback', path='EXPERT', type='TH1D', + title='Time of callback calls;Time [us];Calls', + xbins=500, xmin=0, xmax=5000) + + acc = ComponentAccumulator() + acc.addService(monsvc) + return acc diff --git a/Trigger/TrigMonitoring/TrigSteerMonitor/python/TrigSteerMonitorLegacyConfig.py b/Trigger/TrigMonitoring/TrigSteerMonitor/python/TrigSteerMonitorLegacyConfig.py new file mode 100644 index 00000000000..98720c1c1ff --- /dev/null +++ b/Trigger/TrigMonitoring/TrigSteerMonitor/python/TrigSteerMonitorLegacyConfig.py @@ -0,0 +1,159 @@ +# Copyright (C) 2002-2019 CERN for the benefit of the ATLAS collaboration + +from TrigSteerMonitor.TrigSteerMonitorConf import * + + +class TrigErrorMonConfigValidation(TrigErrorMon): + """ HLT Error Code monitoring """ + __slots__ = [ ] + def __init__(self,name="TrigErrorMonValidation"): + super(TrigErrorMonConfigValidation, self).__init__(name) + self.LBNHistoryDepth=0 + + def target(self): + return [ "Validation" ] + +class TrigRoIMoniConfigValidation(TrigRoIMoni): + """ RoI monitoring """ + __slots__ = [ ] + def __init__(self,name="TrigRoIMoniValidation"): + super(TrigRoIMoniConfigValidation, self).__init__(name) + self.LBNHistoryDepth=0 + self.ThreshMultiMax=18 + + def target(self): + return [ "Validation" ] + +class TrigSignatureMoniConfigValidation(TrigSignatureMoni): + """ Signature monitoring """ + __slots__ = [ ] + def __init__(self,name="TrigSignatureMoniValidation"): + super(TrigSignatureMoniConfigValidation, self).__init__(name) + self.LBNHistoryDepth=0 + + def target(self): + return [ "Validation" ] + +class TrigTEMoniConfig(TrigTEMoni): + """ TriggerElement monitoring """ + __slots__ = [ ] + def __init__(self,name="TrigTEMoni"): + super(TrigTEMoniConfig, self).__init__(name) + self.LBNHistoryDepth=0 + + def target(self): + return [ "Validation" ] + + +class TrigChainMoniConfig(TrigChainMoni): + """ Chains monitoring """ + __slots__ = [ ] + def __init__(self,name="TrigChainMoni"): + super(TrigChainMoniConfig, self).__init__(name) + self.LBNHistoryDepth=0 + + def target(self): + return [ "Online", "Validation" ] + +class TrigErrorMonitor(TrigErrorMon): + """ HLT Error Code monitoring """ + __slots__ = [ ] + def __init__(self,name="TrigErrorMonitor"): + super(TrigErrorMonitor, self).__init__(name) + self.LBNHistoryGroup=10 + self.expertMode=False + + def target(self): + return [ "Online", "Validation" ] + +class TrigErrorExpertMonitor(TrigErrorMon): + """ HLT Error Code monitoring for experts (all errorcodes) """ + __slots__ = [ ] + def __init__(self,name="TrigErrorExpertMonitor"): + super(TrigErrorExpertMonitor, self).__init__(name) + self.LBNHistoryGroup=10 + self.expertMode=True + + def target(self): + return [ "Online", "Validation" ] + +class TrigRoIMoniConfigOnline(TrigRoIMoni): + """ RoI monitoring """ + __slots__ = [ ] + def __init__(self,name="TrigRoIMoniOnline"): + super(TrigRoIMoniConfigOnline, self).__init__(name) + self.LBNHistoryDepth=2 + self.ThreshMultiMax=30 + + def target(self): + return [ "Online" ] + +class TrigSignatureMoniConfigOnline(TrigSignatureMoni): + """ Signature monitoring """ + __slots__ = [ ] + def __init__(self,name="TrigSignatureMoniOnline"): + super(TrigSignatureMoniConfigOnline, self).__init__(name) + self.LBNHistoryDepth=5 + + def target(self): + return [ "Online" ] + +class TrigRateMoniConfig20s(TrigRateMoni): + """ Rates monitor for online use only """ + def __init__(self,name="TrigRate20s"): + super(TrigRateMoniConfig20s, self).__init__(name) + self.IntervalDuration = 20 + self.NumberOfIntervals = 3 + self.doChains=True + self.doStreams=True + self.StreamSets = [ + 'recording_physics_prompt:Main', + 'recording_physics_delayed:BphysDelayed,ExoDelayed', + 'recording_physics_other:' + ] + + def target(self): + return [ "Online" ] + +class TrigMemMonitor(TrigMemMoni): + """ Memory monitor """ + def __init__(self,name="TrigMemMonitor"): + super(TrigMemMonitor, self).__init__(name) + + from AthenaCommon.AppMgr import ServiceMgr as svcMgr,theApp + if not hasattr(svcMgr.AuditorSvc,"TrigMemAuditor"): + from TrigSteerMonitor.TrigSteerMonitorConf import TrigMemAuditor + svcMgr.AuditorSvc += TrigMemAuditor() + theApp.AuditAlgorithms = True + + def target(self): + return [ "Online" ] + +class TrigROBMoniConfig(TrigROBMoni): + """ ROB request monitor for online use """ + def __init__(self,name="TrigROBMoni"): + super(TrigROBMoniConfig, self).__init__(name) + + def target(self): + return [ "OnlineDetail" ] + +class TrigCorMonitor(TrigCorMoni): + """ Trigger L1 and HLT correlation monitor """ + def __init__(self,name="TrigCorMonitor"): + super(TrigCorMonitor, self).__init__(name) + + def target(self): + return [ "Online", "Validation" ] + +TrigSteerMonitorToolList = [ TrigRateMoniConfig20s(), # leave first(!) so it gets finalized first (ATDSUPPORT-223) + TrigErrorMonitor(), + TrigErrorExpertMonitor(), + TrigRoIMoniConfigValidation(), TrigRoIMoniConfigOnline(), + TrigSignatureMoniConfigValidation(), TrigSignatureMoniConfigOnline(), + TrigTEMoniConfig(), + TrigChainMoniConfig(), + TrigMemMonitor(), + TrigROBMoniConfig(), + TrigCorMonitor() ] + + diff --git a/Trigger/TrigMonitoring/TrigSteerMonitor/python/__init__.py b/Trigger/TrigMonitoring/TrigSteerMonitor/python/__init__.py index 28c6e9c65eb..21b3beea83c 100644 --- a/Trigger/TrigMonitoring/TrigSteerMonitor/python/__init__.py +++ b/Trigger/TrigMonitoring/TrigSteerMonitor/python/__init__.py @@ -1,7 +1,3 @@ -# Copyright (C) 2002-2017 CERN for the benefit of the ATLAS collaboration +# Copyright (C) 2002-2020 CERN for the benefit of the ATLAS collaboration # File: /__init__.py - -__version__ = '1.0.0' -__author__ = '' -__all__ = [ 'TrigSteerMonitorConfig' ] diff --git a/Trigger/TrigMonitoring/TrigSteerMonitor/src/SchedulerMonSvc.cxx b/Trigger/TrigMonitoring/TrigSteerMonitor/src/SchedulerMonSvc.cxx new file mode 100644 index 00000000000..e6665eb8afa --- /dev/null +++ b/Trigger/TrigMonitoring/TrigSteerMonitor/src/SchedulerMonSvc.cxx @@ -0,0 +1,97 @@ +/* + Copyright (C) 2002-2020 CERN for the benefit of the ATLAS collaboration +*/ + +#include "SchedulerMonSvc.h" + +namespace { + /// This is AlgsExecutionStates::State::MAXVALUE which cannot be used here directly because it is not in a public header + static constexpr size_t s_numAlgStates = 8; + static constexpr std::array<size_t,8> s_algStateNumbers = {0,1,2,3,4,5,6,7}; +} + +// ============================================================================= +SchedulerMonSvc::SchedulerMonSvc(const std::string& name, ISvcLocator* svcLoc) +: base_class(name, svcLoc) {} + +// ============================================================================= +StatusCode SchedulerMonSvc::initialize() { + if (!m_monTool.empty()) ATH_CHECK(m_monTool.retrieve()); + return StatusCode::SUCCESS; +} + +// ============================================================================= +StatusCode SchedulerMonSvc::startMonitoring() { + // Get a handle to the scheduler + if (!m_scheduler.isValid()) { + m_scheduler = serviceLocator()->service<IScheduler>(m_schedulerName, false); + if (!m_scheduler.isValid()) { + ATH_MSG_ERROR("Failed to retrieve the Scheduler service with name " << m_schedulerName); + return StatusCode::FAILURE; + } + } + + // Flag the monitoring as running (prevents going past this point twice) + if (bool expected = false; not m_running.compare_exchange_strong(expected, true)) { + ATH_MSG_ERROR("startMonitoring called but it is already running"); + return StatusCode::FAILURE; + } + + // Construct the callback and pass it to the scheduler monitoring API + auto monCallback = [this](IScheduler::OccupancySnapshot snap) -> void { + auto monTime = Monitored::Timer("TIME_monCallback"); + // Calculate and update snap counters + const ClockType::duration wallTime = snap.time - m_startTime; + const size_t thisSnapCounter = std::chrono::duration_cast<std::chrono::milliseconds>(wallTime).count() / m_samplingPeriodMillisec.value(); + const size_t lastSnapCounter = m_lastSnapCounter.exchange(thisSnapCounter); + const int periodsSinceLastSnap = thisSnapCounter - lastSnapCounter; + + // If new snap comes before next sampling point, discard it + if (periodsSinceLastSnap <= 0) { + ATH_MSG_DEBUG("Discarding snap because periodsSinceLastSnap=" << periodsSinceLastSnap << " is not positive"); + return; + } + + // Monitor total state counts across all slots + std::vector<int> stateTotalCounts(s_numAlgStates, 0); + for (size_t slot=0; slot < snap.states.size(); ++slot) { + for (size_t state=0; state < snap.states[slot].size(); ++state) { + stateTotalCounts[state] += snap.states[slot][state]; + } + } + auto mon_stateNumber = Monitored::Collection("AlgStates", s_algStateNumbers); + auto mon_stateTotalCounts = Monitored::Collection("StateTotalCounts", stateTotalCounts); + + // Monitor number of free slots + auto mon_freeSlots = Monitored::Scalar("FreeSlots", m_scheduler->freeSlots()); + + // Fill monitoring histograms once for each sampling period passed since the last fill + // If multiple sampling periods passed, it means the scheduler state didn't change during that time + for (size_t snapNumber=lastSnapCounter+1; snapNumber<=thisSnapCounter; ++snapNumber) { + auto mon_snapNumber = Monitored::Scalar("SnapNumber", snapNumber); + auto mon_wallTimeSec = Monitored::Scalar("WallTimeSeconds", snapNumber*m_samplingPeriodMillisec.value()*1e-3); + Monitored::Group(m_monTool, mon_snapNumber, mon_wallTimeSec, mon_freeSlots, mon_stateNumber, mon_stateTotalCounts); + } + monTime.stop(); + Monitored::Group(m_monTool, monTime); + }; + + // Start monitoring + m_startTime = ClockType::now(); + m_scheduler->recordOccupancy(m_samplingPeriodMillisec.value(), std::move(monCallback)); + + ATH_MSG_INFO("Scheduler monitoring started"); + + return StatusCode::SUCCESS; +} + +// ============================================================================= +StatusCode SchedulerMonSvc::stopMonitoring() { + if (bool expected = true; not m_running.compare_exchange_strong(expected, false)) { + ATH_MSG_WARNING("stopMonitoring called but it was not running"); + return StatusCode::SUCCESS; + } + m_scheduler->recordOccupancy(-1, {}); + ATH_MSG_INFO("Scheduler monitoring stopped"); + return StatusCode::SUCCESS; +} diff --git a/Trigger/TrigMonitoring/TrigSteerMonitor/src/SchedulerMonSvc.h b/Trigger/TrigMonitoring/TrigSteerMonitor/src/SchedulerMonSvc.h new file mode 100644 index 00000000000..16e1849b3cb --- /dev/null +++ b/Trigger/TrigMonitoring/TrigSteerMonitor/src/SchedulerMonSvc.h @@ -0,0 +1,60 @@ +/* + Copyright (C) 2002-2020 CERN for the benefit of the ATLAS collaboration +*/ +#ifndef TRIGSTEERMONITOR_SCHEDULERMONSVC_H +#define TRIGSTEERMONITOR_SCHEDULERMONSVC_H + +// Local includes +#include "TrigSteerMonitor/ISchedulerMonSvc.h" + +// Athena includes +#include "AthenaBaseComps/AthService.h" +#include "AthenaMonitoringKernel/Monitored.h" + +// Gaudi includes +#include "GaudiKernel/SmartIF.h" +#include "GaudiKernel/IScheduler.h" + +// System includes +#include <atomic> +#include <chrono> + +/** + * @class SchedulerMonSvc + * @brief Service monitoring the Scheduler status and producing relevant online histograms + **/ +class SchedulerMonSvc : public extends<AthService, ISchedulerMonSvc> { +public: + + // Clock type used in IScheduler::OccupancySnapshot::time + using ClockType = decltype(IScheduler::OccupancySnapshot::time)::clock; + + SchedulerMonSvc(const std::string& name, ISvcLocator* svcLoc); + virtual ~SchedulerMonSvc() override = default; + + // IService methods + virtual StatusCode initialize() override; + + // ISchedulerMonSvc methods + /// Start querying and monitoring Scheduler status + virtual StatusCode startMonitoring() override; + /// Stop querying and monitoring Scheduler status + virtual StatusCode stopMonitoring() override; + +private: + // Properties and handles + Gaudi::Property<std::string> m_schedulerName { + this, "SchedulerName", "AvalancheSchedulerSvc", "Name of the scheduler"}; + Gaudi::Property<unsigned int> m_samplingPeriodMillisec { + this, "SamplingPeriodMillisec", 5, "Target sampling period in milliseconds"}; + ToolHandle<GenericMonitoringTool> m_monTool { + this, "MonTool", "", "Monitoring tool"}; + + // Other private members + SmartIF<IScheduler> m_scheduler {nullptr}; + std::atomic_bool m_running {false}; + ClockType::time_point m_startTime {}; + std::atomic_size_t m_lastSnapCounter {0}; +}; + +#endif // TRIGSTEERMONITOR_SCHEDULERMONSVC_H diff --git a/Trigger/TrigMonitoring/TrigSteerMonitor/src/components/TrigSteerMonitor_entries.cxx b/Trigger/TrigMonitoring/TrigSteerMonitor/src/components/TrigSteerMonitor_entries.cxx index 063703df903..9e4c016cad1 100644 --- a/Trigger/TrigMonitoring/TrigSteerMonitor/src/components/TrigSteerMonitor_entries.cxx +++ b/Trigger/TrigMonitoring/TrigSteerMonitor/src/components/TrigSteerMonitor_entries.cxx @@ -11,6 +11,7 @@ #include "../TrigMemAuditor.h" #include "../TrigSignatureMoniMT.h" #include "../DecisionCollectorTool.h" +#include "../SchedulerMonSvc.h" DECLARE_COMPONENT( TrigSteerMoni ) DECLARE_COMPONENT( TrigChainMoni ) @@ -25,3 +26,4 @@ DECLARE_COMPONENT( TrigCorMoni ) DECLARE_COMPONENT( TrigMemAuditor ) DECLARE_COMPONENT( TrigSignatureMoniMT ) DECLARE_COMPONENT( DecisionCollectorTool ) +DECLARE_COMPONENT( SchedulerMonSvc ) diff --git a/Trigger/TrigSteer/TrigSteering/python/TestingTrigSteeringConfig.py b/Trigger/TrigSteer/TrigSteering/python/TestingTrigSteeringConfig.py index 7a873ddb987..5a22642523e 100755 --- a/Trigger/TrigSteer/TrigSteering/python/TestingTrigSteeringConfig.py +++ b/Trigger/TrigSteer/TrigSteering/python/TestingTrigSteeringConfig.py @@ -166,7 +166,7 @@ class TestingTrigSteer( TrigSteer_baseClass ): # adding the steering monitoring tools try: - from TrigSteerMonitor.TrigSteerMonitorConfig import TrigSteerMonitorToolList + from TrigSteerMonitor.TrigSteerMonitorLegacyConfig import TrigSteerMonitorToolList self.MonTools += TrigSteerMonitorToolList except: log.warning("Attempt to add monitoring tools failed, will continue without them") @@ -233,7 +233,7 @@ class TestingTrigSteer_L2( TrigSteer_baseClass ): self += instance log.info( "added algorithm to the configuration: "+alg) try: - from TrigSteerMonitor.TrigSteerMonitorConfig import TrigSteerMonitorToolList + from TrigSteerMonitor.TrigSteerMonitorLegacyConfig import TrigSteerMonitorToolList self.MonTools += TrigSteerMonitorToolList except: log.warning("Attempt to add monitoring tools failed, will continue without them") @@ -295,7 +295,7 @@ class TestingTrigSteer_EF( TrigSteer_baseClass ) : try: - from TrigSteerMonitor.TrigSteerMonitorConfig import TrigSteerMonitorToolList + from TrigSteerMonitor.TrigSteerMonitorLegacyConfig import TrigSteerMonitorToolList self.MonTools += TrigSteerMonitorToolList except: log.warning("Attempt to add monitoring tools failed, will continue without them") diff --git a/Trigger/TrigSteer/TrigSteering/python/TrigSteeringConfig.py b/Trigger/TrigSteer/TrigSteering/python/TrigSteeringConfig.py index 691fac87732..b7afd009118 100755 --- a/Trigger/TrigSteer/TrigSteering/python/TrigSteeringConfig.py +++ b/Trigger/TrigSteer/TrigSteering/python/TrigSteeringConfig.py @@ -298,7 +298,7 @@ class TrigSteer_HLT( TrigSteer_baseClass ): log.info( "added algorithm to the configuration: "+alg) try: - from TrigSteerMonitor.TrigSteerMonitorConfig import TrigSteerMonitorToolList + from TrigSteerMonitor.TrigSteerMonitorLegacyConfig import TrigSteerMonitorToolList self.MonTools += TrigSteerMonitorToolList except Exception as e: log.warning("Exception while adding monitoring tools ('%s'). Will continue without them." % e) diff --git a/Trigger/TriggerCommon/TriggerJobOpts/python/Modifiers.py b/Trigger/TriggerCommon/TriggerJobOpts/python/Modifiers.py index 16f7c4c7e6e..1d7ffb3ae5f 100644 --- a/Trigger/TriggerCommon/TriggerJobOpts/python/Modifiers.py +++ b/Trigger/TriggerCommon/TriggerJobOpts/python/Modifiers.py @@ -1127,6 +1127,22 @@ class perfmon(_modifier): jobproperties.PerfMonFlags.doMonitoring = True jobproperties.PerfMonFlags.doPersistencyMonitoring = False +class enableSchedulerMon(_modifier): + """ + Enable SchedulerMonSvc + """ + def preSetup(self): + from AthenaConfiguration.ComponentAccumulator import CAtoGlobalWrapper + from AthenaConfiguration.AllConfigFlags import ConfigFlags as flags + from TrigSteerMonitor.TrigSteerMonitorConfig import SchedulerMonSvcCfg + CAtoGlobalWrapper(SchedulerMonSvcCfg, flags) + + def postSetup(self): + from AthenaConfiguration.AllConfigFlags import ConfigFlags as flags + if flags.Trigger.Online.isPartition: + from AthenaCommon.AppMgr import ServiceMgr as svcMgr + svcMgr.HltEventLoopMgr.MonitorScheduler = True + class memMon(_modifier): """ Enable TrigMemMonitor printout -- GitLab