From 21b3e346fd77c6a62e8176a988fd571d2e3a3093 Mon Sep 17 00:00:00 2001 From: Rafal Bielski <rafal.bielski@cern.ch> Date: Fri, 2 Oct 2020 18:57:38 +0200 Subject: [PATCH] Move online error monitoring from event loop manager to a dedicated tool --- .../TrigServices/python/TrigServicesConfig.py | 7 +-- .../TrigServices/src/HltEventLoopMgr.cxx | 21 ++------- .../TrigServices/src/HltEventLoopMgr.h | 5 +- .../TrigSteerMonitor/ITrigErrorMonTool.h | 22 +++++++++ .../python/TrigSteerMonitorConfig.py | 11 +++++ .../TrigSteerMonitor/src/TrigErrorMonTool.cxx | 47 +++++++++++++++++++ .../TrigSteerMonitor/src/TrigErrorMonTool.h | 36 ++++++++++++++ .../components/TrigSteerMonitor_entries.cxx | 2 + 8 files changed, 127 insertions(+), 24 deletions(-) create mode 100644 Trigger/TrigMonitoring/TrigSteerMonitor/TrigSteerMonitor/ITrigErrorMonTool.h create mode 100644 Trigger/TrigMonitoring/TrigSteerMonitor/src/TrigErrorMonTool.cxx create mode 100644 Trigger/TrigMonitoring/TrigSteerMonitor/src/TrigErrorMonTool.h diff --git a/HLT/Trigger/TrigControl/TrigServices/python/TrigServicesConfig.py b/HLT/Trigger/TrigControl/TrigServices/python/TrigServicesConfig.py index 9e363ea071c..068d32ec5d9 100644 --- a/HLT/Trigger/TrigControl/TrigServices/python/TrigServicesConfig.py +++ b/HLT/Trigger/TrigControl/TrigServices/python/TrigServicesConfig.py @@ -127,9 +127,6 @@ class HltEventLoopMgr(_HltEventLoopMgr): super(HltEventLoopMgr, self).__init__(name) from AthenaMonitoringKernel.GenericMonitoringTool import GenericMonitoringTool self.MonTool = GenericMonitoringTool('MonTool', HistPath='HLTFramework/'+name) - self.MonTool.defineHistogram('ErrorAlgName,ErrorCode', path='EXPERT', type='TH2I', - title='Error StatusCodes per algorithm;Algorithm name;StatusCode', - xbins=1, xmin=0, xmax=1, ybins=1, ymin=0, ymax=1) self.MonTool.defineHistogram('TotalTime', path='EXPERT', type='TH1F', title='Total event processing time (all events);Time [ms];Events', xbins=200, xmin=0, xmax=10000) @@ -142,4 +139,8 @@ class HltEventLoopMgr(_HltEventLoopMgr): self.MonTool.defineHistogram('SlotIdleTime', path='EXPERT', type='TH1F', title='Time between freeing and assigning a scheduler slot;Time [ms];Events', xbins=400, xmin=0, xmax=400) + + from TrigSteerMonitor.TrigSteerMonitorConfig import getTrigErrorMonTool + self.TrigErrorMonTool = getTrigErrorMonTool() + return diff --git a/HLT/Trigger/TrigControl/TrigServices/src/HltEventLoopMgr.cxx b/HLT/Trigger/TrigControl/TrigServices/src/HltEventLoopMgr.cxx index c619cb671b5..8718714de1e 100644 --- a/HLT/Trigger/TrigControl/TrigServices/src/HltEventLoopMgr.cxx +++ b/HLT/Trigger/TrigControl/TrigServices/src/HltEventLoopMgr.cxx @@ -196,8 +196,9 @@ StatusCode HltEventLoopMgr::initialize() ATH_CHECK(m_coolHelper.retrieve()); // HLT result builder ATH_CHECK(m_hltResultMaker.retrieve()); - // Monitoring tool + // Monitoring tools if (!m_monTool.empty()) ATH_CHECK(m_monTool.retrieve()); + ATH_CHECK(m_errorMonTool.retrieve()); //---------------------------------------------------------------------------- // Initialise data handle keys @@ -1183,22 +1184,6 @@ void HltEventLoopMgr::runEventTimer() ATH_MSG_VERBOSE("end of " << __FUNCTION__); } -// ============================================================================= -std::unordered_map<std::string_view,StatusCode> HltEventLoopMgr::algExecErrors(const EventContext& eventContext) const { - std::unordered_map<std::string_view,StatusCode> algErrors; - for (const auto& [key, state] : m_aess->algExecStates(eventContext)) { - if (!state.execStatus().isSuccess()) { - ATH_MSG_DEBUG("Algorithm " << key << " returned StatusCode " << state.execStatus().message() - << " in event " << eventContext.eventID()); - algErrors[key.str()] = state.execStatus(); - auto monErrorAlgName = Monitored::Scalar<std::string>("ErrorAlgName", key.str()); - auto monErrorCode = Monitored::Scalar<std::string>("ErrorCode", state.execStatus().message()); - auto mon = Monitored::Group(m_monTool, monErrorAlgName, monErrorCode); - } - } - return algErrors; -} - // ============================================================================= /** * @brief Retrieves finished events from the scheduler, processes their output and cleans up the slots @@ -1258,7 +1243,7 @@ HltEventLoopMgr::DrainSchedulerStatusCode HltEventLoopMgr::drainScheduler() // Check the event processing status if (m_aess->eventStatus(*thisFinishedEvtContext) != EventStatus::Success) { markFailed(); - auto algErrors = algExecErrors(*thisFinishedEvtContext); + auto algErrors = m_errorMonTool->algExecErrors(*thisFinishedEvtContext); HLT::OnlineErrorCode errCode = isTimedOut(algErrors) ? HLT::OnlineErrorCode::TIMEOUT : HLT::OnlineErrorCode::PROCESSING_FAILURE; HLT_DRAINSCHED_CHECK(sc, "Processing event with context " << *thisFinishedEvtContext diff --git a/HLT/Trigger/TrigControl/TrigServices/src/HltEventLoopMgr.h b/HLT/Trigger/TrigControl/TrigServices/src/HltEventLoopMgr.h index 163ffa1e83c..f3ff73ae52c 100644 --- a/HLT/Trigger/TrigControl/TrigServices/src/HltEventLoopMgr.h +++ b/HLT/Trigger/TrigControl/TrigServices/src/HltEventLoopMgr.h @@ -10,6 +10,7 @@ #include "TrigOutputHandling/HLTResultMTMaker.h" #include "TrigSteeringEvent/OnlineErrorCode.h" #include "TrigSteerMonitor/ISchedulerMonSvc.h" +#include "TrigSteerMonitor/ITrigErrorMonTool.h" // Athena includes #include "AthenaBaseComps/AthService.h" @@ -156,9 +157,6 @@ private: /// The method executed by the event timeout monitoring thread void runEventTimer(); - /// Produce a subset of IAlgExecStateSvc::algExecStates with only non-success StatusCodes - std::unordered_map<std::string_view,StatusCode> algExecErrors(const EventContext& eventContext) const; - /// Drain the scheduler from all actions that may be queued DrainSchedulerStatusCode drainScheduler(); @@ -186,6 +184,7 @@ private: ToolHandle<TrigCOOLUpdateHelper> m_coolHelper{this, "CoolUpdateTool", "TrigCOOLUpdateHelper"}; ToolHandle<HLTResultMTMaker> m_hltResultMaker{this, "ResultMaker", "HLTResultMTMaker"}; ToolHandle<GenericMonitoringTool> m_monTool{this, "MonTool", "", "Monitoring tool"}; + ToolHandle<ITrigErrorMonTool> m_errorMonTool{this, "TrigErrorMonTool", "TrigErrorMonTool", "Error monitoring tool"}; SmartIF<IHiveWhiteBoard> m_whiteboard; SmartIF<IAlgResourcePool> m_algResourcePool; diff --git a/Trigger/TrigMonitoring/TrigSteerMonitor/TrigSteerMonitor/ITrigErrorMonTool.h b/Trigger/TrigMonitoring/TrigSteerMonitor/TrigSteerMonitor/ITrigErrorMonTool.h new file mode 100644 index 00000000000..2a372954a71 --- /dev/null +++ b/Trigger/TrigMonitoring/TrigSteerMonitor/TrigSteerMonitor/ITrigErrorMonTool.h @@ -0,0 +1,22 @@ +/* + Copyright (C) 2002-2020 CERN for the benefit of the ATLAS collaboration +*/ +#ifndef TRIGSTEERMONITOR_ITRIGERRORMONTOOL_H +#define TRIGSTEERMONITOR_ITRIGERRORMONTOOL_H + +#include "GaudiKernel/IAlgTool.h" +#include "GaudiKernel/EventContext.h" + +/** + * @class ITrigErrorMonTool + * @brief Interface of a tool which retrieves and monitors all non-success status codes returned by algorithms + **/ +class ITrigErrorMonTool : virtual public IAlgTool { +public: + DeclareInterfaceID(ITrigErrorMonTool, 1, 0); + + /// Produce a subset of IAlgExecStateSvc::algExecStates with only non-success StatusCodes and fill relevant histograms + virtual std::unordered_map<std::string_view, StatusCode> algExecErrors(const EventContext& eventContext) const = 0; +}; + +#endif // TRIGSTEERMONITOR_ITRIGERRORMONTOOL_H diff --git a/Trigger/TrigMonitoring/TrigSteerMonitor/python/TrigSteerMonitorConfig.py b/Trigger/TrigMonitoring/TrigSteerMonitor/python/TrigSteerMonitorConfig.py index 99c55a62589..23be91a7d8f 100644 --- a/Trigger/TrigMonitoring/TrigSteerMonitor/python/TrigSteerMonitorConfig.py +++ b/Trigger/TrigMonitoring/TrigSteerMonitor/python/TrigSteerMonitorConfig.py @@ -59,3 +59,14 @@ def SchedulerMonSvcCfg(flags, name='SchedulerMonSvc'): acc = ComponentAccumulator() acc.addService(monsvc) return acc + +def getTrigErrorMonTool(name='TrigErrorMonTool'): + errorMonTool = CompFactory.TrigErrorMonTool(name) + errorMonTool.MonTool = GenericMonitoringTool('MonTool', HistPath='HLTFramework/'+name) + + errorMonTool.MonTool.defineHistogram( + 'ErrorAlgName,ErrorCode', path='EXPERT', type='TH2I', + title='Error StatusCodes per algorithm;Algorithm name;StatusCode', + xbins=1, xmin=0, xmax=1, ybins=1, ymin=0, ymax=1) + + return errorMonTool diff --git a/Trigger/TrigMonitoring/TrigSteerMonitor/src/TrigErrorMonTool.cxx b/Trigger/TrigMonitoring/TrigSteerMonitor/src/TrigErrorMonTool.cxx new file mode 100644 index 00000000000..52263624a12 --- /dev/null +++ b/Trigger/TrigMonitoring/TrigSteerMonitor/src/TrigErrorMonTool.cxx @@ -0,0 +1,47 @@ +/* + Copyright (C) 2002-2020 CERN for the benefit of the ATLAS collaboration +*/ +#include "TrigErrorMonTool.h" +#include "GaudiKernel/IAlgExecStateSvc.h" + +// ============================================================================= +// Standard constructor +// ============================================================================= +TrigErrorMonTool::TrigErrorMonTool(const std::string& type, const std::string& name, const IInterface* parent) +: base_class(type, name, parent) {} + +// ============================================================================= +// Implementation of IStateful::initialize +// ============================================================================= +StatusCode TrigErrorMonTool::initialize() { + ATH_CHECK(m_monTool.retrieve(DisableTool{m_monTool.name().empty()})); + ATH_CHECK(m_aess.retrieve()); + return StatusCode::SUCCESS; +} + +// ============================================================================= +// Implementation of IStateful::finalize +// ============================================================================= +StatusCode TrigErrorMonTool::finalize() { + ATH_CHECK(m_monTool.release()); + ATH_CHECK(m_aess.release()); + return StatusCode::SUCCESS; +} + +// ============================================================================= +// The main method of the tool, ITrigErrorMonTool::algExecErrors +// ============================================================================= +std::unordered_map<std::string_view, StatusCode> TrigErrorMonTool::algExecErrors(const EventContext& eventContext) const { + std::unordered_map<std::string_view, StatusCode> algErrors; + for (const auto& [key, state] : m_aess->algExecStates(eventContext)) { + if (!state.execStatus().isSuccess()) { + ATH_MSG_DEBUG("Algorithm " << key << " returned StatusCode " << state.execStatus().message() + << " in event " << eventContext.eventID()); + algErrors[key.str()] = state.execStatus(); + auto monErrorAlgName = Monitored::Scalar<std::string>("ErrorAlgName", key.str()); + auto monErrorCode = Monitored::Scalar<std::string>("ErrorCode", state.execStatus().message()); + auto mon = Monitored::Group(m_monTool, monErrorAlgName, monErrorCode); + } + } + return algErrors; +} diff --git a/Trigger/TrigMonitoring/TrigSteerMonitor/src/TrigErrorMonTool.h b/Trigger/TrigMonitoring/TrigSteerMonitor/src/TrigErrorMonTool.h new file mode 100644 index 00000000000..ebbbe1ddd40 --- /dev/null +++ b/Trigger/TrigMonitoring/TrigSteerMonitor/src/TrigErrorMonTool.h @@ -0,0 +1,36 @@ +/* + Copyright (C) 2002-2020 CERN for the benefit of the ATLAS collaboration +*/ +#ifndef TRIGSTEERMONITOR_TRIGERRORMONTOOL_H +#define TRIGSTEERMONITOR_TRIGERRORMONTOOL_H + +#include "TrigSteerMonitor/ITrigErrorMonTool.h" +#include "AthenaBaseComps/AthAlgTool.h" +#include "AthenaMonitoringKernel/Monitored.h" +#include "GaudiKernel/EventContext.h" + +class IAlgExecStateSvc; + +/** + * @class TrigErrorMonTool + * @brief Retrieves and monitors all non-success status codes returned by algorithms + **/ +class TrigErrorMonTool : public extends<AthAlgTool, ITrigErrorMonTool> { +public: + TrigErrorMonTool(const std::string& type, const std::string& name, const IInterface* parent); + + // ------------------------- IStateful methods ------------------------------- + virtual StatusCode initialize() override; + virtual StatusCode finalize() override; + + // ------------------------- ITrigErrorMonTool methods ----------------------- + /// Produce a subset of IAlgExecStateSvc::algExecStates with only non-success StatusCodes and fill relevant histograms + virtual std::unordered_map<std::string_view, StatusCode> algExecErrors(const EventContext& eventContext) const override; + +private: + ServiceHandle<IAlgExecStateSvc> m_aess{this, "AlgExecStateSvc", "AlgExecStateSvc"}; + ToolHandle<GenericMonitoringTool> m_monTool{this, "MonTool", "", "Monitoring tool"}; +}; + + +#endif // TRIGSTEERMONITOR_TRIGERRORMONTOOL_H diff --git a/Trigger/TrigMonitoring/TrigSteerMonitor/src/components/TrigSteerMonitor_entries.cxx b/Trigger/TrigMonitoring/TrigSteerMonitor/src/components/TrigSteerMonitor_entries.cxx index 9e4c016cad1..99609106c35 100644 --- a/Trigger/TrigMonitoring/TrigSteerMonitor/src/components/TrigSteerMonitor_entries.cxx +++ b/Trigger/TrigMonitoring/TrigSteerMonitor/src/components/TrigSteerMonitor_entries.cxx @@ -12,6 +12,7 @@ #include "../TrigSignatureMoniMT.h" #include "../DecisionCollectorTool.h" #include "../SchedulerMonSvc.h" +#include "../TrigErrorMonTool.h" DECLARE_COMPONENT( TrigSteerMoni ) DECLARE_COMPONENT( TrigChainMoni ) @@ -27,3 +28,4 @@ DECLARE_COMPONENT( TrigMemAuditor ) DECLARE_COMPONENT( TrigSignatureMoniMT ) DECLARE_COMPONENT( DecisionCollectorTool ) DECLARE_COMPONENT( SchedulerMonSvc ) +DECLARE_COMPONENT( TrigErrorMonTool ) -- GitLab