From 21b3e346fd77c6a62e8176a988fd571d2e3a3093 Mon Sep 17 00:00:00 2001
From: Rafal Bielski <rafal.bielski@cern.ch>
Date: Fri, 2 Oct 2020 18:57:38 +0200
Subject: [PATCH] Move online error monitoring from event loop manager to a
 dedicated tool

---
 .../TrigServices/python/TrigServicesConfig.py |  7 +--
 .../TrigServices/src/HltEventLoopMgr.cxx      | 21 ++-------
 .../TrigServices/src/HltEventLoopMgr.h        |  5 +-
 .../TrigSteerMonitor/ITrigErrorMonTool.h      | 22 +++++++++
 .../python/TrigSteerMonitorConfig.py          | 11 +++++
 .../TrigSteerMonitor/src/TrigErrorMonTool.cxx | 47 +++++++++++++++++++
 .../TrigSteerMonitor/src/TrigErrorMonTool.h   | 36 ++++++++++++++
 .../components/TrigSteerMonitor_entries.cxx   |  2 +
 8 files changed, 127 insertions(+), 24 deletions(-)
 create mode 100644 Trigger/TrigMonitoring/TrigSteerMonitor/TrigSteerMonitor/ITrigErrorMonTool.h
 create mode 100644 Trigger/TrigMonitoring/TrigSteerMonitor/src/TrigErrorMonTool.cxx
 create mode 100644 Trigger/TrigMonitoring/TrigSteerMonitor/src/TrigErrorMonTool.h

diff --git a/HLT/Trigger/TrigControl/TrigServices/python/TrigServicesConfig.py b/HLT/Trigger/TrigControl/TrigServices/python/TrigServicesConfig.py
index 9e363ea071c..068d32ec5d9 100644
--- a/HLT/Trigger/TrigControl/TrigServices/python/TrigServicesConfig.py
+++ b/HLT/Trigger/TrigControl/TrigServices/python/TrigServicesConfig.py
@@ -127,9 +127,6 @@ class HltEventLoopMgr(_HltEventLoopMgr):
       super(HltEventLoopMgr, self).__init__(name)
       from AthenaMonitoringKernel.GenericMonitoringTool import GenericMonitoringTool
       self.MonTool = GenericMonitoringTool('MonTool', HistPath='HLTFramework/'+name)
-      self.MonTool.defineHistogram('ErrorAlgName,ErrorCode', path='EXPERT', type='TH2I',
-                                   title='Error StatusCodes per algorithm;Algorithm name;StatusCode',
-                                   xbins=1, xmin=0, xmax=1, ybins=1, ymin=0, ymax=1)
       self.MonTool.defineHistogram('TotalTime', path='EXPERT', type='TH1F',
                                    title='Total event processing time (all events);Time [ms];Events',
                                    xbins=200, xmin=0, xmax=10000)
@@ -142,4 +139,8 @@ class HltEventLoopMgr(_HltEventLoopMgr):
       self.MonTool.defineHistogram('SlotIdleTime', path='EXPERT', type='TH1F',
                                    title='Time between freeing and assigning a scheduler slot;Time [ms];Events',
                                    xbins=400, xmin=0, xmax=400)
+
+      from TrigSteerMonitor.TrigSteerMonitorConfig import getTrigErrorMonTool
+      self.TrigErrorMonTool = getTrigErrorMonTool()
+
       return
diff --git a/HLT/Trigger/TrigControl/TrigServices/src/HltEventLoopMgr.cxx b/HLT/Trigger/TrigControl/TrigServices/src/HltEventLoopMgr.cxx
index c619cb671b5..8718714de1e 100644
--- a/HLT/Trigger/TrigControl/TrigServices/src/HltEventLoopMgr.cxx
+++ b/HLT/Trigger/TrigControl/TrigServices/src/HltEventLoopMgr.cxx
@@ -196,8 +196,9 @@ StatusCode HltEventLoopMgr::initialize()
   ATH_CHECK(m_coolHelper.retrieve());
   // HLT result builder
   ATH_CHECK(m_hltResultMaker.retrieve());
-  // Monitoring tool
+  // Monitoring tools
   if (!m_monTool.empty()) ATH_CHECK(m_monTool.retrieve());
+  ATH_CHECK(m_errorMonTool.retrieve());
 
   //----------------------------------------------------------------------------
   // Initialise data handle keys
@@ -1183,22 +1184,6 @@ void HltEventLoopMgr::runEventTimer()
   ATH_MSG_VERBOSE("end of " << __FUNCTION__);
 }
 
-// =============================================================================
-std::unordered_map<std::string_view,StatusCode> HltEventLoopMgr::algExecErrors(const EventContext& eventContext) const {
-  std::unordered_map<std::string_view,StatusCode> algErrors;
-  for (const auto& [key, state] : m_aess->algExecStates(eventContext)) {
-    if (!state.execStatus().isSuccess()) {
-      ATH_MSG_DEBUG("Algorithm " << key << " returned StatusCode " << state.execStatus().message()
-                    << " in event " << eventContext.eventID());
-      algErrors[key.str()] = state.execStatus();
-      auto monErrorAlgName = Monitored::Scalar<std::string>("ErrorAlgName", key.str());
-      auto monErrorCode = Monitored::Scalar<std::string>("ErrorCode", state.execStatus().message());
-      auto mon = Monitored::Group(m_monTool, monErrorAlgName, monErrorCode);
-    }
-  }
-  return algErrors;
-}
-
 // =============================================================================
 /**
  * @brief Retrieves finished events from the scheduler, processes their output and cleans up the slots
@@ -1258,7 +1243,7 @@ HltEventLoopMgr::DrainSchedulerStatusCode HltEventLoopMgr::drainScheduler()
     // Check the event processing status
     if (m_aess->eventStatus(*thisFinishedEvtContext) != EventStatus::Success) {
       markFailed();
-      auto algErrors = algExecErrors(*thisFinishedEvtContext);
+      auto algErrors = m_errorMonTool->algExecErrors(*thisFinishedEvtContext);
       HLT::OnlineErrorCode errCode = isTimedOut(algErrors) ?
                                      HLT::OnlineErrorCode::TIMEOUT : HLT::OnlineErrorCode::PROCESSING_FAILURE;
       HLT_DRAINSCHED_CHECK(sc, "Processing event with context " << *thisFinishedEvtContext
diff --git a/HLT/Trigger/TrigControl/TrigServices/src/HltEventLoopMgr.h b/HLT/Trigger/TrigControl/TrigServices/src/HltEventLoopMgr.h
index 163ffa1e83c..f3ff73ae52c 100644
--- a/HLT/Trigger/TrigControl/TrigServices/src/HltEventLoopMgr.h
+++ b/HLT/Trigger/TrigControl/TrigServices/src/HltEventLoopMgr.h
@@ -10,6 +10,7 @@
 #include "TrigOutputHandling/HLTResultMTMaker.h"
 #include "TrigSteeringEvent/OnlineErrorCode.h"
 #include "TrigSteerMonitor/ISchedulerMonSvc.h"
+#include "TrigSteerMonitor/ITrigErrorMonTool.h"
 
 // Athena includes
 #include "AthenaBaseComps/AthService.h"
@@ -156,9 +157,6 @@ private:
   /// The method executed by the event timeout monitoring thread
   void runEventTimer();
 
-  /// Produce a subset of IAlgExecStateSvc::algExecStates with only non-success StatusCodes
-  std::unordered_map<std::string_view,StatusCode> algExecErrors(const EventContext& eventContext) const;
-
   /// Drain the scheduler from all actions that may be queued
   DrainSchedulerStatusCode drainScheduler();
 
@@ -186,6 +184,7 @@ private:
   ToolHandle<TrigCOOLUpdateHelper>   m_coolHelper{this, "CoolUpdateTool", "TrigCOOLUpdateHelper"};
   ToolHandle<HLTResultMTMaker>       m_hltResultMaker{this, "ResultMaker", "HLTResultMTMaker"};
   ToolHandle<GenericMonitoringTool>  m_monTool{this, "MonTool", "", "Monitoring tool"};
+  ToolHandle<ITrigErrorMonTool>      m_errorMonTool{this, "TrigErrorMonTool", "TrigErrorMonTool", "Error monitoring tool"};
 
   SmartIF<IHiveWhiteBoard> m_whiteboard;
   SmartIF<IAlgResourcePool> m_algResourcePool;
diff --git a/Trigger/TrigMonitoring/TrigSteerMonitor/TrigSteerMonitor/ITrigErrorMonTool.h b/Trigger/TrigMonitoring/TrigSteerMonitor/TrigSteerMonitor/ITrigErrorMonTool.h
new file mode 100644
index 00000000000..2a372954a71
--- /dev/null
+++ b/Trigger/TrigMonitoring/TrigSteerMonitor/TrigSteerMonitor/ITrigErrorMonTool.h
@@ -0,0 +1,22 @@
+/*
+  Copyright (C) 2002-2020 CERN for the benefit of the ATLAS collaboration
+*/
+#ifndef TRIGSTEERMONITOR_ITRIGERRORMONTOOL_H
+#define TRIGSTEERMONITOR_ITRIGERRORMONTOOL_H
+
+#include "GaudiKernel/IAlgTool.h"
+#include "GaudiKernel/EventContext.h"
+
+/**
+ * @class ITrigErrorMonTool
+ * @brief Interface of a tool which retrieves and monitors all non-success status codes returned by algorithms
+ **/
+class ITrigErrorMonTool : virtual public IAlgTool {
+public: 
+  DeclareInterfaceID(ITrigErrorMonTool, 1, 0);
+
+  /// Produce a subset of IAlgExecStateSvc::algExecStates with only non-success StatusCodes and fill relevant histograms
+  virtual std::unordered_map<std::string_view, StatusCode> algExecErrors(const EventContext& eventContext) const = 0;
+}; 
+
+#endif // TRIGSTEERMONITOR_ITRIGERRORMONTOOL_H
diff --git a/Trigger/TrigMonitoring/TrigSteerMonitor/python/TrigSteerMonitorConfig.py b/Trigger/TrigMonitoring/TrigSteerMonitor/python/TrigSteerMonitorConfig.py
index 99c55a62589..23be91a7d8f 100644
--- a/Trigger/TrigMonitoring/TrigSteerMonitor/python/TrigSteerMonitorConfig.py
+++ b/Trigger/TrigMonitoring/TrigSteerMonitor/python/TrigSteerMonitorConfig.py
@@ -59,3 +59,14 @@ def SchedulerMonSvcCfg(flags, name='SchedulerMonSvc'):
     acc = ComponentAccumulator()
     acc.addService(monsvc)
     return acc
+
+def getTrigErrorMonTool(name='TrigErrorMonTool'):
+    errorMonTool = CompFactory.TrigErrorMonTool(name)
+    errorMonTool.MonTool = GenericMonitoringTool('MonTool', HistPath='HLTFramework/'+name)
+
+    errorMonTool.MonTool.defineHistogram(
+        'ErrorAlgName,ErrorCode', path='EXPERT', type='TH2I',
+        title='Error StatusCodes per algorithm;Algorithm name;StatusCode',
+        xbins=1, xmin=0, xmax=1, ybins=1, ymin=0, ymax=1)
+
+    return errorMonTool
diff --git a/Trigger/TrigMonitoring/TrigSteerMonitor/src/TrigErrorMonTool.cxx b/Trigger/TrigMonitoring/TrigSteerMonitor/src/TrigErrorMonTool.cxx
new file mode 100644
index 00000000000..52263624a12
--- /dev/null
+++ b/Trigger/TrigMonitoring/TrigSteerMonitor/src/TrigErrorMonTool.cxx
@@ -0,0 +1,47 @@
+/*
+  Copyright (C) 2002-2020 CERN for the benefit of the ATLAS collaboration
+*/
+#include "TrigErrorMonTool.h"
+#include "GaudiKernel/IAlgExecStateSvc.h"
+
+// =============================================================================
+// Standard constructor
+// =============================================================================
+TrigErrorMonTool::TrigErrorMonTool(const std::string& type, const std::string& name, const IInterface* parent)
+: base_class(type, name, parent) {}
+
+// =============================================================================
+// Implementation of IStateful::initialize
+// =============================================================================
+StatusCode TrigErrorMonTool::initialize() {
+  ATH_CHECK(m_monTool.retrieve(DisableTool{m_monTool.name().empty()}));
+  ATH_CHECK(m_aess.retrieve());
+  return StatusCode::SUCCESS;
+}
+
+// =============================================================================
+// Implementation of IStateful::finalize
+// =============================================================================
+StatusCode TrigErrorMonTool::finalize() {
+  ATH_CHECK(m_monTool.release());
+  ATH_CHECK(m_aess.release());
+  return StatusCode::SUCCESS;
+}
+
+// =============================================================================
+// The main method of the tool, ITrigErrorMonTool::algExecErrors
+// =============================================================================
+std::unordered_map<std::string_view, StatusCode> TrigErrorMonTool::algExecErrors(const EventContext& eventContext) const {
+  std::unordered_map<std::string_view, StatusCode> algErrors;
+  for (const auto& [key, state] : m_aess->algExecStates(eventContext)) {
+    if (!state.execStatus().isSuccess()) {
+      ATH_MSG_DEBUG("Algorithm " << key << " returned StatusCode " << state.execStatus().message()
+                    << " in event " << eventContext.eventID());
+      algErrors[key.str()] = state.execStatus();
+      auto monErrorAlgName = Monitored::Scalar<std::string>("ErrorAlgName", key.str());
+      auto monErrorCode = Monitored::Scalar<std::string>("ErrorCode", state.execStatus().message());
+      auto mon = Monitored::Group(m_monTool, monErrorAlgName, monErrorCode);
+    }
+  }
+  return algErrors;
+}
diff --git a/Trigger/TrigMonitoring/TrigSteerMonitor/src/TrigErrorMonTool.h b/Trigger/TrigMonitoring/TrigSteerMonitor/src/TrigErrorMonTool.h
new file mode 100644
index 00000000000..ebbbe1ddd40
--- /dev/null
+++ b/Trigger/TrigMonitoring/TrigSteerMonitor/src/TrigErrorMonTool.h
@@ -0,0 +1,36 @@
+/*
+  Copyright (C) 2002-2020 CERN for the benefit of the ATLAS collaboration
+*/
+#ifndef TRIGSTEERMONITOR_TRIGERRORMONTOOL_H
+#define TRIGSTEERMONITOR_TRIGERRORMONTOOL_H
+
+#include "TrigSteerMonitor/ITrigErrorMonTool.h"
+#include "AthenaBaseComps/AthAlgTool.h"
+#include "AthenaMonitoringKernel/Monitored.h"
+#include "GaudiKernel/EventContext.h"
+
+class IAlgExecStateSvc;
+
+/**
+ * @class TrigErrorMonTool
+ * @brief Retrieves and monitors all non-success status codes returned by algorithms
+ **/
+class TrigErrorMonTool : public extends<AthAlgTool, ITrigErrorMonTool> {
+public:
+  TrigErrorMonTool(const std::string& type, const std::string& name, const IInterface* parent);
+
+  // ------------------------- IStateful methods -------------------------------
+  virtual StatusCode initialize() override;
+  virtual StatusCode finalize() override;
+
+  // ------------------------- ITrigErrorMonTool methods -----------------------
+  /// Produce a subset of IAlgExecStateSvc::algExecStates with only non-success StatusCodes and fill relevant histograms
+  virtual std::unordered_map<std::string_view, StatusCode> algExecErrors(const EventContext& eventContext) const override;
+
+private:
+  ServiceHandle<IAlgExecStateSvc> m_aess{this, "AlgExecStateSvc", "AlgExecStateSvc"};
+  ToolHandle<GenericMonitoringTool> m_monTool{this, "MonTool", "", "Monitoring tool"};
+};
+
+
+#endif // TRIGSTEERMONITOR_TRIGERRORMONTOOL_H
diff --git a/Trigger/TrigMonitoring/TrigSteerMonitor/src/components/TrigSteerMonitor_entries.cxx b/Trigger/TrigMonitoring/TrigSteerMonitor/src/components/TrigSteerMonitor_entries.cxx
index 9e4c016cad1..99609106c35 100644
--- a/Trigger/TrigMonitoring/TrigSteerMonitor/src/components/TrigSteerMonitor_entries.cxx
+++ b/Trigger/TrigMonitoring/TrigSteerMonitor/src/components/TrigSteerMonitor_entries.cxx
@@ -12,6 +12,7 @@
 #include "../TrigSignatureMoniMT.h"
 #include "../DecisionCollectorTool.h"
 #include "../SchedulerMonSvc.h"
+#include "../TrigErrorMonTool.h"
 
 DECLARE_COMPONENT( TrigSteerMoni )
 DECLARE_COMPONENT( TrigChainMoni )
@@ -27,3 +28,4 @@ DECLARE_COMPONENT( TrigMemAuditor )
 DECLARE_COMPONENT( TrigSignatureMoniMT )
 DECLARE_COMPONENT( DecisionCollectorTool )
 DECLARE_COMPONENT( SchedulerMonSvc )
+DECLARE_COMPONENT( TrigErrorMonTool )
-- 
GitLab