diff --git a/Online/Hlt2Monitoring/CMakeLists.txt b/Online/Hlt2Monitoring/CMakeLists.txt index fbfe97b1bf928d24adfe8935b30e09a975455579..377d2faef7ebf5e584b085b6fe8615487fc76ec0 100644 --- a/Online/Hlt2Monitoring/CMakeLists.txt +++ b/Online/Hlt2Monitoring/CMakeLists.txt @@ -1,5 +1,5 @@ ################################################################################ -# Package: Gaucho +# Package: Hlt2Monitoring ################################################################################ gaudi_subdir(Hlt2Monitoring v1r7) @@ -8,19 +8,37 @@ gaudi_depends_on_subdirs(GaudiKernel GaudiUtils Online/ZeroMQ) -find_package(Boost COMPONENTS system regex serialization filesystem) +find_package(Boost COMPONENTS system regex serialization filesystem iostreams program_options) find_package(ROOT COMPONENTS Core RIO Hist Thread) find_package(AIDA) include_directories(SYSTEM ${Boost_INCLUDE_DIRS} ${ROOT_INCLUDE_DIRS}) +find_path(RANGES_V3_INCLUDE_DIR NAMES range/v3/all.hpp) +if(NOT RANGES_V3_INCLUDE_DIR) + message(FATAL "required headers from range-v3 missing") +endif() + gaudi_add_library(Hlt2Monitoring src/lib/*.cpp - INCLUDE_DIRS Boost AIDA + INCLUDE_DIRS Boost AIDA ${RANGES_V3_INCLUDE_DIR} PUBLIC_HEADERS Hlt2Monitoring LINK_LIBRARIES GaudiKernel Boost ROOT zmq ZMQ) gaudi_add_module(Hlt2MonitoringLib src/component/*.cpp - INCLUDE_DIRS Boost AIDA + INCLUDE_DIRS Boost AIDA ${RANGES_V3_INCLUDE_DIR} LINK_LIBRARIES GaudiKernel Boost ROOT zmq ZMQ Hlt2Monitoring GaudiUtilsLib) +# C++ utilities useful for testing or as examples +gaudi_add_executable(dump_info + test/dump_info.cpp + INCLUDE_DIRS Hlt2Monitoring ROOT Boost ${RANGES_V3_INCLUDE_DIR} + LINK_LIBRARIES Boost ROOT zmq ZMQ Hlt2Monitoring) +target_compile_definitions(dump_info PUBLIC STANDALONE) + +gaudi_add_executable(test_registrar + test/test_registrar.cpp + INCLUDE_DIRS Hlt2Monitoring ROOT Boost ${RANGES_V3_INCLUDE_DIR} + LINK_LIBRARIES Boost ROOT zmq ZMQ Hlt2Monitoring) +target_compile_definitions(test_registrar PUBLIC STANDALONE) + gaudi_install_python_modules() diff --git a/Online/Hlt2Monitoring/Hlt2Monitoring/Histo1DDef.h b/Online/Hlt2Monitoring/Hlt2Monitoring/Histo1DDef.h index 40e6afb4864ce3a1281bf064908056652990f805..edd41b3dd34511219ae7a7ad5b724cf8d85403d4 100644 --- a/Online/Hlt2Monitoring/Hlt2Monitoring/Histo1DDef.h +++ b/Online/Hlt2Monitoring/Hlt2Monitoring/Histo1DDef.h @@ -36,6 +36,27 @@ struct Histo1DDef { xlabels = getLabels(axis); labels = !xlabels.empty(); } + + bool operator==(const Histo1DDef& other) const { + using namespace ranges; + + if (title != other.title) { + return false; + } else if (variable != other.variable) { + return false; + } else if (!variable && !same_bins(xlow, xhigh, xbins, + other.xlow, other.xhigh, other.xbins)) { + return false; + } else if (labels != other.labels) { + return false; + } else if (labels && !same_labels(xlabels, other.xlabels)) { + return false; + } else if (variable && !same_edges(xedges, other.xedges)) { + return false; + } else { + return true; + } + } std::string title; diff --git a/Online/Hlt2Monitoring/Hlt2Monitoring/Histo2DDef.h b/Online/Hlt2Monitoring/Hlt2Monitoring/Histo2DDef.h index 3db24f8269d484a4e0db35c00a3e70ce140e58ba..ebd04a134c17a22dc8c276d45fcde5274b0c01a4 100644 --- a/Online/Hlt2Monitoring/Hlt2Monitoring/Histo2DDef.h +++ b/Online/Hlt2Monitoring/Hlt2Monitoring/Histo2DDef.h @@ -21,15 +21,45 @@ struct Histo2DDef { : title{std::move(t)} { std::tie(xbins, xlow, xhigh, xedges) = axisDefinition(xaxis); - std::tie(ybins, ylow, yhigh, yedges) = axisDefinition(xaxis); + std::tie(ybins, ylow, yhigh, yedges) = axisDefinition(yaxis); xvariable = !xedges.empty(); - xvariable = !yedges.empty(); + yvariable = !yedges.empty(); xlabels = getLabels(xaxis); ylabels = getLabels(yaxis); labels = (!xlabels.empty() || !ylabels.empty()); } - + + bool operator==(const Histo2DDef& other) const { + using namespace ranges; + + if (title != other.title) { + return false; + } else if (xvariable != other.xvariable) { + return false; + } else if (yvariable != other.yvariable) { + return false; + } else if (!xvariable && !same_bins(xlow, xhigh, xbins, + other.xlow, other.xhigh, other.xbins)) { + return false; + } else if (!yvariable && !same_bins(ylow, yhigh, ybins, + other.ylow, other.yhigh, other.ybins)) { + return false; + } else if (labels != other.labels) { + return false; + } else if (labels && !same_labels(xlabels, other.xlabels)) { + return false; + } else if (labels && !same_labels(ylabels, other.ylabels)) { + return false; + } else if (xvariable && !same_edges(xedges, other.xedges)) { + return false; + } else if (yvariable && !same_edges(yedges, other.yedges)) { + return false; + } else { + return true; + } + } + std::string title; bool xvariable = false; @@ -73,7 +103,7 @@ void serialize(Archive& archive, Monitoring::Histo2DDef& def, const unsigned int archive & def.yhigh; archive & def.ybins; } - + archive & def.labels; if (def.labels) { archive & def.xlabels; diff --git a/Online/Hlt2Monitoring/Hlt2Monitoring/HistoUtils.h b/Online/Hlt2Monitoring/Hlt2Monitoring/HistoUtils.h index e14423748a71f5d17a8bdb55dfd5346406db96df..6b9e68c506eb7db15f176513068e5ce3c27ba5be 100644 --- a/Online/Hlt2Monitoring/Hlt2Monitoring/HistoUtils.h +++ b/Online/Hlt2Monitoring/Hlt2Monitoring/HistoUtils.h @@ -2,13 +2,51 @@ #define HISTOUTILS_H #include <vector> +#include <string> #include <tuple> +#include <boost/math/special_functions/relative_difference.hpp> +#include <boost/math/special_functions/sign.hpp> +#include <boost/functional/hash.hpp> + +#include <range/v3/algorithm.hpp> +#include <range/v3/view.hpp> + namespace Gaudi { class Axis; } std::tuple<int, double, double, std::vector<double>> axisDefinition(const Gaudi::Axis& axis); + std::vector<std::string> getLabels(const Gaudi::Axis& axis); +template<typename T, typename std::enable_if<std::is_floating_point<T>::value, T>::type* = nullptr> +bool same_edges(const std::vector<T>& l, const std::vector<T>& r) { + using boost::math::sign; + using boost::math::epsilon_difference; + if (l.size() != r.size()) { + return false; + } else { + return ranges::all_of(ranges::view::zip(l, r), [](const std::tuple<T, T>& t) { + return (boost::math::sign(std::get<0>(t)) == boost::math::sign(std::get<1>(t)) + && boost::math::epsilon_difference(std::get<0>(t), std::get<1>(t)) < 2); + }); + } +} + +bool same_bins(double ll, double lh, int lb, + double rl, double rh, int rb); + +bool same_labels(const std::vector<std::string>& ll, + const std::vector<std::string>& rl); + +namespace Monitoring { +struct Histo1DDef; +struct Histo2DDef; + + // Hash a Histo1DDef +size_t hash_value(const Histo1DDef& def); +size_t hash_value(const Histo2DDef& def); +} + #endif diff --git a/Online/Hlt2Monitoring/Hlt2Monitoring/ITransmitterSvc.h b/Online/Hlt2Monitoring/Hlt2Monitoring/ITransmitterSvc.h index c92b9ae0e462a102bf3501f6bff3b926c73cb40b..dec5745b1505ad7937a312b125c2211321804a9c 100644 --- a/Online/Hlt2Monitoring/Hlt2Monitoring/ITransmitterSvc.h +++ b/Online/Hlt2Monitoring/Hlt2Monitoring/ITransmitterSvc.h @@ -45,6 +45,8 @@ public: virtual void trigger(size_t id) = 0; virtual std::pair<std::string, std::string> application() const = 0; + + virtual bool ok() const = 0; }; #endif // HLT2MONITORING_ITRANSMITTERSVC_H diff --git a/Online/Hlt2Monitoring/Hlt2Monitoring/InfoUtils.h b/Online/Hlt2Monitoring/Hlt2Monitoring/InfoUtils.h new file mode 100644 index 0000000000000000000000000000000000000000..9bb982ce754c5c014c2eb75f51a73ad4077dce16 --- /dev/null +++ b/Online/Hlt2Monitoring/Hlt2Monitoring/InfoUtils.h @@ -0,0 +1,18 @@ +#ifndef INFOUTILS_H +#define INFOUTILS_H + +#include <string> + +#include "Types.h" + +// Add a histogram to a histogram container +std::pair<Monitoring::HistoMap::const_iterator, bool> +addHistogram(Monitoring::HistoMap& histograms, + const Monitoring::HistoKey& key, + const std::string& type, + MonInfo::HistoVariant variant); + +// Load histograms from a compressed file. +size_t loadHistoInfo(Monitoring::HistoMap& histograms, std::string filename); + +#endif diff --git a/Online/Hlt2Monitoring/Hlt2Monitoring/RunInfo.h b/Online/Hlt2Monitoring/Hlt2Monitoring/RunInfo.h index d929d44c47933abb798aad93f37ba1bde1158ccc..228be41cd77aa2977b6f64c3458fcacea61f0321 100644 --- a/Online/Hlt2Monitoring/Hlt2Monitoring/RunInfo.h +++ b/Online/Hlt2Monitoring/Hlt2Monitoring/RunInfo.h @@ -1,4 +1,3 @@ - #ifndef HLT2MONITORING_RUNINFO_H #define HLT2MONITORING_RUNINFO_H 1 diff --git a/Online/Hlt2Monitoring/Hlt2Monitoring/SaverUtilities.h b/Online/Hlt2Monitoring/Hlt2Monitoring/SaverUtilities.h index cc723127a2259e53d13eef4ac47ffe2894f2a31e..f193cd0e003bd4330e8d138166dd312f4af92a7c 100644 --- a/Online/Hlt2Monitoring/Hlt2Monitoring/SaverUtilities.h +++ b/Online/Hlt2Monitoring/Hlt2Monitoring/SaverUtilities.h @@ -13,6 +13,7 @@ #include <boost/functional/hash.hpp> #include <boost/unordered_map.hpp> #include <boost/multi_index_container.hpp> +#include <boost/multi_index/ordered_index.hpp> #include <boost/multi_index/hashed_index.hpp> #include <boost/multi_index/member.hpp> #include <boost/multi_index/mem_fun.hpp> @@ -31,7 +32,8 @@ struct ByRun{ }; struct ByWorker{ }; struct ByDir{ }; struct ByName{ }; - +struct Sorted{ }; + using HistoKey = std::pair<Monitoring::RunNumber, Monitoring::HistId>; struct HistoEntry { @@ -55,6 +57,18 @@ struct HistoEntry { bool add = true; }; +struct SortHistos { + bool operator()(const HistoEntry& lhs, const HistoEntry& rhs) const { + if (lhs.run < rhs.run) { + return true; + } else if (rhs.run < lhs.run) { + return false; + } else { + return lhs.name() < rhs.name(); + } + } +}; + // Multi index container to hold the items. using SaverHistos = boost::multi_index_container< HistoEntry, @@ -67,6 +81,11 @@ using SaverHistos = boost::multi_index_container< boost::multi_index::tag<ByName>, boost::multi_index::const_mem_fun<HistoEntry, std::string, &HistoEntry::name> >, + boost::multi_index::ordered_unique< + boost::multi_index::tag<Sorted>, + boost::multi_index::identity<HistoEntry>, + SortHistos + >, boost::multi_index::hashed_non_unique< boost::multi_index::tag<ByDir>, boost::multi_index::member<HistoEntry, std::string, &HistoEntry::dir> diff --git a/Online/Hlt2Monitoring/Hlt2Monitoring/Types.h b/Online/Hlt2Monitoring/Hlt2Monitoring/Types.h index 4f382b389f8f21ae818170894fb337c1cce93922..a1e85aa72f29a94fd93ed5e69673d1e83bed2d34 100644 --- a/Online/Hlt2Monitoring/Hlt2Monitoring/Types.h +++ b/Online/Hlt2Monitoring/Hlt2Monitoring/Types.h @@ -8,30 +8,148 @@ #include <unordered_set> #include <boost/variant.hpp> +#include <boost/functional/hash.hpp> +#include <boost/multi_index_container.hpp> +#include <boost/multi_index/hashed_index.hpp> +#include <boost/multi_index/ordered_index.hpp> +#include <boost/multi_index/member.hpp> +#include <boost/multi_index/mem_fun.hpp> + +#include <boost/serialization/shared_ptr.hpp> +#include <boost/serialization/serialization.hpp> +#include <boost/serialization/set.hpp> +#include <boost/serialization/map.hpp> +#include <boost/serialization/string.hpp> +#include <boost/serialization/vector.hpp> +#include <boost/serialization/shared_ptr.hpp> +#include <boost/serialization/variant.hpp> + +#include <boost/multi_index_container.hpp> +#include <boost/multi_index/hashed_index.hpp> +#include <boost/multi_index/ordered_index.hpp> +#include <boost/multi_index/member.hpp> +#include <boost/multi_index/mem_fun.hpp> + +#include <boost/serialization/shared_ptr.hpp> +#include <boost/serialization/serialization.hpp> +#include <boost/serialization/set.hpp> +#include <boost/serialization/map.hpp> +#include <boost/serialization/string.hpp> +#include <boost/serialization/vector.hpp> +#include <boost/serialization/shared_ptr.hpp> +#include <boost/serialization/variant.hpp> #include "Common.h" #include "Histo1DDef.h" #include "Histo2DDef.h" #include "CounterDef.h" +#include "RunInfo.h" + +namespace Monitoring { + using HistoKey = std::pair<RunNumber, HistId>; +} + +namespace MonInfo { + + struct ByKey{}; + struct ByContent{}; + + // Variant to wrap the different types we want to store as histo info. + using HistoVariant = boost::variant<std::string, + Monitoring::Histo1DDef, + Monitoring::Histo2DDef>; + + // Visitor to print different types in an info message + class Printer : public boost::static_visitor<std::string> { + public: + + std::string operator()(const std::string& eval) { + return eval; + } + + std::string operator()(const Monitoring::Histo1DDef& def) { + return def.title; + } + + std::string operator()(const Monitoring::Histo2DDef& def) { + return def.title; + } + + std::string operator()(const Monitoring::CounterDef& def) { + return def.name + " " + def.description; + } + }; + + // Node for the multi-index container. Use shared_ptr to save memory. + struct HistoEntry { + HistoEntry() = default; + + HistoEntry(Monitoring::HistoKey k, std::string t, + size_t h, std::shared_ptr<HistoVariant> c) + : key{std::move(k)}, + type{std::move(t)}, + hash{h}, + cnt{std::move(c)} {} + + Monitoring::HistoKey key = {0, 0}; + std::string type; + size_t hash = 0; + std::shared_ptr<HistoVariant> cnt; + + const HistoVariant& content() const { + return *cnt; + } + }; +} + +namespace boost { +namespace serialization { + +// Serialize HistoEntry +template <typename Archive> +auto serialize(Archive& archive, MonInfo::HistoEntry& entry, + const unsigned int) -> void { + archive& entry.key; + archive& entry.type; + archive& entry.hash; + archive& entry.cnt; +} +} +} namespace Monitoring { // Types used when communicating and storing information about objects. -using HistoVariant = boost::variant<std::string, Histo1DDef, Histo2DDef>; -using HistoKey = std::pair<RunNumber, HistId>; +// Multi index container to hold the items. +using HistoMap = boost::multi_index_container< + MonInfo::HistoEntry, + boost::multi_index::indexed_by< + boost::multi_index::hashed_unique< + boost::multi_index::tag<MonInfo::ByKey>, + boost::multi_index::member<MonInfo::HistoEntry, HistoKey, &MonInfo::HistoEntry::key>, + boost::hash<HistoKey> + >, + boost::multi_index::ordered_non_unique< + boost::multi_index::tag<MonInfo::ByContent>, + boost::multi_index::member<MonInfo::HistoEntry, size_t, &MonInfo::HistoEntry::hash> + > + > + >; + using HistoKeys = std::unordered_set<HistoKey, boost::hash<HistoKey>>; -using HistoPub = std::vector<std::tuple<RunNumber, HistId, std::string, std::string>>; -using HistoMap = boost::unordered_map<HistoKey, std::pair<std::string, HistoVariant>>; +using HistoPub = std::vector<std::tuple<std::string, std::string, std::vector<HistoKey>>>; + +using KeyHash = boost::hash<Monitoring::HistoKey>; using CounterKey = HistId; using CounterKeys = std::unordered_set<CounterKey>; using CounterPub = std::vector<std::tuple<HistId, CounterDef>>; -using CounterMap = boost::unordered_map<CounterKey, CounterDef>; +using CounterMap = std::unordered_map<CounterKey, CounterDef, boost::hash<CounterKey>>; // Run Info using RunInfoKey = std::pair<RunNumber, std::string>; using RunInfoKeys = std::unordered_set<RunInfoKey, boost::hash<RunInfoKey>>; -using RunInfoMap = boost::unordered_map<RunInfoKey, std::pair<bool, RunInfo>>; +using RunInfoMap = std::unordered_map<RunInfoKey, std::pair<bool, RunInfo>, boost::hash<RunInfoKey>>; using RunInfoPub = std::vector<std::pair<std::string, RunInfo>>; } #endif diff --git a/Online/Hlt2Monitoring/Hlt2Monitoring/Utilities.h b/Online/Hlt2Monitoring/Hlt2Monitoring/Utilities.h index 5198101be9913df990904e285df29090df71f69e..a830e2e8bdee4a7a815f067070e818b515faf379 100644 --- a/Online/Hlt2Monitoring/Hlt2Monitoring/Utilities.h +++ b/Online/Hlt2Monitoring/Hlt2Monitoring/Utilities.h @@ -18,21 +18,15 @@ #include <boost/regex.hpp> // ZeroMQ and local -#ifdef STANDALONE -#include "zmq.hpp" -#include "IZeroMQSvc.h" -#define endmsg endl -#define MsgStream std::ostream -#else #include <zmq/zmq.hpp> #include <ZeroMQ/IZeroMQSvc.h> -#include <GaudiKernel/MsgStream.h> -#include <GaudiKernel/ParsersFactory.h> -#endif #include "Common.h" #ifndef STANDALONE +#include <GaudiKernel/MsgStream.h> +#include <GaudiKernel/ParsersFactory.h> + namespace Gaudi { namespace Parsers { // Parser grammar and parse function for CorrectMap @@ -45,10 +39,6 @@ namespace Gaudi { } #endif -#ifdef STANDALONE -IZeroMQSvc* zmqSvc(); -#endif - namespace Monitoring { unsigned int sourceID(boost::regex regex, std::string host); diff --git a/Online/Hlt2Monitoring/python/Hlt2Monitoring/Hlt2Adder.py b/Online/Hlt2Monitoring/python/Hlt2Monitoring/Hlt2Adder.py index 22624008ef97b06378e52bd11c925fe0643eb2b1..fad50d3eefd0375e5270fbc5288ef04f32dcbfaa 100644 --- a/Online/Hlt2Monitoring/python/Hlt2Monitoring/Hlt2Adder.py +++ b/Online/Hlt2Monitoring/python/Hlt2Monitoring/Hlt2Adder.py @@ -2,11 +2,12 @@ import os import socket import re -from Utilities import importOnline, configOnline +from Utilities import importOnline, configOnline, connectionDirectory __ports = {'Transmitter': 31348, 'Adder': {'in': 31347, 'out': 31351}, 'InfoSvc': {'in': 31349, 'out': 31352}} +__host_regex = r"^hlt(0[12]|(?P<subfarm>[a-f]{1}[0-9]{2})(?P<node>[0-9]{2})?).*" def extraConf(svcs, extra): @@ -51,7 +52,7 @@ def configureTop(node_info): infoSvc.InfoConnection = "ipc:///run/HLT2/MonInfo_0" infoSvc.OutPort = ports['InfoSvc']['out'] infoSvc.RunDBConnection = runDBCon - infoSvc.OutputLevel = node_info.get('OutputLevel', 3) + infoSvc.OutputLevel = node_info.get('OutputLevel', 2) # The histogram adder service from Configurables import Hlt2AdderSvc @@ -65,7 +66,8 @@ def configureTop(node_info): adderSvc.BackConnection = "ipc:///run/HLT2/Hlt2MonData_0" adderSvc.ConnectBack = False adderSvc.SendInterval = 210 - adderSvc.OutputLevel = node_info.get('OutputLevel', 3) + adderSvc.ReceiveHighWaterMark = 200000 + adderSvc.OutputLevel = node_info.get('OutputLevel', 2) # The root conversion service from Configurables import Hlt2RootPublishSvc @@ -74,7 +76,7 @@ def configureTop(node_info): rootSvc.FrontConnection = adderSvc.BackConnection rootSvc.BackConnection = "ipc:///run/HLT2/MonData_2" rootSvc.InfoConnection = infoSvc.InfoConnection - rootSvc.OutputLevel = node_info.get('OutputLevel', 3) + rootSvc.OutputLevel = node_info.get('OutputLevel', 2) # The saver svc from Configurables import Hlt2SaverSvc @@ -84,9 +86,10 @@ def configureTop(node_info): saverSvc.RunInfoType = "Moore2" saverSvc.DataConnection = rootSvc.BackConnection saverSvc.InfoConnection = infoSvc.InfoConnection + saverSvc.RegistrarConnection = 'tcp://hist01:31360' if 'HistogramDirectory' in node_info: saverSvc.BaseDirectory = node_info['HistogramDirectory'] - saverSvc.OutputLevel = node_info.get('OutputLevel', 3) + saverSvc.OutputLevel = node_info.get('OutputLevel', 2) from Configurables import ZmqTransmitterSvc transmitter = ZmqTransmitterSvc() @@ -97,7 +100,7 @@ def configureTop(node_info): for svc in svcs: svc.ForceTop = node_info['forced'] - confs = svcs + confs = svcs + [transmitter] if runDB: confs += [runDBSvc] return extraConf(confs, node_info.get('extra', {})) @@ -108,6 +111,8 @@ def configureSubfarm(node_info): ports = node_info['ports'] connections = node_info.get('connections', {}) + con_dir = connectionDirectory() + # The info svc from Configurables import Hlt2MonInfoSvc infoSvc = Hlt2MonInfoSvc() @@ -120,6 +125,7 @@ def configureSubfarm(node_info): infoSvc.InPort = ports['InfoSvc']['in'] infoSvc.OutPort = ports['InfoSvc']['out'] infoSvc.OutputLevel = node_info.get('OutputLevel', 3) + infoSvc.IPCConnectionPath = con_dir # The histogram adder service from Configurables import Hlt2AdderSvc @@ -131,13 +137,19 @@ def configureSubfarm(node_info): else: adderSvc.FrontConnection = "tcp://*:%d" % ports['Adder']['in'] adderSvc.BackConnection = 'tcp://hlt02:%d' % ports['Adder']['in'] - adderSvc.SendInterval = 60 + adderSvc.ReceiveHighWaterMark = 200000 + adderSvc.SendInterval = 67 adderSvc.OutputLevel = node_info.get('OutputLevel', 3) from Configurables import ZmqTransmitterSvc - ZmqTransmitterSvc().InfoPort = ports['Transmitter'] + transmitter = ZmqTransmitterSvc() + transmitter.InfoPort = ports['Transmitter'] + transmitter.OutputLevel = node_info.get('OutputLevel', 3) + + transmitter.IPCConnectionPath = con_dir - return extraConf((infoSvc, adderSvc), node_info.get('extra', {})) + return extraConf((infoSvc, adderSvc, transmitter), + node_info.get('extra', {})) def configureNode(node_info): @@ -147,8 +159,8 @@ def configureNode(node_info): # Cleanup old connection sockets. # If there's no running PID that corresponds to the socket, delete it. + con_dir = connectionDirectory() pids = set([pid for pid in os.listdir('/proc') if pid.isdigit()]) - con_dir = '/run/HLT2' cons = [c for c in os.listdir(con_dir)] if os.path.exists(con_dir) else [] for con in cons: if con.split('_')[-1] in pids: @@ -169,6 +181,7 @@ def configureNode(node_info): infoSvc.InPort = ports['InfoSvc']['in'] infoSvc.OutPort = ports['InfoSvc']['out'] infoSvc.OutputLevel = node_info.get('OutputLevel', 3) + infoSvc.IPCConnectionPath = con_dir # The histogram adder service from Configurables import Hlt2AdderSvc @@ -178,19 +191,21 @@ def configureNode(node_info): for c, v in cons.iteritems(): setattr(adderSvc, c + 'Connection', v) else: - adderSvc.FrontConnection = "ipc:///run/HLT2/MonData_0" + adderSvc.FrontConnection = "ipc://%s/MonData_0" % con_dir adderSvc.BackConnection = 'tcp://hlt%s:%d' % ( node_info['subfarm'], ports['Adder']['in']) - adderSvc.SendInterval = 60 + adderSvc.ReceiveHighWaterMark = 50000 + adderSvc.SendInterval = 61 adderSvc.OutputLevel = node_info.get('OutputLevel', 3) from Configurables import ZmqTransmitterSvc transmitter = ZmqTransmitterSvc() transmitter.InfoPort = ports['Transmitter'] transmitter.OutputLevel = node_info.get('OutputLevel', 3) + transmitter.IPCConnectionPath = con_dir - return extraConf((infoSvc, adderSvc), node_info.get('extra', {})) + return extraConf((infoSvc, adderSvc, transmitter), node_info.get('extra', {})) def configure(host_type=None, directory=None, ports=None, @@ -218,8 +233,7 @@ def configure(host_type=None, directory=None, ports=None, node_info['connections'] = connections hostname = socket.gethostname() - host_regex = re.compile( - r"hlt(0[12]|(?P<subfarm>[a-f]{1}[0-9]{2})(?P<node>[0-9]{2})?)") + host_regex = re.compile(__host_regex) r = host_regex.match(hostname) ht = '' if host_type and host_type in configs: diff --git a/Online/Hlt2Monitoring/python/Hlt2Monitoring/Utilities.py b/Online/Hlt2Monitoring/python/Hlt2Monitoring/Utilities.py index 9ebc8d4c5a775de3af4959e6d3f3f1d9d8052df1..5bcc585ff343b4835c791f3097717bcd3059db54 100644 --- a/Online/Hlt2Monitoring/python/Hlt2Monitoring/Utilities.py +++ b/Online/Hlt2Monitoring/python/Hlt2Monitoring/Utilities.py @@ -35,9 +35,34 @@ connections = {'Hlt2RootPublishSvc': {'back': "ipc:///run/HLT2/MonData_2"}, 'Hlt2MonInfoSvc': {'back': "ipc:///run/HLT2/MonInfo_1"}, 'Hlt2AdderSvc': {'back': "ipc:///run/HLT2/MonData_1"}} -# The next few lines make the OnlineNodeEnv do nothig, we don't need it anyway. + +# Function to create the connection directory at configuration time +def connectionDirectory(): + con_dirs = ['/run/HLT2', '/tmp/HLT2'] + con_dir = None + for cd in con_dirs: + if os.path.exists(cd): + con_dir = cd + break + else: + try: + os.makedirs(cd) + con_dir = cd + break + except OSError as e: + print '[WARNING]:', e + err = ('[WARNING]: Could not create {0}, ' + 'falling back to {1}'.format(cd, con_dirs[-1])) + print err + continue + if not con_dir: + msg = ("Could not create any directory " + "to store connections: %s" % con_dirs) + raise RuntimeError(msg) + return con_dir +# The next few lines make the OnlineNodeEnv do nothig, we don't need it anyway. class EmptyNodeEnv(object): def load_node_info(self): @@ -107,6 +132,7 @@ def configOnline(appMgr, level): from Configurables import AuditorSvc AuditorSvc().Auditors = [] - configMsgSvc(appMgr, 2 if level == 'top' else 3) + # configMsgSvc(appMgr, 2 if level == 'top' else 3) + configMsgSvc(appMgr, 2) OnlineEnv = importOnline() OnlineEnv.end_config(False) diff --git a/Online/Hlt2Monitoring/scripts/test_node_adder.py b/Online/Hlt2Monitoring/scripts/test_node_adder.py index 83739b3e2c8d49a00831097ccd3d74adfccf6200..08d4afddb532f0610568f43abc76635a646fe863 100644 --- a/Online/Hlt2Monitoring/scripts/test_node_adder.py +++ b/Online/Hlt2Monitoring/scripts/test_node_adder.py @@ -14,18 +14,19 @@ Hlt2Adder.configure('node', 'Info' : 'ipc:///run/HLT2/MonInfoNode'}, 'Adder' : {'Front': 'ipc:///run/HLT2/MonData_0', 'Back': 'ipc:///run/HLT2/MonData_1'}}, - extra = {'Hlt2MonInfoSvc' : {'SyncConnections' : ['ipc:///run/HLT2/MonInfoOtherNode'], 'OutputLevel' : 2, 'SyncInterval' : 60}, 'Hlt2AdderSvc' : {'OutputLevel' : 2}})""" + extra = {'Hlt2MonInfoSvc' : {'SyncConnections' : ['ipc:///run/HLT2/MonInfoOtherNode'], + 'OutputLevel' : 1, + 'SyncInterval' : 60}, + 'Hlt2AdderSvc' : {'OutputLevel' : 2}, + 'ZmqTransmitterSvc' : {'OutputLevel' : 2}})""" # cmd = """import GaudiKernel.ProcessJobOptions # from Gaudi.Configuration import importOptions # GaudiKernel.ProcessJobOptions.printing_level=3 # from Hlt2Monitoring import Hlt2Adder -# Hlt2Adder.configure('node', ports = {'Adder' : {'in' : '41347', 'out' :41351}, 'InfoSvc' : {'in' : 41348, 'out' : 41352}}, -# connections = {'Info' : {'Front' : 'ipc:///run/HLT2/MonInfo_0', 'Info' : 'tcp://*:41350', 'Back' : 'tcp://hltd04:41348'}, -# 'Adder' : {'Front' : 'ipc:///run/HLT2/MonData_0', 'Back' : 'tcp://hltd04:41347', 'Info' : 'tcp://*:41351'}}, -# extra = {'Hlt2MonInfoSvc' : {'SyncConnections' : ['tcp://hltd04:41350'], -# 'OutputLevel' : 2, 'SyncInterval' : 60}, 'Hlt2AdderSvc' : {'OutputLevel' -# : 2}})""" +# Hlt2Adder.configure(extra = {'Hlt2MonInfoSvc' : {'OutputLevel' : 2}, +# 'Hlt2AdderSvc' : {'OutputLevel' : 2}, +# 'ZmqTransmitterSvc' : {'OutputLevel' : 2}})""" mon_root = os.environ['HLT2MONITORINGROOT'] os.environ['DIM_DNS_NODE'] = 'localhost' diff --git a/Online/Hlt2Monitoring/scripts/test_online_adder.py b/Online/Hlt2Monitoring/scripts/test_online_adder.py index 6f624e82754e3f2495fc3b583660e5091c5ad513..b05dc87b7bf4dfc166506ccdadec70c3373c6a67 100644 --- a/Online/Hlt2Monitoring/scripts/test_online_adder.py +++ b/Online/Hlt2Monitoring/scripts/test_online_adder.py @@ -14,13 +14,17 @@ Hlt2Adder.configure('top', '/tmp/histograms', 'Info' : 'ipc:///run/HLT2/MonInfoTop'}, 'Adder' : {'Front' : 'ipc:///run/HLT2/MonData_1', 'Back' : 'ipc:///run/HLT2/AddData_2'}, - 'RunDB' : 'ipc:///tmp/testRunDB'}, + 'RunDB' : 'ipc:///run/HLT2/testRunDB'}, extra = {'Hlt2MonInfoSvc' : {'SyncConnections' : ['ipc:///run/HLT2/MonInfoNode', 'ipc:///run/HLT2/MonInfoOtherNode'], 'SyncInterval' : 20, 'OutputLevel' : 2}, 'Hlt2AdderSvc' : {'OutputLevel' : 2}, - 'Hlt2RootPublishSvc' : {'OutputLevel' : 2}, + 'Hlt2RootPublishSvc' : {'OutputLevel' : 2, + 'BatchSize' : 50, + 'HighWaterMark' : 200, + 'PublishInterval' : 50}, 'Hlt2SaverSvc' : {'OutputLevel' : 2, + 'RunInfoPollTimeout' : 1, 'RunInfoType' : 'Moore2', 'SaveInterval' : 30, 'NWorkers' : 5}})""" @@ -29,13 +33,17 @@ Hlt2Adder.configure('top', '/tmp/histograms', # from Gaudi.Configuration import importOptions # GaudiKernel.ProcessJobOptions.printing_level=3 # from Hlt2Monitoring import Hlt2Adder -# Hlt2Adder.configure('top', '/tmp/histograms', -# ports = {'Adder' : {'in' : 41347, 'out' :41349}, 'InfoSvc' : {'in' : 41348, 'out' : 41349}}, -# connections = {'Info' : {'Front' : 'tcp://*:41348', 'Info' : 'ipc:///run/HLT2/MonInfoTop'}, -# 'Adder' : {'Front' : 'tcp://*:41347', 'Back' : 'ipc:///run/HLT2/AddData_2', 'Info' : 'ipc:///run/HLT2/AdderInfo'}, -# 'RunDB' : 'ipc:///tmp/HLT2/TestRunDB'}, -# extra = {'Hlt2MonInfoSvc' : {'SyncConnections' : ['tcp://hltd04:41350'], -# 'SyncInterval' : 20}})""" +# Hlt2Adder.configure('top', '/tmp/hlt2histograms', +# connections = {'RunDB' : 'ipc:///run/HLT2/testRunDB'}, +# extra = {'Hlt2MonInfoSvc' : {'OutputLevel' : 2}, +# 'Hlt2AdderSvc' : {'OutputLevel' : 2}, +# 'Hlt2RootPublishSvc' : {'OutputLevel' : 2}, +# 'Hlt2SaverSvc' : {'OutputLevel' : 2, +# 'RunInfoPollTimeout' : 1, +# 'RunInfoType' : 'Moore2', +# 'SaveInterval' : 50, +# 'NWorkers' : 5}})""" + os.environ['DIM_DNS_NODE'] = 'hlt01' env = {'LC_ALL': 'C', 'UTGID': utgid, 'TEMPDIR': '/tmp/testAdder', 'PARTITION': 'LHCb2', 'PARTITION_NAME': 'LHCb2', 'RUNINFO': '%s/scripts/OnlineEnvBase.py' % os.environ['HLT2MONITORINGROOT']} diff --git a/Online/Hlt2Monitoring/scripts/test_subfarm_adder.py b/Online/Hlt2Monitoring/scripts/test_subfarm_adder.py index 8d6c5ad26a936424e8d5a5c7c8fe31252934430d..a13e22579c016dc1fa026ffcfeff5c36cf5eeaea 100644 --- a/Online/Hlt2Monitoring/scripts/test_subfarm_adder.py +++ b/Online/Hlt2Monitoring/scripts/test_subfarm_adder.py @@ -1,5 +1,5 @@ import os -from Manager import Manager +from Hlt2Monitoring.Manager import Manager utgid = 'TEST_SUBFARMMADDER_00' @@ -7,9 +7,9 @@ cmd = """import GaudiKernel.ProcessJobOptions from Gaudi.Configuration import importOptions GaudiKernel.ProcessJobOptions.printing_level=3 from Hlt2Monitoring import Hlt2Adder -Hlt2Adder.configure(ports = {'Adder' : {'in' : 41347, 'out' :41349}, - 'InfoSvc' : {'in' : 41351, 'out' : 41352}, - 'Transmitter' : 41339}) +Hlt2Adder.configure(extra = {'Hlt2MonInfoSvc' : {'OutputLevel' : 2}, + 'Hlt2AdderSvc' : {'OutputLevel' : 2}, + 'ZmqTransmitterSvc' : {'OutputLevel' : 2}}) """ """import GaudiKernel.ProcessJobOptions diff --git a/Online/Hlt2Monitoring/scripts/test_sync_adder.py b/Online/Hlt2Monitoring/scripts/test_sync_adder.py index 5874ef1eacc9b2e4fb3ae944b6d602dc75d14a8d..35984aba15722b69a6dcb8f1722b28277e24123d 100644 --- a/Online/Hlt2Monitoring/scripts/test_sync_adder.py +++ b/Online/Hlt2Monitoring/scripts/test_sync_adder.py @@ -18,9 +18,10 @@ Hlt2Adder.configure('subfarm', 'Back' : 'ipc:///run/HLT2/MonInfoOther_1'}, 'Adder' : {'Front' : 'ipc:///run/HLT2/MonDataOther_0', 'Back' : 'ipc:///run/HLT2/MonDataOther_1'}}, - extra = {'Hlt2MonInfoSvc' : {'SyncConnections' : ['ipc:///run/HLT2/MonInfoNode'], - 'OutputLevel' : 2, - 'SyncInterval' : 60}})""" + extra = {'Hlt2MonInfoSvc' : {'OutputLevel' : 2, + 'OutPort' : 31352, +# 'SyncConnections' : ['ipc:///run/HLT2/MonInfoNode'], + 'SyncInterval' : 60}})""" os.environ['DIM_DNS_NODE'] = 'localhost' mon_root = os.environ['HLT2MONITORINGROOT'] diff --git a/Online/Hlt2Monitoring/src/component/Hlt2AdderSvc.cpp b/Online/Hlt2Monitoring/src/component/Hlt2AdderSvc.cpp index eb5323052def047bdde59019df6be5c76e3b89e6..862e0f5f7f5b872826aa238c3574a31e5d0ea7fc 100644 --- a/Online/Hlt2Monitoring/src/component/Hlt2AdderSvc.cpp +++ b/Online/Hlt2Monitoring/src/component/Hlt2AdderSvc.cpp @@ -54,11 +54,14 @@ DECLARE_SERVICE_FACTORY(Hlt2AdderSvc) //============================================================================= Hlt2AdderSvc::Hlt2AdderSvc(const string& name, ISvcLocator* loc) - : Hlt2MonBaseSvc(name, loc), - m_stopSending{false} + : Hlt2MonBaseSvc(name, loc) { declareProperty("SendInterval", m_sendInterval = 10); declareProperty("ConnectBack", m_connectBack = true); + declareProperty("PublishInterval", m_publishInterval = 60); + declareProperty("BatchSize", m_batchSize = 500); + declareProperty("HighWaterMark", m_hwm = 5000); + declareProperty("ReceiveHighWaterMark", m_rcvHwm = 200000); } //=============================================================================== @@ -104,6 +107,7 @@ void Hlt2AdderSvc::function() { front.bind(m_frontCon.c_str()); setsockopt(front, zmq::LINGER, 0); zmq::setsockopt(front, zmq::SUBSCRIBE, ""); + zmq::setsockopt(front, zmq::RCVHWM, boost::numeric_cast<int>(m_rcvHwm)); info() << "Bound frontend to: " << m_frontCon << endmsg; // Clean up queue @@ -153,36 +157,36 @@ void Hlt2AdderSvc::function() { publish.bind(pubCon().c_str()); // Start thread to trigger publication of histograms - std::thread pubThread([this] { periodic(pubCon(), Monitoring::s_Publish, - m_stopSending, m_sendInterval); }); + std::thread pubThread([this] { periodic(pubCon(), Monitoring::s_Publish, 0.5); }); + size_t pubCounter = m_publishInterval * 2; + size_t counterPubCounter = m_publishInterval * 2; // Initialize poll set zmq::pollitem_t items[] = {{control, 0, ZMQ_POLLIN, 0}, {front, 0, ZMQ_POLLIN, 0}, {publish, 0, ZMQ_POLLIN, 0}}; - bool paused = false; + // Storage + Histograms histograms; + Counters counters; + Queue messages; + std::vector<Key> toSend; while (true) { // Process messages from both sockets zmq::message_t message; - if (!paused) zmq::poll(&items[0], 3, -1); + zmq::poll(&items[0], 3, -1); - if (paused || (items[0].revents & ZMQ_POLLIN)) { + if (items[0].revents & ZMQ_POLLIN) { auto cmd = receive<std::string>(control); if (cmd == Monitoring::s_Terminate) { + send(publish, Monitoring::s_Terminate); break; - } else if (cmd == "PAUSE") { - debug() << name() << " paused." << endmsg; - paused = true; - } else if (cmd == "RESUME") { - debug() << name() << " resumed." << endmsg; - paused = false; } } - if (!paused && (items[1].revents & ZMQ_POLLIN)) { + if (items[1].revents & ZMQ_POLLIN) { // Deserialize auto msg = receive<zmq::message_t>(front); auto type = decode<string>(msg); @@ -196,11 +200,11 @@ void Hlt2AdderSvc::function() { continue; } - key_t key{hdiff.runNumber, hdiff.histId}; + Key key{hdiff.runNumber, hdiff.histId}; // Add to internal store - auto it = m_histograms.find(key); - if (it == end(m_histograms)) { - auto r = m_histograms.emplace(std::move(key), HistDiff{m_sourceID, hdiff.runNumber, hdiff.histId}); + auto it = histograms.find(key); + if (it == end(histograms)) { + auto r = histograms.emplace(std::move(key), HistDiff{m_sourceID, hdiff.runNumber, hdiff.histId}); assert(r.second); it = r.first; } @@ -209,39 +213,39 @@ void Hlt2AdderSvc::function() { auto start = receive<long>(front); auto last = receive<long>(front); auto key = make_pair(start, last); - std::map<size_t, StatEntity> counters; + std::map<size_t, StatEntity> recvCounters; msg = receive<zmq::message_t>(front); try { - counters = decode<decltype(counters)>(msg); + recvCounters = decode<decltype(recvCounters)>(msg); } catch (boost::archive::archive_exception) { warning() << "Faulty CounterDiff, ignoring " << endmsg; continue; } // Check if there is a set of counters with - auto it = m_counters.find(key); - if (it == end(m_counters)) { - auto count = begin(m_counters); - if (!m_counters.empty() && count->first.first > last) { + auto it = counters.find(key); + if (it == end(counters)) { + auto count = begin(counters); + if (!recvCounters.empty() && count->first.first > last) { warning() << "Received counters for time [" << start << ", " << last << "), which is before first stored counter [" << count->first.first << ", " << count->first.second << endmsg; } else { - auto r = m_counters.emplace(key, std::move(counters)); + auto r = counters.emplace(key, std::move(recvCounters)); if (!r.second) assert(false); } } else { auto& storedCounters = it->second; // Check consistency ++it; - if (it != end(m_counters) && it->first.first != last) { + if (it != end(counters) && it->first.first != last) { warning() << "There is a gap in the set of counters: " << start << " " << last << " " << it->first.first << endmsg; } else { - debug() << "Received " << counters.size() << " counters for time interval [" + debug() << "Received " << recvCounters.size() << " counters for time interval [" << start << ", " << last << ")." << endmsg; } - for (const auto& entry : counters) { + for (const auto& entry : recvCounters) { Monitoring::HistId counterKey = entry.first; // Add to internal store auto counterIt = storedCounters.find(counterKey); @@ -266,71 +270,120 @@ void Hlt2AdderSvc::function() { } - if (!paused && items[2].revents & ZMQ_POLLIN) { + if (items[2].revents & ZMQ_POLLIN) { auto cmd = receive<std::string>(publish); if (cmd == Monitoring::s_Publish) { - // If there is no separate counter socket, use the hist - // socket. This happens if the back connection is bound - // instead of connected. - auto n = (publishHistDiffs(*histOut, histID) + - publishCounterDiffs(counterOut ? *counterOut : *histOut, - counterID)); - for (const auto& id : {histID, counterID}) { + if (pubCounter > 0) { + --pubCounter; + } + + if (pubCounter == 0 && messages.empty() + && toSend.empty() && !histograms.empty()) { + toSend.reserve(histograms.size()); + for (const auto& entry : histograms) { + toSend.push_back(entry.first); + } + pubCounter = m_publishInterval * 2; + } + + if (messages.empty() && !toSend.empty()) { + while (messages.size() < m_hwm && !toSend.empty()) { + const auto& key = toSend.back(); + auto it = histograms.find(key); + if (it != end(histograms)) { + messages.emplace_back(zmq().encode(it->second)); + histograms.erase(it); + } else { + warning() << "Requested to send key " << key.first << " " << key.second + << ", but it is not in histograms." << endmsg; + } + toSend.pop_back(); + } + debug() << "Created " << messages.size() << " messages " << endmsg; + } + + if (!messages.empty()){ + // Ask the TransmitterSvc how many messages we can send. + // If we've bound our output socket, use the batch size + // property until we can talk to the publish service. + size_t n = 0; + if (m_connectBack) { + send(*histOut, Monitoring::s_Check, zmq::SNDMORE); + send(*histOut, Monitoring::s_HistDiff, zmq::SNDMORE); + send(*histOut, *histID); + + zmq::pollitem_t checkItems[] = {{*histOut, 0, ZMQ_POLLIN, 0}}; + zmq::poll(&checkItems[0], 1, 500); + if (checkItems[0].revents & ZMQ_POLLIN) { + n = receive<size_t>(*histOut); + n = std::min(messages.size(), n); + } else { + warning() << "Poll for credit to transmitter service timed out." << endmsg; + } + } else { + n = std::min(messages.size(), m_batchSize); + } + + if (UNLIKELY(msgLevel(MSG::VERBOSE))) { + verbose() << "Sending " << n << " messages " << endmsg; + } + + for (size_t m = 0; m < n; ++m) { + auto& message = messages.front(); + + // Send the message + send(*histOut, Monitoring::s_HistDiff, zmq::SNDMORE); + if (histID) send(*histOut, *histID, zmq::SNDMORE); + send(*histOut, message); + + messages.pop_front(); + } + // Don't trigger too often + if (histID && m_connectBack + && (pubCounter % m_publishInterval) == 0) { + m_transmitter->trigger(*histID); + } + } + + if (UNLIKELY(msgLevel(MSG::VERBOSE)) && toSend.empty()) { + verbose() << "No messages to send." << endmsg; + } + + // Publish counters + if (counterPubCounter == 0 && !counters.empty()) { + // If there is no separate counter socket, use the hist + // socket. This happens if the back connection is bound + // instead of connected. + auto n = publishCounterDiffs(counterOut ? *counterOut : *histOut, + counters, counterID); #if !defined(__clang__) #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wmaybe-uninitialized" #endif - // This line results in a false-positive warning from - // some gcc versions on some builds. - if (id && m_connectBack && (n != 0)) m_transmitter->trigger(*id); + if (counterID && m_connectBack && (n != 0)) m_transmitter->trigger(*counterID); #if !defined(__clang__) #pragma GCC diagnostic pop #endif + counterPubCounter = m_publishInterval * 2; } } } } - m_stopSending = true; pubThread.join(); } -//=============================================================================== -size_t Hlt2AdderSvc::publishHistDiffs(zmq::socket_t& socket, - boost::optional<size_t> id) const { - size_t n = 0; - - for (auto it = begin(m_histograms), last = end(m_histograms); it != last;) { - if (it->second.binDiffs.empty()) { - verbose() << "Pruning histogram " << it->first.first << " " << it->first.second << endmsg; - it = m_histograms.erase(it); - } else { - // Serialize - send(socket, Monitoring::s_HistDiff, zmq::SNDMORE); - if (id) send(socket, *id, zmq::SNDMORE); - send(socket, it->second); - - // Zero out histogram - it->second.binDiffs.clear(); - ++it; - ++n; - } - } - - debug() << "Published " << n << " histogram diffs" << endmsg; - return n; -} - //=============================================================================== size_t Hlt2AdderSvc::publishCounterDiffs(zmq::socket_t& socket, + Counters& counters, boost::optional<size_t> id) const { size_t n = 0; - auto nRemove = m_counters.size() > 5 ? m_counters.size() - 5 : 0; - for (auto it = begin(m_counters), last = end(m_counters); it != last;) { + auto nRemove = counters.size() > 5 ? counters.size() - 5 : 0; + for (auto it = begin(counters), last = end(counters); it != last;) { if (nRemove != 0) { verbose() << "Pruning counters at [" << it->first.first << ", " << it->first.second << ")" << endmsg; - it = m_counters.erase(it); + it = counters.erase(it); --nRemove; } else { // Serialize diff --git a/Online/Hlt2Monitoring/src/component/Hlt2AdderSvc.h b/Online/Hlt2Monitoring/src/component/Hlt2AdderSvc.h index 39e165734833f7d5638a3d040f43e71d12f6bc8c..8833dead523dbf3d5d1129d6539380d4046f0b22 100644 --- a/Online/Hlt2Monitoring/src/component/Hlt2AdderSvc.h +++ b/Online/Hlt2Monitoring/src/component/Hlt2AdderSvc.h @@ -31,19 +31,24 @@ */ class Hlt2AdderSvc : public Hlt2MonBaseSvc { public: + + using Key = std::pair<Monitoring::RunNumber, Monitoring::HistId>; + using Histograms = boost::unordered_map<Key, Monitoring::HistDiff>; + using Counters = std::map<std::pair<long, long>, std::map<Monitoring::HistId, StatEntity>>; + using Queue = std::deque<zmq::message_t>; + /// Standard constructor Hlt2AdderSvc(const std::string& name, ISvcLocator* sl); StatusCode initialize() override; - + // The function that does the work void function() override; private: - size_t publishHistDiffs(zmq::socket_t& socket, - boost::optional<size_t> id) const; size_t publishCounterDiffs(zmq::socket_t& socket, + Counters& counters, boost::optional<size_t> id) const; std::string pubCon() const { @@ -56,14 +61,13 @@ private: bool m_connectBack; boost::optional<zmq::socket_t> m_histOut; boost::optional<zmq::socket_t> m_counterOut; - + unsigned int m_publishInterval; + size_t m_batchSize; + size_t m_hwm; + size_t m_rcvHwm; + // data members unsigned int m_sourceID; - std::atomic<bool> m_stopSending; - - using key_t = std::pair<Monitoring::RunNumber, Monitoring::HistId>; - mutable boost::unordered_map<key_t, Monitoring::HistDiff> m_histograms; - mutable std::map<std::pair<long, long>, std::map<Monitoring::HistId, StatEntity>> m_counters; // Job Info using JobKey = std::tuple<unsigned int, std::string, std::string, std::string>; diff --git a/Online/Hlt2Monitoring/src/component/Hlt2MonBaseSvc.cpp b/Online/Hlt2Monitoring/src/component/Hlt2MonBaseSvc.cpp index 5506b5e2c8b15af57b2a63e1a907c4604dbb7f67..13edfc17bb896098f57a00ee8e70fada71d426f6 100644 --- a/Online/Hlt2Monitoring/src/component/Hlt2MonBaseSvc.cpp +++ b/Online/Hlt2Monitoring/src/component/Hlt2MonBaseSvc.cpp @@ -56,7 +56,7 @@ Hlt2MonBaseSvc::Hlt2MonBaseSvc(const string& name, ISvcLocator* loc, bool bindCo declareProperty("ForceTop", m_forceTop = false); declareProperty("PartitionName", m_partition); declareProperty("RunInPartitions", m_partitions = {"LHCb2"}); - declareProperty("HostnameRegex", m_hostRegex = "hlt(0[12]|(?<subfarm>[a-f][0-9]{2})(?<node>[0-9]{2})?)"); + declareProperty("HostnameRegex", m_hostRegex = "^hlt(0[12]|(?<subfarm>[a-f][0-9]{2})(?<node>[0-9]{2})?).*"); declareProperty("CheckInterval", m_checkInterval = 10); } @@ -201,20 +201,32 @@ Hlt2MonBaseSvc::receiveRunAndId(zmq::socket_t& socket, bool* more) const //=============================================================================== void Hlt2MonBaseSvc::periodic(const std::string& connection, std::string message, - const std::atomic<bool>& stop, const unsigned int interval) { + const double interval) { zmq::socket_t publish = socket(zmq::PAIR); zmq::setsockopt(publish, zmq::LINGER, 0); publish.connect(connection.c_str()); - while (!stop) { - unsigned int n = 0; - while (n < interval) { - if (stop) break; - std::chrono::seconds one{1}; - std::this_thread::sleep_for(one); - ++n; + zmq::pollitem_t items [] = { + { publish, 0, ZMQ_POLLIN, 0 }, + }; + + auto iv = boost::numeric_cast<long>(1000 * interval); + if (UNLIKELY(msgLevel(MSG::DEBUG))) { + debug() << "Sending " << message << " messages with interval " + << iv << " ms." << endmsg; + } + while (true) { + zmq::poll(&items[0], 1, iv); + if (items[0].revents & ZMQ_POLLIN) { + auto msg = zmq().receive<string>(publish); + if (msg == Monitoring::s_Terminate) { + break; + } else { + warning() << "Publish thread got unknown message: " + << msg << endmsg; + } } - if (!stop) send(publish, message); + send(publish, message); } } @@ -291,7 +303,7 @@ bool Hlt2MonBaseSvc::checkRequest(Monitoring::JobInfo jobInfo) const try { check.connect(jobInfo.connection.c_str()); } catch (const zmq::error_t& e) { - error() << "Failed to connect to " << jobInfo.connection + error() << "Failed to connect to " << jobInfo.connection << " to respond to a check request." << endmsg; m_jobInfo.erase(key); return false; diff --git a/Online/Hlt2Monitoring/src/component/Hlt2MonBaseSvc.h b/Online/Hlt2Monitoring/src/component/Hlt2MonBaseSvc.h index a895d22ea1aa7f31819c05ef4db6114e385d1f83..1bb091e7d8e59bc56b9cd4a003bba368532974ba 100644 --- a/Online/Hlt2Monitoring/src/component/Hlt2MonBaseSvc.h +++ b/Online/Hlt2Monitoring/src/component/Hlt2MonBaseSvc.h @@ -94,9 +94,9 @@ protected: void enable() { m_enabled = true; } - void periodic(const std::string& connection, std::string message, - const std::atomic<bool>& stop, - const unsigned int interval); + void periodic(const std::string& connection, + std::string message, + const double interval); bool checkRequest(Monitoring::JobInfo jobInfo) const; diff --git a/Online/Hlt2Monitoring/src/component/Hlt2MonInfoSvc.cpp b/Online/Hlt2Monitoring/src/component/Hlt2MonInfoSvc.cpp index 526fe94d225b2d31ef6e3b6bd1015a205588a06e..e3876708acf1ce45e241f92fdf23d993d1515cc7 100644 --- a/Online/Hlt2Monitoring/src/component/Hlt2MonInfoSvc.cpp +++ b/Online/Hlt2Monitoring/src/component/Hlt2MonInfoSvc.cpp @@ -10,6 +10,7 @@ // boost #include <boost/regex.hpp> #include <boost/functional/hash.hpp> +#include <boost/optional.hpp> // zeromq #include "zmq/zmq.hpp" @@ -29,6 +30,7 @@ #include <Hlt2Monitoring/Histo1DDef.h> #include <Hlt2Monitoring/Histo2DDef.h> #include <Hlt2Monitoring/Utilities.h> +#include <Hlt2Monitoring/InfoUtils.h> // Local #include "Hlt2MonInfoSvc.h" @@ -57,6 +59,7 @@ namespace { using std::unordered_set; using Monitoring::hostname; + using Monitoring::HistoKey; using Monitoring::HistoKeys; using Monitoring::HistoPub; using Monitoring::HistoMap; @@ -71,6 +74,11 @@ namespace { using Monitoring::Histo2DDef; using Monitoring::CounterDef; + using MonInfo::ByKey; + using MonInfo::ByContent; + using MonInfo::HistoVariant; + using MonInfo::HistoEntry; + } // Factory for instantiation of service objects @@ -91,12 +99,10 @@ Hlt2MonInfoSvc::Hlt2MonInfoSvc(const string& name, ISvcLocator* loc) declareProperty("RunDBRetires", m_runDBRetries = 5); declareProperty("SendInterval", m_sendInterval = 5); declareProperty("ForceSourceID", m_sourceID = 0); -} - -//=============================================================================== -Hlt2MonInfoSvc::~Hlt2MonInfoSvc() -{ - + declareProperty("IPCConnectionPath", m_connectionPath = "/run/HLT2"); + declareProperty("SyncBatchSize", m_syncBatchSize = 100); + declareProperty("SyncTries", m_syncTries = 5); + declareProperty("LoadFrom", m_inputFile); } //=============================================================================== @@ -148,9 +154,9 @@ StatusCode Hlt2MonInfoSvc::initialize() m_randomSync = m_syncConnections.empty(); } else { // node - m_frontCon = string("ipc:///run/HLT2/MonInfo_0"); - m_infoCon = string("tcp://*:") + to_string(m_outPort); - m_backCon = string("tcp://hlt") + matches.str("subfarm") + ":" + to_string(m_inPort); + m_frontCon = string{"ipc://"} + m_connectionPath + "/MonInfo_0"; + m_infoCon = string{"tcp://*:"} + to_string(m_outPort); + m_backCon = string{"tcp://hlt"} + matches.str("subfarm") + ":" + to_string(m_inPort); } } else { fatal() << "Could not determine hostname." << endmsg; @@ -172,6 +178,11 @@ StatusCode Hlt2MonInfoSvc::initialize() // Sort subfarms, lexically is good enough std::sort(begin(m_subfarms), end(m_subfarms)); + + // Remove ourselves from the list of subfarms. + auto it = std::find(begin(m_subfarms), end(m_subfarms), m_hostname); + if (it != end(m_subfarms)) m_subfarms.erase(it); + return sc; } @@ -182,7 +193,7 @@ void Hlt2MonInfoSvc::synchroniser() zmq::setsockopt(syncer, zmq::LINGER, 0); syncer.connect(syncCon().c_str()); - auto hostDown = [this](const string& con) -> bool { + auto hostUp = [this](const string& con) -> bool { zmq::socket_t ping = socket(zmq::REQ); zmq::setsockopt(ping, zmq::LINGER, 0); zmq::setsockopt(ping, zmq::RCVTIMEO, 100); @@ -205,46 +216,17 @@ void Hlt2MonInfoSvc::synchroniser() if (UNLIKELY(msgLevel(MSG::DEBUG)) && r.empty()) { debug() << con << " is down." << endmsg; } - return r.empty(); + return !r.empty(); }; - auto checkHosts = [&hostDown](decltype(m_syncConnections)& cons) { - for (auto it = begin(cons), last = end(cons); it != last;) { - if (hostDown(*it)) { - it = cons.erase(it); - } else { - ++it; - } - } - }; - - int i = 0; - map<size_t, string> subfarms; - for (auto s : m_subfarms) - subfarms[i++] = s; - std::mt19937 gen{std::hash<string>{}(m_hostname)}; - auto genHosts = [&subfarms, &gen, this](const size_t n) { - auto connection = [this](const string& host) { - return string{"tcp://"} + host + ":" + to_string(m_outPort); - }; - std::uniform_int_distribution<size_t> sf{0, m_subfarms.size() - 1}; - - while (m_syncConnections.size() < n) { - auto it = subfarms.find(sf(gen)); - if (it == end(subfarms)) continue; - if (it->second == m_hostname) continue; - m_syncConnections.emplace(connection(it->second)); - } - }; - auto sleep_for = [this](const unsigned int m) { unsigned int n = 0; - while (n < m) { + while (n < 10 * m) { if (m_stopSync) break; - std::chrono::seconds one{1}; - std::this_thread::sleep_for(one); + std::chrono::milliseconds hundred{100}; + std::this_thread::sleep_for(hundred); ++n; } }; @@ -255,19 +237,37 @@ void Hlt2MonInfoSvc::synchroniser() decltype(m_syncConnections) connections{m_syncConnections}; while (!m_stopSync) { - if (!m_randomSync) { - m_syncConnections = connections; - } - - checkHosts(m_syncConnections); if (m_randomSync) { - genHosts(std::min(m_subfarms.size(), size_t{m_sync})); + auto subfarms = m_subfarms; + std::mt19937 gen{std::hash<string>{}("dump_info")}; + + auto connection = [this](const string& host) -> string { + return string{"tcp://"} + host + ":" + to_string(m_outPort); + }; + + vector<string>::iterator last = subfarms.end() - 1, first = subfarms.begin(); + while (connections.size() < m_sync && last != first - 1) { + size_t s = distance(first, last); + auto it = first + std::uniform_int_distribution<size_t>{0, s ? s - 1 : 0}(gen); + auto con = connection(*it); + + if (hostUp(con)) { + connections.emplace(con); + } + std::swap(*it, *last); + --last; + } + + } else { + std::copy_if(begin(m_syncConnections), end(m_syncConnections), + std::inserter(connections, connections.end()), hostUp); } - if (m_syncConnections.empty()) { + if (connections.empty()) { debug() << "Could not find any hosts to synchronise info with." << endmsg; } else { - send(syncer, Monitoring::s_Sync); + send(syncer, Monitoring::s_Sync, zmq::SNDMORE); + send(syncer, connections); } sleep_for(m_syncInterval); } @@ -300,117 +300,130 @@ void Hlt2MonInfoSvc::runDB() const using Pair = std::pair<Monitoring::RunNumber, string>; std::unordered_set<Pair, boost::hash<Pair>> runs; std::unordered_map<Monitoring::RunNumber, boost::optional<double>> deadtimes; + std::unordered_set<Pair, boost::hash<Pair>> toCheck; bool done = false; while (!done) { - bool request = false; - Monitoring::RunNumber run = 0; - size_t n = 0; - for (const auto& entry : deadtimes) { - n += bool(entry.second); - } - if (runs.size() && (runs.size() != n)) { - for (auto r : runs) { - auto it = deadtimes.find(r.first); - if (it != end(deadtimes) && bool(it->second)) { - continue; - } - if (UNLIKELY(msgLevel(MSG::VERBOSE))) - verbose() << "Requesting info for run: " << r.first << " " - << r.second << endmsg; - send(rdb, Monitoring::s_RunInfo, zmq::SNDMORE); - send(rdb, r.first, zmq::SNDMORE); - send(rdb, r.second); - request = true; - run = r.first; - break; + toCheck.clear(); + + // Find the runs for which we have no deadtime yet. + for (const auto& entry : runs) { + auto it = deadtimes.find(entry.first); + if ((it == end(deadtimes) || !(it->second))) { + toCheck.emplace(entry); } } - size_t tries = 0; - bool reply = false; - while (tries < m_runDBRetries) { - bool other = false; + auto checkIt = begin(toCheck); - zmq::pollitem_t items[] = { - {rdb, 0, ZMQ_POLLIN, 0}, - {rep, 0, ZMQ_POLLIN, 0}, - }; + // We need to keep track of whether a request has been sent already, + // so we don't send another one if we receive a new run while waiting + // for the reply on a previous request. + bool request = false; - // Poll socket for a reply, with timeout if we are waiting for some runs - zmq::poll(&items[0], 2, (request ? 200 : -1)); - - // If we got a reply, process it - if (items[0].revents & ZMQ_POLLIN) { - auto known = receive<string>(rdb); - debug() << "New reply from run DB server: " << known << endmsg; - reply = true; - boost::optional<double> dt{}; - if (known == Monitoring::s_Unknown) { - warning() << "RunDBServ does have info for run " << run << endmsg; - } else { - auto info = receive<Monitoring::RunInfo>(rdb); - assert(info.run == run); - if (info.deadtime >= 0.) { - dt = info.deadtime; + while (checkIt != end(toCheck) || toCheck.empty()) { + size_t tries = 0; + while (true) { + + zmq::pollitem_t items[] = { + {rdb, 0, ZMQ_POLLIN, 0}, + {rep, 0, ZMQ_POLLIN, 0}, + }; + + boost::optional<Monitoring::RunNumber> run; + boost::optional<string> app; + + if (checkIt != end(toCheck)) { + run = checkIt->first; + app = checkIt->second; + if (!request) { if (UNLIKELY(msgLevel(MSG::DEBUG))) - debug() << "Deadtime from run DB for run: " << info.run << " " - << info.deadtime << endmsg; - } else { - warning() << "Got reply from RunDBServ for run " << info.run - << ", but deadtime is not known." << endmsg; + debug() << "Requesting info for run: " << *run << " " + << *app << " tries " << tries << endmsg; + send(rdb, Monitoring::s_RunInfo, zmq::SNDMORE); + send(rdb, *run, zmq::SNDMORE); + send(rdb, *app); + request = true; } } - deadtimes[run] = dt; - } - if (items[1].revents & ZMQ_POLLIN) { - other = true; - bool more = true; - auto type = receive<string>(rep, &more); - if (type == Monitoring::s_Command) { - auto cmd = receive<string>(rep); - if (cmd == "TERMINATE") { - send(rep, true); - done = true; - break; + // Poll socket for a reply, with timeout if we are waiting for some runs + // If a run request passes by in the middle, we simply poll a bit longer. + // As those are not very frequest and come in batches, that's fine. + auto nRep = zmq::poll(&items[0], 2, (toCheck.empty() ? -1 : 500)); + + if (items[1].revents & ZMQ_POLLIN) { + auto type = receive<string>(rep); + if (type == Monitoring::s_Command) { + auto cmd = receive<string>(rep); + if (cmd == "TERMINATE") { + send(rep, true); + done = true; + break; + } + } else if (type == Monitoring::s_RunInfo) { + auto reqRun = receive<Monitoring::RunNumber>(rep); + auto application = receive<string>(rep); + auto it = deadtimes.find(reqRun); + if (it != end(deadtimes) && bool(it->second)) { + send(rep, true, zmq::SNDMORE); + send(rep, *it->second); + } else { + send(rep, false); + runs.emplace(reqRun, application); + toCheck.emplace(reqRun, application); + } } - } else if (type == Monitoring::s_RunInfo) { - auto reqRun = receive<Monitoring::RunNumber>(rep); - auto app = receive<string>(rep); - auto it = deadtimes.find(reqRun); - if (it != end(deadtimes) && bool(it->second)) { - send(rep, true, zmq::SNDMORE); - send(rep, *it->second); + } + + // If we got a reply, process it + if (items[0].revents & ZMQ_POLLIN) { + auto known = receive<string>(rdb); + debug() << "New reply from run DB server: " << known << endmsg; + boost::optional<double> dt{}; + if (known == Monitoring::s_Unknown) { + warning() << "RunDBServ does not have info for " + << (run ? "run " + to_string(*run) : string{"a run"}) << endmsg; } else { - send(rep, false); - runs.emplace(reqRun, app); - if (!request) break; - } - } else if (more) { - while(more) { - receive<zmq::message_t>(rep, &more); + auto info = receive<Monitoring::RunInfo>(rdb); + if (info.deadtime >= 0.) { + dt = info.deadtime; + if (UNLIKELY(msgLevel(MSG::DEBUG))) + debug() << "Deadtime from run DB for run: " << info.run << " " + << info.deadtime << endmsg; + } else { + warning() << "Got reply from RunDBServ for run " << info.run + << ", but deadtime is not known." << endmsg; + } + deadtimes[info.run] = dt; + if (checkIt != end(toCheck) && (*run == info.run)) { + checkIt = toCheck.erase(checkIt); + } } + request = false; } - } - if (!reply) { - tries += !other; - } - - if (reply && deadtimes.count(run)) { - break; + // RunDB connection is now confused, recreate it. + if (nRep == 0 && request) { + if (UNLIKELY(msgLevel(MSG::DEBUG) && run)) { + debug() << "RunDB info for run: " << *run << " " << *app << " timed out." << endmsg; + } + rdb = runDBReq(); + request = false; + if (++tries == m_runDBRetries) { + if (checkIt != end(toCheck)) ++checkIt; + if (checkIt == end(toCheck)) checkIt = begin(toCheck); + break; + } + } else if (!request) { + break; + } } + if (done) break; } - if (done) break; - - // RunDB connection is now confused, restart it. - if (request && !reply) { - rdb = runDBReq(); - } } } @@ -425,7 +438,13 @@ void Hlt2MonInfoSvc::function() // Create frontend, backend and control sockets zmq::socket_t data = socket(ZMQ_SUB); zmq::setsockopt(data, zmq::LINGER, 0); - data.bind(m_frontCon.c_str()); + zmq::setsockopt(data, zmq::RCVHWM, 10000); + try { + data.bind(m_frontCon.c_str()); + } catch (const zmq::error_t& e) { + error() << "Failed to bind connection " << m_frontCon << endmsg; + throw e; + } zmq::setsockopt(data, zmq::SUBSCRIBE, ""); info() << "Bound data input socket to: " << m_frontCon << endmsg; @@ -475,11 +494,32 @@ void Hlt2MonInfoSvc::function() {inf, 0, ZMQ_POLLIN, 0}, {syncer, 0, ZMQ_POLLIN, 0} }; - bool paused = false; + + // Storage + Monitoring::HistoMap histograms; + Monitoring::RunInfoMap runInfo; + Monitoring::CounterMap counters; + + // Load histogram info from file + if (!m_inputFile.empty()) { + auto loaded = loadHistoInfo(histograms, m_inputFile); + if (loaded) { + debug() << "Read " << histograms.size() << " histograms from " + << m_inputFile << endmsg; + } else { + warning() << "Failed to load histo info from " << m_inputFile << endmsg; + } + } + + bool syncMore = false; + + set<string> connections; + set<string>::const_iterator connection; + size_t syncTries = 5; while (true) { // Process messages from all sockets - zmq::poll (&items[0], 4, -1); + auto n = zmq::poll (&items[0], 4, syncMore ? 0 : -1); if (items[0].revents & ZMQ_POLLIN) { // Control messages @@ -494,16 +534,10 @@ void Hlt2MonInfoSvc::function() break; } else if (cmd == Monitoring::s_Check) { m_controlConnected = true; - } else if (cmd == "PAUSE") { - debug() << name() << " paused." << endmsg; - paused = true; - } else if (cmd == "RESUME") { - debug() << name() << " resumed." << endmsg; - paused = false; } } - if (!paused && (items[1].revents & ZMQ_POLLIN)) { + if (items[1].revents & ZMQ_POLLIN) { vector<zmq::message_t> msgs; bool more = true; while (more) { @@ -517,11 +551,11 @@ void Hlt2MonInfoSvc::function() auto newMsg = false; if (type == Monitoring::s_HistoInfo) { - newMsg = decodeHistoInfo(msgs); + newMsg = decodeHistoInfo(histograms, msgs); } else if (type == Monitoring::s_RunInfo) { - newMsg = decodeRunInfo(msgs); + newMsg = decodeRunInfo(runInfo, msgs); } else if (type == Monitoring::s_CounterInfo) { - newMsg = decodeCounterInfo(msgs); + newMsg = decodeCounterInfo(counters, msgs); } else if (type == Monitoring::s_Check) { if (msgs.size() != 2) { warning() << "Faulty check request of " << msgs.size() @@ -547,32 +581,78 @@ void Hlt2MonInfoSvc::function() } } - if (!paused && (items[2].revents & ZMQ_POLLIN)) { + if (items[2].revents & ZMQ_POLLIN) { // Requests for info // what type auto type = receive<std::string>(inf); if (type == Monitoring::s_Ping) { send(inf, Monitoring::s_Pong); } else if (type == Monitoring::s_Sync) { - syncRequest(inf); + syncRequest(histograms, runInfo, counters, inf); } else if (type == Monitoring::s_CounterInfo) { - counterInfoRequest(inf); + counterInfoRequest(counters, inf); } else if (type == Monitoring::s_HistoInfo) { - histoInfoRequest(inf); + histoInfoRequest(histograms, inf); } else if (type == Monitoring::s_RunInfo) { - runInfoRequest(inf); + runInfoRequest(runInfo, inf); } else { warning() << "Unknown type of info request received: " << type << endmsg; } } - if (!paused && (items[3].revents & ZMQ_POLLIN)) { - auto cmd = receive<std::string>(syncer); - if (cmd == Monitoring::s_Sync) { - sync(); + // We should sync: + // - if we get a sync command + // - if there is more to sync, but no other messages + if ((items[3].revents & ZMQ_POLLIN) || (syncMore && n == 0)) { + + bool doSync = syncMore; + + // Receive new connections if they were sent + if (items[3].revents & ZMQ_POLLIN) { + auto cmd = receive<std::string>(syncer); + if (cmd == Monitoring::s_Sync) { + connections = receive<set<string>>(syncer); + connection = begin(connections); + syncTries = m_syncTries; + doSync = true; + if (UNLIKELY(msgLevel(MSG::DEBUG))) { + debug() << "Synchronising with:"; + for (auto c : connections) { + debug() << " " << c; + } + debug() << endmsg; + } + } + } + + // If we have a connection and should sync, do it. + if (doSync && connection != end(connections)) { + vector<string> what = {Monitoring::s_HistoInfo, + Monitoring::s_RunInfo, + Monitoring::s_CounterInfo}; + + if (UNLIKELY(msgLevel(MSG::VERBOSE))) { + verbose() << "Synchronising with: " << *connection << endmsg; + } + auto r = sync(*connection, what, unordered_set<Monitoring::RunNumber>{}, + histograms, HistoKeys{}, + runInfo, + counters); + if (r) { + // If we successfully synced some histograms, indicate + // whether there is more to be synced. + syncMore = *r; + } else { + // Otherwise, try the next connection, or start from + // the beginning if there are tries left. + ++connection; + if (connection == end(connections) && syncTries > 0) { + connection = begin(connections); + syncTries = m_syncTries; + } + } } } - } m_stopSync = true; @@ -584,7 +664,8 @@ void Hlt2MonInfoSvc::function() } //=============================================================================== -bool Hlt2MonInfoSvc::decodeHistoInfo(const vector<zmq::message_t>& msgs) const +bool Hlt2MonInfoSvc::decodeHistoInfo(Monitoring::HistoMap& histograms, + const vector<zmq::message_t>& msgs) const { if (msgs.size() != 5) { return false; @@ -593,24 +674,42 @@ bool Hlt2MonInfoSvc::decodeHistoInfo(const vector<zmq::message_t>& msgs) const const auto run = decode<Monitoring::RunNumber>(msgs[1]); const auto id = decode<Monitoring::HistId>(msgs[2]); const pair<Monitoring::RunNumber, Monitoring::HistId> key{run, id}; + auto type = decode<std::string>(msgs[3]); - Printer printer{}; + MonInfo::Printer printer{}; string title; - if (!m_histograms.count(key)) { - auto type = decode<std::string>(msgs[3]); + if (!histograms.count(key)) { HistoMap::const_iterator it; - bool placed; + bool placed = false; + + // New histogram, do we need to share content? + boost::optional<HistoVariant> variant; if (type == Monitoring::s_Rate) { - std::tie(it, placed) = m_histograms.emplace(key, make_pair(type, decode<string>(msgs[4]))); + variant = decode<string>(msgs[4]); } else if (type == Monitoring::s_Histo1D) { - std::tie(it, placed) = m_histograms.emplace(key, make_pair(type, decode<Histo1DDef>(msgs[4]))); + variant = decode<Histo1DDef>(msgs[4]); } else if (type == Monitoring::s_Histo2D) { - std::tie(it, placed) = m_histograms.emplace(key, make_pair(type, decode<Histo2DDef>(msgs[4]))); + variant = decode<Histo2DDef>(msgs[4]); + } else { + warning() << "Unkown type of histogram info: " << type + << " for histogram with ID: " << run << " " << id << endmsg; + } + + if (variant) { + std::tie(it, placed) = addHistogram(histograms, key, type, std::move(*variant)); + } + + if (UNLIKELY(msgLevel(MSG::VERBOSE))) { + if (placed) { + verbose() << "New histogram: " << key.first << " " << std::setw(20) << std::right + << key.second << " " << std::setw(7) << std::left << type << " " + << boost::apply_visitor(printer, it->content()) << endmsg; + } else { + verbose() << "Known histogram: " << key.first << " " << std::setw(20) << std::right + << key.second << endmsg; + } } - debug() << "New histogram: " << key.first << " " << std::setw(20) << std::right - << key.second << " " << std::setw(7) << std::left << type << " " - << boost::apply_visitor(printer, it->second.second) << endmsg; return true; } else { return false; @@ -618,7 +717,9 @@ bool Hlt2MonInfoSvc::decodeHistoInfo(const vector<zmq::message_t>& msgs) const } //=============================================================================== -bool Hlt2MonInfoSvc::decodeCounterInfo(const vector<zmq::message_t>& msgs, const bool quiet) const +bool Hlt2MonInfoSvc::decodeCounterInfo(Monitoring::CounterMap& counters, + const vector<zmq::message_t>& msgs, + const bool quiet) const { if (msgs.size() != 3) { return false; @@ -626,13 +727,13 @@ bool Hlt2MonInfoSvc::decodeCounterInfo(const vector<zmq::message_t>& msgs, const const auto id = decode<Monitoring::HistId>(msgs[1]); - Printer printer{}; + MonInfo::Printer printer{}; string title; - if (!m_counters.count(id)) { + if (!counters.count(id)) { CounterMap::const_iterator it; bool placed; - std::tie(it, placed) = m_counters.emplace(id, decode<CounterDef>(msgs[2])); + std::tie(it, placed) = counters.emplace(id, decode<CounterDef>(msgs[2])); if (UNLIKELY(msgLevel(MSG::DEBUG) && !quiet)) debug() << std::setw(22) << std::left << "New counter: " << std::setw(20) << std::right << id << " COUNTER " << printer(it->second) << endmsg; @@ -643,31 +744,35 @@ bool Hlt2MonInfoSvc::decodeCounterInfo(const vector<zmq::message_t>& msgs, const } //=============================================================================== -bool Hlt2MonInfoSvc::decodeRunInfo(const vector<zmq::message_t>& msgs) const +bool Hlt2MonInfoSvc::decodeRunInfo(Monitoring::RunInfoMap& runInfo, + const vector<zmq::message_t>& msgs) const { if (msgs.size() != 3) { return false; } auto app = decode<pair<string, string>>(msgs[1]); - auto runInfo = decode<Monitoring::RunInfo>(msgs[2]); - - debug() << "Decoded run info: " << runInfo.run << " " << app.first << " " - << runInfo.start << " " << runInfo.tck << endmsg; + auto info = decode<Monitoring::RunInfo>(msgs[2]); // Add to internal store - RunInfoKey key{runInfo.run, app.first}; - if (!m_runInfo.count(key)) { + RunInfoKey key{info.run, app.first}; + if (!runInfo.count(key)) { bool complete = !m_top; if (m_top) { - auto dt = deadtime(runInfo.run, app.first); + auto dt = deadtime(info.run, app.first); if (dt) { - runInfo.deadtime = *dt; + info.deadtime = *dt; complete = true; } } if (UNLIKELY(msgLevel(MSG::DEBUG))) - debug() << "New run info for run: " << runInfo.run << " " << app.first << endmsg; - m_runInfo.emplace(std::move(key), make_pair(complete, std::move(runInfo))); + debug() << "New run info for run: " << info.run << " " << app.first + << " complete " << complete << " deadtime " << info.deadtime + << endmsg; + runInfo.emplace(std::move(key), make_pair(complete, std::move(info))); + if (UNLIKELY(msgLevel(MSG::DEBUG))) { + debug() << "Decoded run info: " << info.run << " " << app.first << " " + << info.start << " " << info.tck << endmsg; + } return true; } else { return false; @@ -675,128 +780,165 @@ bool Hlt2MonInfoSvc::decodeRunInfo(const vector<zmq::message_t>& msgs) const } //=============================================================================== -void Hlt2MonInfoSvc::sync() const +boost::optional<bool> Hlt2MonInfoSvc::sync(std::string connection, + const std::vector<string>& what, + const std::unordered_set<Monitoring::RunNumber>& reqRuns, + Monitoring::HistoMap& histograms, + const Monitoring::HistoKeys& exclude, + Monitoring::RunInfoMap& runInfo, + Monitoring::CounterMap& counters) const { - - vector<string> what = {Monitoring::s_HistoInfo, - Monitoring::s_RunInfo, - Monitoring::s_CounterInfo}; - HistoKeys histoKeys; - std::for_each(begin(m_histograms), end(m_histograms), - [&histoKeys](const HistoMap::value_type& entry) { - histoKeys.emplace(entry.first); + std::for_each(begin(histograms), end(histograms), + [&histoKeys](const HistoEntry& entry) { + histoKeys.emplace(entry.key); + }); + + std::for_each(begin(exclude), end(exclude), + [&histoKeys](const HistoKey& key) { + histoKeys.emplace(key); }); // gather run info RunInfoKeys runKeys; - std::for_each(begin(m_runInfo), end(m_runInfo), + std::for_each(begin(runInfo), end(runInfo), [&runKeys](const RunInfoMap::value_type& entry) { runKeys.emplace(entry.first); }); CounterKeys counterKeys; - std::for_each(begin(m_counters), end(m_counters), + std::for_each(begin(counters), end(counters), [&counterKeys](const CounterMap::value_type& entry) { counterKeys.emplace(entry.first); }); - if (UNLIKELY(msgLevel(MSG::DEBUG))) { - debug() << "Synchronising with:"; - for (auto c : m_syncConnections) { - debug() << " " << c; + // Connect output request socket + zmq::socket_t out = socket(zmq::REQ); + zmq::setsockopt(out, zmq::LINGER, 0); + zmq::setsockopt(out, zmq::RCVTIMEO, 100); + out.connect(connection.c_str()); + + boost::optional<bool> more; + + // Request synchronisation + send(out, Monitoring::s_Sync, zmq::SNDMORE); + + // Send what we want to synchronise in the right order + send(out, what, zmq::SNDMORE); + + for (auto w : what) { + if (w == Monitoring::s_HistoInfo) { + // Indicate we want all runs, by sending an empty set, and + // the histo keys we have. + send(out, reqRuns, zmq::SNDMORE); + send(out, histoKeys, w == what.back() ? 0 : zmq::SNDMORE); + } else if (w == Monitoring::s_RunInfo) { + // The run info we have + send(out, runKeys, w == what.back() ? 0 : zmq::SNDMORE); + } else if (w == Monitoring::s_CounterInfo) { + send(out, counterKeys, w == what.back() ? 0 : zmq::SNDMORE); } - debug() << endmsg; } - for (auto connection : m_syncConnections) { - if (UNLIKELY(msgLevel(MSG::DEBUG))) - debug() << "Synchronising with: " << connection << endmsg; - - // Connect output request socket - zmq::socket_t out = socket(zmq::REQ); - zmq::setsockopt(out, zmq::LINGER, 0); - zmq::setsockopt(out, zmq::RCVTIMEO, 100); - out.connect(connection.c_str()); - - // Request synchronisation - send(out, Monitoring::s_Sync, zmq::SNDMORE); - // Send what we want to synchronise in the right order - send(out, what, zmq::SNDMORE); - - // The histo keys we have - send(out, histoKeys, zmq::SNDMORE); - - // The run info we have - send(out, runKeys, zmq::SNDMORE); - - // The counter keys we have - send(out, counterKeys); + boost::optional<string> rep; + zmq::pollitem_t items[] = { + {out, 0, ZMQ_POLLIN, 0} + }; - boost::optional<string> rep; - zmq::pollitem_t items[] = { - {out, 0, ZMQ_POLLIN, 0} - }; + zmq::poll(&items[0], 1, 1000); + if (items[0].revents & ZMQ_POLLIN) { + rep = receive<std::string>(out); + } - zmq::poll(&items[0], 1, 1000); - if (items[0].revents & ZMQ_POLLIN) { - rep = receive<std::string>(out); - } + if (!rep) { + debug() << "Sync request reply timed out." << endmsg; + return more; + } else if (*rep != "INCOMING") { + debug() << "Bad reply to sync request: " << *rep << endmsg; + return more; + } - if (!rep) { - debug() << "Sync request reply timed out." << endmsg; - } else if (*rep == "INCOMING") { + for (auto w : what) { + if (w == Monitoring::s_HistoInfo) { std::unordered_map<Monitoring::RunNumber, size_t> hpr; // Histograms auto histos = receive<HistoPub>(out); + auto m = receive<bool>(out); + more = (more ? *more | m : m); for (const auto& entry : histos) { - hpr[std::get<0>(entry)]++; + const auto& type = std::get<0>(entry); + const auto& info = std::get<1>(entry); + const auto& keys = std::get<2>(entry); + // Entry consists of (RunNumber, HistId, type, info_string), + // where the info string needs to be converted to a message. vector<zmq::message_t> msgs; msgs.reserve(5); - msgs.emplace_back(encode(Monitoring::s_HistoInfo)); - msgs.emplace_back(encode(std::get<0>(entry))); - msgs.emplace_back(encode(std::get<1>(entry))); - msgs.emplace_back(encode(std::get<2>(entry))); - const auto& infoString = std::get<3>(entry); - zmq::message_t msg{infoString.size()}; - std::copy_n(begin(infoString), infoString.size(), static_cast<char*>(msg.data())); - msgs.emplace_back(std::move(msg)); - decodeHistoInfo(msgs); - } + for (const auto& key : keys) { + if (msgs.empty()) { + msgs.emplace_back(encode(Monitoring::s_HistoInfo)); + msgs.emplace_back(encode(key.first)); + msgs.emplace_back(encode(key.second)); + msgs.emplace_back(encode(type)); - if (UNLIKELY(msgLevel(MSG::DEBUG))) { - for (const auto& entry : hpr) { - debug() << "Decoded " << std::right << std::setw(6) << to_string(entry.second) - << " histograms for run " << entry.first << endmsg; + zmq::message_t msg{info.size()}; + std::copy_n(begin(info), info.size(), static_cast<char*>(msg.data())); + msgs.emplace_back(std::move(msg)); + } else { + msgs[1] = encode(key.first); + msgs[2] = encode(key.second); + } + // Update known keys with received info + histoKeys.emplace(key.first, key.second); + + hpr[key.first]++; + decodeHistoInfo(histograms, msgs); } } + for (const auto& entry : hpr) { + info() << "Decoded " << std::right << std::setw(6) << to_string(entry.second) + << " histograms for run " << entry.first << endmsg; + } + if (UNLIKELY(msgLevel(MSG::VERBOSE) && hpr.empty())) { + verbose() << "No new histogram info after synchronisation." << endmsg; + } + } else if (w == Monitoring::s_RunInfo) { // Run infos auto runInfos = receive<RunInfoPub>(out); + auto m = receive<bool>(out); + more = (more ? *more | m : m); for (const auto& entry : runInfos) { + // Update known keys with received info + runKeys.emplace(entry.second.run, entry.first); + vector<zmq::message_t> msgs; msgs.reserve(3); msgs.emplace_back(encode(Monitoring::s_RunInfo)); msgs.emplace_back(encode(make_pair(entry.first, string{"fake"}))); msgs.emplace_back(encode(entry.second)); - decodeRunInfo(msgs); + decodeRunInfo(runInfo, msgs); } - + } else if (w == Monitoring::s_CounterInfo) { // Counters - auto counters = receive<CounterPub>(out); + auto counterPub = receive<CounterPub>(out); + auto m = receive<bool>(out); + more = (more ? *more | m : m); size_t nc = 0; - for (const auto& entry : counters) { + for (const auto& entry : counterPub) { + auto id = std::get<0>(entry); + counterKeys.emplace(id); + vector<zmq::message_t> msgs; msgs.reserve(3); msgs.emplace_back(encode(Monitoring::s_CounterInfo)); - msgs.emplace_back(encode(std::get<0>(entry))); + msgs.emplace_back(encode(id)); msgs.emplace_back(encode(std::get<1>(entry))); - decodeCounterInfo(msgs, true); + decodeCounterInfo(counters, msgs, true); ++nc; } @@ -804,28 +946,43 @@ void Hlt2MonInfoSvc::sync() const if (nc != 0) { debug() << "Decoded " << std::right << std::setw(6) << to_string(nc) << " counters" << endmsg; - } else if (!hpr.empty() && !runInfos.empty() && nc == 0) { - debug() << "No new info after synchronisation." << endmsg; + } else { + verbose() << "No new counter info after synchronisation." << endmsg; } } + } + } + if (UNLIKELY(msgLevel(MSG::DEBUG))) { + if (more && *more) { + debug() << "Syncing again with " << connection + << " as there is more info." << endmsg; } else { - info() << "Got unexpected reply to info request: " << *rep << endmsg; + verbose() << "Done syncing with " << connection << endmsg; } } + return more; } //=============================================================================== -void Hlt2MonInfoSvc::syncRequest(zmq::socket_t& inf) const +void Hlt2MonInfoSvc::syncRequest(const Monitoring::HistoMap& histograms, + const Monitoring::RunInfoMap& runInfo, + const Monitoring::CounterMap& counters, + zmq::socket_t& inf) const { HistoKeys otherKeys; CounterKeys otherCounters; RunInfoKeys otherRuns; + unordered_set<Monitoring::RunNumber> reqRuns; - // What runs and histos does the other know? + // What types of info need synchronising? auto what = receive<vector<string>>(inf); + + // What runs and histos does the other know? for (auto type : what) { if (type == Monitoring::s_HistoInfo) { + // What runs do we want info about? + reqRuns = receive<unordered_set<Monitoring::RunNumber>>(inf); otherKeys = receive<HistoKeys>(inf); } else if (type == Monitoring::s_CounterInfo) { otherCounters = receive<CounterKeys>(inf); @@ -849,51 +1006,107 @@ void Hlt2MonInfoSvc::syncRequest(zmq::socket_t& inf) const // Send requested information in the order it was requested send(inf, "INCOMING", zmq::SNDMORE); + if (UNLIKELY(msgLevel(MSG::VERBOSE))) { + verbose() << "Received request to sync:"; + for (auto type : what) { + verbose() << " " << type; + } + verbose() << endmsg; + } + for (auto type : what) { if (type == Monitoring::s_HistoInfo) { - // Find the info that the other service does not have. - auto pubHistos = SelectPub<HistoMap, HistoKeys>{}(m_histograms, otherKeys); + + // Check wether the run of a given histogram has been requested + auto useRun = [&reqRuns](Monitoring::RunNumber run) -> bool { + return reqRuns.empty() || reqRuns.count(run); + }; + // Publish histo info HistoPub histos; - histos.reserve(pubHistos.size()); - for (auto it : pubHistos) { - auto infoString = makeString(boost::apply_visitor(encoder, it->second.second)); - histos.emplace_back(make_tuple(it->first.first, it->first.second, - it->second.first, std::move(infoString))); + + // Get our histograms by content (hash) + const auto& hbh = histograms.get<ByContent>(); + histos.reserve(std::min(m_syncBatchSize / 2, hbh.size())); + + // Loop over the hashes and for each hash add all keys that + // are not known for that given hash, if the run was + // requested. + bool more = false; + for (auto it = begin(hbh); it != end(hbh); it = hbh.upper_bound(it->hash)) { + HistoMap::index<ByContent>::type::const_iterator first, last; + std::tie(first, last) = hbh.equal_range(it->hash); + std::vector<HistoKey> keys; + keys.reserve(std::distance(first, last)); + std::for_each(first, last, [&keys, &otherKeys, &useRun](const HistoEntry& entry) { + if (useRun(entry.key.first) && !otherKeys.count(entry.key)) { + keys.push_back(entry.key); + } + }); + keys.shrink_to_fit(); + if (!keys.empty()) { + auto infoString = makeString(boost::apply_visitor(encoder, it->content())); + histos.emplace_back(make_tuple(it->type, std::move(infoString), std::move(keys))); + } + // TODO: Do we count hashes, or do we count keys?? + if (histos.size() > m_syncBatchSize / 2) { + more = true; + break; + } } - send(inf, histos, type == what.back() ? 0 : zmq::SNDMORE); - } + // Indicate if there is more to sync, and then send the info - if (type == Monitoring::s_RunInfo) { - auto pubRuns = SelectPub<RunInfoMap, RunInfoKeys>{}(m_runInfo, otherRuns); + // Send the info and indicate if there is more to sync, + if (UNLIKELY(msgLevel(MSG::VERBOSE))) { + verbose() << "Sending " << histos.size() << " histos, more = " + << more << endmsg; + } + + send(inf, histos, zmq::SNDMORE); + send(inf, more, type == what.back() ? 0 : zmq::SNDMORE); + } else if (type == Monitoring::s_RunInfo) { + auto r = SelectPub<RunInfoMap, RunInfoKeys>{}(runInfo, otherRuns, m_syncBatchSize); // Publish run info RunInfoPub runInfos; - runInfos.reserve(pubRuns.size()); - for (auto it : pubRuns) { + runInfos.reserve(r.first.size()); + for (auto it : r.first) { runInfos.emplace_back(make_pair(it->first.second, it->second.second)); } - send(inf, runInfos, type == what.back() ? 0 : zmq::SNDMORE); - } - if (type == Monitoring::s_CounterInfo) { - auto pubCounters = SelectPub<CounterMap, CounterKeys>{}(m_counters, otherCounters); + // Send the info and indicate if there is more to sync, + if (UNLIKELY(msgLevel(MSG::VERBOSE))) { + verbose() << "Sending " << runInfos.size() << " run infos, more = " + << r.second << endmsg; + } + + send(inf, runInfos, zmq::SNDMORE); + send(inf, r.second, type == what.back() ? 0 : zmq::SNDMORE); + } else if (type == Monitoring::s_CounterInfo) { + auto r = SelectPub<CounterMap, CounterKeys>{}(counters, otherCounters, m_syncBatchSize); // Publish counter info CounterPub counters; - counters.reserve(pubCounters.size()); - for (auto it : pubCounters) { + counters.reserve(r.first.size()); + for (auto it : r.first) { counters.emplace_back(make_tuple(it->first, it->second)); } + if (UNLIKELY(msgLevel(MSG::VERBOSE))) { + verbose() << "Sending " << counters.size() << " counters, more = " + << r.second << endmsg; + } + // Send information - send(inf, counters, type == what.back() ? 0 : zmq::SNDMORE); + send(inf, counters, zmq::SNDMORE); + send(inf, r.second, type == what.back() ? 0 : zmq::SNDMORE); } } } //=============================================================================== -bool Hlt2MonInfoSvc::histoInfoRequest(zmq::socket_t& inf) const +bool Hlt2MonInfoSvc::histoInfoRequest(const Monitoring::HistoMap& histograms, + zmq::socket_t& inf) const { // What run and ID requested? auto key = receiveRunAndId(inf); @@ -901,8 +1114,10 @@ bool Hlt2MonInfoSvc::histoInfoRequest(zmq::socket_t& inf) const Monitoring::HistId id; std::tie(run, id) = key; - verbose() << std::setw(21) << "New histo info request: " - << std::right << run << " " << std::setw(20) << id << endmsg; + if (UNLIKELY(msgLevel(MSG::VERBOSE))) { + verbose() << std::setw(21) << "New histo info request: " + << std::right << run << " " << std::setw(20) << id << endmsg; + } // Prepare reply std::string known; @@ -911,11 +1126,11 @@ bool Hlt2MonInfoSvc::histoInfoRequest(zmq::socket_t& inf) const Encoder encoder{zmq()}; - auto it = m_histograms.find(key); - if (it != end(m_histograms)) { + auto it = histograms.find(key); + if (it != end(histograms)) { send(inf, Monitoring::s_Known, zmq::SNDMORE); - send(inf, it->second.first, zmq::SNDMORE); - send(inf, boost::apply_visitor(encoder, it->second.second)); + send(inf, it->type, zmq::SNDMORE); + send(inf, boost::apply_visitor(encoder, it->content())); } else { send(inf, Monitoring::s_Unknown); } @@ -924,20 +1139,23 @@ bool Hlt2MonInfoSvc::histoInfoRequest(zmq::socket_t& inf) const } //=============================================================================== -bool Hlt2MonInfoSvc::counterInfoRequest(zmq::socket_t& inf) const +bool Hlt2MonInfoSvc::counterInfoRequest(const Monitoring::CounterMap& counters, + zmq::socket_t& inf) const { // What run and ID requested? auto id = receive<Monitoring::HistId>(inf); - verbose() << std::setw(21) << "New counter info request: " - << " " << std::setw(27) << id << endmsg; + if (UNLIKELY(msgLevel(MSG::VERBOSE))) { + verbose() << std::setw(21) << "New counter info request: " + << " " << std::setw(27) << id << endmsg; + } // Prepare reply std::string known; std::string reply; - auto it = m_counters.find(id); - if (it != end(m_counters)) { + auto it = counters.find(id); + if (it != end(counters)) { send(inf, Monitoring::s_Known, zmq::SNDMORE); send(inf, it->second); } else { @@ -948,19 +1166,24 @@ bool Hlt2MonInfoSvc::counterInfoRequest(zmq::socket_t& inf) const } //=============================================================================== -bool Hlt2MonInfoSvc::runInfoRequest(zmq::socket_t& inf) const +bool Hlt2MonInfoSvc::runInfoRequest(Monitoring::RunInfoMap& runInfo, + zmq::socket_t& inf) const { // Incoming IDs auto run = receive<Monitoring::RunNumber>(inf); auto app = receive<string>(inf); - verbose() << std::setw(21) << "New run info requested for " << run << " " - << app << endmsg; + if (UNLIKELY(msgLevel(MSG::VERBOSE))) { + verbose() << std::setw(21) << "New run info requested for " << run << " " + << app << endmsg; + } - auto it = m_runInfo.find(make_pair(run, app)); - if (it == end(m_runInfo)) { + auto it = runInfo.find(make_pair(run, app)); + if (it == end(runInfo)) { send(inf, Monitoring::s_Unknown); - verbose() << "Run info unknown " << endmsg; + if (UNLIKELY(msgLevel(MSG::VERBOSE))) { + verbose() << "Run info unknown " << endmsg; + } } else { if (m_top && !it->second.first) { auto dt = deadtime(run, app); @@ -969,7 +1192,10 @@ bool Hlt2MonInfoSvc::runInfoRequest(zmq::socket_t& inf) const it->second.first = true; } } - verbose() << "Run info known with deadtime: " << it->second.second.deadtime << endmsg; + if (UNLIKELY(msgLevel(MSG::VERBOSE))) { + verbose() << "Run info known with deadtime: " + << it->second.second.deadtime << endmsg; + } send(inf, Monitoring::s_Known, zmq::SNDMORE); send(inf, it->second.second); } @@ -984,6 +1210,13 @@ boost::optional<double> Hlt2MonInfoSvc::deadtime(const Monitoring::RunNumber run m_runDBThread = make_unique<std::thread>([this]{runDB();}); } + auto vb = msgLevel(MSG::VERBOSE); + + if (UNLIKELY(vb)) { + verbose() << "Requesting deadtime for run " + << run << " from run DB thread." << endmsg; + } + auto rdb = runDBSocket(); boost::optional<double> r{}; size_t tries = 0; @@ -1005,12 +1238,16 @@ boost::optional<double> Hlt2MonInfoSvc::deadtime(const Monitoring::RunNumber run if (items[0].revents & ZMQ_POLLIN) { try { auto known = receive<bool>(rdb); + if (UNLIKELY(vb)) verbose() << "Reply from run DB thread: " + << (known ? "known: " : "unknown."); if (known) { auto dt = receive<double>(rdb); + if (UNLIKELY(vb)) verbose() << dt; if (dt >= 0.) { r = dt; } } + if (UNLIKELY(vb))verbose() << endmsg; break; } catch (const ZMQ::TimeOutException&) { rdb = runDBSocket(); diff --git a/Online/Hlt2Monitoring/src/component/Hlt2MonInfoSvc.h b/Online/Hlt2Monitoring/src/component/Hlt2MonInfoSvc.h index cae8bd57b4ece7f62c8713a7bfe7541b8dd4c5af..0a59e57504fd607d3edc8237136ccc20e109af9e 100644 --- a/Online/Hlt2Monitoring/src/component/Hlt2MonInfoSvc.h +++ b/Online/Hlt2Monitoring/src/component/Hlt2MonInfoSvc.h @@ -22,6 +22,7 @@ // Hlt2Monitoring #include <Hlt2Monitoring/Types.h> +#include <Hlt2Monitoring/HistoUtils.h> #include <Hlt2Monitoring/RunInfo.h> #include <Hlt2Monitoring/CounterDef.h> #include <Hlt2Monitoring/HashTuple.h> @@ -41,7 +42,6 @@ class Hlt2MonInfoSvc : public Hlt2MonBaseSvc { public: Hlt2MonInfoSvc(const std::string& name, ISvcLocator* sl); - virtual ~Hlt2MonInfoSvc(); // Service pure virtual member functions virtual StatusCode initialize() override; @@ -55,16 +55,32 @@ private: void synchroniser(); // Request to synchronise with others - void sync() const; - - bool decodeHistoInfo(const std::vector<zmq::message_t>& info) const; - bool decodeCounterInfo(const std::vector<zmq::message_t>& info, const bool quiet = false) const; - bool decodeRunInfo(const std::vector<zmq::message_t>& info) const; - - void syncRequest(zmq::socket_t& inf) const; - bool histoInfoRequest(zmq::socket_t& inf) const; - bool counterInfoRequest(zmq::socket_t& inf) const; - bool runInfoRequest(zmq::socket_t& inf) const; + boost::optional<bool> sync(std::string connection, + const std::vector<std::string>& what, + const std::unordered_set<Monitoring::RunNumber>& reqRuns, + Monitoring::HistoMap& histograms, + const Monitoring::HistoKeys& exclude, + Monitoring::RunInfoMap& runInfo, + Monitoring::CounterMap& counters) const; + + bool decodeHistoInfo(Monitoring::HistoMap& histograms, + const std::vector<zmq::message_t>& info) const; + bool decodeCounterInfo(Monitoring::CounterMap& counters, + const std::vector<zmq::message_t>& info, + const bool quiet = false) const; + bool decodeRunInfo(Monitoring::RunInfoMap& runInfo, + const std::vector<zmq::message_t>& info) const; + + void syncRequest(const Monitoring::HistoMap& histograms, + const Monitoring::RunInfoMap& runInfo, + const Monitoring::CounterMap& counters, + zmq::socket_t& inf) const; + bool histoInfoRequest(const Monitoring::HistoMap& histograms, + zmq::socket_t& inf) const; + bool counterInfoRequest(const Monitoring::CounterMap& counters, + zmq::socket_t& inf) const; + bool runInfoRequest(Monitoring::RunInfoMap& runInfo, + zmq::socket_t& inf) const; void runDB() const; std::string runDBThreadCon() const { @@ -91,6 +107,10 @@ private: SmartIF<ITransmitterSvc> m_transmitter; unsigned int m_sendInterval; unsigned int m_sourceID; + std::string m_connectionPath; + size_t m_syncBatchSize; + size_t m_syncTries; + std::string m_inputFile; // data members std::string m_hostname; @@ -113,41 +133,34 @@ private: const IZeroMQSvc& m_zmq; }; - // Visitor to print different types in an info message - class Printer : public boost::static_visitor<std::string> { - public: - std::string operator()( const std::string& eval ) { - return eval; - } - - std::string operator()( const Monitoring::Histo1DDef& def ) { - return def.title; - } + template<class C, class O, class K = typename C::key_type> + struct SelectPub { - std::string operator()( const Monitoring::Histo2DDef& def ) { - return def.title; + std::pair<std::vector<typename C::const_iterator>, bool> + operator()(const C& container, const O& other, size_t n) { + auto pred = [](typename C::const_iterator) { return true; }; + auto extract = [](typename C::const_iterator it) { return it->first; }; + return operator()(container, other, extract, pred, n); } - std::string operator()( const Monitoring::CounterDef& def ) { - return def.name + " " + def.description; - } - }; - - template<class C, class O> - struct SelectPub { - std::vector<typename C::const_iterator> operator()(const C& container, const O& other) { + std::pair<std::vector<typename C::const_iterator>, bool> + operator()(const C& container, const O& other, + std::function<K(typename C::const_iterator)> keyExtract, + std::function<bool(typename C::const_iterator)> predicate, size_t n) { std::vector<typename C::const_iterator> pub; + bool more = false; for (auto it = begin(container), last = end(container); it != last; ++it) { - if (!other.count(it->first)) pub.emplace_back(it); + if (predicate(it) && !other.count(keyExtract(it))) pub.emplace_back(it); + if (pub.size() > n) { + more = true; + break; + } } - return pub; + return make_pair(std::move(pub), more); } }; // Storage for histogram, counter, and run information - mutable Monitoring::HistoMap m_histograms; - mutable Monitoring::CounterMap m_counters; - mutable Monitoring::RunInfoMap m_runInfo; mutable boost::unordered_map<Monitoring::RunNumber, int> m_startTimes; // Run Info diff --git a/Online/Hlt2Monitoring/src/component/Hlt2RootPublishSvc.cpp b/Online/Hlt2Monitoring/src/component/Hlt2RootPublishSvc.cpp index 7494bad9c7fd0267d86bdcfad72cf43270b66fe2..d0a39570daf2fde0ee8f7d9d07be8a2b6d023e0d 100644 --- a/Online/Hlt2Monitoring/src/component/Hlt2RootPublishSvc.cpp +++ b/Online/Hlt2Monitoring/src/component/Hlt2RootPublishSvc.cpp @@ -5,6 +5,9 @@ #include <memory> #include <set> #include <thread> +#include <random> +#include <unordered_set> +#include <unordered_map> // boost #include <boost/algorithm/string/find.hpp> @@ -31,6 +34,7 @@ #include <Hlt2Monitoring/Histo1DDef.h> #include <Hlt2Monitoring/Histo2DDef.h> #include <Hlt2Monitoring/Serialize.h> +#include <Hlt2Monitoring/Utilities.h> // ROOT #include <TClass.h> @@ -51,6 +55,7 @@ namespace { using std::make_pair; using std::pair; using std::unordered_map; + using std::unordered_set; using boost::lexical_cast; namespace ba = boost::algorithm; @@ -95,14 +100,15 @@ DECLARE_SERVICE_FACTORY(Hlt2RootPublishSvc) //============================================================================= Hlt2RootPublishSvc::Hlt2RootPublishSvc(const string& name, ISvcLocator* loc) - : Hlt2MonBaseSvc(name, loc), - m_stopPublishing{false} + : Hlt2MonBaseSvc(name, loc) { declareProperty("InfoConnection", m_infoCon); declareProperty("PublishInterval", m_publishInterval = 60); declareProperty("RateStart", m_rateStart = 0.); declareProperty("RunDuration", m_runDuration = 4000.); declareProperty("RateInterval", m_rateInterval = 5.); + declareProperty("BatchSize", m_batchSizeProp = 400); + declareProperty("HighWaterMark", m_hwm = 10000); } //=============================================================================== @@ -113,6 +119,8 @@ StatusCode Hlt2RootPublishSvc::initialize() return sc; } + m_batchSize = m_batchSizeProp; + if (m_frontCon.empty() || m_backCon.empty() || m_infoCon.empty()) { warning() << "Connections not correctly configured, " << "Hlt2 ROOT publisher disabled" << endmsg; @@ -133,11 +141,14 @@ void Hlt2RootPublishSvc::function() auto control = connectControl(); if (!m_controlConnected) return; - zmq::socket_t front = socket(ZMQ_SUB); - front.connect(m_frontCon.c_str()); + zmq::socket_t front = socket(zmq::SUB); + zmq::setsockopt(front, zmq::LINGER, 0); zmq::setsockopt(front, zmq::SUBSCRIBE, ""); - info() << "Connected front socket to: " << m_frontCon << endmsg; - + auto hwm = boost::numeric_cast<int>(m_hwm); + zmq::setsockopt(front, zmq::RCVHWM, hwm); + front.connect(m_frontCon.c_str()); + info() << "Connected front socket to: " << m_frontCon + << " with HWM " << hwm << endmsg; // Clean up queue zmq::message_t msg; while (front.recv(&msg, ZMQ_DONTWAIT)) { @@ -148,8 +159,11 @@ void Hlt2RootPublishSvc::function() info() << "Connected info socket to: " << m_infoCon << endmsg; zmq::socket_t back = socket(ZMQ_PUB); + zmq::setsockopt(back, zmq::LINGER, 0); + zmq::setsockopt(front, zmq::SNDHWM, hwm); back.bind(m_backCon.c_str()); - info() << "Bound back socket to: " << m_backCon << endmsg; + info() << "Bound back socket to: " << m_backCon + << " with HWM " << hwm << endmsg; // Publish trigger thread socket zmq::socket_t publish = socket(zmq::PAIR); @@ -157,15 +171,18 @@ void Hlt2RootPublishSvc::function() publish.bind(pubCon().c_str()); // Start thread to trigger saving of histograms - std::thread pubThread([this] { periodic(pubCon(), Monitoring::s_Publish, - m_stopPublishing, m_publishInterval); }); + std::thread pubThread([this] { periodic(pubCon(), Monitoring::s_Publish, 0.5); }); + size_t pubCounter = m_publishInterval * 2; // Initialize poll set zmq::pollitem_t items[] = {{control, 0, ZMQ_POLLIN, 0}, {front, 0, ZMQ_POLLIN, 0}, {publish, 0, ZMQ_POLLIN, 0}}; - bool paused = false; + ROOTHistos histos; + SentHistos sentHistos; + Queue messages; + bool sentAll = true; while (true) { // Process messages from both sockets @@ -175,23 +192,13 @@ void Hlt2RootPublishSvc::function() if (items[0].revents & ZMQ_POLLIN) { auto cmd = receive<string>(control); - if (cmd == "TERMINATE") { - for (auto s : {&front, &inf, &control}) { - zmq::setsockopt(*s, zmq::LINGER, 0); - } + if (cmd == Monitoring::s_Terminate) { + send(publish, Monitoring::s_Terminate); break; - } else if (cmd == "PAUSE") { - if (UNLIKELY(msgLevel(MSG::DEBUG))) - debug() << name() << " paused." << endmsg; - paused = true; - } else if (cmd == "RESUME") { - if (UNLIKELY(msgLevel(MSG::DEBUG))) - debug() << name() << " resumed." << endmsg; - paused = false; } } - if (!paused && (items[1].revents & ZMQ_POLLIN)) { + if (items[1].revents & ZMQ_POLLIN) { // Deserialize auto type = receive<string>(front); if (type == Monitoring::s_HistDiff) { @@ -202,8 +209,8 @@ void Hlt2RootPublishSvc::function() string type, dir; TH1* rhist = nullptr; HistoKey key{hdiff.runNumber, hdiff.histId}; - auto it = m_histos.find(key); - if (it != end(m_histos)) { // We have it cached! + auto it = histos.find(key); + if (it != end(histos)) { // We have it cached! type = std::get<0>(it->second); dir = std::get<1>(it->second); rhist = std::get<2>(it->second).get(); @@ -212,7 +219,7 @@ void Hlt2RootPublishSvc::function() std::tie(type, dir, hist) = getHistogram(inf, hdiff.runNumber, hdiff.histId); if (hist) { rhist = hist.get(); - m_histos.emplace(std::move(key), make_tuple(type, dir, std::move(hist))); + histos.emplace(std::move(key), make_tuple(type, dir, std::move(hist))); } } if (rhist == nullptr) { @@ -250,47 +257,160 @@ void Hlt2RootPublishSvc::function() } } - if (!paused && items[2].revents & ZMQ_POLLIN) { + if (items[2].revents & ZMQ_POLLIN) { auto cmd = receive<string>(publish); if (cmd == Monitoring::s_Publish) { - // Sync with info service and store resulting empty - // histogramsq - auto histos = syncHistograms(inf); - for (auto& entry : histos) { - m_histos.emplace(std::move(entry)); + if (pubCounter > 0) { + --pubCounter; } - // Publish - publishHistograms(back); + if ((pubCounter == 0 && messages.empty() && !histos.empty()) + || (messages.empty() && !sentAll)) { + // Find which runs have histograms that are not empty + // NOTE: This can probably be done better with a + // multi-index container that is indexed by run and + // (run, histID). For now just save all runs for which + // there is at least 1 not-empty histogram + std::unordered_set<Monitoring::RunNumber> runNotEmpty; + size_t nonEmpty = 0; + for (auto& entry : histos) { + if (std::get<2>(entry.second)->GetEntries() != 0) { + ++nonEmpty; + runNotEmpty.emplace(entry.first.first); + } + } - // Reset histograms - for (auto& entry : m_histos) { - std::get<2>(entry.second)->Reset("ICESM"); + // Sync with info service and store resulting empty + // histograms + for (auto& entry : syncHistograms(inf, histos, sentHistos, runNotEmpty)) { + histos.emplace(std::move(entry)); + } + + // Fill the queue of messages + auto s = messages.size(); + auto sent = publishHistograms(histos, sentHistos, messages); + debug() << "Created " << messages.size() - s << " messages." << endmsg; + sentAll = (sent.size() == nonEmpty); + + // Reset/cleanup histograms + + // Reset histograms for runs where we received something. If the + // number of histograms is above the HWM, delete some until half + // of the HWM is reached. + for (bool onlyEmpty : {true, false}) { + s = histos.size(); + auto ne = 0, nr = 0; + for (ROOTHistos::const_iterator it = histos.cbegin(); + it != histos.cend();) { + // If we didn't send a histogram, don't touch it + // If we have a lot of histograms, first remove all empty ones. + // If we still have a lot, remove all sent ones. + auto histo = std::get<2>(it->second).get(); + if (!sent.count(it->first)) { + ++it; + } else if ((histos.size() > m_hwm / 2) + && (!onlyEmpty || (onlyEmpty && histo->GetEntries() == 0))) { + it = histos.erase(it); + ++ne; + } else if (!onlyEmpty) { + // Only reset on the second pass to avoid doing it twice + histo->Reset("ICESM"); + ++it; + ++nr; + } else { + ++it; + } + } + + debug() << "Pruned " << s - histos.size() << " sent histograms." << endmsg; + debug() << "Erased " << ne << " histograms, reset " << nr << endmsg; + + if (histos.size() < (m_hwm / 2)) break; + } + + + pubCounter = m_publishInterval * 2; + } + + // Send some more if we have a lot of messages + if (messages.size() > m_hwm * 4 && m_batchSize < 2 * m_batchSizeProp) { + m_batchSize = 2 * m_batchSizeProp; + } else if (messages.size() < m_hwm) { + m_batchSize = m_batchSizeProp; + } + + if (!messages.empty()){ + auto n = std::min(messages.size(), m_batchSize); + + if (UNLIKELY(msgLevel(MSG::VERBOSE))) { + verbose() << "Sending " << n << " messages " << endmsg; + } + + for (size_t m = 0; m < n; ++m) { + auto& message = messages.front(); + for (size_t i = 0; i < message.size(); ++i) { + send(back, message[i], i < (messages.size() - 1) ? zmq::SNDMORE : 0); + } + messages.pop_front(); + } + } else if (UNLIKELY(msgLevel(MSG::VERBOSE))) { + verbose() << "Neither histograms nor messages." << endmsg; } } } } - m_stopPublishing = true; pubThread.join(); - } //=============================================================================== -void Hlt2RootPublishSvc::publishHistograms(zmq::socket_t& socket) const +Hlt2RootPublishSvc::SentHistos +Hlt2RootPublishSvc::publishHistograms(ROOTHistos& histos, + SentHistos& sentHistos, + Queue& messages) const { // Loop over histograms if (UNLIKELY(msgLevel(MSG::DEBUG))) debug() << "Publishing histograms." << endmsg; - for (auto& entry : m_histos) { + // Find out which + std::unordered_set<Monitoring::RunNumber> runNotEmpty; + for (auto& entry : histos) { + if (std::get<2>(entry.second)->GetEntries() != 0) { + runNotEmpty.emplace(entry.first.first); + } + } + + std::vector<ROOTHistos::const_iterator> iterators; + iterators.reserve(histos.size()); + for (auto it = begin(histos), last = end(histos); + it != last; ++it) { + iterators.push_back(it); + } + + // If we have too many histograms to send, randomize their order and send what + // we can. + if (iterators.size() > m_hwm) { + if (UNLIKELY(msgLevel(MSG::DEBUG))) { + debug() << "Not publishing all histos (" << histos.size() + << ") as there are more than the hwm (" << m_hwm + << "). Which histograms are sent has been randomized." + << endmsg; + } + std::mt19937 gen{std::hash<string>{}(Monitoring::hostname())}; + std::shuffle(begin(iterators), end(iterators), gen); + } + + SentHistos sent; + + for (auto it : iterators) { Monitoring::RunNumber run; Monitoring::HistId id; - std::tie(run, id) = entry.first; + std::tie(run, id) = it->first; - string type = std::get<0>(entry.second); - string dir = std::get<1>(entry.second); - const unique_ptr<TH1>& hist = std::get<2>(entry.second); + string type = std::get<0>(it->second); + string dir = std::get<1>(it->second); + const unique_ptr<TH1>& hist = std::get<2>(it->second); if (hist == nullptr) { // TODO: Skip empty histograms. GetEntries does not @@ -299,24 +419,37 @@ void Hlt2RootPublishSvc::publishHistograms(zmq::socket_t& socket) const } else if (!Monitoring::RootTypeMap.count(hist->ClassName())) { warning() << "Unknown type of histogram: " << hist->ClassName() << endmsg; continue; + } else if (hist->GetEntries() == 0 && sentHistos.count(it->first)) { + // Don't send and empty histogram twice + continue; + } else { + sentHistos.emplace(it->first); } if (UNLIKELY(msgLevel(MSG::VERBOSE))) verbose() << "Publishing " << run << " " << id << " " << dir << " " << type << " " << hist->GetName() << endmsg; - // Send run and ID - send(socket, run, zmq::SNDMORE); - send(socket, id, zmq::SNDMORE); - // Send histogram type - send(socket, type, zmq::SNDMORE); - // Send histogram directory - send(socket, dir, zmq::SNDMORE); - // Indicate histograms should be added by the SaverSvc - send(socket, true, zmq::SNDMORE); - // Send histogram - send(socket, *hist); + array<zmq::message_t, 6> message = { + // run and ID + zmq().encode(run), + zmq().encode(id), + // Send histogram type + zmq().encode(type), + // Send histogram directory + zmq().encode(dir), + // Indicate histograms should be added by the SaverSvc + zmq().encode(true), + // Send histogram + zmq().encode(*hist)}; + messages.emplace_back(std::move(message)); + + // Indicate we sent this histogram + sent.emplace(it->first); + + if (messages.size() >= m_hwm) break; } + return sent; } //=============================================================================== @@ -403,25 +536,25 @@ Hlt2RootPublishSvc::DirAndHist Hlt2RootPublishSvc::makeRate(const string& path) //=============================================================================== Hlt2RootPublishSvc::DirAndHist Hlt2RootPublishSvc::make1D(const Histo1DDef& def) const { - string dir, title; - std::tie(dir, title) = splitPath(def.title); + string dir, title; + std::tie(dir, title) = splitPath(def.title); - std::unique_ptr<TH1D> hist; - if (def.variable) { - hist = make_unique<TH1D>(title.c_str(), title.c_str(), - def.xbins, def.xedges.data()); - } else { - hist = make_unique<TH1D>(title.c_str(), title.c_str(), - def.xbins, def.xlow, def.xhigh); - } + std::unique_ptr<TH1D> hist; + if (def.variable) { + hist = make_unique<TH1D>(title.c_str(), title.c_str(), + def.xbins, def.xedges.data()); + } else { + hist = make_unique<TH1D>(title.c_str(), title.c_str(), + def.xbins, def.xlow, def.xhigh); + } - if (def.labels) setLabels(def.xlabels, hist->GetXaxis()); - - hist->SetDirectory(nullptr); - hist->Sumw2(); - if (UNLIKELY(msgLevel(MSG::VERBOSE))) - verbose() << "Created TH1D for " << def.title << endmsg; - return {dir, std::move(hist)}; + if (def.labels) setLabels(def.xlabels, hist->GetXaxis()); + + hist->SetDirectory(nullptr); + hist->Sumw2(); + if (UNLIKELY(msgLevel(MSG::VERBOSE))) + verbose() << "Created TH1D for " << def.title << endmsg; + return {dir, std::move(hist)}; } //=============================================================================== @@ -437,7 +570,7 @@ Hlt2RootPublishSvc::DirAndHist Hlt2RootPublishSvc::make2D(const Histo2DDef& def) } else if (def.xvariable) { hist = make_unique<TH2D>(title.c_str(), title.c_str(), def.xbins, def.xedges.data(), - def.ybins, def.ylow, def.yhigh); + def.ybins, def.ylow, def.yhigh); } else if (def.yvariable) { hist = make_unique<TH2D>(title.c_str(), title.c_str(), @@ -447,7 +580,7 @@ Hlt2RootPublishSvc::DirAndHist Hlt2RootPublishSvc::make2D(const Histo2DDef& def) } else { hist = make_unique<TH2D>(title.c_str(), title.c_str(), def.xbins, def.xlow, def.xhigh, - def.ybins, def.ylow, def.yhigh); + def.ybins, def.ylow, def.yhigh); } if (def.labels) { @@ -474,74 +607,105 @@ pair<string, string> Hlt2RootPublishSvc::splitPath(const string& path) const //=============================================================================== vector<Hlt2RootPublishSvc::ROOTHistos::value_type> -Hlt2RootPublishSvc::syncHistograms(zmq::socket_t& inf) const +Hlt2RootPublishSvc::syncHistograms(zmq::socket_t& inf, + const ROOTHistos& histos, + const SentHistos& sentHistos, + const unordered_set<Monitoring::RunNumber>& runNotEmpty) const { // Request to sync histo info to make empty histograms vector<string> what = {Monitoring::s_HistoInfo}; + // As we do some cleanup, use also the sentHistos to track which histograms + // we know about. HistoKeys histoKeys; - std::for_each(begin(m_histos), end(m_histos), + std::for_each(begin(histos), end(histos), [&histoKeys](const ROOTHistos::value_type& entry) { histoKeys.emplace(entry.first); }); - // Send request - send(inf, Monitoring::s_Sync, zmq::SNDMORE); - send(inf, what, zmq::SNDMORE); - send(inf, histoKeys); - - // Wait for reply - boost::optional<string> rep; - zmq::pollitem_t items[] = { - {inf, 0, ZMQ_POLLIN, 0} - }; + std::for_each(begin(sentHistos), end(sentHistos), + [&histoKeys](const SentHistos::value_type& entry) { + histoKeys.emplace(entry); + }); - zmq::poll(&items[0], 1, 1000); - if (items[0].revents & ZMQ_POLLIN) { - rep = receive<string>(inf); - } + unordered_map<Monitoring::RunNumber, size_t> hpr; vector<ROOTHistos::value_type> r; - if (!rep) { - // No reply, recreate info socket - inf = infoSocket(); - warning() << "No reply from Info service to sync request." << endmsg; - } else if(*rep == "INCOMING") { - unordered_map<Monitoring::RunNumber, size_t> hpr; - auto histos = receive<HistoPub>(inf); - for (const auto& entry : histos) { - hpr[std::get<0>(entry)]++; - // Entry consists of (RunNumber, HistId, type, info_string), - // where the info string needs to be converted to a message. - const auto& type = std::get<2>(entry); - - // The last entry in the tuple is a string that needs to be - // decoded according to the type; it's actually used as a - // byte array. We make a message out of it again to decode - // it. - const auto& infoString = std::get<3>(entry); - zmq::message_t msg{infoString.size()}; - std::copy_n(begin(infoString), infoString.size(), static_cast<char*>(msg.data())); - - // Create the histogram - auto tdh = makeHistogram(type, msg); - if (std::get<2>(tdh)) { - r.emplace_back(HistoKey{std::get<0>(entry), std::get<1>(entry)}, std::move(tdh)); - } else if (UNLIKELY(msgLevel(MSG::DEBUG))) { - debug() << "Could not create histogram for run = " << std::get<0>(entry) - << " and id = " << std::get<1>(entry) << endmsg; - } + // Loop until there is no more information, or our cache is full + bool more = true; + while (more && histos.size() < m_hwm) { + // Send request + send(inf, Monitoring::s_Sync, zmq::SNDMORE); + send(inf, what, zmq::SNDMORE); + send(inf, runNotEmpty, zmq::SNDMORE); + send(inf, histoKeys); + + // Wait for reply + boost::optional<string> rep; + zmq::pollitem_t items[] = { + {inf, 0, ZMQ_POLLIN, 0} + }; + + zmq::poll(&items[0], 1, 1000); + if (items[0].revents & ZMQ_POLLIN) { + rep = receive<string>(inf); } - if (UNLIKELY(msgLevel(MSG::DEBUG))) { - for (const auto& entry : hpr) { - debug() << "Created " << std::right << std::setw(6) << to_string(entry.second) - << " empty histograms for run " << entry.first << endmsg; + if (!rep) { + // No reply, recreate info socket + inf = infoSocket(); + warning() << "No reply from Info service to sync request." << endmsg; + break; + } else if(*rep == "INCOMING") { + auto infos = receive<HistoPub>(inf); + more = receive<bool>(inf); + for (const auto& entry : infos) { + auto type = std::get<0>(entry); + + // The second entry in the tuple is a string that needs to be + // decoded according to the type; it's actually used as a + // byte array. We make a message out of it again to decode + // it. + const auto& infoString = std::get<1>(entry); + zmq::message_t msg{infoString.size()}; + std::copy_n(begin(infoString), infoString.size(), static_cast<char*>(msg.data())); + + // The last entry is the vector of keys. + const auto& keys = std::get<2>(entry); + for (const auto& key : keys) { + auto run = std::get<0>(key); + auto histID = std::get<1>(key); + + hpr[run]++; + + // Update known keys with received info + histoKeys.emplace(run, histID); + + // Create the histogram + auto tdh = makeHistogram(type, msg); + if (std::get<2>(tdh)) { + r.emplace_back(HistoKey{run, histID}, std::move(tdh)); + } else if (UNLIKELY(msgLevel(MSG::DEBUG))) { + debug() << "Could not create histogram for run = " << run + << " and id = " << histID << endmsg; + } + } + } + if (UNLIKELY(msgLevel(MSG::DEBUG) && more)) { + debug() << "Syncing again with " << m_infoCon + << " as there is more info." << endmsg; } + } else { + info() << "Got unexpected reply to info request: " << *rep << endmsg; + more = false; } - } else { - info() << "Got unexpected reply to info request: " << *rep << endmsg; } + + for (const auto& entry : hpr) { + info() << "Created " << std::right << std::setw(6) << to_string(entry.second) + << " empty histograms for run " << entry.first << endmsg; + } + return r; } diff --git a/Online/Hlt2Monitoring/src/component/Hlt2RootPublishSvc.h b/Online/Hlt2Monitoring/src/component/Hlt2RootPublishSvc.h index 7eaf8a46022dbb5c24eaf080cf9210c56e5f9d77..16563b3aa17efaf39236f12ccb6426b18d244bc7 100644 --- a/Online/Hlt2Monitoring/src/component/Hlt2RootPublishSvc.h +++ b/Online/Hlt2Monitoring/src/component/Hlt2RootPublishSvc.h @@ -37,8 +37,11 @@ class Hlt2RootPublishSvc : public Hlt2MonBaseSvc { public: using DirAndHist = std::pair<std::string, std::unique_ptr<TH1>>; using TypeDirHist = std::tuple<std::string, std::string, std::unique_ptr<TH1>>; - using ROOTHistos = boost::unordered_map<Monitoring::HistoKey, TypeDirHist>; - + using ROOTHistos = std::unordered_map<Monitoring::HistoKey, TypeDirHist, + Monitoring::KeyHash>; + using SentHistos = std::unordered_set<Monitoring::HistoKey, + Monitoring::KeyHash>; + using Queue = std::deque<std::array<zmq::message_t, 6>>; Hlt2RootPublishSvc(const std::string& name, ISvcLocator* sl); StatusCode initialize() override; @@ -46,10 +49,13 @@ public: private: - void publishHistograms(zmq::socket_t&) const; + SentHistos publishHistograms(ROOTHistos& histos, + SentHistos& sentHistos, + Queue& messages) const; std::vector<ROOTHistos::value_type> - syncHistograms(zmq::socket_t&) const; + syncHistograms(zmq::socket_t&, const ROOTHistos& histos, const SentHistos& sentHistos, + const std::unordered_set<Monitoring::RunNumber>& runNotEmpty) const; TypeDirHist makeHistogram(std::string, const zmq::message_t&) const; TypeDirHist getHistogram(zmq::socket_t&, @@ -62,8 +68,9 @@ private: zmq::socket_t infoSocket() const { zmq::socket_t inf = socket(zmq::REQ); - inf.connect(m_infoCon.c_str()); + zmq::setsockopt(inf, zmq::LINGER, 0); zmq::setsockopt(inf, zmq::RCVTIMEO, 50); + inf.connect(m_infoCon.c_str()); return inf; } @@ -77,9 +84,9 @@ private: double m_rateStart; double m_runDuration; double m_rateInterval; + size_t m_batchSizeProp; + size_t m_batchSize = 0; + size_t m_hwm; - std::atomic<bool> m_stopPublishing; - - ROOTHistos m_histos; }; #endif // HLT2ROOTPUBLISH_H diff --git a/Online/Hlt2Monitoring/src/component/Hlt2SaverSvc.cpp b/Online/Hlt2Monitoring/src/component/Hlt2SaverSvc.cpp index 3f6e7ead286ed5eb65dd2677dbd47b3e0778e148..c5518d3a8eaefd5dcd1bbc6b4b48630b18916498 100644 --- a/Online/Hlt2Monitoring/src/component/Hlt2SaverSvc.cpp +++ b/Online/Hlt2Monitoring/src/component/Hlt2SaverSvc.cpp @@ -54,12 +54,14 @@ namespace { using std::set; using std::multimap; using std::tuple; + using std::stringstream; using Monitoring::HistDiff; using Monitoring::HistoEntry; using Monitoring::SaverHistos; using Monitoring::ByName; using Monitoring::ByRun; + using Monitoring::Sorted; using Monitoring::WorkMap; using boost::lexical_cast; @@ -97,9 +99,9 @@ Hlt2SaverSvc::Hlt2SaverSvc(const string& name, ISvcLocator* loc) declareProperty("DataConnection", m_dataCon); declareProperty("InfoConnection", m_infoCon); declareProperty("TriggerConnection", m_triggerCon = "inproc://Hlt2SaverSvc_trigger"); - declareProperty("PublishPort", m_pubPort = 0); declareProperty("PublishConnection", m_pubCon); declareProperty("RegistrarConnection", m_regCon); + declareProperty("RegisterInterval", m_regInt = 60); declareProperty("BaseDirectory", m_directory); declareProperty("SaveInterval", m_saveInterval = 60); declareProperty("NormalizeRateTo", m_normalize = "Hlt2RoutingBitsWriter/RoutingBit33"); @@ -108,6 +110,7 @@ Hlt2SaverSvc::Hlt2SaverSvc(const string& name, ISvcLocator* loc) declareProperty("UseRunInfoService", m_useInfoSvc = true); declareProperty("NWorkers", m_nWorkers = 10); declareProperty("RunInfoPollTimeout", m_runInfoPollTime = 0.2); + declareProperty("HighWaterMark", m_hwm = 10000); } //=============================================================================== @@ -226,7 +229,80 @@ void Hlt2SaverSvc::saver() { } //=============================================================================== -void Hlt2SaverSvc::function() { +void Hlt2SaverSvc::registrar(std::string con) const +{ + auto internal = zmq().socket(zmq::PAIR); + zmq::setsockopt(internal, zmq::LINGER, 0); + internal.connect(con.c_str()); + + std::vector<zmq::pollitem_t> items(2); + items[0] = {internal, 0, zmq::POLLIN, 0}; + + auto makePing = [this, &items] { + auto ping = zmq().socket(zmq::REQ); + zmq::setsockopt(ping, zmq::LINGER, 0); + ping.connect(m_regCon.c_str()); + items[1] = {ping, 0, zmq::POLLIN, 0}; + return ping; + }; + + boost::optional<zmq::socket_t> ping; + + bool sentPing = false; + bool first = true; + + while (true) { + int timeo = sentPing ? 500 : m_regInt * 1000; + zmq::poll(&items[0], ping ? 2 : 1, first ? 0 : timeo); + if (first) first = false; + + if (items[0].revents & zmq::POLLIN) { + auto cmd = receive<std::string>(internal); + if (cmd == Monitoring::s_Terminate) { + break; + } else { + warning() << "registar: received bad command message " << cmd << endmsg; + } + } + + if (!ping) { + // attempt to register + auto r = registerPublisher(); + if (r) { + send(internal, r->first); + send(internal, r->second); + ping = makePing(); + } + } else if (sentPing) { + sentPing = false; + if (items[1].revents & zmq::POLLIN) { + auto msg = receive<string>(*ping); + if (msg != Monitoring::s_Pong) { + warning() << "Bad reply from ping to registrar: " << msg << endmsg; + } else { + auto app = receive<string>(*ping); + if (app != "Registrar") { + warning() << "Received ping reply from " << app + << " while Registrar was expected." << endmsg; + } + } + } else { + ping.reset(); + } + } else { + // send a ping message + sentPing = true; + if (!ping) { + ping = makePing(); + } + send(*ping, Monitoring::s_Ping); + } + } +} + +//=============================================================================== +void Hlt2SaverSvc::function() +{ TThread{}; @@ -235,10 +311,13 @@ void Hlt2SaverSvc::function() { if (!m_controlConnected) return; zmq::socket_t data = socket(zmq::SUB); - data.connect(m_dataCon.c_str()); + auto hwm = boost::numeric_cast<int>(m_hwm); + zmq::setsockopt(data, zmq::RCVHWM, hwm); zmq::setsockopt(data, zmq::SUBSCRIBE, ""); zmq::setsockopt(data, zmq::LINGER, 0); - info() << "Connected data socket to: " << m_dataCon << endmsg; + data.connect(m_dataCon.c_str()); + info() << "Connected data socket to: " << m_dataCon + << " with HWM " << hwm << endmsg; // Clean up queue zmq::message_t msg; @@ -257,7 +336,7 @@ void Hlt2SaverSvc::function() { auto workerSocket = socket(zmq::PAIR); zmq::setsockopt(workerSocket, zmq::LINGER, 0); workerSocket.bind(saveWorkerCon(i).c_str()); - m_workers.emplace_back(make_tuple(std::thread{[this] (unsigned int i){ saveWorker(i); }, i}, + m_workers.emplace_back(make_tuple(std::thread{[this](unsigned int i) { saveWorker(i); }, i}, std::move(workerSocket), WorkQueue{})); } @@ -270,49 +349,35 @@ void Hlt2SaverSvc::function() { std::thread saveThread{[this] { saver(); }}; // publish socket - boost::optional<zmq::socket_t> pub; - boost::optional<zmq::socket_t> ping; - boost::optional<string> pubCon; - boost::optional<string> pingCon; + boost::optional<zmq::socket_t> pub, ping, reg; + boost::optional<std::thread> registrarThread; + + unsigned int pingPort = 0, pubPort = 0; + string pingCon, pubCon; + + string intRegCon = "inproc://registrar"; + if (!m_pubCon.empty()) { - pubCon = m_pubCon; - } else if (m_pubPort != 0) { - pubCon = "tcp://*:" + to_string(m_pubPort); - } else if (!m_regCon.empty()) { - auto r = registerPublisher(); - if (!r) { - warning() << "Could not register to obtain port for publication, " - << " not publishing written files." << endmsg; - } else { - unsigned int pubPort = 0, pingPort = 0; - std::tie(pubPort, pingPort) = *r; - pubCon = "tcp://*:" + to_string(pubPort); - pingCon = "tcp://*:" + to_string(pingPort); - debug() << "Registered port " << pubPort << " for publication." << endmsg; - } - } - if (pubCon) { pub = zmq().socket(zmq::PUB); zmq::setsockopt(*pub, zmq::LINGER, 0); - pub->bind(pubCon->c_str()); - debug() << "Bound publish socket to " << *pubCon << endmsg; - } - if (pingCon) { - ping = zmq().socket(zmq::REP); - zmq::setsockopt(*ping, zmq::LINGER, 0); - ping->bind(pingCon->c_str()); - debug() << "Bound publish ping socket to " << *pingCon << endmsg; + pub->bind(m_pubCon.c_str()); + debug() << "Bound publish socket to " << m_pubCon << endmsg; + } else if (!m_regCon.empty()) { + reg = socket(zmq::PAIR); + zmq::setsockopt(*reg, zmq::LINGER, 0); + reg->bind(intRegCon.c_str()); + registrarThread = std::thread{[this, intRegCon]{ registrar(intRegCon); }}; } // Initialize poll set - std::vector<zmq::pollitem_t> items(ping ? 5 : 4); - items.reserve(items.size() + + m_nWorkers); + std::vector<zmq::pollitem_t> items(reg ? 5 : 4); + items.reserve(items.size() + m_nWorkers); items[0] = {control, 0, zmq::POLLIN, 0}; items[1] = {data, 0, zmq::POLLIN, 0}; items[2] = {save, 0, zmq::POLLIN, 0}; items[3] = {trigger, 0, zmq::POLLIN, 0}; - if (ping) { - items[4] = {*ping, 0, zmq::POLLIN, 0}; + if (reg) { + items[4] = {*reg, 0, zmq::POLLIN, 0}; } for (auto& worker : m_workers) { items.push_back({std::get<1>(worker), 0, zmq::POLLIN, 0}); @@ -330,7 +395,7 @@ void Hlt2SaverSvc::function() { // Remember which workers work on what. WorkMap haveWork; - decltype(m_workers)::iterator saving; + Workers::const_iterator saving; while (!stopping || ((doneSaving.size() < m_workers.size()) @@ -345,11 +410,12 @@ void Hlt2SaverSvc::function() { auto cmd = receive<std::string>(control); if (cmd == Monitoring::s_Terminate) { // Save last histograms now. - haveWork = saveHistograms(closedRuns); + haveWork = saveHistograms(closedRuns, saving); std::set<size_t> withWork; for (const auto& entry : haveWork) { withWork.emplace(entry.first); } + send(*reg, Monitoring::s_Terminate); send(save, std::move(withWork), zmq::SNDMORE); send(save, true); stopping = true; @@ -393,13 +459,16 @@ void Hlt2SaverSvc::function() { // Either create a new histogram, or update the existing one. if (it == end(range)) { - m_histos.insert({key.first, type, dir, histo.release()}); + m_histos.insert({key.first, type, dir, histo.release(), add}); } else { // For a while, the wrong type was written to files, so update it from the published one // if needed. if (it->type != type) { hbn.modify(it, [&type](HistoEntry& entry) { entry.type = type; }); } + if (it->add != add) { + hbn.modify(it, [&add](HistoEntry& entry) { entry.add = add; }); + } if (add) { it->histo->Add(histo.get()); } else { @@ -419,7 +488,7 @@ void Hlt2SaverSvc::function() { auto w = receive<int>(save); if (w == -1) { // Fill worker queues with work. - haveWork = saveHistograms(closedRuns); + haveWork = saveHistograms(closedRuns, saving); // Send worker IDs with work to save thread. std::set<size_t> withWork; @@ -432,10 +501,12 @@ void Hlt2SaverSvc::function() { send(save, false); } else { // Tell worker w to start saving and which runs are closed - if (UNLIKELY(msgLevel(MSG::VERBOSE))) - verbose() << "Sending save command to worker " << w << endmsg; - send(std::get<1>(m_workers[w]), Monitoring::s_Save); - saving = begin(m_workers) + w; + if (std::distance(m_workers.cbegin(), saving) != w) { + if (UNLIKELY(msgLevel(MSG::VERBOSE))) + verbose() << "Sending save command to worker " << w << endmsg; + send(std::get<1>(m_workers[w]), Monitoring::s_Save); + saving = m_workers.cbegin() + w; + } } } } @@ -477,9 +548,49 @@ void Hlt2SaverSvc::function() { } } + // Setup the ping and pub connections if/when the registrar + // thread tells us the ports to use. + if (reg && items[4].revents & zmq::POLLIN) { + auto p = receive<unsigned int>(*reg); + auto pp = receive<unsigned int>(*reg); + + if (p != pubPort) { + if (!pubCon.empty() ) { + pub->disconnect(pubCon.c_str()); + } + + pubCon = "tcp://*:" + std::to_string(p); + // Setup pub connection + pub = zmq().socket(zmq::PUB); + zmq::setsockopt(*pub, zmq::LINGER, 0); + pub->bind(pubCon.c_str()); + pubPort = p; + debug() << "Bound publish socket to " << pubCon << endmsg; + } + + if (pp != pingPort) { + if (!pingCon.empty() ) { + ping->disconnect(pingCon.c_str()); + } + pingCon = "tcp://*:" + std::to_string(pp); + + // Setup ping connection + ping = socket(zmq::REP); + zmq::setsockopt(*ping, zmq::LINGER, 0); + ping->bind(pingCon.c_str()); + if (pingPort != 0) { + items[5] = {*ping, 0, zmq::POLLIN, 0}; + } else { + items.insert(items.begin() + 5, {*ping, 0, zmq::POLLIN, 0}); + } + pingPort = pp; + debug() << "Bound publish ping socket to " << pingCon << endmsg; + } + } + // Reply to pings from registration server to see if we're // alive. - if (ping && items[4].revents & zmq::POLLIN) { + if (ping && items[5].revents & zmq::POLLIN) { auto msg = receive<string>(*ping); verbose() << "Received message " << msg << " on ping socket." << endmsg; if (msg == Monitoring::s_Ping) { @@ -571,7 +682,15 @@ void Hlt2SaverSvc::saveWorker(const unsigned int worker) { // Copy to the EOR file boost::system::error_code ec; fs::copy_file(src_file, dest_file, ec); - if (ec) verbose() << "Copied file for run " << run << " to " << dest_file.string() << endmsg; + if (!ec.value()) { + verbose() << "Copied file for run " << run << " to " << dest_file.string() << endmsg; + } else { + error() << "Failed to copy file for run " << run << " to " << dest_file.string() + << ". Histograms have not been saved. Error from system: " << ec.message() + << ". This indicates a problem with machine " + << Monitoring::hostname() << ", please check with Online." + << endmsg; + } return ec; } return false; @@ -625,7 +744,8 @@ void Hlt2SaverSvc::saveWorker(const unsigned int worker) { } //=============================================================================== -WorkMap Hlt2SaverSvc::saveHistograms(const std::unordered_set<Monitoring::RunNumber>& closed) +WorkMap Hlt2SaverSvc::saveHistograms(const std::unordered_set<Monitoring::RunNumber>& closed, + Workers::const_iterator nowSaving) { // Get list of runs for which we have updates @@ -640,9 +760,14 @@ WorkMap Hlt2SaverSvc::saveHistograms(const std::unordered_set<Monitoring::RunNum sizes.reserve(m_workers.size()); // Lambda to find the first worker with the least work to do. - auto findWorker = [&sizes](decltype(m_workers)& workers) { + auto findWorker = [&sizes, nowSaving](Workers& workers) { sizes.clear(); for (auto it = begin(workers), last = end(workers); it != last; ++it) { + + // Don't use the worker that is now saving for anything + if (it == nowSaving) continue; + + // Count the number of entries in the work queue const WorkQueue& queue = std::get<2>(*it); size_t n = std::count_if(begin(queue), end(queue), [](const WorkQueue::value_type& entry) { @@ -689,7 +814,12 @@ WorkMap Hlt2SaverSvc::saveHistograms(const std::unordered_set<Monitoring::RunNum while (run != end(runs)) { auto saveIt = saving.find(*run); if (saveIt != end(saving)) { - workOnRun(*run, saveIt->second, closed.count(*run)); + // Only work on a run if the worker that was already working + // on it is not doing so now. + if (saveIt->second != nowSaving) { + workOnRun(*run, saveIt->second, closed.count(*run)); + } + // Do not dispatch this run to any worker run = runs.erase(run); } else { ++run; @@ -713,19 +843,29 @@ WorkMap Hlt2SaverSvc::saveHistograms(const std::unordered_set<Monitoring::RunNum } - for (decltype(m_workers)::const_iterator worker = begin(m_workers), last = end(m_workers); + for (Workers::const_iterator worker = begin(m_workers), last = end(m_workers); worker != last; ++worker) { + // Skip the worker that is saving right now to avoid sending it + // extra save commands. + if (worker == nowSaving) continue; const auto& queue = std::get<2>(*worker); if (queue.empty()) continue; size_t w = std::distance(m_workers.cbegin(), worker); - debug() << "Worker " << std::right << std::setw(3) << w << " works on runs:"; + stringstream msg; + msg << "Worker " << std::right << std::setw(3) << w << " works on runs:"; for (const auto& entry : queue) { if (!std::get<2>(entry).empty() || std::get<1>(entry)) { - debug() << " " << std::get<0>(entry).run; + msg << " " << std::get<0>(entry).run; haveWork.emplace(w, std::get<0>(entry).run); } } - debug() << endmsg; + if (haveWork.count(w)) { + info() << msg.str() << endmsg; + } + } + + if (haveWork.empty()) { + info() << "No work to divide." << endmsg; } // Return the set of workers that have received work. @@ -825,7 +965,7 @@ Hlt2SaverSvc::saveHistograms(const Monitoring::RunInfo& runInfo, } // Loop over histograms for that run - for (const auto& entry : histos) { + for (const auto& entry : histos.get<Sorted>()) { auto histo = entry.histo.get(); auto dir = entry.dir; auto outDir = static_cast<TDirectoryFile*>(outFile.Get(dir.c_str())); @@ -888,8 +1028,11 @@ Hlt2SaverSvc::saveHistograms(const Monitoring::RunInfo& runInfo, // Copy to the saveset fs::copy_file(outPath, file, ec); if (ec) { - warning() << "Could not copy file " << outPath << " to " - << file.string() << endmsg; + error() << "Could not copy file " << outPath << " to " << file.string() + << ". Error from system: " << ec.message() + << ". This indicates a problem with machine " + << Monitoring::hostname() << ", please check with Online." + << endmsg; } else { if (UNLIKELY(msgLevel(MSG::DEBUG))) debug() << "Saved histograms for run " << runInfo.run << " to " @@ -908,7 +1051,11 @@ Hlt2SaverSvc::saveHistograms(const Monitoring::RunInfo& runInfo, bool success = fs::remove(outPath, ec); success &= !ec; if (!success) { - warning() << "Could not remove file " << outPath.string() << endmsg; + error() << "Could not remove file " << outPath.string() + << ". Error from system: " << ec.message() + << ". This indicates a problem with machine " + << Monitoring::hostname() << ", please check with Online." + << endmsg; } return make_tuple(closed, file.string(), fileByRun.string()); } @@ -1056,7 +1203,11 @@ std::pair<fs::path, bool> Hlt2SaverSvc::filename(const Monitoring::RunInfo& runI bool success = fs::create_directories(directory, ec); success &= !ec; if (!success) { - warning() << "Failed to create directory " << directory << endmsg; + error() << "Failed to create directory " << directory + << ". Histograms will not be saved. Error from system: " + << ec.message() << ". This indicates a problem with machine " + << Monitoring::hostname() << ", please check with Online." + << endmsg; return make_pair(directory, false); } } diff --git a/Online/Hlt2Monitoring/src/component/Hlt2SaverSvc.h b/Online/Hlt2Monitoring/src/component/Hlt2SaverSvc.h index 419b216dbc7545f8ceee3b0e118ad0c0e3ff297f..62ffb1f374a9eb653bbf422a3cd3acb68cb88500 100644 --- a/Online/Hlt2Monitoring/src/component/Hlt2SaverSvc.h +++ b/Online/Hlt2Monitoring/src/component/Hlt2SaverSvc.h @@ -29,6 +29,11 @@ class Hlt2SaverSvc : public Hlt2MonBaseSvc { public: + // Add boolean to indicate closing + using WorkQueue = std::vector<std::tuple<Monitoring::RunInfo, bool, Monitoring::SaverHistos>>; + using Worker = std::tuple<std::thread, zmq::socket_t, WorkQueue>; + using Workers = std::vector<Worker>; + /// Standard constructor Hlt2SaverSvc(const std::string& name, ISvcLocator* sl); @@ -39,16 +44,15 @@ public: private: - // Add boolean to indicate closing - using WorkQueue = std::vector<std::tuple<Monitoring::RunInfo, bool, Monitoring::SaverHistos>>; - using Worker = std::tuple<std::thread, zmq::socket_t, WorkQueue>; - using Workers = std::vector<Worker>; - // Function used by thread to trigger saving of histograms void saver(); + // Function used by thread to communicate with the registrar + void registrar(std::string con) const; + // Save all known histograms to file - Monitoring::WorkMap saveHistograms(const std::unordered_set<Monitoring::RunNumber>& closed); + Monitoring::WorkMap saveHistograms(const std::unordered_set<Monitoring::RunNumber>& closed, + Workers::const_iterator nowSaving); // Save all histograms of a single run to file std::tuple<bool, std::string, std::string> @@ -94,9 +98,9 @@ private: std::string m_dataCon; std::string m_infoCon; std::string m_triggerCon; - unsigned int m_pubPort; std::string m_pubCon; std::string m_regCon; + int m_regInt; std::string m_normalize; std::string m_application; int m_saveInterval; @@ -104,6 +108,7 @@ private: bool m_useInfoSvc; double m_runInfoPollTime; std::string m_runInfoType; + size_t m_hwm; // Data members std::atomic<bool> m_stopSaving; diff --git a/Online/Hlt2Monitoring/src/component/ZmqTransmitterSvc.cpp b/Online/Hlt2Monitoring/src/component/ZmqTransmitterSvc.cpp index a04c1d162c2b0894e8f73e21fe38c3191ff751a3..3d5bdfc5e914c6c9d2ff53f04ee1033656c05bfa 100644 --- a/Online/Hlt2Monitoring/src/component/ZmqTransmitterSvc.cpp +++ b/Online/Hlt2Monitoring/src/component/ZmqTransmitterSvc.cpp @@ -1,4 +1,3 @@ -// Include files #include <string> #include <tuple> #include <vector> @@ -8,8 +7,8 @@ #include <thread> #include <unordered_map> #include <sstream> +#include <chrono> -// boost #include <boost/regex.hpp> #include <boost/archive/text_oarchive.hpp> #include <boost/numeric/conversion/cast.hpp> @@ -17,17 +16,14 @@ #include <boost/optional.hpp> #include <boost/functional/hash.hpp> -// Gaudi #include <GaudiKernel/ParsersFactory.h> -// ZeroMQ #include <ZeroMQ/IZeroMQSvc.h> -// Hlt2Monitoring #include <Hlt2Monitoring/Utilities.h> #include <Hlt2Monitoring/Serialize.h> +#include <Hlt2Monitoring/Types.h> -// local #include "ZmqTransmitterSvc.h" //----------------------------------------------------------------------------- @@ -66,6 +62,7 @@ namespace { namespace fs = boost::filesystem; using ms = chr::duration<double, std::milli>; + using namespace std::chrono_literals; #if __cplusplus <= 201103L //TODO: this adds C++14 'make_unique'... remove once we move to C++14... @@ -81,13 +78,13 @@ namespace { //============================================================================= ZmqTransmitterSvc::ZmqTransmitterSvc(const string& name, ISvcLocator* pSvcLocator) - : base_class (name , pSvcLocator), - m_internalConCounter{1} + : base_class (name , pSvcLocator) { declareProperty("HostnameRegex", m_hostRegex = "hlt(?<subfarm>[a-f]{1}[0-9]{2})(?<node>[0-9]{2})?"); declareProperty("Application", m_application = {"Moore2", "v1r0"}); declareProperty("InfoPort", m_infoPort = 31339); declareProperty("IPCConnectionPath", m_connectionPath = "/run/HLT2"); + declareProperty("MaxMonRestart", m_maxMonRestart = 0); } //============================================================================= @@ -105,9 +102,11 @@ StatusCode ZmqTransmitterSvc::initialize() if (!m_zmqSvc) { fatal() << "ZeroMQSvc not found" << endmsg; return StatusCode::FAILURE; - } + } // Create directories needed for ipc connections. + debug() << "Checking for existence of " << m_connectionPath + << " and creating if needed." << endmsg; fs::path p(m_connectionPath); if (!fs::exists(p)) { boost::system::error_code ec; @@ -123,6 +122,82 @@ StatusCode ZmqTransmitterSvc::initialize() return sc; } +//============================================================================= +void ZmqTransmitterSvc::transmitWrapper() +{ + auto internal = internalSocket(internalCon("internal"), false); + auto checkCon = internalCon("check"); + auto check = internalSocket(checkCon, true, {{zmq::SNDTIMEO, 100}}); + auto pubCon = internalCon("publish"); + auto pub = internalSocket(pubCon, true, {{zmq::SNDTIMEO, 100}}); + auto debugCon = internalCon("debug"); + + std::exception_ptr checkException{nullptr}; + std::thread checkThread{[this, checkCon, debugCon, &checkException] { + try { + sendCheck(checkCon, debugCon); + } catch (const zmq::error_t& e) { + error() << "Check caught unhandled 0MQ exception: " << e.what() << endmsg; + checkException = std::current_exception(); + } catch(const std::exception& e) { + error() << "Check caught unhandled std exception: " << e.what() << endmsg; + checkException = std::current_exception(); + } catch (...) { + error() << "Check caught unhandled other exception." << endmsg; + checkException = std::current_exception(); + } + }}; + + std::exception_ptr pubException{nullptr}; + std::thread pubThread{[this, pubCon, &pubException] { + try { + publish(pubCon); + } catch (const zmq::error_t& e) { + error() << "Publish caught unhandled 0MQ exception: " << e.what() << endmsg; + pubException = std::current_exception(); + } catch (const std::exception& e) { + error() << "Publish caught unhandled std exception: " << e.what() << endmsg; + pubException = std::current_exception(); + } catch (...) { + error() << "Publish caught unhandled other exception." << endmsg; + pubException = std::current_exception(); + } + }}; + + try { + transmit(internal, check, pub, pubException, checkException); + } catch (const zmq::error_t& e) { + error() << "Transmit caught unhandled 0MQ exception: " << e.what() << endmsg; + m_transmitException = std::current_exception(); + } catch (const std::exception& e) { + error() << "Transmit caught unhandled std exception: " << e.what() << endmsg; + m_transmitException = std::current_exception(); + } catch (...) { + error() << "Transmit caught unhandled other exception." << endmsg; + m_transmitException = std::current_exception(); + } + + try { + if (!checkException) { + zmq().send(check, Monitoring::s_Command, zmq::SNDMORE); + zmq().send(check, Monitoring::s_Terminate); + } + if (!pubException) { + zmq().send(pub, Monitoring::s_Terminate); + } + } catch (const zmq::error_t& e) { + error() << "Caught zmq error while exiting: " << e.what() << endmsg; + } + checkThread.join(); + pubThread.join(); + + if (pubException) { + m_transmitException = pubException; + } else if (checkException) { + m_transmitException = checkException; + } +} + //============================================================================= void ZmqTransmitterSvc::setup() { @@ -153,7 +228,24 @@ void ZmqTransmitterSvc::setup() } if (!m_thread) { - m_thread = make_unique<std::thread>([this]{ transmit(); }); + m_thread = make_unique<std::thread>([this]{ + unsigned int tries = 0; + while (tries <= m_maxMonRestart) { + transmitWrapper(); + if (m_transmitException) { + warning() << "Restarting monitoring thread try " << tries << endmsg; + std::this_thread::sleep_for(1s); + ++tries; + if (tries > m_maxMonRestart) { + error() << "Maximum number monitoring restarts " + << "reached, giving up " << endmsg; + } + } else { + break; + } + } + m_ok = false; + }); } } @@ -167,8 +259,10 @@ StatusCode ZmqTransmitterSvc::start() { StatusCode ZmqTransmitterSvc::finalize() { if (m_thread) { - zmq().send(*m_internal, Monitoring::s_Command, zmq::SNDMORE); - zmq().send(*m_internal, Monitoring::s_Terminate); + if (!m_transmitException) { + zmq().send(*m_internal, Monitoring::s_Command, zmq::SNDMORE); + zmq().send(*m_internal, Monitoring::s_Terminate); + } m_thread->join(); } m_internal.reset(); @@ -183,7 +277,7 @@ ZmqTransmitterSvc::outputSocket(size_t nMsg, size_t hwm, const std::string forwardType, const unsigned int sourceID, const bool checkEverySend, - const unsigned int interval) + const unsigned int i) { // If we haven't started yet, but one of our users is in start, // we're still good to go. @@ -197,14 +291,19 @@ ZmqTransmitterSvc::outputSocket(size_t nMsg, size_t hwm, boost::hash<pair<string, unsigned int>> hasher{}; size_t id = hasher(make_pair(forwardType, count)); + // Batch size for sending is such that all credit (and messages) + // should be sent in 10 parts of a second, so keep the interval + // larger than 10 seconds + auto interval = (i != 0 && i < 12) ? 12 : i; + debug() << "Registering forward type: " << forwardType << endmsg << " id: " << id << endmsg << " connection: " << outputInfo.second << endmsg << " message size: " << nMsg << endmsg << " high water mark: " << hwm << endmsg + << " check every send: " << checkEverySend << endmsg << " interval: " << interval << endmsg; - Transmitter::Forwarder forward{nMsg, hwm, std::move(outputInfo), internalCon("internal", count), std::move(forwardType), sourceID, @@ -212,6 +311,7 @@ ZmqTransmitterSvc::outputSocket(size_t nMsg, size_t hwm, auto internal = zmq().socket(zmq::PAIR); zmq::setsockopt(internal, zmq::LINGER, 0); + zmq::setsockopt(internal, zmq::SNDTIMEO, 100); internal.bind(forward.internalCon.c_str()); zmq().send(*m_internal, Monitoring::s_Command, zmq::SNDMORE); @@ -227,27 +327,58 @@ ZmqTransmitterSvc::outputSocket(size_t nMsg, size_t hwm, return r; } +//=============================================================================== +void ZmqTransmitterSvc::publish(const string pubCon) const { + + auto internal = zmq().socket(zmq::PAIR); + zmq::setsockopt(internal, zmq::LINGER, 0); + internal.connect(pubCon.c_str()); + + zmq::pollitem_t items [] = { + { internal, 0, ZMQ_POLLIN, 0 }, + }; + + while(true) { + zmq::poll(&items[0], 1, 1000); + if (items[0].revents & ZMQ_POLLIN) { + auto msg = zmq().receive<string>(internal); + if (msg == Monitoring::s_Terminate) { + break; + } else { + warning() << "Publish thread got unknown message: " + << msg << endmsg; + } + } + zmq().send(internal, Monitoring::s_Publish); + } +} + //=============================================================================== void ZmqTransmitterSvc::sendCheck(const string checkCon, const string debugCon) const { auto dbg = zmq().socket(zmq::PAIR); zmq::setsockopt(dbg, zmq::LINGER, 0); + zmq::setsockopt(dbg, zmq::SNDTIMEO, 100); dbg.connect(debugCon.c_str()); auto sendDebug = [this, &dbg](string message) { - zmq().send(dbg, message); + try { + zmq().send(dbg, message); + } catch (const zmq::error_t&) { + debug() << message << endmsg; + } }; auto internal = zmq().socket(zmq::PAIR); zmq::setsockopt(internal, zmq::LINGER, 0); internal.connect(checkCon.c_str()); - sendDebug("sendCheck: connected internal connection to " + checkCon); + sendDebug("connected internal connection to " + checkCon); auto trigger = zmq().socket(zmq::SUB); zmq::setsockopt(trigger, zmq::LINGER, 0); trigger.connect(m_triggerCon.c_str()); zmq::setsockopt(trigger, zmq::SUBSCRIBE, ""); - sendDebug("sendCheck: connected trigger connection to " + m_triggerCon); + sendDebug("connected trigger connection to " + m_triggerCon); zmq::pollitem_t items [] = { { internal, 0, ZMQ_POLLIN, 0 }, @@ -258,7 +389,9 @@ void ZmqTransmitterSvc::sendCheck(const string checkCon, const string debugCon) using KeyHash = boost::hash<Key>; // Map of type for which we are checking to {type connected, check interval for type} unordered_map<Key, pair<bool, unsigned int>, KeyHash> checkInfo; - + unordered_map<Key, bool, KeyHash> checking; + + bool stop = false; unordered_map<size_t, string> id2Type; @@ -337,28 +470,24 @@ void ZmqTransmitterSvc::sendCheck(const string checkCon, const string debugCon) } } - auto checkAgain = [&intervals, &checkInfo, maxTries](const tuple<string, size_t, bool>& t) { + auto checkAgain = [&intervals, &checkInfo, &checking, maxTries](const tuple<string, size_t, bool>& t) { Key key{std::get<0>(t), std::get<1>(t)}; - // Start checking this forwardType again. - auto it = intervals.find(key); - if (it == end(intervals)) { - // If we were not yet checking, start now with the registered interval - // if the registered interval is 0, it means no regular check, so force a - // check by setting the interval to 1. - auto interval = checkInfo[key].second != 0 ? checkInfo[key].second : 1; - intervals.emplace(key, make_pair(interval, maxTries)); - } else if (checkInfo[key].second != 0) { - // If we were already checking, extend the interval back to the original - it->second.first = checkInfo[key].second; + // Check this forward type now. + auto it = checking.find(key); + if (it == end(checking) || !it->second) { + checking[key] = false; + intervals[key] = make_pair(500, maxTries); } }; - auto shouldCheck = [](const decltype(intervals)::value_type& entry) { - return entry.second.first <= 0 && entry.second.second > 0; + auto shouldCheck = [&checking](const decltype(intervals)::value_type& entry) { + return !checking[entry.first] && entry.second.first <= 0 && entry.second.second > 0; }; vector<tuple<string, size_t, bool>> connected; std::set<Key> tried; + double timeout = 999; + while (!stop) { double diff = 0.; @@ -367,61 +496,43 @@ void ZmqTransmitterSvc::sendCheck(const string checkCon, const string debugCon) if (interval.second.first > 0) interval.second.first -= diff; } - // if (msgLevel(MSG::DEBUG)) { - // stringstream s; - // s << "Poll returned: " << connected.size(); - // for (size_t i = 0; i < connected.size(); ++i) { - // s << " " << i << " " << connected[i].first << " " << connected[i].second; - // } - // s << " " << diff; - // sendDebug(s.str()); - // sendDebug(string{"stop: "} + to_string(stop)); - // s.str(string{}); - - // s << "Intervals:"; - // for (const auto& entry : intervals) { - // s << " " << entry.first << " " << entry.second.first << " " << entry.second.second; - // } - // sendDebug(s.str()); - // s.str(string{}); - - // s << "Tried:"; - // for (const auto& entry : tried) { - // s << " " << entry; - // } - // sendDebug(s.str()); - // s.str(string{}); - // } - - if (stop) break; + if (UNLIKELY(msgLevel(MSG::VERBOSE))) { + stringstream s; + s << "Poll returned: " << connected.size(); + for (size_t i = 0; i < connected.size(); ++i) { + s << " " << i << " " << std::get<0>(connected[i]) << " " + << std::get<1>(connected[i]) << " " << std::get<2>(connected[i]); + } + s << " " << diff; + sendDebug(s.str()); + sendDebug(string{"stop: "} + to_string(stop)); + s.str(string{}); + + s << "Intervals:"; + for (const auto& entry : intervals) { + s << " " << entry.first.first << " " << entry.first.second + << " " << entry.second.first << " " << entry.second.second; + } + sendDebug(s.str()); + s.str(string{}); - for (const auto& entry : connected) { - if (!std::get<0>(entry).empty() and !std::get<2>(entry)) { - checkAgain(entry); + s << "Tried:"; + for (const auto& entry : tried) { + s << " " << entry.first << " " << entry.second; } - } + sendDebug(s.str()); + s.str(string{}); - // If nobody was connected, reduce all trial counts by 1 and remove those that reached the max tried. - if (none_of(begin(connected), end(connected), [](const decltype(connected)::value_type& entry) { - return std::get<2>(entry); - })) { - auto tr = tried.begin(); - while(tr != end(tried)) { - auto it = intervals.find(*tr); - if (it != end(intervals)) { - --(it->second.second); - if (it->second.second == 0) { - tr = tried.erase(tr); - intervals.erase(it); - } else { - ++tr; - } - } else { - ++tr; - } + s << "Checking:"; + for (const auto& entry : checking) { + s << " " << entry.first.first << " " << entry.first.second << " " + << entry.second; } + sendDebug(s.str()); } + if (stop) break; + for (const auto& entry : connected) { if (std::get<0>(entry).empty()) continue; Key key{std::get<0>(entry), std::get<1>(entry)}; @@ -434,9 +545,9 @@ void ZmqTransmitterSvc::sendCheck(const string checkCon, const string debugCon) } if (std::get<2>(entry)) { - Key key{std::get<0>(entry), std::get<1>(entry)}; tried.erase(key); - + checking[key] = false; + if (!it->second.first) { it->second.first = true; @@ -451,63 +562,91 @@ void ZmqTransmitterSvc::sendCheck(const string checkCon, const string debugCon) } } + for (const auto& entry : connected) { + if (!std::get<0>(entry).empty() && !std::get<2>(entry)) { + checkAgain(entry); + } + } + + timeout -= diff; + + if (timeout > 0) continue; + + timeout = 999; + + // If nobody was connected, reduce all trial counts by 1 and remove those that reached the max tried. + if (none_of(begin(connected), end(connected), [](const decltype(connected)::value_type& entry) { + return std::get<2>(entry); + }) && timeout < 0) { + auto tr = tried.begin(); + while(tr != end(tried)) { + auto it = intervals.find(*tr); + if (it != end(intervals)) { + --(it->second.second); + if (it->second.second == 0) { + tr = tried.erase(tr); + intervals.erase(it); + checking[*tr] = false; + } else { + ++tr; + } + } else { + ++tr; + } + } + } + // Find out which ones should be checked for connectivity for (auto& interval : intervals) { if (!shouldCheck(interval)) continue; - // Flag that we are no longer connected + // Flag that we are no longer connected and checking + checking[interval.first] = true; auto it = checkInfo.find(interval.first); - if (it != end(checkInfo)) it->second.first = false; + if (it != end(checkInfo)) { + it->second.first = false; + } + + stringstream s; + s << "Sending check message for type " << interval.first.first << " " << interval.first.second; + sendDebug(string{s.str()}); // Send check message zmq().send(internal, Monitoring::s_Publish, zmq::SNDMORE); zmq().send(internal, interval.first.first, zmq::SNDMORE); zmq().send(internal, interval.first.second); + tried.insert(interval.first); } } } //=============================================================================== -void ZmqTransmitterSvc::transmit() const { +void ZmqTransmitterSvc::transmit(zmq::socket_t& internal, zmq::socket_t& check, + zmq::socket_t& pub, + std::exception_ptr& pubException, + std::exception_ptr& checkException) const { boost::regex reInfoSvc{"tcp://\\*(:[0-9]+)"}; std::unordered_map<size_t, zmq::socket_t> dataInConnections; std::unordered_map<std::string, std::pair<size_t, zmq::socket_t>> svcConnections; - // check if our info service is tcp, such that we can create the right - // connection string for the remote check sender to connect to. - - using Options = vector<pair<zmq::SocketOptions, int>>; - auto internalSocket = [this](string con, bool bind = true, - Options options = Options{}) -> zmq:: socket_t{ - auto s = zmq().socket(zmq::PAIR); - zmq::setsockopt(s, zmq::LINGER, 0); - for (const auto& e : options) zmq::setsockopt(s, e.first, e.second); - bind ? s.bind(con.c_str()) : s.connect(con.c_str()); - return s; - }; - - auto internal = internalSocket(internalCon("internal"), false); - auto checkCon = internalCon("check"); - auto check = internalSocket(checkCon, true, {{zmq::SNDTIMEO, 100}}); auto debugCon = internalCon("debug"); auto dbg = internalSocket(debugCon, true); - std::thread checkThread{[this, checkCon, debugCon] { - sendCheck(checkCon, debugCon); - }}; - std::vector<zmq::pollitem_t> items; items.push_back({internal, 0, ZMQ_POLLIN, 0}); items.push_back({check, 0, ZMQ_POLLIN, 0}); + items.push_back({pub, 0, ZMQ_POLLIN, 0}); items.push_back({dbg, 0, ZMQ_POLLIN, 0}); using Messages = vector<zmq::message_t>; // key: forward type - // value: (JobInfo, output socket, messages to be send, connected, checking) - using InfoTuple = tuple<Monitoring::JobInfo, zmq::socket_t, deque<Messages>, bool, bool>; + // value: (JobInfo, output socket, messages to be send, connected, checking, + // credit, batchSize, number of publishes) + using InfoTuple = tuple<Monitoring::JobInfo, zmq::socket_t, deque<Messages>, bool, bool, + size_t, size_t, size_t>; using Key = pair<string, size_t>; using Infos = unordered_map<Key, InfoTuple, boost::hash<Key>>; Infos infos; @@ -517,24 +656,19 @@ void ZmqTransmitterSvc::transmit() const { Forwarders forwarders; auto sendCheckMsg = [this] (zmq::socket_t& output, const Monitoring::JobInfo& jobInfo) { - // Send our connection state to the check thread - debug() << "Sending check message to request reply to " << jobInfo.connection << endmsg; + if (UNLIKELY(msgLevel(MSG::DEBUG))) { + debug() << "Sending check message to request reply to " << jobInfo.connection << endmsg; + } zmq().send(output, Monitoring::s_Check, zmq::SNDMORE); zmq().send(output, jobInfo); }; - auto sendMessage = [this](zmq::socket_t& output, Messages& msgs) { - size_t n = msgs.size(); - for (unsigned int i = 0; i < n; ++i) { - zmq().send(output, msgs[i], (i < (n - 1)) ? zmq::SNDMORE : 0); - } - }; - // Output socket factory lambda auto outputFactory = [this](zmq::SocketTypes type, const std::string con) { auto output = [this, con, type] { zmq::socket_t s = zmq().socket(type); zmq::setsockopt(s, zmq::LINGER, 0); + zmq::setsockopt(s, zmq::SNDHWM, 10000); s.connect(con.c_str()); return s; }; @@ -550,18 +684,17 @@ void ZmqTransmitterSvc::transmit() const { std::unordered_map<Key, bool, boost::hash<Key>> recreated; auto checkRecreated = [&recreated](const Key& k) { return recreated.count(k) && recreated[k]; }; + m_ok = true; + while (true) { // Process messages from all sockets zmq::poll (&items[0], items.size(), 1000); for (const auto& entry : infos) recreated[entry.first] = false; if (items[0].revents & ZMQ_POLLIN) { - auto typeMsg = zmq().receive<zmq::message_t>(internal); - auto type = zmq().decode<string>(typeMsg); + auto type = zmq().receive<string>(internal); if (type == Monitoring::s_Command) { - auto cmdMsg = zmq().receive<zmq::message_t>(internal); - auto cmd = zmq().decode<string>(cmdMsg); - zmq().send(check, typeMsg, zmq::SNDMORE); + auto cmd = zmq().receive<string>(internal); if (cmd == Monitoring::s_Register) { // If a register command is received, add that forward destination // Receive the forwarder information @@ -605,14 +738,16 @@ void ZmqTransmitterSvc::transmit() const { << forwarder.internalCon.c_str() << endmsg; } items.push_back({r.first->second, 0, ZMQ_POLLIN, 0}); - InfoTuple t = make_tuple(std::move(info), makeOutput(), deque<Messages>{}, false, true); + InfoTuple t = make_tuple(std::move(info), makeOutput(), deque<Messages>{}, false, true, + 0, 0, 0); infos.emplace(make_pair(ft, id), std::move(t)); // Store the forwarder info forwarders.emplace(make_pair(ft, id), make_pair(std::move(forwarder), std::move(makeOutput))); // Finally forward to the check thread - zmq().send(check, cmdMsg, zmq::SNDMORE); + zmq().send(check, type, zmq::SNDMORE); + zmq().send(check, cmd, zmq::SNDMORE); zmq().send(check, ft, zmq::SNDMORE); zmq().send(check, id, zmq::SNDMORE); zmq().send(check, interval); @@ -621,10 +756,10 @@ void ZmqTransmitterSvc::transmit() const { debug() << "Check thread reports " << (success ? "" : "un") << "successful registration of forwarder of type " << ft << " with interval " << interval << endmsg; + // send result of registration to outputSocket call zmq().send(internal, success); } else if (cmd == Monitoring::s_Terminate) { - zmq().send(check, cmdMsg); break; } } @@ -642,68 +777,80 @@ void ZmqTransmitterSvc::transmit() const { << " with id " << id << " and message type " << msgType << endmsg; } else if (msgType == Monitoring::s_Publish) { // If we are not checking for a connection, we've been connected before, but we want to recheck. - debug() << "Received publish message for forward type: " << forwardType - << ", id: " << id << ", connected: " << infoConnected(it->second) - << ", checking: " << infoChecking(it->second) << endmsg; - if (!infoChecking(it->second)) { + auto checking = infoChecking(it->second); + if (!checking) { infoConnected(it->second) = false; infoChecking(it->second) = true; } // If we are not connected, send a check message if (!infoConnected(it->second)) { + if (UNLIKELY(msgLevel(MSG::DEBUG))) { + debug() << "Sending check message for forward type: " + << forwardType << ", id: " << id << endmsg; + } sendCheckMsg(dataOut(it->second), jobInfo(it->second)); } } } - // Debug messages from check thread - if (items[2].revents & ZMQ_POLLIN) { - debug() << "sendCheck: " << zmq().receive<string>(dbg) << endmsg; - } - // Loop over data input connections to process messages for (auto& entry : dataInConnections) { if (items[entry.first].revents & ZMQ_POLLIN) { bool more = true; - auto forwardTypeMsg = zmq().receive<zmq::message_t>(entry.second, &more); - auto forwardType = zmq().decode<string>(forwardTypeMsg); + auto forwardType = zmq().receive<string>(entry.second, &more); boost::optional<size_t> id; Key key; - auto it = end(forwarders); - if (more) { - id = zmq().receive<size_t>(entry.second, &more); + + if (forwardType == Monitoring::s_Check) { + forwardType = zmq().receive<string>(entry.second); + id = zmq().receive<size_t>(entry.second); key = Key{forwardType, *id}; - verbose() << "Received data in message for forward type " << forwardType - << " id " << *id << endmsg; - it = forwarders.find(key); - } else { - error() << "Received data in message of only size 1" << endmsg; - } - if (it != end(forwarders)) { - Transmitter::Forwarder& forward = it->second.first; - Messages msgs; - msgs.reserve(forward.nMsg); - msgs.emplace_back(std::move(forwardTypeMsg)); - for (unsigned int i = 1; i < forward.nMsg; ++i) { - if (!more) { - throw ZMQ::MoreException{}; - } - msgs.emplace_back(zmq().receive<zmq::message_t>(entry.second, &more)); - } - // If we are not connected and we are above the high water mark, throw some data away. auto infoIt = infos.find(key); - auto& messages = infoMessages(infoIt->second); - if (messages.size() > forward.hwm) { - messages.pop_front(); + if (infoIt != end(infos) && infoConnected(infoIt->second)) { + const auto& messages = infoMessages(infoIt->second); + size_t credit = forwarders[key].first.hwm - messages.size(); + size_t batchSize = std::get<6>(infoIt->second); + zmq().send(entry.second, credit < batchSize ? credit : batchSize); + } else { + zmq().send(entry.second, 0); } - if (!infoConnected(infoIt->second)) { - messages.emplace_back(std::move(msgs)); + } else { + auto it = end(forwarders); + if (more) { + id = zmq().receive<size_t>(entry.second, &more); + key = Key{forwardType, *id}; + it = forwarders.find(key); } else { - sendMessage(dataOut(infoIt->second), msgs); + error() << "Received data in message of only size 1" << endmsg; + } + if (it != end(forwarders)) { + Transmitter::Forwarder& forward = it->second.first; + Messages msgs; + msgs.reserve(forward.nMsg); + msgs.emplace_back(zmq().encode(forwardType)); + for (unsigned int i = 1; i < forward.nMsg; ++i) { + if (!more) { + throw ZMQ::MoreException{}; + } + msgs.emplace_back(zmq().receive<zmq::message_t>(entry.second, &more)); + } + if (more) { + error() << "Received too many messages for forward type: " << forwardType << endmsg; + } + // If we are not connected and we are above the high water mark, throw some data away. + auto infoIt = infos.find(key); + auto& messages = infoMessages(infoIt->second); + if (messages.size() > forward.hwm) { + if (UNLIKELY(msgLevel(MSG::DEBUG))) { + debug() << "Removing buffered message, above HWM of " << forward.hwm << endmsg; + } + messages.pop_front(); + } + messages.emplace_back(std::move(msgs)); + } else if (id) { + warning() << "Received forward message for unknown forward type: " << forwardType + << " id: " << *id << endmsg; } - } else if (id) { - warning() << "Received forward message for unknown forward type: " << forwardType - << " id: " << *id << endmsg; } } } @@ -717,8 +864,10 @@ void ZmqTransmitterSvc::transmit() const { if (msgType == Monitoring::s_Check) { auto forwardType = zmq().receive<string>(svcSocket); auto id = zmq().receive<size_t>(svcSocket); - debug() << "Got reply to check message on service connection " << entry.second.first - << " for type " << forwardType << " id " << id << endmsg; + if (UNLIKELY(msgLevel(MSG::DEBUG))) { + debug() << "Got reply to check message on service connection " << entry.second.first + << " for type " << forwardType << " id " << id << endmsg; + } // We got a reply to our check message in the form or a check request, so we are connected and // can stop checking. Reply with our service connection so the other side can check that it // sent the request to the right connection. @@ -728,6 +877,11 @@ void ZmqTransmitterSvc::transmit() const { infoConnected(info) = true; infoChecking(info) = false; + auto& queue = infoMessages(info); + auto credit = queue.size() < 1000 ? 667 : queue.size(); + std::get<5>(info) = boost::numeric_cast<size_t>(credit * 1.5); + std::get<6>(info) = std::max(std::get<5>(info) / 10 + 1, 200ul); + // Send the connection that others should connect to as reply string infoSvcOut = boost::regex_replace(entry.first, reInfoSvc, string{"tcp://"} + Monitoring::hostname() + "$1"); @@ -744,28 +898,87 @@ void ZmqTransmitterSvc::transmit() const { } } - for (auto& info : infos) { - InfoTuple& infos = info.second; - auto& forwardType = info.first; - unsigned int hwm = forwarders[forwardType].first.hwm; - if (!infoConnected(infos) && !infoChecking(infos) && (infoMessages(infos).size() > (hwm >> 2))) { - infoChecking(infos) = true; - if (!checkRecreated(forwardType)) dataOut(infos) = forwarders[forwardType].second(); - sendCheckMsg(dataOut(infos), jobInfo(infos)); - } + // Send messages + if (items[2].revents & ZMQ_POLLIN) { + zmq().receive<string>(pub); + for (auto& info : infos) { + InfoTuple& infoTuple = info.second; + const auto& forwardType = info.first; + + auto& nPub = std::get<7>(infoTuple); + ++nPub; + + if (!infoConnected(infoTuple) + && !infoMessages(infoTuple).empty()) { + infoChecking(infoTuple) = true; + if (!checkRecreated(forwardType) + && (nPub != 0) && (nPub % 5 == 0)) { + + if (UNLIKELY(msgLevel(MSG::DEBUG))) { + debug() << "Recreating socket for forward type " + << forwardType << endmsg; + } - if (infoConnected(infos) && !infoMessages(infos).empty()) { - if (msgLevel(MSG::DEBUG) && infoMessages(infos).size() > 1) - debug() << "Sending " << infoMessages(infos).size() << " buffered messages." << endmsg; - zmq::socket_t& output = dataOut(infos); - for (auto& entry : infoMessages(infos)) { - sendMessage(output, entry); + dataOut(infoTuple) = forwarders[forwardType].second(); + recreated[info.first] = true; + } + sendCheckMsg(dataOut(infoTuple), jobInfo(infoTuple)); } - infoMessages(infos).clear(); - if (forwarders[forwardType].first.checkEverySend) infoConnected(infos) = false; + + if (infoConnected(infoTuple) && !infoMessages(infoTuple).empty()) { + + auto& credit = std::get<5>(infoTuple); + auto batchSize = std::get<6>(infoTuple); + + zmq::socket_t& output = dataOut(infoTuple); + auto& messages = infoMessages(infoTuple); + + // Send either batchSize messages or all remaining if + // fewer than batchSize + // Send never more than the remaining credit + auto n = std::min(credit, std::min(batchSize, messages.size())); + + if (UNLIKELY(msgLevel(MSG::DEBUG) && n > 0)) { + debug() << "Sending " << n << " buffered messages for " + << forwardType << endmsg; + } + + for (size_t i = 0; i < n; ++i) { + auto& message = messages.front(); + size_t nMsg = message.size(); + for (unsigned int i = 0; i < nMsg; ++i) { + zmq().send(output, message[i], (i < (nMsg - 1)) ? zmq::SNDMORE : 0); + } + messages.pop_front(); + } + credit -= n; + + if (forwarders[forwardType].first.checkEverySend && nPub == 10) { + infoConnected(infoTuple) = false; + } + } + if (nPub == 10) nPub = 0; } } - } - checkThread.join(); + // Debug messages from check thread + if (items[3].revents & ZMQ_POLLIN) { + auto msg = zmq().receive<string>(dbg); + if (UNLIKELY(msgLevel(MSG::DEBUG))) { + debug() << "sendCheck: " << msg << endmsg; + } + } + + // If an exception has occurred in one of the daughter threads, send the other one a terminate and then exit ourselves + if (pubException && !checkException) { + error() << "Publish exception was caught, exiting transmit." << endmsg; + zmq().send(check, Monitoring::s_Command, zmq::SNDMORE); + zmq().send(check, Monitoring::s_Terminate); + break; + } else if (checkException && !pubException) { + error() << "Check exception was caught, exiting transmit." << endmsg; + zmq().send(pub, Monitoring::s_Terminate); + break; + } + } } diff --git a/Online/Hlt2Monitoring/src/component/ZmqTransmitterSvc.h b/Online/Hlt2Monitoring/src/component/ZmqTransmitterSvc.h index bd3ccf240a0982c5aa2ae46c1d5ba16bf6342725..a025f0d9945d69e186ba69cf5e040d65e7710144 100644 --- a/Online/Hlt2Monitoring/src/component/ZmqTransmitterSvc.h +++ b/Online/Hlt2Monitoring/src/component/ZmqTransmitterSvc.h @@ -8,6 +8,8 @@ #include <boost/optional.hpp> // Include files +#include <GaudiKernel/IIncidentSvc.h> +#include <GaudiKernel/IIncidentListener.h> #include <GaudiKernel/Service.h> // from Hlt2Monitoring @@ -100,10 +102,17 @@ class GAUDI_API ZmqTransmitterSvc : public extends<Service, ITransmitterSvc> { return m_application; } + bool ok() const override + { + return m_ok; + } + private: + using Options = std::vector<std::pair<zmq::SocketOptions, int>>; + std::string infoSvcCon(const std::string& dataCon) const { - if (dataCon.substr(0, 4) == "tcp") { + if (dataCon.substr(0, 3) == "tcp") { return std::string{"tcp://*:"} + std::to_string(m_infoPort); } else { return std::string{"ipc://"} + m_connectionPath + "/" + name() + "_" + std::to_string(::getpid()); @@ -115,26 +124,45 @@ class GAUDI_API ZmqTransmitterSvc : public extends<Service, ITransmitterSvc> { } void sendCheck(const std::string internalCon, const std::string debugCon) const; + void publish(const std::string internalCon) const; std::string internalCon(const std::string& type, unsigned int i = 0) const { return std::string{"inproc://"} + name() + "_" + type + "_" + std::to_string(i); } + zmq::socket_t internalSocket(std::string con, bool bind = true, + Options options = Options{}) const + { + auto s = zmq().socket(zmq::PAIR); + zmq::setsockopt(s, zmq::LINGER, 0); + for (const auto& e : options) zmq::setsockopt(s, e.first, e.second); + bind ? s.bind(con.c_str()) : s.connect(con.c_str()); + return s; + }; + + void setup(); - void transmit() const; + void transmitWrapper(); + void transmit(zmq::socket_t& internal, zmq::socket_t& check, + zmq::socket_t& pub, + std::exception_ptr& pubException, + std::exception_ptr& checkException) const; std::pair<std::string, std::string> m_application; std::string m_hostRegex; unsigned int m_infoPort; std::string m_connectionPath; + unsigned int m_maxMonRestart; + mutable std::atomic<bool> m_ok{false}; SmartIF<IZeroMQSvc> m_zmqSvc; std::unique_ptr<zmq::socket_t> m_trigger; std::string m_triggerCon; - unsigned int m_internalConCounter; + unsigned int m_internalConCounter = 1; std::unique_ptr<zmq::socket_t> m_internal; std::unique_ptr<std::thread> m_thread; + std::exception_ptr m_transmitException = nullptr; }; #endif // ZMQTRANSMITTERSVC_H diff --git a/Online/Hlt2Monitoring/src/lib/HistoUtils.cpp b/Online/Hlt2Monitoring/src/lib/HistoUtils.cpp index 8c8bf380c49c9688c179fa439463a40bb34d0f1f..7a1c87b61f95fc0aa16b389c520cf1896461589b 100644 --- a/Online/Hlt2Monitoring/src/lib/HistoUtils.cpp +++ b/Online/Hlt2Monitoring/src/lib/HistoUtils.cpp @@ -7,17 +7,41 @@ // boost #include <boost/numeric/conversion/cast.hpp> +#include <boost/iostreams/filter/bzip2.hpp> +#include <boost/iostreams/filtering_stream.hpp> +#include <boost/iostreams/device/file.hpp> +#include <boost/iostreams/stream.hpp> + +#include <boost/filesystem.hpp> + +#include <boost/archive/text_iarchive.hpp> + +// range v3 +#include <range/v3/algorithm.hpp> +#include <range/v3/view.hpp> + // ROOT #include <THashList.h> #include <TObjString.h> // local -#include "Hlt2Monitoring/HistoUtils.h" +#include <Hlt2Monitoring/Types.h> +#include <Hlt2Monitoring/HistoUtils.h> +#include <Hlt2Monitoring/Histo1DDef.h> +#include <Hlt2Monitoring/Histo2DDef.h> +#include <Hlt2Monitoring/InfoUtils.h> namespace { using std::string; using std::tuple; using std::vector; + using std::ofstream; + using namespace ranges; + using boost::math::sign; + using boost::math::epsilon_difference; + + namespace io = boost::iostreams; + namespace fs = boost::filesystem; } tuple<int, double, double, vector<double>> axisDefinition(const Gaudi::Axis& axis) { @@ -60,3 +84,117 @@ vector<string> getLabels(const Gaudi::Axis& axis) { } return labels; } + +bool same_bins(double ll, double lh, int lb, + double rl, double rh, int rb) { + if (lb != rb) { + return false; + } else if ((sign(ll) != sign(rl)) + || (sign(lh) != sign(rh))) { + return false; + } else if ((epsilon_difference(ll, rl) > 2) + || (epsilon_difference(lh, rh) > 2)) { + return false; + } else { + return true; + } +} + +bool same_labels(const std::vector<std::string>& ll, + const std::vector<std::string>& rl) { + if (ll.size() != rl.size()) { + return false; + } else { + return all_of(view::zip(ll, rl), [](const std::tuple<std::string, std::string>& t) { + return std::get<0>(t) == std::get<1>(t); + }); + } +} + +// Hash a Histo1DDef +size_t Monitoring::hash_value(const Monitoring::Histo1DDef& def) { + std::size_t h = 0; + boost::hash_combine(h, def.title); + if (def.variable) { + boost::hash_combine(h, def.xedges); + } else { + boost::hash_combine(h, def.xlow); + boost::hash_combine(h, def.xhigh); + boost::hash_combine(h, def.xbins); + } + if (def.labels) { + boost::hash_combine(h, def.xlabels); + } + return h; +} + +// Hash a Histo2DDef +size_t Monitoring::hash_value(const Monitoring::Histo2DDef& def) { + std::size_t h = 0; + boost::hash_combine(h, def.title); + if (def.xvariable) { + boost::hash_combine(h, def.xedges); + } else { + boost::hash_combine(h, def.xlow); + boost::hash_combine(h, def.xhigh); + boost::hash_combine(h, def.xbins); + } + if (def.yvariable) { + boost::hash_combine(h, def.yedges); + } else { + boost::hash_combine(h, def.ylow); + boost::hash_combine(h, def.yhigh); + boost::hash_combine(h, def.ybins); + } + if (def.labels) { + boost::hash_combine(h, def.xlabels); + boost::hash_combine(h, def.ylabels); + } + return h; +} + +std::pair<Monitoring::HistoMap::const_iterator, bool> +addHistogram(Monitoring::HistoMap& histograms, + const Monitoring::HistoKey& key, + const string& type, + MonInfo::HistoVariant variant) { + size_t hash = boost::hash<MonInfo::HistoVariant>{}(variant); + auto vit = histograms.get<MonInfo::ByContent>().find(hash); + std::shared_ptr<MonInfo::HistoVariant> shared; + if (vit == end(histograms.get<MonInfo::ByContent>())) { + // Completely new + shared = std::make_shared<MonInfo::HistoVariant>(std::move(variant)); + } else { + shared = vit->cnt; + } + return histograms.emplace(std::move(key), std::move(type), + hash, std::move(shared)); +} + +size_t loadHistoInfo(Monitoring::HistoMap& histograms, string input_file) { + fs::path filename = fs::path{input_file}; + if (!fs::exists(filename)) { + return 0; + } + + io::stream<io::file_source> input(filename.string(), ofstream::in | ofstream::binary); + io::filtering_istream filter; + filter.push(io::bzip2_decompressor()); + filter.push(input); + boost::archive::text_iarchive ta(filter); + + Monitoring::HistoMap tmp; + size_t n_read = 0; + while (!filter.eof()) { + try { + ta >> tmp; + ++n_read; + } catch (const boost::archive::archive_exception&) { + break; + } + for (const auto& entry : tmp) { + addHistogram(histograms, entry.key, entry.type, entry.content()); + } + } + return n_read; +} diff --git a/Online/Hlt2Monitoring/src/lib/Utilities.cpp b/Online/Hlt2Monitoring/src/lib/Utilities.cpp index 4776eca4ee8c13638787e0e9a474c1093aff40ee..54ff57ae33cea49d8c6e61f423b7ca8063e012c3 100644 --- a/Online/Hlt2Monitoring/src/lib/Utilities.cpp +++ b/Online/Hlt2Monitoring/src/lib/Utilities.cpp @@ -1,23 +1,12 @@ -// Include files #include <iostream> #include <string> #include <map> -// boost #include <boost/numeric/conversion/cast.hpp> #include <boost/range/iterator_range.hpp> #include <boost/lexical_cast.hpp> - -// ROOT -#include <TDirectory.h> -#include <TClass.h> -#include <TFile.h> -#include <TKey.h> - -// boost #include <boost/regex.hpp> -// local #include "Hlt2Monitoring/Utilities.h" namespace { @@ -42,16 +31,6 @@ namespace Gaudi { } #endif -#ifdef STANDALONE -IZeroMQSvc* zmqSvc() { - static std::unique_ptr<IZeroMQSvc> zmqSvc; - if (!zmqSvc) { - zmqSvc.reset(new IZeroMQSvc{}); - } - return zmqSvc.get(); -} -#endif - //=============================================================================== unsigned int Monitoring::sourceID(boost::regex regex, string host) { boost::smatch matches; @@ -99,6 +78,10 @@ string Monitoring::hostname() { string hn; if (!gethostname(hname, sizeof(hname))) { hn = string{hname}; + auto pos = hn.find('.'); + if (pos != string::npos) { + hn = hn.substr(0, pos); + } } return hn; } diff --git a/Online/Hlt2Monitoring/test/dump_info.cpp b/Online/Hlt2Monitoring/test/dump_info.cpp new file mode 100644 index 0000000000000000000000000000000000000000..34f2a2ebf128156ebee3d1660c97ac3109a86594 --- /dev/null +++ b/Online/Hlt2Monitoring/test/dump_info.cpp @@ -0,0 +1,530 @@ +#include <iostream> +#include <vector> +#include <string> +#include <csignal> +#include <fstream> +#include <random> + +#include <boost/format.hpp> +#include <boost/optional.hpp> +#include <boost/functional/hash.hpp> + +#include <boost/archive/text_oarchive.hpp> + +#include <boost/serialization/shared_ptr.hpp> +#include <boost/serialization/serialization.hpp> +#include <boost/serialization/set.hpp> +#include <boost/serialization/map.hpp> +#include <boost/serialization/string.hpp> +#include <boost/serialization/vector.hpp> +#include <boost/serialization/shared_ptr.hpp> +#include <boost/serialization/variant.hpp> + +#include <boost/iostreams/filter/bzip2.hpp> +#include <boost/iostreams/filtering_stream.hpp> +#include <boost/iostreams/device/file.hpp> +#include <boost/iostreams/stream.hpp> + +#include <boost/program_options.hpp> + +#include <boost/filesystem.hpp> + +#include <zmq/zmq.hpp> +#include <ZeroMQ/functions.h> +#include <ZeroMQ/IZeroMQSvc.h> + +#include <Hlt2Monitoring/Types.h> +#include <Hlt2Monitoring/Histo1DDef.h> +#include <Hlt2Monitoring/Histo2DDef.h> +#include <Hlt2Monitoring/HistoUtils.h> +#include <Hlt2Monitoring/InfoUtils.h> +#include <Hlt2Monitoring/Utilities.h> + +namespace { + using std::cout; + using std::endl; + using std::vector; + using std::string; + using std::ofstream; + + using std::array; + using std::set; + using std::vector; + using std::string; + using std::to_string; + using std::unique_ptr; + using std::make_pair; + using std::make_tuple; + using std::pair; + using std::map; + using std::unordered_set; + + using boost::optional; + + using Monitoring::HistoKey; + using Monitoring::HistoKeys; + using Monitoring::HistoPub; + using Monitoring::HistoMap; + using Monitoring::Histo1DDef; + using Monitoring::Histo2DDef; + + using MonInfo::ByKey; + using MonInfo::ByContent; + using MonInfo::HistoVariant; + using MonInfo::HistoEntry; + + namespace io = boost::iostreams; + namespace fs = boost::filesystem; + namespace po = boost::program_options; + + volatile std::sig_atomic_t interrupted = 0; +} + +IZeroMQSvc& zmqSvc() { + static std::unique_ptr<IZeroMQSvc> svc; + if (!svc) { + svc = std::make_unique<IZeroMQSvc>(); + } + return *svc; +} + +void signal_handler(int) +{ + interrupted = 1; +} + +// Sync in the old way +optional<bool> syncOld(Monitoring::HistoMap& histograms, const HistoKeys& written, std::string connection); + +// Sync in the new way +optional<bool> syncNew(Monitoring::HistoMap& histograms, const HistoKeys& written, std::string connection); + +// Decode histo information +bool decodeHistoInfo(const vector<zmq::message_t>& msgs, Monitoring::HistoMap& histograms); + +int main(int ac, char* av[]) { + + string method; + unsigned int port; + + // Declare the supported options. + po::options_description desc("Allowed options"); + desc.add_options() + ("help", "produce help message") + ("method,m", po::value<string>(&method)->default_value("new"), "input file") + ("input-file,i", po::value<vector<string>>(), "input file") + ("output-file,o", po::value<string>(), "output file") + ("host,h", po::value<vector<string>>(), "host to sync with") + ("port,p", po::value<unsigned int>(&port)->default_value(31352), "port to connect to") + ("node,n", po::value<vector<string>>(), "node to sync with"); + + po::positional_options_description p; + p.add("output-file", 1); + + po::variables_map vm; + po::store(po::command_line_parser(ac, av). + options(desc).positional(p).run(), vm); + po::notify(vm); + + if (vm.count("help")) { + cout << desc << "\n"; + return 1; + } + + if (method != "old" && method != "new") { + cout << "method must be either old or new" << endl; + return 1; + } + + + vector<string> input_files; + if (vm.count("input-file")) { + input_files = vm["input-file"].as< vector<string>>(); + } + + auto output_file = vm["output-file"].as<string>(); + if (fs::exists(fs::path{output_file})) { + input_files.push_back(output_file); + } + + set<string> syncConnections; + if (vm.count("host")) { + for (auto host : vm["host"].as< vector<string>>()) { + syncConnections.emplace(host); + } + } else { + // Build set of all subfarms + vector<string> subfarms; + for (auto rack : string{"abcdef"}) { + for (int row = 1; row < 11; ++row) { + subfarms.emplace_back((boost::format{"hlt%s%02d"} % rack % row).str()); + } + } + + std::mt19937 gen{std::hash<string>{}("dump_info")}; + + auto connection = [port](const string& host) -> string { + return string{"tcp://"} + host + ":" + to_string(port); + }; + + vector<string>::iterator last = subfarms.end() - 1, first = subfarms.begin(); + while (syncConnections.size() < 5 && last != first - 1) { + size_t s = distance(first, last); + auto it = first + std::uniform_int_distribution<size_t>{0, s ? s - 1 : 0}(gen); + auto con = connection(*it); + + zmq::socket_t ping = zmqSvc().socket(zmq::REQ); + zmq::setsockopt(ping, zmq::LINGER, 0); + zmq::setsockopt(ping, zmq::RCVTIMEO, 100); + ping.connect(con.c_str()); + zmqSvc().send(ping, Monitoring::s_Ping); + string r; + + zmq::pollitem_t items[] = { + {ping, 0, ZMQ_POLLIN, 0} + }; + + auto n = zmq::poll(&items[0], 1, 500); + + if (items[0].revents & ZMQ_POLLIN) { + try { + r = zmqSvc().receive<string>(ping); + } catch (const ZMQ::TimeOutException&) { + } + } + + if (n == 0 || r.empty()) { + cout << "no reply from " << con << endl; + } else { + syncConnections.emplace(con); + } + + std::swap(*it, *last); + --last; + } + } + + HistoKeys written; + Monitoring::HistoMap histograms; + + auto n_read = 0; + for (auto input_file : input_files) { + auto n = loadHistoInfo(histograms, input_file); + if (n) { + ++n_read; + cout << "Read " << histograms.size() << " histograms from " << input_file << endl; + } + } + cout << "Read " << n_read << " sets of histograms." << endl; + + + io::stream<io::file_sink> output(output_file, ofstream::out | ofstream::binary); + io::filtering_stream<io::output> filter; + filter.push(io::bzip2_compressor()) ; + filter.push(output); + boost::archive::text_oarchive ta(filter); + + size_t tries = 100; + bool more = true; + + std::signal(SIGINT, signal_handler); + + cout << "Synchronising with:"; + for (auto c : syncConnections) { + cout << " " << c; + } + cout << endl; + + auto it = begin(syncConnections); + while (more && it != end(syncConnections) && interrupted == 0) { + auto connection = *it; + cout << "Synchronising with: " << connection << endl; + while (more && interrupted == 0) { + optional<bool> r; + try { + if (method == "old") { + r = syncOld(histograms, written, connection); + } else { + r = syncNew(histograms, written, connection); + } + } catch (const zmq::error_t& e) { + more = false; + break; + } + if (!r) { + more = true; + break; + } else { + more = *r; + } + + } + ++it; + if (it == end(syncConnections) && tries > 0) { + it = begin(syncConnections); + --tries; + } + } + + if (!histograms.empty()) { + ta << histograms; + cout << "Total: " << histograms.size() << endl; + } +} + +std::pair<HistoMap::const_iterator, bool> addHistogram(HistoMap& histograms, + const HistoKey& key, + const string& type, + HistoVariant variant) { + size_t hash = boost::hash<MonInfo::HistoVariant>{}(variant); + auto vit = histograms.get<ByContent>().find(hash); + std::shared_ptr<HistoVariant> shared; + if (vit == end(histograms.get<ByContent>())) { + // Completely new + shared = std::make_shared<HistoVariant>(std::move(variant)); + } else { + shared = vit->cnt; + } + return histograms.emplace(std::move(key), std::move(type), + hash, std::move(shared)); +} + +optional<bool> syncOld(Monitoring::HistoMap& histograms, + const HistoKeys& written, + std::string connection) +{ + + vector<string> what = {Monitoring::s_HistoInfo}; + + HistoKeys histoKeys; + std::for_each(begin(histograms), end(histograms), + [&histoKeys](const HistoEntry& entry) { + histoKeys.emplace(entry.key); + }); + + std::for_each(begin(written), end(written), + [&histoKeys](const HistoKey& key) { + histoKeys.emplace(key); + }); + + // Connect output request socket + zmq::socket_t out = zmqSvc().socket(zmq::REQ); + zmq::setsockopt(out, zmq::LINGER, 0); + zmq::setsockopt(out, zmq::RCVTIMEO, 1000); + out.connect(connection.c_str()); + + optional<bool> more; + + // Request synchronisation + zmqSvc().send(out, Monitoring::s_Sync, zmq::SNDMORE); + + // Send what we want to synchronise in the right order + zmqSvc().send(out, what, zmq::SNDMORE); + + // Indicate we want all runs, by sending an empty set, and + // the histo keys we have. + zmqSvc().send(out, std::unordered_set<Monitoring::RunNumber>{}, zmq::SNDMORE); + zmqSvc().send(out, histoKeys); + + optional<string> rep; + zmq::pollitem_t items[] = { + {out, 0, ZMQ_POLLIN, 0} + }; + + zmq::poll(&items[0], 1, 5000); + if (items[0].revents & ZMQ_POLLIN) { + rep = zmqSvc().receive<std::string>(out); + } + + if (!rep) { + cout << "Sync request reply timed out." << endl; + return more; + } else if (*rep != "INCOMING") { + cout << "Bad reply to sync request: " << *rep << endl; + return more; + } + + std::unordered_map<Monitoring::RunNumber, size_t> hpr; + + // Old typedef + using HistoPub = std::vector<std::tuple<Monitoring::RunNumber, Monitoring::HistId, + std::string, std::string>>; + + // Histograms + auto histos = zmqSvc().receive<HistoPub>(out); + more = zmqSvc().receive<bool>(out); + for (const auto& entry : histos) { + auto run = std::get<0>(entry); + auto histID = std::get<1>(entry); + hpr[run]++; + + // Update known keys with received info + histoKeys.emplace(run, histID); + + // Entry consists of (RunNumber, HistId, type, info_string), + // where the info string needs to be converted to a message. + vector<zmq::message_t> msgs; + msgs.reserve(5); + msgs.emplace_back(zmqSvc().encode(Monitoring::s_HistoInfo)); + msgs.emplace_back(zmqSvc().encode(run)); + msgs.emplace_back(zmqSvc().encode(histID)); + msgs.emplace_back(zmqSvc().encode(std::get<2>(entry))); + + const auto& infoString = std::get<3>(entry); + zmq::message_t msg{infoString.size()}; + std::copy_n(begin(infoString), infoString.size(), static_cast<char*>(msg.data())); + msgs.emplace_back(std::move(msg)); + decodeHistoInfo(msgs, histograms); + } + + for (const auto& entry : hpr) { + cout << "Decoded " << std::right << std::setw(6) << to_string(entry.second) + << " histograms for run " << entry.first << endl; + } + + return more; +} + +optional<bool> syncNew(Monitoring::HistoMap& histograms, + const HistoKeys& written, + std::string connection) +{ + + vector<string> what = {Monitoring::s_HistoInfo}; + + HistoKeys histoKeys; + std::for_each(begin(histograms), end(histograms), + [&histoKeys](const HistoEntry& entry) { + histoKeys.emplace(entry.key); + }); + + + std::for_each(begin(written), end(written), + [&histoKeys](const HistoKey& key) { + histoKeys.emplace(key); + }); + + // Connect output request socket + zmq::socket_t out = zmqSvc().socket(zmq::REQ); + zmq::setsockopt(out, zmq::LINGER, 0); + zmq::setsockopt(out, zmq::RCVTIMEO, 100); + out.connect(connection.c_str()); + + optional<bool> more; + + // Request synchronisation + zmqSvc().send(out, Monitoring::s_Sync, zmq::SNDMORE); + + // Send what we want to synchronise in the right order + zmqSvc().send(out, what, zmq::SNDMORE); + + // Indicate we want all runs, by sending an empty set, and + // the histo keys we have. + zmqSvc().send(out, std::unordered_set<Monitoring::RunNumber>{}, zmq::SNDMORE); + zmqSvc().send(out, histoKeys); + + optional<string> rep; + zmq::pollitem_t items[] = { + {out, 0, ZMQ_POLLIN, 0} + }; + + zmq::poll(&items[0], 1, 1000); + if (items[0].revents & ZMQ_POLLIN) { + rep = zmqSvc().receive<std::string>(out); + } + + if (!rep) { + cout << "Sync request reply timed out." << endl; + return more; + } else if (*rep != "INCOMING") { + cout << "Bad reply to sync request: " << *rep << endl; + return more; + } + + std::unordered_map<Monitoring::RunNumber, size_t> hpr; + + // Histograms + auto histos = zmqSvc().receive<HistoPub>(out); + more = zmqSvc().receive<bool>(out); + for (const auto& entry : histos) { + const auto& type = std::get<0>(entry); + const auto& info = std::get<1>(entry); + const auto& keys = std::get<2>(entry); + + // Entry consists of (RunNumber, HistId, type, info_string), + // where the info string needs to be converted to a message. + vector<zmq::message_t> msgs; + msgs.reserve(5); + + for (const auto& key : keys) { + if (msgs.empty()) { + msgs.emplace_back(zmqSvc().encode(Monitoring::s_HistoInfo)); + msgs.emplace_back(zmqSvc().encode(key.first)); + msgs.emplace_back(zmqSvc().encode(key.second)); + msgs.emplace_back(zmqSvc().encode(type)); + + zmq::message_t msg{info.size()}; + std::copy_n(begin(info), info.size(), static_cast<char*>(msg.data())); + msgs.emplace_back(std::move(msg)); + } else { + msgs[1] = zmqSvc().encode(key.first); + msgs[2] = zmqSvc().encode(key.second); + } + // Update known keys with received info + histoKeys.emplace(key.first, key.second); + + hpr[key.first]++; + decodeHistoInfo(msgs, histograms); + } + } + + for (const auto& entry : hpr) { + cout << "Decoded " << std::right << std::setw(6) << to_string(entry.second) + << " histograms for run " << entry.first << endl; + } + + if (more && *more) { + cout << "Syncing again with " << connection + << " as there is more info." << endl; + } + return more; +} + +//=============================================================================== +bool decodeHistoInfo(const vector<zmq::message_t>& msgs, Monitoring::HistoMap& histograms) +{ + if (msgs.size() != 5) { + return false; + } + + const auto run = zmqSvc().decode<Monitoring::RunNumber>(msgs[1]); + const auto id = zmqSvc().decode<Monitoring::HistId>(msgs[2]); + const pair<Monitoring::RunNumber, Monitoring::HistId> key{run, id}; + auto type = zmqSvc().decode<std::string>(msgs[3]); + + string title; + if (!histograms.count(key)) { + HistoMap::const_iterator it; + + // New histogram, do we need to share content? + optional<HistoVariant> variant; + if (type == Monitoring::s_Rate) { + variant = zmqSvc().decode<string>(msgs[4]); + } else if (type == Monitoring::s_Histo1D) { + variant = zmqSvc().decode<Histo1DDef>(msgs[4]); + } else if (type == Monitoring::s_Histo2D) { + variant = zmqSvc().decode<Histo2DDef>(msgs[4]); + } else { + cout << "Unkown type of histogram info: " << type + << " for histogram with ID: " << run << " " << id << endl; + } + + if (variant) { + addHistogram(histograms, key, type, std::move(*variant)); + } + + return true; + } else { + return false; + } +} diff --git a/Online/Hlt2Monitoring/test/test_registrar b/Online/Hlt2Monitoring/test/test_registrar deleted file mode 100755 index d00a534b7663cd8ad2d70110594d20a5148a27bb..0000000000000000000000000000000000000000 Binary files a/Online/Hlt2Monitoring/test/test_registrar and /dev/null differ diff --git a/Online/Hlt2Monitoring/test/test_registrar.cpp b/Online/Hlt2Monitoring/test/test_registrar.cpp index 9d1289aac1fbf61d28a7701ac8ee0fd483329144..b44138ae56286c5caabb4e75180fd20aede50734 100644 --- a/Online/Hlt2Monitoring/test/test_registrar.cpp +++ b/Online/Hlt2Monitoring/test/test_registrar.cpp @@ -1,9 +1,13 @@ #include <string> #include <iostream> +#include <thread> #include <boost/optional.hpp> #include <boost/lexical_cast.hpp> +#include <boost/program_options.hpp> + #include <ZeroMQ/IZeroMQSvc.h> +#include <Hlt2Monitoring/Utilities.h> #include <zmq/zmq.hpp> @@ -15,23 +19,32 @@ namespace { using std::make_unique; using boost::lexical_cast; + + namespace po = boost::program_options; } -int main() { +string g_hostname; - IZeroMQSvc zmqSvc{}; +const IZeroMQSvc& zmqSvc() { + static std::unique_ptr<IZeroMQSvc> svc; + if (!svc) { + svc = std::make_unique<IZeroMQSvc>(); + } + return *svc; +} - std::string hostname = "hltperf-quanta01-e52630v4"; - std::string regCon = "tcp://" + hostname + ":31360"; - // std::string regCon = "ipc:///tmp/test_registrar"; +//=============================================================================== +boost::optional<std::pair<unsigned int, unsigned int>> +registerPublisher(std::string regCon) +{ boost::optional<std::pair<unsigned int, unsigned int>> r; - + zmq::pollitem_t items[1]; unsigned int tries = 0; - auto makeReg = [&zmqSvc, &items, &tries, ®Con] { - auto reg = make_unique<zmq::socket_t>(zmqSvc.context(), zmq::REQ); + auto makeReg = [&items, &tries, ®Con] { + auto reg = make_unique<zmq::socket_t>(zmqSvc().context(), zmq::REQ); items[0] = {reg->operator void*(), 0, zmq::POLLIN, 0}; zmq::setsockopt(*reg, zmq::LINGER, 0); try { @@ -50,9 +63,9 @@ int main() { if (!reg) break; - zmqSvc.send(*reg, "REGISTER", zmq::SNDMORE); - zmqSvc.send(*reg, hostname, zmq::SNDMORE); - zmqSvc.send(*reg, "TEST"); + zmqSvc().send(*reg, "REGISTER", zmq::SNDMORE); + zmqSvc().send(*reg, g_hostname, zmq::SNDMORE); + zmqSvc().send(*reg, "TEST"); zmq::poll(&items[0], 1, 1000); @@ -60,7 +73,7 @@ int main() { // use string here to allow easier interop with possible python // remote end. string mp, mpp; - auto msg = zmqSvc.receive<string>(*reg); + auto msg = zmqSvc().receive<string>(*reg); if (msg == "TAKEN") { cout << "A service publishing files with name " << "TEST" << " already exists, not publishing files." << endl; @@ -68,8 +81,8 @@ int main() { } try { - mp = zmqSvc.receive<string>(*reg); - mpp = zmqSvc.receive<string>(*reg); + mp = zmqSvc().receive<string>(*reg); + mpp = zmqSvc().receive<string>(*reg); r = make_pair(lexical_cast<unsigned int>(mp), lexical_cast<unsigned int>(mpp)); } catch (boost::bad_lexical_cast) { cout << "Got unexpected reply from registar: " << mp << " " << mpp << endl; @@ -82,4 +95,163 @@ int main() { if (r) { cout << r->first << " " << r->second << endl; } + return r; +} + +//=============================================================================== +void registrar(std::string regCon, std::string con) +{ + auto internal = zmqSvc().socket(zmq::PAIR); + zmq::setsockopt(internal, zmq::LINGER, 0); + internal.connect(con.c_str()); + + std::vector<zmq::pollitem_t> items(2); + items[0] = {internal, 0, zmq::POLLIN, 0}; + + auto makePing = [&items, regCon] { + auto ping = zmqSvc().socket(zmq::REQ); + zmq::setsockopt(ping, zmq::LINGER, 0); + ping.connect(regCon.c_str()); + items[1] = {ping, 0, zmq::POLLIN, 0}; + return ping; + }; + + boost::optional<zmq::socket_t> ping; + + bool sentPing = false; + while (true) { + int timeo = sentPing ? 500 : 10 * 1000; + zmq::poll(&items[0], ping ? 2 : 1, timeo); + + if (items[0].revents & zmq::POLLIN) { + auto cmd = zmqSvc().receive<std::string>(internal); + if (cmd == Monitoring::s_Terminate) { + break; + } else { + cout << "registar: received bad command message " << cmd << endl; + } + } + + if (!ping) { + // attempt to register + auto r = registerPublisher(regCon); + if (r) { + zmqSvc().send(internal, r->first); + zmqSvc().send(internal, r->second); + ping = makePing(); + } + } else if (sentPing) { + sentPing = false; + if (items[1].revents & zmq::POLLIN) { + auto msg = zmqSvc().receive<string>(*ping); + if (msg != Monitoring::s_Pong) { + cout << "Bad reply from ping to registrar: " << msg << endl; + } else { + auto app = zmqSvc().receive<string>(*ping); + cout << "Received reply to ping: " << app << endl; + } + } else { + ping.reset(); + cout << "No reply to ping." << endl; + } + } else { + // send a ping message + sentPing = true; + if (!ping) { + ping = makePing(); + } + zmqSvc().send(*ping, Monitoring::s_Ping); + cout << "Sent ping" << endl; + } + } +} + +int main(int ac, char* av[]) { + + // We need this to send to the registrar we are testing, so it + // knows where to find us. + g_hostname = Monitoring::hostname(); + + string regCon; + + // Declare the supported options. + po::options_description desc("Allowed options"); + desc.add_options() + ("help", "produce help message") + ("connection,c", po::value<string>(®Con)->default_value("tcp://" + g_hostname + ":31360"), + "connection to registrar."); + + po::variables_map vm; + po::store(po::command_line_parser(ac, av). + options(desc).run(), vm); + po::notify(vm); + + if (vm.count("help")) { + cout << desc << "\n"; + return 1; + } + + string intRegCon = "inproc://registrar"; + auto reg = zmqSvc().socket(zmq::PAIR); + zmq::setsockopt(reg, zmq::LINGER, 0); + reg.bind(intRegCon.c_str()); + std::thread registrarThread{[regCon, intRegCon]{ registrar(regCon, intRegCon); }}; + + std::vector<zmq::pollitem_t> items; + items.push_back({reg, 0, zmq::POLLIN, 0}); + + boost::optional<zmq::socket_t> ping; + unsigned int pingPort = 0; + string pingCon; + + size_t n = 0; + + while (n < 10) { + zmq::poll(&items[0], items.size(), -1); + + if (items[0].revents & zmq::POLLIN) { + auto pubPort = zmqSvc().receive<unsigned int>(reg); + auto pp = zmqSvc().receive<unsigned int>(reg); + + cout << "received publisher ports: " << pubPort << " " << pp << endl; + + if (pp != pingPort) { + if (!pingCon.empty() ) { + ping->disconnect(pingCon.c_str()); + } + + pingCon = "tcp://*:" + std::to_string(pp); + + // Setup ping connection + ping = zmqSvc().socket(zmq::REP); + zmq::setsockopt(*ping, zmq::LINGER, 0); + ping->bind(pingCon.c_str()); + if (pingPort != 0) { + items[1] = {*ping, 0, zmq::POLLIN, 0}; + } else { + items.push_back({*ping, 0, zmq::POLLIN, 0}); + } + pingPort = pp; + } + } + + // Reply to pings from registration server to see if we're + // alive. + if (ping && items[1].revents & zmq::POLLIN) { + auto msg = zmqSvc().receive<string>(*ping); + cout << "Received message " << msg << " on ping socket." << endl; + if (msg == Monitoring::s_Ping) { + zmqSvc().send(*ping, Monitoring::s_Pong, zmq::SNDMORE); + zmqSvc().send(*ping, "TEST"); + ++n; + } else { + cout << "Received unknown message on ping socket: " << msg << endl; + } + } + if (n == 10) { + zmqSvc().send(reg, Monitoring::s_Terminate); + } + } + + registrarThread.join(); } diff --git a/Online/Monitoring/python/Monitoring/Communicator.py b/Online/Monitoring/python/Monitoring/Communicator.py index 63f2c0f08f24236cc2d3a4c8b5d5fa34385af7e1..0c6e4db6e44a19267bbc039644be1152341fbae2 100644 --- a/Online/Monitoring/python/Monitoring/Communicator.py +++ b/Online/Monitoring/python/Monitoring/Communicator.py @@ -1,6 +1,7 @@ import os import pydim import socket +import errno class State(object): @@ -58,8 +59,16 @@ class Communicator(object): def status(self): return self.__status + def pipe(self): + return self.__process_end + def get_command(self): - return self.__process_end.recv() + while True: + try: + return self.__process_end.recv() + except IOError as e: + if e.errno != errno.EINTR: + raise def set_status(self, status): self.__status = status diff --git a/Online/Monitoring/python/Monitoring/DimMonitor.py b/Online/Monitoring/python/Monitoring/DimMonitor.py index ee94e6259825f6026390e898c1d64a0906eaf32c..13dbfe1c5470baf8d904114e00e9b4d173075aa0 100644 --- a/Online/Monitoring/python/Monitoring/DimMonitor.py +++ b/Online/Monitoring/python/Monitoring/DimMonitor.py @@ -113,7 +113,7 @@ class DimMonitor(object): class DimForwarder(object): def __init__(self, service, t=None, dim_dns_node="mona08", - connection_path="/tmp", context=None): + connection_path="/tmp", context=None, pipe=None): self.__dns_node = dim_dns_node self.__connection_path = connection_path if not os.path.exists(connection_path): @@ -123,6 +123,7 @@ class DimForwarder(object): self.__type = t if t is not None else "C" self.__context = context self.__internal = None + self.__pipe = None def __callback(self, tag, val): # the callback is called from another thread, so we need a socket to @@ -134,6 +135,9 @@ class DimForwarder(object): self.__internal.send(val) def __call__(self): + if self.__pipe: + self.__pipe.recv() + if self.__context is None: self.__context = zmq.Context() @@ -154,12 +158,14 @@ class DimForwarder(object): for s in (internal, control): poller.register(s, zmq.POLLIN) + if self.__pipe: + self.__pipe.send("ready") + done = False while not done: socks = dict(poller.poll()) if control in socks and socks[control] == zmq.POLLIN: - message = control.recv() - print "Recieved control command: %s" % message + message = control.recv_string() if message == "TERMINATE": done = True @@ -175,6 +181,7 @@ class DimForwarder(object): def control_connection(self): return self.service_connection() + "_control" + __all__ = (DimMonitor, DimForwarder, get_dns_node, diff --git a/Online/Monitoring/python/Monitoring/DiskMonitor.py b/Online/Monitoring/python/Monitoring/DiskMonitor.py new file mode 100644 index 0000000000000000000000000000000000000000..929aa56e94de53cb47e0c444ecad5b628aa5a48a --- /dev/null +++ b/Online/Monitoring/python/Monitoring/DiskMonitor.py @@ -0,0 +1,195 @@ +import re +import socket +from Monitoring.DimMonitor import DimForwarder +from Monitoring.decorators import zmq +from Hlt2Monitoring.Utilities import node_type +from Configurables import MonitoringJob +from multiprocessing import Process, Pipe +from ROOT import TH1D +from ROOT import kOrange, kGreen, kBlue, kMagenta +from Communicator import (Communicator, State) + + +def nodes_info(val): + node_re = re.compile( + r"hlt(?P<rack>[a-f])(?P<row>[0-9]{2})(?P<node>[0-9]{2})") + info = {} + s = val.split('|') + if not s: + return info + for entry in s: + if not entry: + continue + if not entry[0].isalnum(): + break + host, numbers = entry.split(' ') + m = node_re.match(host) + if not m: + continue + n = numbers.split('/') + total, free, nd, gd = float(n[0]), float(n[1]), int(n[2]), int(n[3]) + info[(host, m.group('rack'), int(m.group('row')), + int(m.group('node')))] = (total, free, nd, gd) + return info + + +plots = {0: ("slow", kMagenta - 3), + 1: ("medium", kOrange + 5), + 2: ("fast", kBlue - 3), + 3: ("faster", kGreen - 3)} +histos = {} + + +def run(outputLevel=3, auto=True, **kwargs): + + forward_pipe, start_pipe = Pipe() + filling_svc = DimForwarder('FarmStatus/StorageStatus', + dim_dns_node="ecs03", + connection_path="/tmp/DiskMonitor", + pipe=start_pipe) + forwarder = Process(target=filling_svc) + forwarder.daemon = True + # fork must happen here, so start now + forwarder.start() + + # Start the communicator + com = Communicator('DISKMONITOR') + + mj = MonitoringJob(**kwargs) + mj.JobName = "DiskFilling" + + # FSM loop + state = State.NOT_READY + com.set_status(state) + + gaudi = None + + print '[DEBUG] DiskMonitor launched' + + poller = zmq.Poller() + command_pipe = com.pipe() + poller.register(command_pipe, zmq.POLLIN) + + # socket placeholders + input_socket = None + control = None + zmqSvc = None + + while True: + + if not (auto and state in (State.NOT_READY, State.READY)): + timeout = -1 + else: + timeout = 0 + + rep = dict(poller.poll(timeout)) + command = None + + # Handle commands + if command_pipe in rep and rep[command_pipe] == zmq.POLLIN: + command = com.get_command() + print 'Got command %s' % command + + if ((command and command.startswith('configure') or auto) + and state == State.NOT_READY): + + from Monitoring.MonitoringJob import initialize + initialize() + + for nt, (name, color) in plots.iteritems(): + histo = TH1D(name, name, 101, -0.05, 1.05) + histo.SetLineColor(color) + histo.SetLineWidth(2) + histos[nt] = histo + + state = State.READY + elif ((command and command.startswith('start') or auto) + and state == State.READY): + # Really start the forwarder process + start_pipe.send('start') + + # Start our main job + from Monitoring.MonitoringJob import start + monSvc = None + try: + _, monSvc = start() + except RuntimeError, e: + print e + state = State.ERROR + break + + zmqSvc = monSvc.zmq() + input_socket = zmqSvc.socket(zmq.SUB) + input_socket.connect(filling_svc.service_connection()) + input_socket.setsockopt(zmq.SUBSCRIBE, "") + poller.register(input_socket, zmq.POLLIN) + + # Wait for forwarder to be ready + r = forward_pipe.poll(60) + if r: + forward_pipe.recv() + else: + state = State.ERROR + break + + # Connect control connection + control = zmqSvc.socket(zmq.PAIR) + control.connect(filling_svc.control_connection()) + + state = State.RUNNING + elif (command and command.startswith('stop') + and state in (State.RUNNING, State.READY)): + if gaudi: + gaudi.stop() + state = State.READY + elif command and command.startswith('reset'): + if zmqSvc: + control.send("TERMINATE") + if forwarder.is_alive(): + forwarder.join() + + if gaudi: + gaudi.finalize() + state = State.NOT_READY + break + elif command is not None: + print ('[ERROR]: RunDB server: bad transition ' + 'from %s to %s' % (state, command)) + state = State.ERROR + break + + if (command is not None) or (auto and state in (State.NOT_READY, + State.READY, + State.RUNNING)): + # Once we've auto-started, set auto to off to prevent + # further auto actions, for example after a stop command + if auto and state == State.RUNNING: + auto = False + # Set the status + com.set_status(state) + + # Handle input from the forwarder + if (input_socket and input_socket in rep + and rep[input_socket] == zmq.POLLIN): + info = input_socket.recv_string() + # Reset the histograms + for histo in histos.itervalues(): + histo.Reset() + + for k, v in nodes_info(info).iteritems(): + (host, _, row, node) = k + gd = v[3] + if gd != 2: + continue + _, nt = node_type(*k[1:]) + histos[nt].Fill((v[0] - v[1]) / v[0] if v[3] else 0.) + + for histo in histos.itervalues(): + monSvc.publishHistogram("DiskFilling", histo, add=False) + + # Set our status one last time + com.set_status(state) + + +if __name__ == '__main__': + run() diff --git a/Online/Monitoring/python/Monitoring/Manager.py b/Online/Monitoring/python/Monitoring/Manager.py index e5c0a60713080a217caa0819044ca53c941dc315..51328168bbafc452c9d026d10a7df060ff472457 100644 --- a/Online/Monitoring/python/Monitoring/Manager.py +++ b/Online/Monitoring/python/Monitoring/Manager.py @@ -5,6 +5,7 @@ import pydim import shlex from multiprocessing import Lock, Condition + class State(object): UNKNOWN = "UNKNOWN" ERROR = "ERROR" @@ -13,6 +14,7 @@ class State(object): RUNNING = "RUNNING" PAUSED = "PAUSED" + class Manager(object): def __init__(self, utgid, command, env): self.state = State.UNKNOWN @@ -23,10 +25,12 @@ class Manager(object): cmd = command if type(command) == str: cmd = shlex.split(command) - self.p = subprocess.Popen(cmd, env = os_env) + self.p = subprocess.Popen(cmd, env=os_env) print self.p.pid - self.__dim_svc = pydim.dic_info_service(utgid + '/status', "C", self.callback, pydim.MONITORED, 0, 0, None) + self.__dim_svc = pydim.dic_info_service(utgid + '/status', "C", + self.callback, + pydim.MONITORED, 0, 0, None) self.condition = Condition(Lock()) def _notify(self): @@ -34,7 +38,7 @@ class Manager(object): self.condition.notify() self.condition.release() - def send_command(self, command, wait = True): + def send_command(self, command, wait=True): if wait: self.condition.acquire() pydim.dic_cmnd_service(self.utgid, (command,), "C") @@ -94,3 +98,7 @@ class Manager(object): self.state = State.NOT_READY print 'RESET' self._notify() + elif val.strip().startswith("ERROR"): + self.state = State.ERROR + print 'ERROR' + self._notify() diff --git a/Online/Monitoring/python/Monitoring/MonitoringJob.py b/Online/Monitoring/python/Monitoring/MonitoringJob.py index cf9bdeacd4093b751e498faf1e511ad0e39c34fb..9b66d3cf3d62be383fedf0e4bce65a195fb734ea 100644 --- a/Online/Monitoring/python/Monitoring/MonitoringJob.py +++ b/Online/Monitoring/python/Monitoring/MonitoringJob.py @@ -13,7 +13,7 @@ class MonitoringJob(ConfigurableUser): "DimDNSNode": "mona08", "Saver": True, "Sender": True, - "RegistrarConnection": None, + "RegistrarConnection": "tcp://hist01:31360", "HistogramDirectories": None} def connection(self, t, conType="ipc"): @@ -117,8 +117,7 @@ class MonitoringJob(ConfigurableUser): saverSvc.UseRunInfoService = (self.getProp("RunInfoType") != "None") timeOut = 1. if (self.getProp("RunInfoType") != "Job") else 0.2 saverSvc.RunInfoPollTimeout = timeOut - if self.isPropertySet("RegistrarConnection"): - saverSvc.RegistrarConnection = self.getProp("RegistrarConnection") + saverSvc.RegistrarConnection = self.getProp("RegistrarConnection") zmqSvcs.append(saverSvc) # Publish services, one per directory to make the histograms @@ -137,35 +136,47 @@ class MonitoringJob(ConfigurableUser): svc.RunInPartitions = [self.getProp("Partition")] -def start(): +def initialize(gaudi=None): mj = MonitoringJob() - from GaudiPython.Bindings import AppMgr, InterfaceCast, gbl + + from GaudiPython.Bindings import AppMgr from Monitoring.RunDBInfoService import RunDBInfoService from Monitoring.MonitoringSvc import MonitoringSvc - gaudi = AppMgr() + if gaudi is None: + gaudi = AppMgr() - if mj.getProp("RunInfoType") == "RunDB": - RunDBInfoService("RunDBInfoService", Connection=mj.connection("info")) + monSvc = MonitoringSvc("MonitoringSvc") - monSvc = None - if mj.getProp("Sender"): - monSvc = MonitoringSvc("MonitoringSvc") + if mj.getProp("RunInfoType") == "RunDB": + RunDBInfoService("RunDBInfoService", Connection=mj.connection("info"), + OutputLevel=1) # if we are running as part of GaudiOnline, it will fire the right # incident. - if mj.getProp("GaudiOnline"): - return gaudi, monSvc + if not mj.getProp("GaudiOnline"): + # Initialize and start Gaudi + sc = gaudi.initialize() + if not sc.isSuccess(): + raise RuntimeError("Failed to initialize Gaudi") + return monSvc - # Initialize and start Gaudi - sc = gaudi.initialize() - if not sc.isSuccess(): - raise RuntimeError("Failed to initialize Gaudi") + +def start(): + from GaudiPython.Bindings import AppMgr, gbl, InterfaceCast + gaudi = AppMgr() + + if gaudi.FSMState() < gbl.Gaudi.StateMachine.INITIALIZED: + monSvc = initialize(gaudi) + else: + # Trigger the decoration of AppMgr.service + from Monitoring import Service + monSvc = gaudi.service('MonitoringSvc') incSvc = gaudi.service("IncidentSvc").getInterface() incSvc = InterfaceCast(gbl.IIncidentSvc)(incSvc) - gaudi.start() + sc = gaudi.start() if not sc.isSuccess(): raise RuntimeError("Failed to start Gaudi") diff --git a/Online/Monitoring/python/Monitoring/MonitoringSvc.py b/Online/Monitoring/python/Monitoring/MonitoringSvc.py index a8cd98c09d4f4376891da14732e35b16639dcef9..393dada0c34dedb9001731e537a8a36e76d3ab8b 100644 --- a/Online/Monitoring/python/Monitoring/MonitoringSvc.py +++ b/Online/Monitoring/python/Monitoring/MonitoringSvc.py @@ -24,13 +24,13 @@ class MonitoringSvc(PyService): t = type(histo) if (t not in self.__types): types = [str(ht) for ht in self.__types.keys()] - raise TypeError("Unsopported type of histogram:" + raise TypeError("Unsupported type of histogram:" " %s, only %s are supported." % (str(type(histo)), types)) return self.__types[t] def zmq(self): - return self.__zmqSvc + return self.service(gbl.IZeroMQSvc, "ZeroMQSvc") def socket(self, t): return self.zmq().socket(t) @@ -46,7 +46,6 @@ class MonitoringSvc(PyService): if self.RunNumberFromDIM: self.__dimSvc = self.service(gbl.DimMonitorSvc, "DimMonitorSvc") - self.__zmqSvc = self.service(gbl.IZeroMQSvc, "ZeroMQSvc") return sc def start(self): @@ -68,7 +67,7 @@ class MonitoringSvc(PyService): self.__triggerSocket = self.socket(zmq.PAIR) self.__triggerSocket.setsockopt(zmq.LINGER, 0) self.__triggerSocket.connect("inproc://runNumber") - fun = gbl.DimHelper('int').dispatcher(self.__zmqSvc, + fun = gbl.DimHelper('int').dispatcher(self.zmq(), self.__triggerSocket) self.__runNumber = monitor(self.RunNumberService, fun) # Start the thread that listens for run number triggers and diff --git a/Online/Monitoring/python/Monitoring/PublicationListener.py b/Online/Monitoring/python/Monitoring/PublicationListener.py index 69d0828e91f821e1bf866e7dfc8f1d3535eb82a6..b5877a6e76a26784660f1722d3593405a8e0591b 100644 --- a/Online/Monitoring/python/Monitoring/PublicationListener.py +++ b/Online/Monitoring/python/Monitoring/PublicationListener.py @@ -66,6 +66,7 @@ class PublicationListener(PyService): publishers = [] publisher = 0 + tries = {} timeo = 0 wait_for_reply = False @@ -85,11 +86,20 @@ class PublicationListener(PyService): msg = ping.recv_string() if msg == "PONG" and ping.more(): wait_for_reply = False - msg = ping.recv('string') - self.Verbose("Got PONG from app %s at %s" % (msg, address)) - if msg != publishers[publisher][0]: - replace(context, *publishers[publisher], new=msg) + app, host, pub_port, ping_port = publishers[publisher] + remote_app = ping.recv('string') + self.Verbose("Got PONG from app %s at %s" % (remote_app, + address)) + if remote_app != app: + replace(context, app, host, pub_port, ping_port, + remote_app) + tries.pop((app, host)) + tries[(remote_app, host)] = 0 + else: + tries[(app, host)] = 0 + publisher += 1 + if publisher == len(publishers): timeo = self.__ping_freq else: @@ -108,6 +118,7 @@ class PublicationListener(PyService): # list if publisher == len(publishers): publishers = request(context) + tries = {(a, h): 0 for a, h, _, _ in publishers} self.Verbose("Received publishers %s" % publishers) # If we received a list of publishers, set the # timeout to 0 to immediately start pinging, @@ -119,20 +130,27 @@ class PublicationListener(PyService): publisher = 0 # We were waiting for a reply to a ping, but none came elif wait_for_reply: - # A publisher is offline, send it to the server thread - offline(context, *publishers[publisher]) + app, host, pub_port, ping_port = publishers[publisher] # Since no reply was received, the ping socket is # in a bad state, so make a new one ping = make_ping() wait_for_reply = False - # Remove the offline publisher from our list - publishers.pop(publisher) - # If the offline publisher was the last one, wait longer - if publisher == len(publishers): - timeo = self.__ping_freq + self.Debug("No reply to ping from %s on %s, %d tries." % + (app, host, tries[(app, host)])) + if tries[(app, host)] >= 2: + # A publisher is offline, send it to the server thread + offline(context, app, host, pub_port, ping_port) + # Remove the offline publisher from our list + publishers.pop(publisher) + tries.pop((app, host)) + # If the offline publisher was the last one, + # wait longer + if publisher == len(publishers): + timeo = self.__ping_freq # The poll timeout is used to wait before sending the next ping else: - _, host, _, ping_port = publishers[publisher] + app, host, _, ping_port = publishers[publisher] + tries[(app, host)] += 1 address = "tcp://%s:%d" % (host, ping_port) self.Verbose("Sending PING to %s" % address) ping.connect(address) diff --git a/Online/Monitoring/python/Monitoring/PublicationRegistrar.py b/Online/Monitoring/python/Monitoring/PublicationRegistrar.py index ce870360a946f645e3aea20ea787204053dcb5cb..ff0015f8375eb77ce2e21791a90357b4cd9d8b53 100644 --- a/Online/Monitoring/python/Monitoring/PublicationRegistrar.py +++ b/Online/Monitoring/python/Monitoring/PublicationRegistrar.py @@ -204,6 +204,8 @@ class PublicationRegistrar(PublicationListener): external.send_multipart(["AVAILABLE"] + info) else: external.send("UNKNOWN") + elif msg == "PING": + external.send_multipart(["PONG", "Registrar"]) # Message from ping thread, either request for list of # publishers or message that one of them is offline. diff --git a/Online/Monitoring/python/Monitoring/RunDBInfoServer.py b/Online/Monitoring/python/Monitoring/RunDBInfoServer.py index 24c5704537992b143d7350a23e5ee6c1e3f78442..5a55f488646bb13240b52831d732a7c869ed238d 100644 --- a/Online/Monitoring/python/Monitoring/RunDBInfoServer.py +++ b/Online/Monitoring/python/Monitoring/RunDBInfoServer.py @@ -2,7 +2,7 @@ import os from Communicator import (Communicator, State) -def run(connection="ipc:///tmp/test"): +def run(connection="ipc:///tmp/test", outputLevel=3): # Start the communicator: com = Communicator('RUNDBINFOSERV') @@ -31,7 +31,8 @@ def run(connection="ipc:///tmp/test"): from RunDBInfoService import RunDBInfoService gaudi = AppMgr() RunDBInfoService("RunDBInfoService", - Connection=connection) + Connection=connection, + OutputLevel=outputLevel) gaudi.initialize() state = State.READY diff --git a/Online/Monitoring/python/Monitoring/RunDBInfoService.py b/Online/Monitoring/python/Monitoring/RunDBInfoService.py index 9b9b3456fe7ba8679cbc26e05f2e8b7a959faf4a..9c04ffe1eee01363aa87e00998671eadc5559664 100644 --- a/Online/Monitoring/python/Monitoring/RunDBInfoService.py +++ b/Online/Monitoring/python/Monitoring/RunDBInfoService.py @@ -48,8 +48,16 @@ class RunDBInfoService(PyService): def function(self): from Monitoring.decorators import zmq self.Info("Started RunDB info thread.") - socket = self.zmq().socket(zmq.REP) - socket.bind(self.Connection) + + def makeSocket(): + socket = self.zmq().socket(zmq.REP) + socket.setsockopt(zmq.LINGER, 0) + socket.setsockopt(zmq.RCVTIMEO, 2000) + socket.setsockopt(zmq.SNDTIMEO, 500) + socket.bind(self.Connection) + return socket + + socket = makeSocket() poller = zmq.Poller() poller.register(socket, zmq.POLLIN) @@ -64,20 +72,41 @@ class RunDBInfoService(PyService): self.Verbose("No run info requests in the last 10 seconds.") continue - msg = socket.recv_string() + msg = None + try: + msg = socket.recv_string() + except Exception: + self.Warning("Failed to receive message, recreating socket.") + poller.unregister(socket) + socket = makeSocket() + poller.register(socket, zmq.POLLIN) + continue if msg == "TERMINATE": socket.send("") break run = None + application = None if msg != Monitoring.s_RunInfo: self.Warning('Unknown info request: %s' % msg) while socket.more(): socket.recv_message() + poller.unregister(socket) + socket = makeSocket() + poller.register(socket, zmq.POLLIN) + continue + + try: + run = socket.recv(long) + application = socket.recv_string() + except Exception: + self.Warning( + "Failed to receive run and application, recreating socket.") + poller.unregister(socket) + socket = makeSocket() + poller.register(socket, zmq.POLLIN) continue - run = socket.recv(long) - application = socket.recv_string() # See if we got an integer as request, if not reply with INVALID # We got a run number, check if we already know its info, if not, @@ -96,12 +125,25 @@ class RunDBInfoService(PyService): run_info.deadtime = ri.get('avPhysDeadTime', -1) infos[run] = run_info except (RequestError, TypeError): - socket.send_string("UNKNOWN") - continue + try: + socket.send_string("UNKNOWN") + except Exception: + self.Warning( + "Failed to send UNKNOWN, recreating socket.") + poller.unregister(socket) + socket = makeSocket() + poller.register(socket, zmq.POLLIN) + continue else: run_info = infos[run] # Get the deadtime and reply with it. - socket.send_multipart([Monitoring.s_Known, run_info]) + try: + socket.send_multipart([Monitoring.s_Known, run_info]) + except Exception: + self.Warning("Failed to send reply, recreating socket.") + poller.unregister(socket) + socket = makeSocket() + poller.register(socket, zmq.POLLIN) return SUCCESS diff --git a/Online/Monitoring/python/Monitoring/Service.py b/Online/Monitoring/python/Monitoring/Service.py index 162796fa7249777315aa6f231f69fa95c0fa1a2e..97424dc76793873d859510b49feef77c98d5aa80 100644 --- a/Online/Monitoring/python/Monitoring/Service.py +++ b/Online/Monitoring/python/Monitoring/Service.py @@ -1,3 +1,4 @@ +from functools import wraps from GaudiPython.Bindings import iService, AppMgr, InterfaceCast, Helper from GaudiPython.GaudiAlgs import _hasProperty_ import cppyy @@ -7,6 +8,36 @@ gInterpreter = gbl.gInterpreter gInterpreter.Declare('#include <Monitoring/PyService.h>') +def static_vars(**kwargs): + """ Add an attribute to the function that can be used as a static + variable. """ + def decorate(func): + for k in kwargs: + setattr(func, k, kwargs[k]) + return func + return decorate + + +def _service_wrapper(func): + """ + Wraper the AppMgr.service call to return GaudiPython-based + instances of services if they exist. + """ + @wraps(func) + @static_vars(orig=func) + def wrapper(self, name, interface=None): + if ('GaudiPythonServices' in self.__dict__ + and name in self.__dict__['GaudiPythonServices']): + return self.__dict__['GaudiPythonServices'][name] + else: + return wrapper.orig(self, name, interface) + + return wrapper + + +AppMgr.service = _service_wrapper(AppMgr.service) + + def _init_(self, name, **args): """ The constructor from a unique service instance name & parameters diff --git a/Online/Monitoring/python/Monitoring/TestUtilities.py b/Online/Monitoring/python/Monitoring/TestUtilities.py index 38f8bd7cc3f7db0db8dba672d8e1b365f933e4c7..37ebbce025982049fa5c2f66c6c81e83ee7e8cdd 100644 --- a/Online/Monitoring/python/Monitoring/TestUtilities.py +++ b/Online/Monitoring/python/Monitoring/TestUtilities.py @@ -138,10 +138,14 @@ class Publisher(object): print 'Got unknown control message:', msg if not sockets: - if n < len(self.__files): + if self.__files and n < len(self.__files): filename = self.__files[n] + elif self.__files: + print 'Publishing done.' + break else: filename = "/tmp/test_%d" % n + print 'Publishing %s' % filename publish.send(filename) n += 1 @@ -152,9 +156,10 @@ class Publisher(object): class Registrar(object): - def __init__(self, reg_con="ipc:///tmp/registrar"): + def __init__(self, reg_con="ipc:///tmp/registrar", **kwargs): self.__reg_con = reg_con self.__reg = None + self.__args = kwargs def reg_con(self): return self.__reg_con @@ -164,8 +169,8 @@ class Registrar(object): from PublicationRegistrar import PublicationRegistrar self.__reg = PublicationRegistrar("PublicationRegistrar", StartPort=31361, - OutputLevel=2, - RegistrarConnection=self.reg_con()) + RegistrarConnection=self.reg_con(), + **self.__args) self.__reg.initialize() self.__reg.start() diff --git a/Online/Monitoring/scripts/DiskMonitor.py b/Online/Monitoring/scripts/DiskMonitor.py deleted file mode 100644 index 16c5e59561a8adfa6beebb68aa1c5863c98b6bd5..0000000000000000000000000000000000000000 --- a/Online/Monitoring/scripts/DiskMonitor.py +++ /dev/null @@ -1,99 +0,0 @@ -import atexit -import re -import socket -from Monitoring.DimMonitor import DimForwarder -from Monitoring.decorators import zmq, gbl -from Hlt2Monitoring.Utilities import node_type -from Configurables import MonitoringJob -from multiprocessing import Process -from ROOT import TH1D -from ROOT import kOrange, kGreen, kBlue, kMagenta - -filling_svc = DimForwarder('FarmStatus/StorageStatus', dim_dns_node="ecs03", - connection_path="/tmp/DiskMonitor") -p = Process(target=filling_svc) -p.start() -atexit.register(p.terminate) - -mj = MonitoringJob() -mj.JobName = "DiskFilling" -mj.SavePath = "/tmp/histograms" -mj.RegistrarConnection = "tcp://%s:31360" % socket.gethostname() -# Run the DIM monitoring service in another service to allow the DIM DNS node -# to be ecs03 while we need mona08 to publish to the presenter - -# Start our main job -from Monitoring.MonitoringJob import start -gaudi, monSvc = start() - - -def nodes_info(val): - node_re = re.compile( - r"hlt(?P<rack>[a-f])(?P<row>[0-9]{2})(?P<node>[0-9]{2})") - info = {} - s = val.split('|') - if not s: - return info - for entry in s: - if not entry: - continue - if not entry[0].isalnum(): - break - host, numbers = entry.split(' ') - m = node_re.match(host) - if not m: - continue - n = numbers.split('/') - total, free, nd, gd = float(n[0]), float(n[1]), int(n[2]), int(n[3]) - info[(host, m.group('rack'), int(m.group('row')), - int(m.group('node')))] = (total, free, nd, gd) - return info - - -plots = {0: ("slow", kMagenta - 3), - 1: ("medium", kOrange + 5), - 2: ("fast", kBlue - 3), - 3: ("faster", kGreen - 3)} - -histos = {} -for nt, (name, color) in plots.iteritems(): - histo = TH1D(name, name, 101, -0.05, 1.05) - histo.SetLineColor(color) - histo.SetLineWidth(2) - histos[nt] = histo - -zmqSvc = monSvc.zmq() -input_socket = zmqSvc.socket(zmq.SUB) -input_socket.connect(filling_svc.service_connection()) -input_socket.setsockopt(zmq.SUBSCRIBE, "") - -control = zmqSvc.socket(zmq.PAIR) -control.connect(filling_svc.service_connection()) - -while True: - info = None - try: - info = input_socket.recv_string() - except KeyboardInterrupt: - break - - # Reset the histograms - for histo in histos.itervalues(): - histo.Reset() - - for k, v in nodes_info(info).iteritems(): - rd = {k: i + 1 for i, k in enumerate('abcdef')} - (host, _, row, node) = k - gd = v[3] - if gd != 2: - continue - _, nt = node_type(*k[1:]) - histos[nt].Fill((v[0] - v[1]) / v[0] if v[3] else 0.) - - for histo in histos.itervalues(): - monSvc.publishHistogram("DiskFilling", histo, add=False) - -zmqSvc.send(control, "TERMINATE") -p.join() -gaudi.finalize() -gaudi.exit() diff --git a/Online/Monitoring/scripts/registrar.py b/Online/Monitoring/scripts/registrar.py index 9e37ce4f4bb18e4fa3e2c5ba02422b5d08fbac9d..0d0b7ce495f5d37381d802af3778493e7c1ad138 100644 --- a/Online/Monitoring/scripts/registrar.py +++ b/Online/Monitoring/scripts/registrar.py @@ -15,12 +15,7 @@ context = zmqSvc.context() hostname = socket.gethostname() reg_con = "tcp://*:31360" - -def ctrl_con(suffix): - return "inproc://control_%s" % suffix - - -registrar = Registrar(reg_con) +registrar = Registrar(reg_con, OutputLevel=2) print "Start registrar" registrar.start() diff --git a/Online/Monitoring/scripts/registrar_request.py b/Online/Monitoring/scripts/registrar_request.py new file mode 100644 index 0000000000000000000000000000000000000000..cae55455d9ec26902cb4863a705cc98996705d08 --- /dev/null +++ b/Online/Monitoring/scripts/registrar_request.py @@ -0,0 +1,51 @@ +import pprint +import socket +import argparse +from itertools import izip_longest +from Monitoring.decorators import zmq +from GaudiPython.Bindings import AppMgr, InterfaceCast, gbl + + +parser = argparse.ArgumentParser(usage='usage: %(prog)s app') +parser.add_argument("app", type=str, nargs='?', default="") +parser.add_argument("-c", "--connection", type=str, dest="con", + default="tcp://hist01:31360") +args = parser.parse_args() + + +gaudi = AppMgr() +zmqSvc = gaudi.createSvc("ZeroMQSvc") +zmqSvc = InterfaceCast(gbl.IZeroMQSvc)(zmqSvc) +gaudi.initialize() +gaudi.start() + +context = zmqSvc.context() + +hostname = socket.gethostname() + +req = context.socket(zmq.REQ) +req.connect(args.con) + + +def request(app): + msg = ["REQUEST"] + if app: + msg += [app] + req.send_multipart(msg) + msg = req.recv_string() + if msg == "AVAILABLE": + rest = [] + while req.more(): + rest.append(req.recv_string()) + return rest + else: + return [] + + +def grouper(iterable, n, fillvalue=None): + args = [iter(iterable)] * n + return izip_longest(*args, fillvalue=fillvalue) + + +for app in grouper(request(args.app), 4): + print ' '.join(app) diff --git a/Online/Monitoring/scripts/test_disk_monitor.py b/Online/Monitoring/scripts/test_disk_monitor.py new file mode 100644 index 0000000000000000000000000000000000000000..b8d04092818eff0bb1d948ac03cadc9944542a10 --- /dev/null +++ b/Online/Monitoring/scripts/test_disk_monitor.py @@ -0,0 +1,22 @@ +import os +import atexit +from Hlt2Monitoring.Manager import Manager + +utgid = 'TEST_DISKMONITOR_00' + +cmd = """ +import socket +from Monitoring import DiskMonitor +DiskMonitor.run(SavePath='/tmp/histograms', + RegistrarConnection='tcp://%s:31360' % socket.gethostname(), + DimDNSNode='mona08') +""" + +os.environ['DIM_DNS_NODE'] = 'mona08' +env = {'LC_ALL': 'C', 'UTGID': utgid, 'PARTITION': 'LHCb', + 'PARTITION_NAME': 'LHCb'} + +command = 'python -c "%s"' % cmd +manager = Manager(utgid, command, env) + +atexit.register(manager.terminate) diff --git a/Online/Monitoring/scripts/test_poller.py b/Online/Monitoring/scripts/test_poller.py new file mode 100644 index 0000000000000000000000000000000000000000..fb58f400ce73e00a95fbe2c9e5675f03677b1d81 --- /dev/null +++ b/Online/Monitoring/scripts/test_poller.py @@ -0,0 +1,92 @@ +import random +from time import sleep +from multiprocessing import Process, Pipe +from ZeroMQ.decorators import zmq +from GaudiPython.Bindings import AppMgr, InterfaceCast, gbl + +gaudi = AppMgr() +zmqSvc = gaudi.createSvc("ZeroMQSvc") +zmqSvc = InterfaceCast(gbl.IZeroMQSvc)(zmqSvc) +gaudi.initialize() + + +def zmq_send(): + gaudi.start() + context = zmqSvc.context() + + sockets = [] + for i in range(1, 3): + s = context.socket(zmq.PAIR) + s.connect('ipc:///tmp/test%d' % i) + sockets.append(s) + + for i in range(5): + for s in sockets: + s.send(str(i)) + sleep(1) + gaudi.stop() + gaudi.finalize() + gaudi.exit() + + +zmq_p = Process(target=zmq_send) +zmq_p.start() + +gaudi.start() +context = zmqSvc.context() + + +def pipe_send(p, s): + random.seed(s) + for i in range(3): + p.send(i) + sleep(random.randint(1, 4)) + + +processes = {} +for i in range(3): + parent_con, child_con = Pipe() + p = Process(target=pipe_send, args=(child_con, i)) + processes[i] = (p, parent_con) + + +sockets = [] +for i in range(1, 3): + s = context.socket(zmq.PAIR) + s.bind('ipc:///tmp/test%d' % i) + sockets.append(s) + +poller = zmq.Poller() +for socket in sockets: + poller.register(socket, zmq.POLLIN) + +for i, (p, pipe) in processes.iteritems(): + if i < 3: + poller.register(pipe, zmq.POLLIN) + else: + poller.register(pipe.fileno(), zmq.POLLIN) + p.start() + +n = 0 +while n < 10: + rep = dict(poller.poll()) + for s in sockets: + if s in rep and rep[s] == zmq.POLLIN: + print 'from socket', s.recv_string() + for _, s in processes.itervalues(): + if s in rep and rep[s] == zmq.POLLIN: + print 'from pipe ', s.recv() + n += 1 + + +for socket in sockets: + poller.unregister(socket) + +for i, (p, pipe) in processes.iteritems(): + if i < 3: + poller.unregister(pipe) + else: + poller.unregister(pipe.fileno()) + p.join() + +zmq_p.join() diff --git a/Online/Monitoring/scripts/test_poller_rep.py b/Online/Monitoring/scripts/test_poller_rep.py deleted file mode 100644 index cda179e6d3b31ccf62d475124ec9f7986a0dbe42..0000000000000000000000000000000000000000 --- a/Online/Monitoring/scripts/test_poller_rep.py +++ /dev/null @@ -1,24 +0,0 @@ -from ZeroMQ.decorators import zmq - - -from GaudiPython.Bindings import AppMgr, InterfaceCast, gbl - -gaudi = AppMgr() -zmqSvc = gaudi.createSvc("ZeroMQSvc") -zmqSvc = InterfaceCast(gbl.IZeroMQSvc)(zmqSvc) -gaudi.initialize() -gaudi.start() - -context = zmqSvc.context() - -sockets = [] -for i in range(1, 3): - s = context.socket(zmq.PAIR) - s.bind('ipc:///tmp/test%d' % i) - sockets.append(s) - -poller = zmq.Poller() -for socket in sockets: - poller.register(socket, zmq.POLLIN) - -rep = dict(poller.poll()) diff --git a/Online/Monitoring/scripts/test_poller_req.py b/Online/Monitoring/scripts/test_poller_req.py deleted file mode 100644 index 718687f1966e628a38edcf7dfe6ad38ea0648ec5..0000000000000000000000000000000000000000 --- a/Online/Monitoring/scripts/test_poller_req.py +++ /dev/null @@ -1,16 +0,0 @@ -import zmq -import time - - -context = zmq.Context() -sockets = [] -for i in range(1, 3): - s = context.socket(zmq.PAIR) - s.connect('ipc:///tmp/test%d' % i) - sockets.append(s) - -for i in range(5): - print i - for s in sockets: - s.send(str(i)) - time.sleep(1) diff --git a/Online/Monitoring/scripts/test_rundb.py b/Online/Monitoring/scripts/test_rundb.py index 4e90539b30012f5f17cdd43356c0f07b79c33dcc..e01298e91eb58ac458f91c79d6053539f9e1a072 100644 --- a/Online/Monitoring/scripts/test_rundb.py +++ b/Online/Monitoring/scripts/test_rundb.py @@ -1,3 +1,4 @@ +import time from Configurables import ApplicationMgr ApplicationMgr().ExtSvc += ['ZeroMQSvc'] @@ -11,8 +12,8 @@ gaudi.initialize() gaudi.start() context = zmqSvc.context() -run = 177444 -connection = 'ipc:///tmp/testRunDB' +runs = [177444 + i for i in range(10)] +connection = 'ipc:///run/HLT2/testRunDB' req = context.socket(zmq.REQ) req.connect(connection) @@ -20,15 +21,17 @@ req.connect(connection) poller = zmq.Poller() poller.register(req, zmq.POLLIN) -req.send_multipart(["RUNINFO", long(run), "TEST"]) +for run in runs: + req.send_multipart(["RUNINFO", long(run), "TEST"]) -sockets = dict(poller.poll(2000)) -if req in sockets and sockets[req] == zmq.POLLIN: - msg = req.recv_string() - if msg == "KNOWN": - run_info = req.recv(gbl.Monitoring.RunInfo) - print 'Got run info for run %d: %f' % (run, run_info.deadtime) + sockets = dict(poller.poll(2000)) + if req in sockets and sockets[req] == zmq.POLLIN: + msg = req.recv_string() + if msg == "KNOWN": + run_info = req.recv(gbl.Monitoring.RunInfo) + print 'Got run info for run %d: %f' % (run, run_info.deadtime) + else: + print msg else: - print msg -else: - print 'No reply from run DB server on %s' % connection + print 'No reply from run DB server on %s' % connection + time.sleep(2) diff --git a/Online/Monitoring/scripts/test_rundb_info_serv.py b/Online/Monitoring/scripts/test_rundb_info_serv.py index e2825b211ff1cbe779458a8b64d7ec3299c45732..ff779725c81ae46cc0c2f01374976d2b06d5e29f 100644 --- a/Online/Monitoring/scripts/test_rundb_info_serv.py +++ b/Online/Monitoring/scripts/test_rundb_info_serv.py @@ -4,13 +4,13 @@ from Monitoring.Manager import Manager utgid = 'TEST_RUNDBINFOSERV_00' -connection = 'ipc:///tmp/test' -os.environ['DIM_DNS_NODE'] = 'localhost' +connection = 'ipc:///run/HLT2/testRunDB' +os.environ['DIM_DNS_NODE'] = 'hlt01' cmd = """import GaudiKernel.ProcessJobOptions from Gaudi.Configuration import importOptions GaudiKernel.ProcessJobOptions.printing_level=3 from Monitoring import RunDBInfoServer -RunDBInfoServer.run('%(connection)s') +RunDBInfoServer.run('%(connection)s', 2) """ % {'connection': connection} env = {'LC_ALL': 'C', 'UTGID': utgid, 'PARTITION': 'LHCb2', diff --git a/Online/ZeroMQ/ZeroMQ/ZeroMQPoller.h b/Online/ZeroMQ/ZeroMQ/ZeroMQPoller.h index 465dcd919c5cb32066526ef977a48df7a49ba687..44aaf2d010152d56e8a103f35d05e74ddc25f827 100644 --- a/Online/ZeroMQ/ZeroMQ/ZeroMQPoller.h +++ b/Online/ZeroMQ/ZeroMQ/ZeroMQPoller.h @@ -12,28 +12,51 @@ class ZeroMQPoller { public: - using entry_t = std::tuple<const zmq::socket_t*, size_t, zmq::PollType>; + using entry_t = std::tuple<size_t, zmq::PollType, const zmq::socket_t*>; // The key is what zmq::socket_t stores inside, and what goes into // pollitem_t through zmq::socket_t's conversion to void* operator using sockets_t = std::unordered_map<void*, entry_t>; + + using fd_entry_t = std::tuple<size_t, zmq::PollType>; + using fds_t = std::unordered_map<int, fd_entry_t>; + using free_t = std::deque<int>; ZeroMQPoller() = default; - // ZeroMQPoller(sockets_t sockets) - // : m_sockets{std::move(sockets)} {} - std::vector<std::pair<size_t, int>> poll(int timeo = -1) { std::vector<std::pair<size_t, int>> r; - if (m_sockets.empty()) throw std::runtime_error("No sockets registered"); - zmq::poll(&m_items[0], m_items.size(), timeo); + if (m_items.empty()) { + throw std::runtime_error("No sockets registered"); + } + while (true) { + try { + auto n = zmq::poll(&m_items[0], m_items.size(), timeo); + if (n == 0) return r; + break; + } catch (const zmq::error_t& e) { + if (e.num() != EINTR) { + std::cerr << e.what() << std::endl; + throw; + } + } + } // TODO: replace this with ranges::v3::zip for (size_t i = 0; i < m_items.size(); ++i) { - auto socket = static_cast<zmq::socket_t*>(m_items[i].socket); - auto entry = m_sockets[socket]; - if (m_items[i].revents & short(std::get<2>(entry))) { - r.emplace_back(std::get<1>(entry), std::get<2>(entry)); + void* socket = m_items[i].socket; + size_t index = 0; + int flags = 0; + if (socket == nullptr) { + // an fd was registered + std::tie(index, flags) = m_fds[m_items[i].fd]; + } else { + // a socket was registered + const zmq::socket_t* s; + std::tie(index, flags, s) = m_sockets[socket]; + } + if (m_items[i].revents & short(flags)) { + r.emplace_back(index, flags); } } return r; @@ -41,7 +64,7 @@ public: size_t size() const { - return m_sockets.size(); + return m_items.size(); } size_t register_socket(zmq::socket_t& socket, zmq::PollType type) @@ -49,19 +72,37 @@ public: zmq::socket_t* s = &socket; auto it = m_sockets.find(s); if (it != m_sockets.end()) { - return std::get<1>(it->second); + return std::get<0>(it->second); } - size_t index = m_free.empty() ? m_sockets.size() : m_free.front(); + size_t index = m_free.empty() ? m_items.size() : m_free.front(); if (!m_free.empty()) m_free.pop_front(); // NOTE: tis uses the conversion-to-void* operator of // zmq::socket_t, which returns the wrapped object m_items.push_back({socket, 0, type, 0}); // We need to lookup by the pointer to the object wrapped by zmq::socket_t - m_sockets.emplace(m_items.back().socket, std::make_tuple(s, index, type)); + m_sockets.emplace(m_items.back().socket, std::make_tuple(index, type, s)); + return index; + } + + size_t register_socket(int fd, zmq::PollType type) + { + auto it = m_fds.find(fd); + if (it != m_fds.end()) { + return std::get<0>(it->second); + } + size_t index = m_free.empty() ? m_items.size() : m_free.front(); + if (!m_free.empty()) m_free.pop_front(); + // NOTE: tis uses the conversion-to-void* operator of + // zmq::socket_t, which returns the wrapped object + m_items.push_back({nullptr, fd, type, 0}); + + // We need to lookup by the pointer to the object wrapped by zmq::socket_t + m_fds.emplace(fd, std::make_tuple(index, type)); return index; } + size_t unregister_socket(zmq::socket_t& socket) { if (!m_sockets.count(socket.operator void*())) { @@ -73,9 +114,9 @@ public: // (zmq::socket_t) auto it = std::find_if(begin(m_sockets), end(m_sockets), [&socket](const decltype(m_sockets)::value_type& entry) { - return &socket == std::get<0>(entry.second); + return &socket == std::get<2>(entry.second); }); - auto index = std::get<1>(it->second); + auto index = std::get<0>(it->second); m_free.push_back(index); m_sockets.erase(it); @@ -89,12 +130,37 @@ public: return index; } + size_t unregister_socket(int fd) + { + if (!m_fds.count(fd)) { + throw std::out_of_range("fileno is not registered"); + } + // Remove from m_fds + auto it = m_fds.find(fd); + auto index = std::get<0>(it->second); + m_free.push_back(index); + m_fds.erase(it); + + // Remove from m_items + auto iit = std::find_if(begin(m_items), end(m_items), [&it](const zmq::pollitem_t& item) { + return it->first == item.fd; + }); + assert(iit != end(m_items)); + m_items.erase(iit); + + return index; + } + + private: // Vector of (socket, flags) - free_t m_free; std::vector<zmq::pollitem_t> m_items; sockets_t m_sockets; + fds_t m_fds; + + // free slots in items + free_t m_free; }; #endif // ZEROMQPOLLER_H diff --git a/Online/ZeroMQ/python/ZeroMQ/decorators.py b/Online/ZeroMQ/python/ZeroMQ/decorators.py index ed11fdeae7f7a60890fa182b18cb99eeac45b1e9..136d8368a945e51609ab6b699edfc5a31cb65341 100644 --- a/Online/ZeroMQ/python/ZeroMQ/decorators.py +++ b/Online/ZeroMQ/python/ZeroMQ/decorators.py @@ -1,11 +1,15 @@ +import os import cppyy import pickle +import errno +from functools import wraps gbl = cppyy.gbl gbl.ROOT.EnableThreadSafety() gInterpreter = gbl.gInterpreter gInterpreter.Load('libzmq.so') +gInterpreter.Load('libZMQ.so') gInterpreter.Declare('#include <ZeroMQ/IZeroMQSvc.h>') gInterpreter.Declare('#include <ZeroMQ/ZeroMQHelper.h>') gInterpreter.Declare('#include <ZeroMQ/ZeroMQPoller.h>') @@ -15,9 +19,12 @@ gInterpreter.Declare('#include <ZeroMQ/functions.h>') IZeroMQSvc = cppyy.gbl.IZeroMQSvc Helper = cppyy.gbl.ZeroMQHelper TObject = cppyy.gbl.TObject +zmq = cppyy.gbl.zmq def static_vars(**kwargs): + """ Add an attribute to the function that can be used as a static + variable. """ def decorate(func): for k in kwargs: setattr(func, k, kwargs[k]) @@ -25,6 +32,19 @@ def static_vars(**kwargs): return decorate +def protect_interrupt(func): + """ Protect agains interrupted systemcall exception by trying again.""" + @wraps(func) + def wrapper(*args, **kwargs): + while True: + try: + return func(*args, **kwargs) + except IOError, e: + if e.errno != errno.EINTR: + raise + return wrapper + + @static_vars(svc=None) def _get_svc(): if _get_svc.svc is None: @@ -51,6 +71,7 @@ def _recv(self, T): # This method may block, so inform TMethodProxy that the GIL should be # released recv._threaded = True + recv = protect_interrupt(recv) r = recv(_get_svc(), self) self._more = r.second return r.first @@ -104,7 +125,7 @@ def _recv_string(self): def _recv_message(self): - return _recv(self, cppyy.gbl.zmq.message_t) + return _recv(self, zmq.message_t) def _socket_init(self, context, t): @@ -125,15 +146,40 @@ def _init_(self): def _register(self, socket, flags): - i = self.register_socket(socket, flags) + i = None + if isinstance(socket, zmq.socket_t): + i = self.register_socket(socket, flags) + elif hasattr(socket, 'fileno'): + i = self.register_socket(socket.fileno(), flags) + elif type(socket) == int: + try: + os.fstat(socket) + i = self.register_socket(socket, flags) + except OSError: + pass + if i is None: + raise TypeError("Socket must be either zmq::socket_t, have a" + " fileno function that returns a valid file" + "descriptor, or be a valid file descriptor.") self._sockets[int(i)] = socket def _unregister(self, socket): - i = self.unregister_socket(socket) + i = None + if isinstance(socket, zmq.socket_t): + i = self.unregister_socket(socket) + elif hasattr(socket, 'fileno'): + i = self.unregister_socket(socket.fileno()) + elif type(socket) == int: + i = self.unregister_socket(socket) + else: + raise TypeError("Socket must be either zmq::socket_t, have a" + " fileno function that returns a valid file" + "descriptor, or be a valid file descriptor.") self._sockets.pop(int(i)) +@protect_interrupt def _poll(self, timeout=-1): sockets = self._cpp_poll_(timeout) return {self._sockets[int(s.first)]: s.second for s in sockets} @@ -148,11 +194,6 @@ Poller.poll = _poll Poller.register = _register Poller.unregister = _unregister -# Convenience functions to for setsockopt -# Library load seems to be necessary... -cppyy.gbl.gSystem.Load('libZMQ.so') - -zmq = cppyy.gbl.zmq zmq.Poller = Poller # Decorate socket_t