From 0033a625ee3385919ab344ad7dba5f3642e280a8 Mon Sep 17 00:00:00 2001 From: amete <serhanmete@gmail.com> Date: Fri, 18 Dec 2020 12:10:09 +0100 Subject: [PATCH 1/2] Various updates to PerfMonMTSvc --- .../PerfMonComps/src/PerfMonMTSvc.cxx | 191 +++++++++++++----- .../PerfMonComps/src/PerfMonMTSvc.h | 10 +- .../PerfMonComps/src/PerfMonMTUtils.h | 2 + 3 files changed, 146 insertions(+), 57 deletions(-) diff --git a/Control/PerformanceMonitoring/PerfMonComps/src/PerfMonMTSvc.cxx b/Control/PerformanceMonitoring/PerfMonComps/src/PerfMonMTSvc.cxx index bd14c1b9e88c..02a76cc66af6 100644 --- a/Control/PerformanceMonitoring/PerfMonComps/src/PerfMonMTSvc.cxx +++ b/Control/PerformanceMonitoring/PerfMonComps/src/PerfMonMTSvc.cxx @@ -13,10 +13,15 @@ // PerfMonComps includes #include "PerfMonMTSvc.h" #include "PerfMonUtils.h" // borrow from existing code +#include "SemiDetMisc.h" // borrow from existing code // STD includes #include <algorithm> +// Boost includes +#include "boost/format.hpp" +#include "boost/filesystem.hpp" + /* * Constructor */ @@ -137,7 +142,7 @@ void PerfMonMTSvc::startAud(const std::string& stepName, const std::string& comp if (!m_doComponentLevelMonitoring) return; // Start component auditing - auto ctx = Gaudi::Hive::currentContext(); + auto ctx = Gaudi::Hive::currentContext(); startCompAud(stepName, compName, ctx); } } @@ -155,7 +160,7 @@ void PerfMonMTSvc::stopAud(const std::string& stepName, const std::string& compN if (!m_doComponentLevelMonitoring) return; // Stop component auditing - auto ctx = Gaudi::Hive::currentContext(); + auto ctx = Gaudi::Hive::currentContext(); stopCompAud(stepName, compName, ctx); } } @@ -207,8 +212,8 @@ void PerfMonMTSvc::startCompAud(const std::string& stepName, const std::string& // Generate State PMonMT::StepComp currentState = generate_state(stepName, compName); - // Check if this is the first time calling if so create the mesurement data if not use the existing one. - // Metrics are collected per slot then aggregated before reporting + // Check if this is the first time calling if so create the mesurement data if not use the existing one. + // Metrics are collected per slot then aggregated before reporting data_map_t& compLevelDataMap = m_compLevelDataMapVec[ctx.valid() ? ctx.slot() : 0]; if(compLevelDataMap.find(currentState) == compLevelDataMap.end()) { compLevelDataMap[currentState] = new PMonMT::MeasurementData(); @@ -312,6 +317,33 @@ bool PerfMonMTSvc::isPower(uint64_t input, uint64_t base) { return (input == 1); } +/* + * Helper finction to estimate CPU efficiency + */ +int PerfMonMTSvc::getCpuEfficiency() const { + + // In AthenaMT only the event-loop is executed concurrently + // In this metric, we scale the event-loop wall-time by + // the number of slots to take the concurrency into account + // Then we divide the total cpu-time by this number + // It's A metric not THE metric... + + const float totalCpuTime = + m_snapshotData[CONFIGURE].getDeltaCPU() + + m_snapshotData[INITIALIZE].getDeltaCPU() + + m_snapshotData[EXECUTE].getDeltaCPU() + + m_snapshotData[FINALIZE].getDeltaCPU(); + + const float scaledWallTime = + m_snapshotData[CONFIGURE].getDeltaWall() * 1. + + m_snapshotData[INITIALIZE].getDeltaWall() * 1. + + m_snapshotData[EXECUTE].getDeltaWall() * m_numberOfSlots + + m_snapshotData[FINALIZE].getDeltaWall() * 1.; + + return ( scaledWallTime > 0 ? totalCpuTime / scaledWallTime * 100. : 0 ); + +} + /* * Report the results to the log and the JSON file */ @@ -345,6 +377,7 @@ void PerfMonMTSvc::report2Log() { // Summary and system information report2Log_Summary(); report2Log_CpuInfo(); + report2Log_EnvInfo(); } /* @@ -377,7 +410,7 @@ void PerfMonMTSvc::report2Log_ComponentLevel() { ATH_MSG_INFO(" Component Level Monitoring "); ATH_MSG_INFO("======================================================================================="); - ATH_MSG_INFO(format("%1% %|15t|%2% %|25t|%3% %|40t|%4% %|55t|%5% %|75t|%6%") % "Step" % "Count" % "CPU Time [ms]" % + ATH_MSG_INFO(format("%1% %|15t|%2% %|25t|%3% %|40t|%4% %|55t|%5% %|75t|%6%") % "Step" % "Count" % "CPU Time [ms]" % "Vmem [kB]" % "Malloc [kB]" % "Component"); ATH_MSG_INFO("---------------------------------------------------------------------------------------"); @@ -408,7 +441,7 @@ void PerfMonMTSvc::report2Log_ComponentLevel() { it.second->getCallCount() % it.second->getDeltaCPU() % it.second->getDeltaVmem() % it.second->getDeltaMalloc() % it.first.compName); } - if(counter>0) { + if(counter>0) { ATH_MSG_INFO("======================================================================================="); } } @@ -426,8 +459,8 @@ void PerfMonMTSvc::report2Log_EventLevel_instant() const { long pss = m_eventLevelData.getEventLevelPss(m_eventCounter); long swap = m_eventLevelData.getEventLevelSwap(m_eventCounter); - ATH_MSG_INFO("Event [" << std::setw(5) << m_eventCounter << "] CPU Time: " << scaleTime(cpu_time) << - ", Wall Time: " << scaleTime(wall_time) << ", Vmem: " << scaleMem(vmem) << + ATH_MSG_INFO("Event [" << std::setw(5) << m_eventCounter << "] CPU Time: " << scaleTime(cpu_time) << + ", Wall Time: " << scaleTime(wall_time) << ", Vmem: " << scaleMem(vmem) << ", Rss: " << scaleMem(rss) << ", Pss: " << scaleMem(pss) << ", Swap: " << scaleMem(swap)); } @@ -438,8 +471,8 @@ void PerfMonMTSvc::report2Log_EventLevel() { using boost::format; ATH_MSG_INFO(" Event Level Monitoring "); - ATH_MSG_INFO(" (Only first " << m_eventLoopMsgLimit.toString() << - " measurements are explicitly printed)"); + ATH_MSG_INFO(" (Only the first " << m_eventLoopMsgLimit.toString() << + " and the last measurements are explicitly printed)"); ATH_MSG_INFO("======================================================================================="); ATH_MSG_INFO(format("%1% %|16t|%2% %|28t|%3% %|40t|%4% %|52t|%5% %|64t|%6% %|76t|%7%") % "Event" % "CPU [s]" % @@ -448,14 +481,19 @@ void PerfMonMTSvc::report2Log_EventLevel() { ATH_MSG_INFO("---------------------------------------------------------------------------------------"); m_eventLoopMsgCounter = 0; // reset counter + unsigned long nMeasurements = m_eventLevelData.getNMeasurements(); for (const auto& it : m_eventLevelData.getEventLevelData()) { - if(m_eventLoopMsgCounter < m_eventLoopMsgLimit) { + // Print + if(m_eventLoopMsgCounter < m_eventLoopMsgLimit || m_eventLoopMsgCounter == nMeasurements - 1) { + if(m_eventLoopMsgCounter > m_eventLoopMsgLimit) { + ATH_MSG_INFO(format("%|=87|") % "..."); + } ATH_MSG_INFO(format("%1% %|16t|%2$.2f %|28t|%3$.2f %|40t|%4% %|52t|%5% %|64t|%6% %|76t|%7%") % it.first % (it.second.cpu_time * 0.001) % (it.second.wall_time * 0.001) % it.second.mem_stats.at("vmem") % it.second.mem_stats.at("rss") % it.second.mem_stats.at("pss") % it.second.mem_stats.at("swap")); - m_eventLoopMsgCounter++; } + m_eventLoopMsgCounter++; // Add to leak estimate if (it.first >= 25) { m_fit_vmem.addPoint(it.first, it.second.mem_stats.at("vmem")); @@ -495,6 +533,7 @@ void PerfMonMTSvc::report2Log_Summary() { (m_snapshotData[EXECUTE].getDeltaCPU() / m_eventCounter)); ATH_MSG_INFO(format("%1% %|35t|%2$.3f ") % "Events per second:" % (m_eventCounter / m_snapshotData[EXECUTE].getDeltaWall() * 1000.)); + ATH_MSG_INFO(format("%1% %|35t|%2% ") % "CPU utilization efficiency [%]:" % getCpuEfficiency()); if (m_doEventLoopMonitoring) { ATH_MSG_INFO("***************************************************************************************"); @@ -523,10 +562,28 @@ void PerfMonMTSvc::report2Log_CpuInfo() const { ATH_MSG_INFO(format("%1% %|34t|%2% ") % "CPU Model:" % get_cpu_model_info()); ATH_MSG_INFO(format("%1% %|35t|%2% ") % "Number of Available Cores:" % get_cpu_core_info()); + ATH_MSG_INFO(format("%1% %|35t|%2% ") % "Total Memory:" % scaleMem(get_memory_info())); ATH_MSG_INFO("======================================================================================="); } +/* + * Report run-time enviroment information + */ +void PerfMonMTSvc::report2Log_EnvInfo() const { + using boost::format; + using boost::filesystem::path; + + ATH_MSG_INFO(" Environment Information "); + ATH_MSG_INFO("======================================================================================="); + + ATH_MSG_INFO(format("%1% %|35t|%2% ") % "Malloc Library:" % path(PMonSD::symb2lib("malloc")).filename().string()); + ATH_MSG_INFO(format("%1% %|35t|%2% ") % "Math Library:" % path(PMonSD::symb2lib("atan2")).filename().string()); + + ATH_MSG_INFO("======================================================================================="); + +} + /* * Report data to JSON */ @@ -569,7 +626,7 @@ void PerfMonMTSvc::report2JsonFile() { * Report summary data to JSON */ void PerfMonMTSvc::report2JsonFile_Summary(nlohmann::json& j) const { - + // Report snapshot level results for(int i=0; i < NSNAPSHOTS; i++){ @@ -617,29 +674,43 @@ void PerfMonMTSvc::report2JsonFile_Summary(nlohmann::json& j) const { // Report Sys info const std::string cpuModel = get_cpu_model_info(); const int coreNum = get_cpu_core_info(); + const long totMem = get_memory_info(); j["summary"]["sysInfo"] = {{"cpuModel", cpuModel}, - {"coreNum", coreNum}}; + {"coreNum", coreNum}, + {"totMem", totMem}}; + + // Report Enviroment info + const std::string mallocLib = boost::filesystem::path(PMonSD::symb2lib("malloc")).filename().string(); + const std::string mathLib = boost::filesystem::path(PMonSD::symb2lib("atan2")).filename().string(); + + j["summary"]["envInfo"] = {{"mallocLib", mallocLib}, + {"mathLib", mathLib}}; + + // Report CPU utilization efficiency; + const int cpuUtilEff = getCpuEfficiency(); + j["summary"]["misc"] = {{"cpuUtilEff", cpuUtilEff}}; + } void PerfMonMTSvc::report2JsonFile_ComponentLevel(nlohmann::json& j) const { for (const auto& dataMapPerStep : m_stdoutVec_serial) { - + for(const auto& meas : dataMapPerStep){ const std::string step = meas.first.stepName; - const std::string component = meas.first.compName; + const std::string component = meas.first.compName; const uint64_t count = meas.second->getCallCount(); const double cpuTime = meas.second->getDeltaCPU(); - const long vmem = meas.second->getDeltaVmem(); + const long vmem = meas.second->getDeltaVmem(); const int mall = meas.second->getDeltaMalloc(); j["componentLevel"][step][component] = {{"count", count}, {"cpuTime", cpuTime}, {"vmem", vmem}, {"malloc", mall}}; - } + } } @@ -648,7 +719,7 @@ void PerfMonMTSvc::report2JsonFile_ComponentLevel(nlohmann::json& j) const { void PerfMonMTSvc::report2JsonFile_EventLevel(nlohmann::json& j) const { for (const auto& it : m_eventLevelData.getEventLevelData()) { - + const uint64_t event = it.first; const double cpuTime = it.second.cpu_time; const double wallTime = it.second.wall_time; @@ -724,7 +795,7 @@ void PerfMonMTSvc::divideData2Steps() { std::string PerfMonMTSvc::scaleTime(double timeMeas) const { // Not a huge fan of this, we should eventually unify the types // Just want to be explicit about what's happening - long ms = (long) timeMeas; + long ms = (long) timeMeas; // Compute hrs and offset auto hrs = ms / 3600000; @@ -739,13 +810,18 @@ std::string PerfMonMTSvc::scaleTime(double timeMeas) const { // Primarily care about H:M:S std::stringstream ss; ss.fill('0'); - ss << std::setw(2) << hrs << "h" << - std::setw(2) << mins << "m" << + ss << std::setw(2) << hrs << "h" << + std::setw(2) << mins << "m" << std::setw(2) << secs << "s"; return ss.str(); } std::string PerfMonMTSvc::scaleMem(long memMeas) const { + // The memory measurements should be positive + // Only delta(A,B) can go negative but this method + // is not used for those cases, at least for now + if (memMeas<0) return "NA"; + std::ostringstream ss; ss << std::fixed; ss << std::setprecision(2); @@ -789,43 +865,48 @@ std::string PerfMonMTSvc::scaleMem(long memMeas) const { /* * Collect some hardware information */ -std::string PerfMonMTSvc::get_cpu_model_info() const { - std::string cpu_model; - - std::ifstream file("/proc/cpuinfo"); - std::string line; - if (file.is_open()) { - std::string delimiter = ":"; - while (getline(file, line)) { - std::string key = line.substr(0, line.find(delimiter)); - if (key == "model name ") { - cpu_model = line.substr(line.find(delimiter) + 1, line.length()); - break; +std::string PerfMonMTSvc::get_info_from_file(const std::string& fileName, + const std::string& fieldName) const { + // Helper function to read files of type Key : Value + // Returns the last instance if there are multiple matches + // This is because we use this method to get the processor count + std::string result{""}; + + std::ifstream file{fileName}; + std::string line{""}; + + while (std::getline(file, line)) { + if (line.empty()) continue; + size_t splitIdx = line.find(":"); + if (splitIdx != std::string::npos) { + std::string val = line.substr(splitIdx + 1); + if (val.empty()) continue; + if (line.size() >= fieldName.size() && + line.compare(0, fieldName.size(), fieldName) == 0) { + result = val; } } - file.close(); - return cpu_model; - } else { - return "Unable to open /proc/cpuinfo"; } + + file.close(); + + return result; +} + +std::string PerfMonMTSvc::get_cpu_model_info() const { + return get_info_from_file("/proc/cpuinfo","model name") + + get_info_from_file("/proc/cpuinfo","cache size"); } int PerfMonMTSvc::get_cpu_core_info() const { - int logical_core_num = 0; - - std::ifstream file("/proc/cpuinfo"); - std::string line; - if (file.is_open()) { - std::string delimiter = ":"; - while (getline(file, line)) { - std::string key = line.substr(0, line.find(delimiter)); - if (key == "processor ") { - logical_core_num++; - } - } - file.close(); - return logical_core_num; - } else { - return -1; - } + std::string val = get_info_from_file("/proc/cpuinfo","processor"); + if (val.empty()) return 0; + return std::stoi(val) + 1; +} + +long PerfMonMTSvc::get_memory_info() const { + std::string val = get_info_from_file("/proc/meminfo","MemTotal"); + if (val.empty()) return 0; + val = val.substr(0, val.size() - 3); // strip the trailing kB + return std::stol(val); } diff --git a/Control/PerformanceMonitoring/PerfMonComps/src/PerfMonMTSvc.h b/Control/PerformanceMonitoring/PerfMonComps/src/PerfMonMTSvc.h index d825d8a6a1b2..b432a7484918 100644 --- a/Control/PerformanceMonitoring/PerfMonComps/src/PerfMonMTSvc.h +++ b/Control/PerformanceMonitoring/PerfMonComps/src/PerfMonMTSvc.h @@ -28,8 +28,6 @@ #include <iomanip> #include <nlohmann/json.hpp> -#include "boost/format.hpp" - // Other Libraries #include <algorithm> #include <cmath> @@ -86,6 +84,7 @@ class PerfMonMTSvc : virtual public IPerfMonMTSvc, virtual public IIncidentListe void report2Log_EventLevel(); void report2Log_Summary(); // make it const void report2Log_CpuInfo() const; + void report2Log_EnvInfo() const; /// Report to the JSON File void report2JsonFile(); @@ -104,8 +103,12 @@ class PerfMonMTSvc : virtual public IPerfMonMTSvc, virtual public IIncidentListe bool isCheckPoint(); + /// A few helper methods to get system information + /// These should be carried to PerfMonMTUtils at some point + std::string get_info_from_file(const std::string& fileName, const std::string& fieldName) const; std::string get_cpu_model_info() const; int get_cpu_core_info() const; + long get_memory_info() const; PMonMT::StepComp generate_state(const std::string& stepName, const std::string& compName) const; @@ -196,6 +199,9 @@ class PerfMonMTSvc : virtual public IPerfMonMTSvc, virtual public IIncidentListe PerfMon::LinFitSglPass m_fit_vmem; PerfMon::LinFitSglPass m_fit_pss; + // Estimate CPU efficiency + int getCpuEfficiency() const; + }; // class PerfMonMTSvc #endif // PERFMONCOMPS_PERFMONMTSVC_H diff --git a/Control/PerformanceMonitoring/PerfMonComps/src/PerfMonMTUtils.h b/Control/PerformanceMonitoring/PerfMonComps/src/PerfMonMTUtils.h index 2b592779bd74..26b0cdb49c07 100644 --- a/Control/PerformanceMonitoring/PerfMonComps/src/PerfMonMTUtils.h +++ b/Control/PerformanceMonitoring/PerfMonComps/src/PerfMonMTUtils.h @@ -191,6 +191,8 @@ struct MeasurementData { event_meas_map_t getEventLevelData() const { return m_eventLevel_delta_map; } + unsigned long getNMeasurements() const { return m_eventLevel_delta_map.size(); } + double getEventLevelCpuTime(unsigned long long event_count) const { return m_eventLevel_delta_map.at(event_count).cpu_time; } -- GitLab From b00cab62a1d01fe2779ea0e5dc39d92927c97da1 Mon Sep 17 00:00:00 2001 From: amete <serhanmete@gmail.com> Date: Fri, 18 Dec 2020 12:24:29 +0100 Subject: [PATCH 2/2] Fix typo in message --- Control/PerformanceMonitoring/PerfMonComps/src/PerfMonMTSvc.cxx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Control/PerformanceMonitoring/PerfMonComps/src/PerfMonMTSvc.cxx b/Control/PerformanceMonitoring/PerfMonComps/src/PerfMonMTSvc.cxx index 02a76cc66af6..59cafdc5ecef 100644 --- a/Control/PerformanceMonitoring/PerfMonComps/src/PerfMonMTSvc.cxx +++ b/Control/PerformanceMonitoring/PerfMonComps/src/PerfMonMTSvc.cxx @@ -509,7 +509,7 @@ void PerfMonMTSvc::report2Log_EventLevel() { void PerfMonMTSvc::report2Log_Summary() { using boost::format; - ATH_MSG_INFO(" Snaphots Summary "); + ATH_MSG_INFO(" Snapshots Summary "); ATH_MSG_INFO("======================================================================================="); ATH_MSG_INFO(format("%1% %|13t|%2% %|25t|%3% %|37t|%4% %|44t|%5% %|55t|%6% %|66t|%7% %|77t|%8%") % "Step" % -- GitLab