Commit ccd4ff19 authored by Luciano Orsini's avatar Luciano Orsini Committed by Dainius Simelevicius
Browse files

references #202: added diagnostics in hyperdaq page and fault tolerance code in sentinel probe

parent ac74a345
......@@ -71,13 +71,15 @@ namespace sentinel
void Default (xgi::Input * in, xgi::Output * out) ;
void view (xgi::Input * in, xgi::Output * out) ;
void detail (xgi::Input * in, xgi::Output * out) ;
void inject (xgi::Input * in, xgi::Output * out) ;
void TabPanel (xgi::Output * out);
void ExceptionsTabPage (xgi::Output * out);
void StatisticsTabPage (xgi::Output * out);
void DiagnosticsTabPage (xgi::Output * out);
void publishEvent (const std::string & command, xcept::Exception& e) ;
void publishEvent (const std::string & name, const std::string & command, xcept::Exception& e) ;
bool handleException (xcept::Exception& ex, void* context);
......@@ -100,7 +102,10 @@ namespace sentinel
//! queue for revokes to send out
std::list<xcept::Exception> revokes_;
xdata::UnsignedInteger64 outgoingLossCounter_;
xdata::UnsignedInteger64 outgoingReadinessLossCounter_;
xdata::UnsignedInteger64 outgoingAllocationLossCounter_;
xdata::UnsignedInteger64 outgoingNetworkLossCounter_;
xdata::UnsignedInteger64 outgoingSerializationLossCounter_;
xdata::UnsignedInteger64 outgoingCounter_;
toolbox::mem::Pool* pool_;
......@@ -110,6 +115,20 @@ namespace sentinel
xdata::String outputBus_;
xdata::Boolean loggingLocalExceptions_;
// diagnostics
struct ExceptionDiagnostic {
size_t scanned;
size_t fired;
size_t revoked;
size_t published;
size_t readinessLoss;;
size_t serializationLoss;
size_t networkingLoss;
size_t allocationLoss;
xcept::Exception e;
};
std::map<std::string, ExceptionDiagnostic> exceptionsDiagnosticTable_;
};
}
......
......@@ -15,10 +15,10 @@
// !!! Edit this line to reflect the latest package version !!!
#define WORKSUITE_SENTINELPROBE_VERSION_MAJOR 2
#define WORKSUITE_SENTINELPROBE_VERSION_MINOR 0
#define WORKSUITE_SENTINELPROBE_VERSION_PATCH 1
#define WORKSUITE_SENTINELPROBE_VERSION_MINOR 5
#define WORKSUITE_SENTINELPROBE_VERSION_PATCH 0
// If any previous versions available E.g. #define WORKSUITE_SENTINELPROBE_PREVIOUS_VERSIONS "3.8.0,3.8.1"
#define WORKSUITE_SENTINELPROBE_PREVIOUS_VERSIONS "2.0.0"
#define WORKSUITE_SENTINELPROBE_PREVIOUS_VERSIONS "2.0.0,2.0.1,2.1.0,2.2.0,2.3.0,2.4.0"
//
......
......@@ -9,6 +9,8 @@
* For the licensing terms see LICENSE. *
* For the list of contributors see CREDITS. *
*************************************************************************/
#include <iostream> // std::cout, std::endl
#include <iomanip> // std::setbase
#include "sentinel/probe/Application.h"
#include "sentinel/utils/Serializer.h"
......@@ -75,7 +77,10 @@ sentinel::probe::Application::Application (xdaq::ApplicationStub * s)
maxExceptionMessageSize_ = 0x10000; // 64KB
outgoingCounter_ = 0;
outgoingLossCounter_ = 0;
outgoingReadinessLossCounter_ = 0;
outgoingAllocationLossCounter_ = 0;
outgoingNetworkLossCounter_ = 0;
outgoingSerializationLossCounter_ = 0;
// General configuration parameters
......@@ -92,6 +97,7 @@ sentinel::probe::Application::Application (xdaq::ApplicationStub * s)
xgi::framework::deferredbind(this, this, &sentinel::probe::Application::Default, "Default");
//xgi::framework::deferredbind(this, this, &sentinel::probe::Application::view, "view");
xgi::bind(this, &sentinel::probe::Application::view, "view");
xgi::bind(this, &sentinel::probe::Application::detail, "detail");
// test function
xgi::bind(this, &sentinel::probe::Application::inject, "inject");
......@@ -239,10 +245,11 @@ void sentinel::probe::Application::actionPerformed (xdata::Event& e)
try
{
this->publishEvent("notify", alarm->getException());
this->publishEvent(available.itemName(),"notify", alarm->getException());
}
catch (sentinel::probe::exception::Exception& e)
{
LOG4CPLUS_ERROR(this->getApplicationLogger(), xcept::stdformat_exception_history(e));
return;
}
......@@ -264,12 +271,13 @@ void sentinel::probe::Application::actionPerformed (xdata::Event& e)
try
{
this->publishEvent("revoke", alarm->getException());
this->publishEvent(revoke.itemName(),"revoke", alarm->getException());
return;
}
catch (sentinel::probe::exception::Exception& e)
{
// fall through
LOG4CPLUS_ERROR(this->getApplicationLogger(), xcept::stdformat_exception_history(e));
}
repositoryLock_.take();
......@@ -291,18 +299,40 @@ void sentinel::probe::Application::actionPerformed (xdata::Event& e)
}
}
void sentinel::probe::Application::publishEvent (const std::string & type, xcept::Exception& e)
void sentinel::probe::Application::publishEvent (const std::string & name, const std::string & type, xcept::Exception& e)
{
LOG4CPLUS_DEBUG(this->getApplicationLogger(), "detail begin " << xcept::stdformat_exception_history(e) << " end of detail");
if ( exceptionsDiagnosticTable_.find(name) == exceptionsDiagnosticTable_.end() )
{
// initialize diagnostic table entry
exceptionsDiagnosticTable_[name] = {0,0,0,0,0,0,0,0,e};
}
if ( type == "revoke" )
{
exceptionsDiagnosticTable_[name].revoked++;
}
else
{
exceptionsDiagnosticTable_[name].fired++;
}
exceptionsDiagnosticTable_[name].scanned++;
try
{
if (! this->getEventingBus(outputBus_.toString()).canPublish())
{
outgoingLossCounter_++;
outgoingReadinessLossCounter_++;
exceptionsDiagnosticTable_[name].readinessLoss++;
return;
}
}
catch (eventing::api::exception::Exception & e)
{
outgoingReadinessLossCounter_++;
exceptionsDiagnosticTable_[name].readinessLoss++;
XCEPT_RETHROW(sentinel::probe::exception::Exception, "Failed to access output bus: " + outputBus_.toString(), e);
}
......@@ -318,7 +348,8 @@ void sentinel::probe::Application::publishEvent (const std::string & type, xcept
}
catch (toolbox::mem::exception::Exception & ex)
{
outgoingLossCounter_++;
outgoingAllocationLossCounter_++;
exceptionsDiagnosticTable_[name].allocationLoss++;
XCEPT_RETHROW(sentinel::probe::exception::Exception, "Failed to allocate message for monitor report", e);
}
......@@ -328,22 +359,31 @@ void sentinel::probe::Application::publishEvent (const std::string & type, xcept
xdaq::XceptSerializer::writeTo(e, &outBuffer);
ref->setDataSize(outBuffer.tellp());
}
catch (xdata::exception::Exception & e)
catch (xdata::exception::Exception & ex)
{
outgoingLossCounter_++;
XCEPT_RETHROW(sentinel::probe::exception::Exception, "Failed to serialize exception", e);
outgoingSerializationLossCounter_++;
exceptionsDiagnosticTable_[name].serializationLoss++;
ref->release();
std::stringstream ss;
ss << "Failed to serialize exception " << e.name() << " of type (" << type << ")";
XCEPT_RETHROW(sentinel::probe::exception::Exception, ss.str(), ex);
}
try
{
this->getEventingBus(outputBus_.toString()).publish("urn:xdaq-exception:any", ref, plist);
LOG4CPLUS_DEBUG(this->getApplicationLogger(), "published " << plist.getProperty("urn:sentinel-exception:identifier") << " with severity " << e.getProperty("severity"));
exceptionsDiagnosticTable_[name].published++;
outgoingCounter_++;
}
catch(eventing::api::exception::Exception & e)
{
LOG4CPLUS_ERROR(this->getApplicationLogger(), "failed to send heartbeat to heartbeatd: " << xcept::stdformat_exception_history(e));
outgoingLossCounter_++;
outgoingNetworkLossCounter_++;
exceptionsDiagnosticTable_[name].networkingLoss++;
ref->release();
XCEPT_RETHROW(sentinel::probe::exception::Exception, "Failed to send exception notification", e);
}
......@@ -373,7 +413,7 @@ void sentinel::probe::Application::processPendingEvents ()
std::stringstream o;
o << item.second;
item.first.setProperty("occurrences", o.str());
this->publishEvent("notify", item.first);
this->publishEvent(item.first.getProperty("identifier"),"notify", item.first);
exceptions_.pop_front();
}
}
......@@ -389,17 +429,22 @@ void sentinel::probe::Application::processPendingEvents ()
alarms_->lock();
std::map<std::string, xdata::Serializable *, std::less<std::string> > alarms = alarms_->match(".*");
LOG4CPLUS_DEBUG(this->getApplicationLogger(), "found " << alarms.size() << " alarms in infospace");
for (std::map<std::string, xdata::Serializable *, std::less<std::string> >::iterator i = alarms.begin(); i != alarms.end(); i++)
{
LOG4CPLUS_DEBUG(this->getApplicationLogger(), "going to publish alarm " << (*i).first << " alarms in infospace") ;
sentinel::utils::Alarm* alarm = dynamic_cast<sentinel::utils::Alarm*>((*i).second);
if (alarm != 0)
{
try
{
this->publishEvent("notify", alarm->getException());
this->publishEvent((*i).first , "notify", alarm->getException());
}
catch (sentinel::probe::exception::Exception& e)
{
LOG4CPLUS_ERROR(this->getApplicationLogger(), xcept::stdformat_exception_history(e));
alarms_->unlock();
return;
}
......@@ -421,12 +466,13 @@ void sentinel::probe::Application::processPendingEvents ()
{
xcept::Exception& item = revokes_.front();
item.setProperty("occurrences", "1");
this->publishEvent("revoke", item);
this->publishEvent(item.getProperty("identifier"),"revoke", item);
revokes_.pop_front();
}
}
catch (sentinel::probe::exception::Exception& e)
{
LOG4CPLUS_ERROR(this->getApplicationLogger(), xcept::stdformat_exception_history(e));
repositoryLock_.give();
return;
}
......@@ -550,7 +596,73 @@ void sentinel::probe::Application::TabPanel (xgi::Output * out)
this->ExceptionsTabPage(out);
*out << "</div>";
*out << "<div class=\"xdaq-tab\" title=\"Diagnostics\">" << std::endl;
this->DiagnosticsTabPage(out);
*out << "</div>";
*out << "</div>";
}
size_t scanned;
size_t published;
size_t serializationLoss;
size_t networkingLoss;
size_t allocationLoss;
void sentinel::probe::Application::DiagnosticsTabPage (xgi::Output * out)
{
*out << cgicc::table().set("class","xdaq-table").set("style", "width: 100%;");
*out << cgicc::caption("Exceptions Diagnostics");
*out << cgicc::thead();
*out << cgicc::tr();
*out << cgicc::th("Name").set("style", "vertical-align: top; font-weight: bold;");
*out << cgicc::th("Scanned").set("style", "vertical-align: top; font-weight: bold;");
*out << cgicc::th("Fired").set("style", "vertical-align: top; font-weight: bold;");
*out << cgicc::th("Revoked").set("style", "vertical-align: top; font-weight: bold;");
*out << cgicc::th("Published").set("style", "vertical-align: top; font-weight: bold;");
*out << cgicc::th("Readiness Loss").set("style", "vertical-align: top; font-weight: bold;");
*out << cgicc::th("Serialization Failure").set("style", "vertical-align: top; font-weight: bold;");
*out << cgicc::th("Networking Failure").set("style", "vertical-align: top; font-weight: bold;");
*out << cgicc::th("Allocation Failure").set("style", "vertical-align: top; font-weight: bold;");
*out << cgicc::th("Last Occurred").set("style", "vertical-align: top; font-weight: bold;");
*out << cgicc::tr() << std::endl;
*out << cgicc::thead();
*out << cgicc::tbody();
for (auto i = exceptionsDiagnosticTable_.begin(); i != exceptionsDiagnosticTable_.end(); i++ )
{
std::string name = (*i).first;
std::string viewlink = "/" + this->getApplicationDescriptor()->getURN() + "/detail?name=" + name;
*out << cgicc::tr();
*out << cgicc::td((*i).first) << std::endl;
*out << cgicc::td(toolbox::toString("%d",(*i).second.scanned)) << std::endl;
*out << cgicc::td(toolbox::toString("%d",(*i).second.fired)) << std::endl;
*out << cgicc::td(toolbox::toString("%d",(*i).second.revoked)) << std::endl;
*out << cgicc::td(toolbox::toString("%d",(*i).second.published)) << std::endl;
*out << cgicc::td(toolbox::toString("%d",(*i).second.readinessLoss)) << std::endl;
*out << cgicc::td(toolbox::toString("%d",(*i).second.serializationLoss)) << std::endl;
*out << cgicc::td(toolbox::toString("%d",(*i).second.networkingLoss)) << std::endl;
*out << cgicc::td(toolbox::toString("%d",(*i).second.allocationLoss)) << std::endl;
*out << cgicc::td();
std::stringstream clickHandler;
clickHandler << "window.open('" << viewlink << "', 'Sentinel Probe - Exception Viewer', 'toolbar=no, scrollbars=yes, resizable=yes, width=900, height=900'); return false;";
*out << cgicc::a().set("href", viewlink).set("onClick", clickHandler.str());
*out << cgicc::img().set("src", "/sentinel/images/alert.gif").set("height", "16").set("width", "16").set("border", "0");
*out << cgicc::a();
*out << cgicc::td();
*out << cgicc::td(viewlink) << std::endl;
*out << cgicc::tr() << std::endl;
}
*out << cgicc::tbody();
*out << cgicc::table();
}
void sentinel::probe::Application::ExceptionsTabPage (xgi::Output * out)
......@@ -640,13 +752,42 @@ void sentinel::probe::Application::StatisticsTabPage (xgi::Output * out)
//
*out << cgicc::tr();
*out << cgicc::th();
*out << "Total reports lost";
*out << "Total reports lost (publish/subscribe bus readiness)";
*out << cgicc::th();
*out << cgicc::td();
*out << outgoingLossCounter_.toString();
*out << outgoingReadinessLossCounter_.toString();
*out << cgicc::td();
*out << cgicc::tr() << std::endl;
*out << cgicc::tr();
*out << cgicc::th();
*out << "Total reports lost (memory allocation problem)";
*out << cgicc::th();
*out << cgicc::td();
*out << outgoingAllocationLossCounter_.toString();
*out << cgicc::td();
*out << cgicc::tr() << std::endl;
*out << cgicc::tr();
*out << cgicc::th();
*out << "Total reports lost (networking problem)";
*out << cgicc::th();
*out << cgicc::td();
*out << outgoingNetworkLossCounter_.toString();
*out << cgicc::td();
*out << cgicc::tr() << std::endl;
*out << cgicc::tr();
*out << cgicc::th();
*out << "Total reports lost (serialization problem)";
*out << cgicc::th();
*out << cgicc::td();
*out << outgoingSerializationLossCounter_.toString();
*out << cgicc::td();
*out << cgicc::tr() << std::endl;
*out << cgicc::tbody() << std::endl;
*out << cgicc::table() << std::endl;
}
......@@ -690,3 +831,110 @@ void sentinel::probe::Application::view (xgi::Input * in, xgi::Output * out)
}
}
std::string escape(const std::string& s)
{
std::stringstream ss;
size_t len = s.length();
for(size_t i=0 ; i<len ; i++)
{
char c = s[i];
switch(c)
{
case 'a' ... 'z':
case 'A' ... 'Z':
case '0' ... '9':
case ';':
case '/':
case '?':
case ':':
case '@':
case '&':
case '=':
case '+':
case '$':
case ',':
case '-':
case '_':
case '.':
case '!':
case '~':
case '*':
case '\'':
case '(':
case ')':
ss << c;
break;
default:
ss << '%' << std::setbase(16) << std::setfill('0') << std::setw(2) << (int)(c);
break;
}
}
return ss.str();
}
void sentinel::probe::Application::detail (xgi::Input * in, xgi::Output * out)
{
try
{
cgicc::Cgicc cgi(in);
std::string name = cgi["name"]->getValue();
auto i = exceptionsDiagnosticTable_.find(name);
if ( i != exceptionsDiagnosticTable_.end())
{
xcept::Exception::const_reverse_iterator ri = (*i).second.e.rbegin();
std::stringstream blob;
blob << "{";
while (ri != (*i).second.e.rend())
{
blob << "\"label\":\"" << (*ri).getProperty("identifier") << "\",";
blob << "\"properties\":[";
std::map<std::string, std::string, std::less<std::string> >::const_iterator mi = (*ri).begin();
while (mi != (*ri).end())
{
if ((*mi).first == "message")
{
blob << "{\"name\":\"" << (*mi).first << "\",\"value\":\"" << escape((*mi).second) << "\"}";
}
else
{
blob << "{\"name\":\"" << (*mi).first << "\",\"value\":\"" << toolbox::jsonquote((*mi).second) << "\"}";
}
++mi;
if (mi != (*ri).end())
{
blob << ",";
}
}
++ri;
blob << "]";
if (ri != (*i).second.e.rend())
{
blob << ",\"children\":[{";
}
}
for (size_t s = 0; s < ((*i).second.e.size() - 1); ++s)
{
blob << "}]";
}
blob << "}";
*out << blob.str() << std::endl;
}
else
{
std::stringstream msg;
msg << "Requested non existing exception identified by '" << name << "'";
XCEPT_RAISE(xgi::exception::Exception, msg.str());
}
}
catch (const std::exception & e)
{
XCEPT_RAISE(xgi::exception::Exception, e.what());
}
}
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment