Skip to content
Snippets Groups Projects
Commit 7809870e authored by Ricardo Rocha's avatar Ricardo Rocha
Browse files

LCGDM-999: integrate SAM LFC probes into nagios lcgdm.

parent 2cf87ce2
No related branches found
No related tags found
No related merge requests found
......@@ -6,7 +6,7 @@
%define pnp4nagios_templates_dir %{_datadir}/nagios/html/pnp4nagios/templates.lcgdm
Name: nagios-plugins-lcgdm
Version: 0.9.4
Version: 0.9.5
Release: 1%{?dist}
Summary: Nagios probes to be run remotely against DPM / LFC nodes
Group: Applications/Internet
......@@ -14,8 +14,8 @@ License: ASL 2.0
URL: https://svnweb.cern.ch/trac/lcgdm/wiki/Dpm/Admin/Monitoring
# The source of this package was pulled from upstream's vcs. Use the
# following commands to generate the tarball:
# svn export http://svn.cern.ch/guest/lcgdm/nagios-plugins/tags/nagios-plugins_0_9_4 nagios-plugins-lcgdm-0.9.4
# tar -czvf nagios-plugins-lcgdm-0.9.4.tar.gz nagios-plugins-lcgdm-0.9.4
# svn export http://svn.cern.ch/guest/lcgdm/nagios-plugins/tags/nagios-plugins_0_9_5 nagios-plugins-lcgdm-0.9.5
# tar -czvf nagios-plugins-lcgdm-0.9.5.tar.gz nagios-plugins-lcgdm-0.9.5
Source0: %{name}-%{version}.tar.gz
Buildroot: %{_tmppath}/%{name}-%{version}-%{release}-root-%(%{__id_u} -n)
......@@ -172,8 +172,12 @@ rm -rf %{buildroot}
%config(noreplace) %{_sysconfdir}/nrpe.d/lcgdm-lfc.cfg
%{nagios_plugins_dir}/lcgdm/check_lfc_perf
%{nagios_plugins_dir}/lcgdm/check_oracle_expiration
%{nagios_plugins_dir}/lcgdm/check_lfc_sam
%changelog
* Wed Mar 06 2012 Ricardo Rocha <ricardo.rocha@cern.ch> - 0.9.5-1
- Update for new upstream release
* Tue Oct 22 2012 Ricardo Rocha <ricardo.rocha@cern.ch> - 0.9.4-1
- Update for new upstream release
......
......@@ -29,7 +29,7 @@ install(
install(
FILES check_lfc_perf check_oracle_expiration
FILES check_lfc_perf check_oracle_expiration check_lfc_sam
DESTINATION usr/lib${LIB_SUFFIX}/nagios/plugins/lcgdm/
PERMISSIONS OWNER_EXECUTE OWNER_WRITE OWNER_READ
GROUP_EXECUTE GROUP_READ
......
#!/usr/bin/env python
#
# LFC Probe script. Runs a set of probes against an LFC server
#
# Includes a reasonably generic probe framework
#
# James Casey <james.casey@cern.ch>
#
import os
import sys
import getopt
import time
import errno
try:
from gridmon import probe
import lfc
except ImportError,e:
summary = "UNKNOWN: Error loading modules : %s" % (e)
sys.stdout.write(summary+'\n')
sys.stdout.write(summary+'\nsys.path: %s\n'% str(sys.path))
sys.exit(3)
def parse_uri(uri):
"""Return the [host, port] from the lfc URI. Accepts:
lfc://host/
lfc://host:port/
lfc://host
lfc://host:port
host
host:port"""
import re
match = re.match('([a-z]*://)?([^/:$]*):?(\d+)?/?', uri)
return (match.group(2), match.group(3))
class LFCMetrics(probe.MetricGatherer) :
"""A Metric Gatherer specific for the LFC. Handles the same
metrics as the original perl LEMON sensor code"""
OPS_DIR="/grid/ops"
def __init__(self, tuples, timeout= None):
probe.MetricGatherer.__init__(self,tuples,'LFC')
if not tuples.has_key('serviceURI'):
raise TypeError("No serviceURI passed in")
[self.hostName, self.portNumber] = parse_uri(tuples['serviceURI'])
os.environ['LFC_HOST'] = self.hostName
if self.portNumber:
os.environ['LFC_PORT'] = self.portNumber
os.environ['LFC_CONRETRY']="0"
if timeout:
os.environ['LFC_CONNTIMEOUT']= timeout
else:
os.environ['LFC_CONNTIMEOUT']="15"
self.errbuf=" "*120
lfc.lfc_seterrbuf(self.errbuf,len(self.errbuf))
self.probeinfo = { 'probeName' : 'ch.cern.LFC-Probe',
'probeVersion' : '1.0',
'serviceVersion' : '>= 1.6.0'}
_metrics = { 'Read' :{ 'metricName' : 'ch.cern.LFC-Read',
'metricLocality' : 'remote', 'metricType' : 'status',
'metricDescription' :"Test if we can read an entry in the catalog"},
'Write' :{'metricName' : 'ch.cern.LFC-Write',
'metricLocality' : 'remote', 'metricType' : 'status',
'metricDescription' : "Test if we can update the modification time of an entry in the catalog"},
'Readdir' :{'metricName' : 'ch.cern.LFC-Readdir',
'metricLocality' : 'remote', 'metricType' : 'performance',
'dataType' : 'float',
'metricDescription':"Time how long it takes to read a directory (/grid)"},
'ReadDli' :{'metricName' : 'ch.cern.LFC-ReadDli',
'metricLocality' : 'remote', 'metricType' : 'status',
'metricDescription':"Do a read from a DLI"},
'Ping' :{'metricName' : 'ch.cern.LFC-Ping',
'metricLocality' : 'remote', 'metricType' : 'status',
'metricDescription':"Ping LFC service."}}
self.ns = 'ch.cern'
self.set_metrics(_metrics)
def metricRead(self):
"Test if we can read an entry in the catalog"
DIR_NAME = "/grid/%s"%self.voName
stat = lfc.lfc_filestatg()
res = lfc.lfc_statg(DIR_NAME, "", stat)
if res == 0:
return (0, "OK")
else:
err_num = lfc.cvar.serrno
err_string = lfc.sstrerror(err_num)
return (2, "Trying to statg(%s) : %s"%(DIR_NAME,err_string))
def metricWrite(self):
"Test if we can update the modification time of an entry in the catalog"
FILE_NAME="/grid/%s/file-lfc-probe-%s"%(self.voName,self.hostName)
res = lfc.lfc_utime(FILE_NAME, None)
if res == 0:
return (0, "OK")
else:
err_num = lfc.cvar.serrno
# try again since this could just be non-retried DB problem
if err_num != errno.ENOENT :
res = lfc.lfc_utime(FILE_NAME,None)
if res != 0:
err_num = lfc.cvar.serrno
err_string = lfc.sstrerror(err_num)
return (2, err_string)
else: # try and create the file
uuid=None
import commands
def uuidgen():
return commands.getoutput('uuidgen')
guid = uuidgen()
lfc.lfc_umask(0000)
res = lfc.lfc_creatg(FILE_NAME, guid, 0664)
if res != 0:
err_num = lfc.cvar.serrno
err_string = lfc.sstrerror(err_num)
return (2, err_string)
res = lfc.lfc_addreplica(guid, None, "test.example.com",
"srm://test.example.com/%s/file-%s-%s"%(self.voName, self.hostName, guid),
"-", "P","", "")
if res != 0:
err_num = lfc.cvar.serrno
err_string = lfc.sstrerror(err_num)
return (2, "Can't add replica : %s"%err_string)
# can't rely on default group ACLs on the parent directory
# add VO's root group write access ACL explicitly
try:
# lfc.lfc_getgrpbynam() swig wrap is buggy; use lfc2
import lfc2
gid = lfc2.lfc_getgrpbynam(self.voName)
except Exception, e:
return (3, "Couldn't add VO's root group write access ACL: %s" % str(e))
else:
_, acls_list = lfc.lfc_getacl(FILE_NAME,
lfc.CA_MAXACLENTRIES)
acl_grp = lfc.lfc_acl()
acl_grp.a_type = lfc.CNS_ACL_GROUP
acl_grp.a_id = gid
acl_grp.a_perm = 6
acls_list.append(acl_grp)
# we need to add mask as well
acl_m = lfc.lfc_acl()
acl_m.a_type = lfc.CNS_ACL_MASK
acl_m.a_id = 0
acl_m.a_perm = 6
acls_list.append(acl_m)
try:
lfc2.lfc_setacl(FILE_NAME, acls_list)
except Exception, e:
return (3, "Couldn't add VO's root group write access ACL: %s" % str(e))
return (0, "OK")
def metricReaddir(self):
"Time how long it takes to read a directory (/grid)"
READDIR_DIR='/grid'
dir = lfc.lfc_opendir(READDIR_DIR)
if dir == None:
err_num = lfc.cvar.serrno
err_string = lfc.sstrerror(err_num)
return (2, err_string)
start = time.time()
entry = lfc.lfc_readdirg(dir)
while entry:
entry = lfc.lfc_readdirg(dir)
end = time.time()
return (0, "%2.3f"%(end-start))
def metricReadDli(self):
"Do a read from a DLI"
FILE_NAME="/grid/%s/file-lfc-probe-%s"%(self.voName,self.hostName)
try:
from SOAPpy import SOAPProxy, SOAPConfig, faultType
except ImportError, e:
return (3, "Cannot load SOAPpy to gather metric : %s"%e)
c = SOAPConfig()
remote = SOAPProxy("http://%s:8085/"%self.hostName, namespace="urn:DataLocationInterface",
config = c)
# SOAPpy 0.11.4 (deployed with gLite 3.0) writes the fault to stdout
# so we redirect it during the call to a dummy - only need to handle write...
class dummy_stdout :
def __init__(self) :
pass
def write(write, *kw) :
pass
sys.stdout = dummy_stdout()
try:
try:
result = remote.listReplicas(inputDataType="lfn", inputData=FILE_NAME)
except faultType, fault:
if fault.faultstring == 'InputData' :
return (2, "Could not find LFN : %s"%FILE_NAME)
if fault.faultstring == 'NoURLFound' :
return (2, "No PFN for LFN : %s"%FILE_NAME)
return (3, "Unknown SOAP Fault : %s"%fault)
except Exception, e:
return (3, "Unknown Exception : %s"%e)
finally:
sys.stdout = sys.__stdout__
return (0, "Found %d PFN"%len(result))
def metricPing(self):
'Ping LFC service.'
ver = ' '*256
res = lfc.lfc_ping(self.hostName, ver)
if res == 0:
return ('OK', '%s\n'%ver.rstrip())
else:
err_num = lfc.cvar.serrno
err_string = lfc.sstrerror(err_num)
return ('CRITICAL', err_string)
runner = probe.Runner(LFCMetrics, probe.ProbeFormatRenderer())
sys.exit(runner.run(sys.argv))
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment