Commit 2290ae64 authored by Marian Babik's avatar Marian Babik
Browse files

added abrt; fixed xroot and srm tests

parent f4af54b3
......@@ -18,8 +18,8 @@ RUN yum -y install voms globus-gsi-sysconfig globus-gsi-cert-utils globus-gssapi
# Condor client
RUN yum -y install condor condor-python
# SRM
RUN yum -y install lcg-util gfal2-util lcg-util-python gfal-python globus-ftp-client \
# SRM todo: test removing globus deps
RUN yum -y install gfal2-all gfal2-python gfal2-util globus-ftp-client \
globus-gass-transfer globus-ftp-control globus-xio globus-gssapi-error \
globus-gsi-sysconfig globus-gsi-openssl-error globus-openssl-module \
globus-gsi-proxy-ssl
......@@ -32,6 +32,10 @@ RUN yum -y install xrootd-python-4.7.1-1.osg34.el7 xrootd-client-4.7.1-1.osg34.e
COPY docker/etf-cms/config/grid-env.sh /etc/profile.d/
RUN echo "source /etc/profile.d/grid-env.sh" >> /opt/omd/sites/$CHECK_MK_SITE/.profile
# ABRTD
RUN yum -y install abrt
COPY docker/etf-cms/config/mailx_event.conf /etc/
# VOMS config
# RUN mkdir -p /etc/vomses/
# COPY ./config/cms-lcg-voms2.cern.ch /etc/vomses/
......
#!/usr/bin/python
#!/usr/bin/env python
# ########################################################################### #
#
# SAM xrootd file access probe of CMS
......
......@@ -45,7 +45,7 @@ SAM (Service Availability Monitoring)
import os
import sys
import getopt
import time #@UnresolvedImport
import time
import commands
import errno
import re
......@@ -56,6 +56,7 @@ import pickle
import datetime
import urlparse
import filecmp
import requests
try:
from gridmon import probe
......@@ -63,7 +64,7 @@ try:
from gridmon import gridutils
from gridmon.process import signaling
import gfal2
except ImportError,e:
except ImportError as e:
summary = "UNKNOWN: Error loading modules : %s" % (e)
sys.stdout.write(summary+'\n')
sys.stdout.write(summary+'\nsys.path: %s\n'% str(sys.path))
......@@ -79,6 +80,7 @@ LCG_UTIL_TIMEOUT_SRM = 180
gfal2.set_verbose(gfal2.verbose_level.debug)
class SRMVOMetrics(probe.MetricGatherer) :
"""A Metric Gatherer specific for SRM."""
......@@ -171,9 +173,7 @@ class SRMVOMetrics(probe.MetricGatherer) :
},
}
def __init__(self, tuples):
probe.MetricGatherer.__init__(self, tuples, 'SRM')
self.usage=""" Metrics specific options:
......@@ -224,9 +224,9 @@ class SRMVOMetrics(probe.MetricGatherer) :
curhour=datetime.datetime.now().hour
self._fileHistoryVoInfoDictionary = self.workdir_metric+"/VOInfoDictionary_%s"%curhour
self._fileVoInfoDictionary = self.workdir_metric+"/VOInfoDictionary"
#Read dictionary from current cache
# Read dictionary from current cache
try:
#Clean up stale current cache entries (older than 3 days)
# Clean up stale current cache entries (older than 3 days)
try:
modtime=os.path.getmtime(self._fileVoInfoDictionary)
if (time.time()-modtime>3*86400):
......@@ -299,7 +299,6 @@ class SRMVOMetrics(probe.MetricGatherer) :
"""
try: os.unlink(self._fileLock)
except OSError: pass
def saveVoInfoDictionary(self,filename):
fp = open(filename, "w")
......@@ -313,29 +312,29 @@ class SRMVOMetrics(probe.MetricGatherer) :
return voInfoDict
def weightEndpointCriticality(self,VOtest):
DetailedMsg=''
CriticalResult=[]
DetailedMsg = ''
CriticalResult = []
for srmendpt in self._voInfoDictionary.keys():
try:
try:
criticality=self._voInfoDictionary[srmendpt]['criticality']
try:
criticality = self._voInfoDictionary[srmendpt]['criticality']
except KeyError:
criticality = 1
if criticality == 1:
CriticalResult.append(self._voInfoDictionary[srmendpt][VOtest][0])
# DetailedMsg = DetailedMsg + str(self._voInfoDictionary[srmendpt])
DetailedMsg = DetailedMsg + \
str(self._voInfoDictionary[srmendpt]['space_token']) + \
" critical= " + str(criticality) + \
" " + str(self._voInfoDictionary[srmendpt][VOtest][1]) + \
" file= " + str(self._voInfoDictionary[srmendpt]['fn']) + \
"\n"
# self.printd('VO specific Detailed Output: %s' % str(DetailedMsg))
except IndexError:
return 'UNKNOWN', 'No SRM endpoints found in internal dictionary'
except KeyError:
criticality=1
if criticality==1:
CriticalResult.append(self._voInfoDictionary[srmendpt][VOtest][0])
#DetailedMsg = DetailedMsg + str(self._voInfoDictionary[srmendpt])
DetailedMsg = DetailedMsg + \
str(self._voInfoDictionary[srmendpt]['space_token']) +\
" critical= "+ str(criticality) +\
" "+ str(self._voInfoDictionary[srmendpt][VOtest][1]) +\
" file= " + str(self._voInfoDictionary[srmendpt]['fn'])+\
"\n"
#self.printd('VO specific Detailed Output: %s' % str(DetailedMsg))
except IndexError:
return ('UNKNOWN', 'No SRM endpoints found in internal dictionary')
except KeyError:
return ('UNKNOWN', 'No test results found in internal dictionary for SRM endpoint')
#print " GLOBAL result \n \n \n \n \n "
return 'UNKNOWN', 'No test results found in internal dictionary for SRM endpoint'
# print " GLOBAL result \n \n \n \n \n "
## oredering criticality
self.printd('VO specific Detailed Output: %s' % str(DetailedMsg))
if 'CRITICAL' in CriticalResult: # it's enough one CRIT
......@@ -359,18 +358,17 @@ class SRMVOMetrics(probe.MetricGatherer) :
def metricAllLHCb(self):
return self.metricAll('AllLHCb')
def metricGetPFNFromTFC(self,testLFN="/store/unmerged/SAM/testSRM"):
def metricGetPFNFromTFC(self, testLFN="/store/unmerged/SAM/testSRM"):
"""Get full SRM endpoint(s) and storage areas from PhEDEx DataService.
"""
try:
self.__workdir_lock()
except IOError, e:
except IOError as e:
self.printd('Failed to lock. %s' % str(e))
return 'UNKNOWN', 'UNKNOWN: Failed to lock working directory.'
#URLs for PhEDEx DataService for lfn2pfn
# URLs for PhEDEx DataService for lfn2pfn
tfcURL="https://cmsweb.cern.ch/phedex/datasvc/json/prod/lfn2pfn?node="
pfnMatchURL="&lfn="
pfnProtocolOption = "&protocol=srmv2"
......@@ -384,6 +382,7 @@ class SRMVOMetrics(probe.MetricGatherer) :
opener=urllib2.build_opener()
header='grid-monitoring-probes-org.cms.SRM-GetPFNFromTFC/1.0 (CMS) %s/%s %s/%s (%s)' % (urllib2.__name__,urllib2.__version__,platform.system(),platform.release(),platform.processor())
opener.addheaders = [('User-agent', header)]
headers = {'user-agent': header}
# LFN path for file to test transfers
self.printd('The LFN used for testing will be in: '+testLFN)
......@@ -391,16 +390,16 @@ class SRMVOMetrics(probe.MetricGatherer) :
try:
self.printd("Contacting PhEDEx dataservice to perform SEName-to-PhEDExNodeName at URL:")
self.printd(seNamesURL % nodeName)
seNames=opener.open(seNamesURL % nodeName)
seNamesJSON = simplejson.load(seNames)
req = requests.get(seNamesURL % nodeName, headers=headers, verify=False, timeout=120)
req.raise_for_status()
seNames = req.content
seNamesJSON = simplejson.loads(seNames)
phedexNodeNames = seNamesJSON[u'phedex'][u'senames']
except (urllib2.URLError, KeyError):
except (requests.HTTPError, KeyError):
self.printd('WARNING: Unable to open PhEDEx DataService senames API to perform SEName-to-PhEDExNodeName matching for SEName %s' % nodeName)
if len(self._voInfoDictionary):
self.printd("WARNING: using cached PFN")
# Update timestamp/uuid in cached PFN
# Update timestamp/uuid in cached PFN
for pfn in self._voInfoDictionary:
try:
self._voInfoDictionary[pfn]['fn'] = self._voInfoDictionary[pfn]['fntemp'] % (str(int(time.time())), samutils.uuidstr())
......@@ -408,13 +407,11 @@ class SRMVOMetrics(probe.MetricGatherer) :
self.printd(pfn+" : "+str(self._voInfoDictionary[pfn]))
except KeyError:
self.printd("WARNING: no cached PFN found")
return('WARNING',"WARNING: Unable to open PhEDEx DataService senames API, no cached PFN found")
return('OK',"WARNING: Unable to open PhEDEx DataService senames API, using cached PFN")
return 'WARNING', "WARNING: Unable to open PhEDEx DataService senames API, no cached PFN found"
return 'OK', "WARNING: Unable to open PhEDEx DataService senames API, using cached PFN"
else:
self.printd("WARNING: no cached PFN found")
return('WARNING',"WARNING: Unable to open PhEDEx DataService senames API, no cached PFN found")
return 'WARNING', "WARNING: Unable to open PhEDEx DataService senames API, no cached PFN found"
outputList={}
......@@ -441,15 +438,17 @@ class SRMVOMetrics(probe.MetricGatherer) :
self.printd(pfnUrl)
try:
pfnFile=opener.open(pfnUrl)
pfnJSON = simplejson.load(pfnFile)
req = requests.get(pfnUrl, headers=headers, verify=False, timeout=120)
req.raise_for_status()
pfnFile = req.content
pfnJSON = simplejson.loads(pfnFile)
pfn = (((pfnJSON[u'phedex'])[u'mapping'])[0])[u'pfn']
spacetoken = (((pfnJSON[u'phedex'])[u'mapping'])[0])[u'space_token']
except (urllib2.URLError,KeyError):
except (requests.HTTPError, KeyError):
self.printd('WARNING: Unable to open PhEDEx DataService lfn2pfn URL to perform LFN-to-PFN matching for Site %s' % siteName)
continue
if pfn == None:
if not pfn:
self.printd("ERROR: LFN did not match to any PFN - probably the TFC does not contain any rule for the srmv2 protocol.")
continue
......@@ -463,15 +462,15 @@ class SRMVOMetrics(probe.MetricGatherer) :
if re.compile("^srm://.+srm/managerv2\?SFN=.+$").match(pfn) or re.compile("^srm://.+srm/v2/server\?SFN=.+$").match(pfn):
pfntonode=re.sub(":.+$","",re.sub("^srm://","",pfn))
if pfntonode!=nodeName :
if pfntonode != nodeName :
self.printd("WARNING: the resulting PFN matches to SRM "+pfntonode+" instead of SRM "+nodeName)
continue
else:
fntemp = self._fileSRMPattern % (spacetokendesc,'%s','%s')
fntemp = self._fileSRMPattern % (spacetokendesc, '%s', '%s')
fn = fntemp % (str(int(time.time())), samutils.uuidstr())
outputList[pfn]={'fntemp' : fntemp, 'fn': fn, 'space_token': spacetoken, 'space_token_get': spacetoken, 'userspace' : testLFN}
outputList[pfn] = {'fntemp': fntemp, 'fn': fn, 'space_token': spacetoken, 'space_token_get': spacetoken, 'userspace' : testLFN}
elif pfn.startswith("gsiftp://"):
pfntonode=urlparse.urlparse(pfn).hostname
pfntonode = urlparse.urlparse(pfn).hostname
fntemp = self._fileSRMPattern % (spacetokendesc,'%s','%s')
fn = fntemp % (str(int(time.time())), samutils.uuidstr())
outputList[pfn]={'fntemp' : fntemp, 'fn': fn, 'space_token': spacetoken, 'space_token_get': spacetoken, 'userspace' : testLFN}
......@@ -486,10 +485,12 @@ class SRMVOMetrics(probe.MetricGatherer) :
# Extract a random PFN from the dictionary of PFN matches. It will be used for testing, other PFN matches will be ignored
# Print warning if not all PFN matches are the same.
if len(outputList)==0:
self.printd("WARNING: "+nodeName+" not found in SRM list")
self.printd("WARNING: This error usually means that the site is not running PhEDEx agents in the Prod instance,")
self.printd("WARNING: or that the TrivialFileCatalog published by the site's PhEDEx agents doesn't have a valid srmv2 protocol rule for "+nodeName)
if len(outputList) == 0:
self.printd("WARNING: " + nodeName + " not found in SRM list")
self.printd(
"WARNING: This error usually means that the site is not running PhEDEx agents in the Prod instance,")
self.printd(
"WARNING: or that the TrivialFileCatalog published by the site's PhEDEx agents doesn't have a valid srmv2 protocol rule for " + nodeName)
if len(self._voInfoDictionary):
self.printd("WARNING: using cached PFN")
# Update timestamp/uuid in cached PFN
......@@ -497,20 +498,19 @@ class SRMVOMetrics(probe.MetricGatherer) :
try:
self._voInfoDictionary[pfn]['fn'] = self._voInfoDictionary[pfn]['fntemp'] % (str(int(time.time())), samutils.uuidstr())
self.printd("The PFN path used for testing will be:")
self.printd(pfn+" : "+str(self._voInfoDictionary[pfn]))
self.printd(pfn + " : " + str(self._voInfoDictionary[pfn]))
except KeyError:
self.printd("WARNING: no cached PFN found")
return('WARNING',"WARNING: "+nodeName+" not found in SRM list, no cached PFN found")
return('OK',"WARNING: "+nodeName+" not found in SRM list, using cached PFN")
return ('WARNING', "WARNING: " + nodeName + " not found in SRM list, no cached PFN found")
return ('OK', "WARNING: " + nodeName + " not found in SRM list, using cached PFN")
else:
self.printd("WARNING: no cached PFN found")
return('WARNING',"WARNING: "+nodeName+" not found in SRM list, no cached PFN found")
return ('WARNING', "WARNING: " + nodeName + " not found in SRM list, no cached PFN found")
else:
self._voInfoDictionary=outputList
else:
self._voInfoDictionary = outputList
for outputPfns in outputList:
self.printd("The PFN path used for testing will be:")
self.printd(outputPfns+" : "+str(outputList[outputPfns]))
......@@ -557,16 +557,17 @@ class SRMVOMetrics(probe.MetricGatherer) :
self.printd(str(agis_endpoint_info))
self.printd(str(self._voInfoDictionary))
try:
fp = open(self._ldap_fileEndptSAPath, "w")
for info in agis_endpoint_info:
ep=info.split()[0]+'\n'
fp.write(ep)
fp.close()
except IOError, e:
try:
os.unlink(self._ldap_fileEndptSAPath)
except OSError: pass
return ('UNKNOWN', 'IOError: %s' % str(e))
fp = open(self._ldap_fileEndptSAPath, "w")
for info in agis_endpoint_info:
ep = info.split()[0] + '\n'
fp.write(ep)
fp.close()
except IOError as e:
try:
os.unlink(self._ldap_fileEndptSAPath)
except OSError:
pass
return ('UNKNOWN', 'IOError: %s' % str(e))
#print self._ldap_fileEndptSAPath
......
EVENT=notify
# do not rely on the default config nor on the config file
Mailx_Subject="[abrt] $(cat package || cat executable): $(cat crash_function && echo "():") $(cat reason || (cat executable && echo " crashed"))" \
Mailx_EmailFrom="ABRT Daemon <DoNotReply>" \
Mailx_EmailTo="root@localhost" \
reporter-mailx --notify-only
EVENT=notify-dup
# do not rely on the default config nor on the config file
Mailx_Subject="[abrt] $(cat package || cat executable): $(cat crash_function && echo "():") $(cat reason || (cat executable && echo " crashed"))" \
Mailx_EmailFrom="ABRT Daemon <DoNotReply>" \
Mailx_EmailTo="root@localhost" \
reporter-mailx --notify-only
......@@ -92,6 +92,13 @@ else
/usr/bin/disable_nstream
fi
if [ "${ABRT_ENABLED}" -eq "1" ] ; then
echo "Enabling abrt ..."
sed -e "s/OpenGPGCheck = yes/OpenGPGCheck = no/g" -i /etc/abrt/abrt-action-save-package-data.conf
cp -f /etc/mailx_event.conf /etc/libreport/events.d/mailx_event.conf
/usr/sbin/abrtd
fi
echo "Fetching CMS credentials ..."
su etf -c "/usr/lib/nagios/plugins/globus/refresh_proxy --vo-fqan /cms/Role=lcgadmin --myproxyuser nagios -H myproxy.cern.ch -t 120 --key /opt/omd/sites/etf/etc/nagios/globus/etf_srv_key.pem --vo cms --lifetime 24 --name NagiosRetrieve-ETF-cms -x /opt/omd/sites/etf/etc/nagios/globus/userproxy.pem--cms-Role_lcgadmin --cert /opt/omd/sites/etf/etc/nagios/globus/etf_srv_cert.pem"
su etf -c "/usr/lib/nagios/plugins/globus/refresh_proxy --vo-fqan /cms/Role=production --myproxyuser nagios -H myproxy.cern.ch -t 120 --key /opt/omd/sites/etf/etc/nagios/globus/etf_srv_key.pem --vo cms --lifetime 24 --name NagiosRetrieve-ETF-cms -x /opt/omd/sites/etf/etc/nagios/globus/userproxy.pem--cms-Role_production --cert /opt/omd/sites/etf/etc/nagios/globus/etf_srv_cert.pem"
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment