Commit aab41302 authored by Marian Babik's avatar Marian Babik
Browse files

new ETF release; latest mw; ETF API; new job submission (jess); new worker...

new ETF release; latest mw; ETF API; new job submission (jess); new worker node framework (wn-wm); os, system updates
parent 911407ec
......@@ -35,21 +35,19 @@ RUN yum -y install gfal2-all gfal2-python gfal2-util globus-ftp-client \
COPY docker/etf-cms/config/grid-env.sh /etc/profile.d/
RUN echo "source /etc/profile.d/grid-env.sh" >> /opt/omd/sites/$CHECK_MK_SITE/.profile
# VOMS config
# RUN mkdir -p /etc/vomses/
# COPY ./config/cms-lcg-voms2.cern.ch /etc/vomses/
# COPY ./config/cms-voms2.cern.ch /etc/vomses/
#RUN mkdir -p /etc/grid-security/vomsdir/cms/
#COPY ./config/lcg-voms2.cern.ch.lsc /etc/grid-security/vomsdir/cms/
#COPY ./config/voms2.cern.ch.lsc /etc/grid-security/vomsdir/cms/
# ETF base plugins
RUN yum -y install nagios-plugins-wlcg-condor nagios-plugins-globus nagios-plugins
# ETF JESS setup
# RUN yum -y install python-jess python-nap && chmod 755 /usr/lib64/nagios/plugins/check_js
# COPY ./config/check_condor.cfg /etc/ncgx/metrics.d/
# COPY ./config/metrics.cfg /etc/ncgx/metrics.d/wlcg_cms.cfg
RUN yum -y install python-pip
RUN pip install pexpect ptyprocess argparse
RUN yum -y install python-jess python-wnfm nagios-plugins-globus nagios-plugins
# ETF WN-qFM payload
RUN mkdir -p /usr/libexec/grid-monitoring/wnfm/lib/python/site-packages
RUN mkdir -p /usr/libexec/grid-monitoring/wnfm/bin
RUN cp /usr/bin/etf_wnfm /usr/libexec/grid-monitoring/wnfm/bin/
RUN cp -r /usr/lib/python2.7/site-packages/pexpect /usr/libexec/grid-monitoring/wnfm/lib/python/site-packages
RUN cp -r /usr/lib/python2.7/site-packages/ptyprocess /usr/libexec/grid-monitoring/wnfm/lib/python/site-packages
RUN cp -r /usr/lib/python2.7/site-packages/wnfm /usr/libexec/grid-monitoring/wnfm/lib/python/site-packages
RUN cp /usr/lib/python2.7/site-packages/argparse.py /usr/libexec/grid-monitoring/wnfm/lib/python/site-packages/
# ETF streaming
RUN mkdir -p /var/spool/nstream/outgoing && chmod 777 /var/spool/nstream/outgoing
......
......@@ -40,12 +40,18 @@ COPY docker/etf-cms/config/grid-env_ipv6.sh /etc/profile.d/grid-env.sh
RUN echo "source /etc/profile.d/grid-env.sh" >> /opt/omd/sites/$CHECK_MK_SITE/.profile
# ETF base plugins
RUN yum -y install nagios-plugins-wlcg-condor nagios-plugins-globus nagios-plugins
# ETF JESS setup
# RUN yum -y install python-jess python-nap && chmod 755 /usr/lib64/nagios/plugins/check_js
# COPY ./config/check_condor.cfg /etc/ncgx/metrics.d/
# COPY ./config/metrics.cfg /etc/ncgx/metrics.d/wlcg_cms.cfg
RUN yum -y install python-pip
RUN pip install pexpect ptyprocess argparse
RUN yum -y install python-jess python-wnfm nagios-plugins-globus nagios-plugins
# ETF WN-qFM payload
RUN mkdir -p /usr/libexec/grid-monitoring/wnfm/lib/python/site-packages
RUN mkdir -p /usr/libexec/grid-monitoring/wnfm/bin
RUN cp /usr/bin/etf_wnfm /usr/libexec/grid-monitoring/wnfm/bin/
RUN cp -r /usr/lib/python2.7/site-packages/pexpect /usr/libexec/grid-monitoring/wnfm/lib/python/site-packages
RUN cp -r /usr/lib/python2.7/site-packages/ptyprocess /usr/libexec/grid-monitoring/wnfm/lib/python/site-packages
RUN cp -r /usr/lib/python2.7/site-packages/wnfm /usr/libexec/grid-monitoring/wnfm/lib/python/site-packages
RUN cp /usr/lib/python2.7/site-packages/argparse.py /usr/libexec/grid-monitoring/wnfm/lib/python/site-packages/
# ETF streaming
RUN mkdir -p /var/spool/nstream/outgoing && chmod 777 /var/spool/nstream/outgoing
......
......@@ -80,6 +80,17 @@ if [[ "${ETF_ALERTS_ENABLED}" -eq "1" ]] && [[ -f /etc/check_mk/notifications.mk
sed -i "s/ETF_HOSTED_BY/${ETF_HOSTED_BY}/" /opt/omd/sites/${CHECK_MK_SITE}/etc/check_mk/conf.d/wato/notifications.mk
sed -i "s/ETF_NAGIOS_HOST/${ETF_NAGIOS_HOST}/" /opt/omd/sites/${CHECK_MK_SITE}/etc/check_mk/conf.d/wato/notifications.mk
fi
touch /var/www/html/status_snapshot.json && chown etf.etf /var/www/html/status_snapshot.json
echo "Configuring HT-Condor ..."
if [[ -f /usr/bin/condor_status ]]; then
/usr/bin/condor_status -schedd -af MyAddress -pool $(hostname) > /var/lib/condor/spool/.schedd_address
fi
if [[ ! -s /var/lib/condor/spool/.schedd_address ]]; then
echo "Schedd address file empty, waiting 5 seconds before restarting ..."
sleep 5
exit 1
fi
cp /etc/ncgx/templates/generic/handlers.cfg /opt/omd/sites/etf/etc/nagios/conf.d/
......@@ -97,11 +108,12 @@ echo "Configuring ETF ..."
if [[ -z "${ETF_NAGIOS_HOST}" ]]; then
echo " Variable ETF_NAGIOS_HOST is not defined, using hostname"
ETF_NAGIOS_HOST=`hostname`
grep -qF "${ETF_NAGIOS_HOST}" /etc/ncgx/ncgx.cfg || echo "NAGIOS_HOST = \"${ETF_NAGIOS_HOST}\"" >> /etc/ncgx/ncgx.cfg
else
grep -qF "${ETF_NAGIOS_HOST}" /etc/ncgx/ncgx.cfg || echo "NAGIOS_HOST = \"${ETF_NAGIOS_HOST}\"" >> /etc/ncgx/ncgx.cfg
fi
grep -qF "${ETF_NAGIOS_HOST}" /etc/ncgx/ncgx.cfg || echo "NAGIOS_HOST = \"${ETF_NAGIOS_HOST}\"" >> /etc/ncgx/ncgx.cfg
sed -i "s/ETF_NAGIOS_HOST/${ETF_NAGIOS_HOST}/" /etc/httpd/conf.d/welcome.conf
sed -i "s/ETF_NAGIOS_HOST/${ETF_NAGIOS_HOST}/" /var/www/html/index.html
su etf -c "ncgx --log | tee /opt/omd/sites/etf/var/log/ncgx.log"
su - etf -c "cmk -II; cmk -O"
if [[ "${NSTREAM_ENABLED}" -eq "1" ]] ; then
......
import logging
import itertools
import requests
import urlparse
import xml.etree.ElementTree as ET
import json
from ncgx.inventory import Hosts, Checks, Groups
from vofeed.api import VOFeed
......@@ -15,38 +12,43 @@ FLAVOR_MAP = {'CREAM-CE': 'cream',
'GLOBUS': 'gt',
'OSG-CE': 'gt'}
SAME_CODES = {'OK': 0, 'INFO': 3, 'NOTICE': 3, 'WARNING': 1, 'ERROR': 2, 'CRITICAL': 2, 'MAINTENANCE': 3}
CE_STATE_METRICS = [
'org.sam.CONDOR-JobState-/cms/Role=lcgadmin']
'org.sam.CONDOR-JobState-/cms/Role=lcgadmin']
CE_METRICS = (
'org.sam.CONDOR-JobSubmit-/cms/Role=lcgadmin',
'org.cms.WN-analysis-/cms/Role=lcgadmin',
'org.cms.WN-isolation-/cms/Role=lcgadmin',
'org.cms.WN-basic-/cms/Role=lcgadmin',
'org.cms.WN-cvmfs-/cms/Role=lcgadmin',
'org.cms.WN-env-/cms/Role=lcgadmin',
'org.cms.WN-frontier-/cms/Role=lcgadmin',
'org.cms.WN-mc-/cms/Role=lcgadmin',
'org.cms.WN-squid-/cms/Role=lcgadmin',
'org.cms.WN-xrootd-access-/cms/Role=lcgadmin',
'org.cms.WN-xrootd-fallback-/cms/Role=lcgadmin')
'org.sam.CONDOR-JobSubmit-/cms/Role=lcgadmin',)
SE_METRICS = (
'org.cms.SRM-AllCMS-/cms/Role=production',
'org.cms.SRM-GetPFNFromTFC-/cms/Role=production',
'org.cms.SRM-VODel-/cms/Role=production',
'org.cms.SRM-VOGet-/cms/Role=production',
'org.cms.SRM-VOGetTURLs-/cms/Role=production',
'org.cms.SRM-VOLs-/cms/Role=production',
'org.cms.SRM-VOLsDir-/cms/Role=production',
'org.cms.SRM-VOPut-/cms/Role=production')
'org.cms.SRM-AllCMS-/cms/Role=production',
'org.cms.SRM-GetPFNFromTFC-/cms/Role=production',
'org.cms.SRM-VODel-/cms/Role=production',
'org.cms.SRM-VOGet-/cms/Role=production',
'org.cms.SRM-VOGetTURLs-/cms/Role=production',
'org.cms.SRM-VOLs-/cms/Role=production',
'org.cms.SRM-VOLsDir-/cms/Role=production',
'org.cms.SRM-VOPut-/cms/Role=production')
XROOT_METRICS = (
'org.cms.SE-xrootd-contain',
'org.cms.SE-xrootd-connection',
'org.cms.SE-xrootd-version',
'org.cms.SE-xrootd-contain',
'org.cms.SE-xrootd-connection',
'org.cms.SE-xrootd-version',
)
WN_METRICS = {
'CE-cms-analysis.sing': 'org.cms.WN-analysis-/cms/Role=lcgadmin',
'CE-cms-singularity': 'org.cms.WN-isolation-/cms/Role=lcgadmin',
'CE-cms-basic': 'org.cms.WN-basic-/cms/Role=lcgadmin',
'WN-cvmfs': 'org.cms.WN-cvmfs-/cms/Role=lcgadmin',
'CE-cms-env': 'org.cms.WN-env-/cms/Role=lcgadmin',
'CE-cms-frontier.sing': 'org.cms.WN-frontier-/cms/Role=lcgadmin',
'CE-cms-mc.sing': 'org.cms.WN-mc-/cms/Role=lcgadmin',
'CE-cms-squid.sing': 'org.cms.WN-squid-/cms/Role=lcgadmin',
'CE-cms-xrootd-access.sing': 'org.cms.WN-xrootd-access-/cms/Role=lcgadmin',
'CE-cms-xrootd-fallback.sing': 'org.cms.WN-xrootd-fallback-/cms/Role=lcgadmin'
}
def run(url, ipv6=False):
log.info("Processing vo feed: %s" % url)
......@@ -75,7 +77,8 @@ def run(url, ipv6=False):
# Add corresponding metrics to tags
# creates /etc/ncgx/conf.d/generated_checks.cfg
c = Checks()
c.add_all(CE_METRICS, tags=["CREAM-CE", "ARC-CE", "GLOBUS", "HTCONDOR-CE"])
c.add_all(CE_METRICS, tags=["CREAM-CE", "ARC-CE", "HTCONDOR-CE"])
c.add_all(WN_METRICS.values(), tags=["CREAM-CE", "ARC-CE", "HTCONDOR-CE"])
c.add_all(SE_METRICS, tags=["SRM"])
c.add_all(XROOT_METRICS, tags=["XROOTD"])
# IPv6
......@@ -102,6 +105,22 @@ def run(url, ipv6=False):
else:
c.add("org.cms.SE-xrootd-read", hosts=(host,), params={'args': {'--site': site.pop(), '--endpoint': endpoint, '-4': ''}, '_tags': 'XROOTD'})
# ETF env - environment variables to export on the worker node (global for all sites), such as:
# ETF_TESTS - points to a list of WN tests to execute (stored in WN_METRICS)
# ETF_LEGACY should be a subset of ETF_TESTS that identifies SFT tests (those that are not nagios compliant)
# SAME* environment variables are needed by the legacy/SFT tests
with open('/tmp/etf-env.sh', 'w') as etf_env:
etf_env.write('ETF_TESTS={}\n'.format(','.join(['etf/probes/org.cms/'+m for m in WN_METRICS.keys()])))
for code, value in SAME_CODES.items():
etf_env.write('SAME_{}={}\n'.format(code, value))
etf_env.write('SAME_VO=cms\n')
etf_env.write('SAME_TEST_DIRNAME=$HOME/etf/probes/org.cms/testjob/tests\n')
etf_env.write('SAME_SENSOR_HOME=$HOME/etf/probes/org.cms/testjob\n')
# ETF WN-qFM config - maps WN tests to metrics (WN-cvmfs -> org.lhcb.WN-cvmfs-/lhcb/Role=production)
with open('/tmp/etf_wnfm.json', 'w') as etf_wnfm:
json.dump({'wn_metric_map': WN_METRICS, 'counter_enabled': True}, etf_wnfm)
# Queues
for service in services:
host = service[0]
......
......@@ -73,89 +73,49 @@ metrics = {
"-x" : "::CMS_PROXY_PROD"
}
},
"org.globus.MyProxy-ProxyLifetime-/cms/Role=pilot" : {
"extends" : "org.globus.MyProxy-ProxyLifetime",
"args" : {
"-x" : "::CMS_PROXY_PILOT"
}
},
"org.globus.MyProxy-ProxyLifetime-/cms/Role=lcgadmin" : {
"extends" : "org.globus.MyProxy-ProxyLifetime",
"args" : {
"-x" : "::CMS_PROXY_LCGADMIN"
}
},
"org.sam.CONDOR-JobState-/cms/Role=lcgadmin" : {
"extends" : "org.sam.CONDOR-JobState",
"depends" : "org.globus.GridProxy-Valid-/cms/Role=lcgadmin",
"interval" : 15,
"retry_interval" : 15,
"check_js" : {
"command" : "/usr/lib64/nagios/plugins/check_js",
"args" : {
"-x" : "::CMS_PROXY_LCGADMIN",
"--vo-fqan" : "::CMS_FQAN_LCGADMIN",
"--add-wntar-nag" : "/usr/libexec/grid-monitoring/probes/org.cms/wnjob/org.cms,/usr/libexec/grid-monitoring/probes/org.cms/wnjob/org.cms.lcgadmin",
}
},
"org.sam.CONDOR-JobState-/cms/Role=pilot" : {
"--vo" : "::VO",
"-t" : 600,
"-x" : "::CMS_PROXY_LCGADMIN",
"--suffix" : "::CMS_FQAN_LCGADMIN",
"--work-dir" : "/var/lib/gridprobes",
"--executable" : "/usr/bin/etf_run.sh",
"--vo-fqan" : "::CMS_FQAN_LCGADMIN",
"--add-payload" : "/usr/libexec/grid-monitoring/probes/org.cms/wnjob/org.cms,/usr/libexec/grid-monitoring/probes/org.cms/wnjob/org.cms.lcgadmin",
"--job-schedule" : 12,
"--env-file" : "/tmp/etf-env.sh",
"--wnfm-config" : "/tmp/etf_wnfm.json",
},
"docurl" : "https://etf.cern.ch/docs",
"doc" : "https://etf.cern.ch/docs",
"ocsp" : 0,
"max_check_attempts" : 1,
"timeout" : 600,
"interval" : 15,
"retry_interval" : 15,
"extends" : "org.sam.CONDOR-JobState",
"depends" : "org.globus.GridProxy-Valid-/cms/Role=pilot",
"args" : {
"-x" : "::CMS_PROXY_PILOT",
"--vo-fqan" : "::CMS_FQAN_PILOT",
"--add-wntar-nag" : "/usr/libexec/grid-monitoring/probes/org.cms/wnjob/org.cms.glexec",
}
"retry_interval" : 15
},
"org.sam.CONDOR-JobState-/cms/Role=production" : {
"org.sam.CONDOR-JobState-/cms/Role=lcgadmin" : {
"extends" : "check_js",
"depends" : "org.globus.GridProxy-Valid-/cms/Role=lcgadmin",
"interval" : 15,
"retry_interval" : 15,
"extends" : "org.sam.CONDOR-JobState",
"depends" : "org.globus.GridProxy-Valid-/cms/Role=production",
"args" : {
"-x" : "::CMS_PROXY_PROD",
"--vo-fqan" : "::CMS_FQAN_PROD",
"--add-wntar-nag" : "/usr/libexec/grid-monitoring/probes/org.cms/wnjob/org.cms,/usr/libexec/grid-monitoring/probes/org.cms/wnjob/org.cms.production",
}
},
"org.sam.CONDOR-JobMonit-/cms/Role=lcgadmin" : {
"extends" : "org.sam.CONDOR-JobMonit",
"target" : "localhost",
"depends" : "org.globus.GridProxy-Valid-/cms/Role=lcgadmin",
"args" : {
"-x" : "::CMS_PROXY_LCGADMIN",
"--vo-fqan" : "::CMS_FQAN_LCGADMIN",
}
},
"org.sam.CONDOR-JobMonit-/cms/Role=pilot" : {
"extends" : "org.sam.CONDOR-JobMonit",
"target" : "localhost",
"depends" : "org.globus.GridProxy-Valid-/cms/Role=pilot",
"args" : {
"-x" : "::CMS_PROXY_PILOT",
"--vo-fqan" : "::CMS_FQAN_PILOT",
}
},
"org.sam.CONDOR-JobMonit-/cms/Role=production" : {
"extends" : "org.sam.CONDOR-JobMonit",
"target" : "localhost",
"depends" : "org.globus.GridProxy-Valid-/cms/Role=production",
"args" : {
"-x" : "::CMS_PROXY_PROD",
"--vo-fqan" : "::CMS_FQAN_PROD",
"--prefix" : "org.sam.CONDOR",
"--backend" : "scondor",
"--timeout-limits" : "global:1410,IDLE:1380"
}
},
"org.sam.CONDOR-JobSubmit-/cms/Role=lcgadmin" : {
"passive" : True,
"depends" : "org.sam.CONDOR-JobMonit-/cms/Role=lcgadmin org.sam.CONDOR-JobState-/cms/Role=lcgadmin"
},
"org.sam.CONDOR-JobSubmit-/cms/Role=pilot" : {
"passive" : True,
"depends" : "org.sam.CONDOR-JobMonit-/cms/Role=pilot org.sam.CONDOR-JobState-/cms/Role=pilot"
},
"org.sam.CONDOR-JobSubmit-/cms/Role=production" : {
"passive" : True,
"depends" : "org.sam.CONDOR-JobMonit-/cms/Role=production org.sam.CONDOR-JobState-/cms/Role=production"
"depends" : "org.sam.CONDOR-JobState-/cms/Role=lcgadmin"
},
"org.cms.SRM-AllCMS-/cms/Role=production" : {
"command" : "/usr/libexec/grid-monitoring/probes/org.cms/srmvometrics.py",
......@@ -197,11 +157,6 @@ metrics = {
"depends" : "org.cms.SRM-AllCMS-/cms/Role=production",
"passive" : True,
},
"org.cms.WN-cvmfs-/cms/Role=production" : {
"docurl" : "https://twiki.cern.ch/twiki/bin/view/CMS/LhcbItSamNagiosProbes#WnProbes",
"doc" : "https://twiki.cern.ch/twiki/bin/view/CMS/LhcbItSamNagiosProbes#WnProbes",
"passive" : True,
},
"org.cms.SE-xrootd-read" : {
"command" : "/usr/libexec/grid-monitoring/probes/org.cms/cmssam_xrootd_endpnt.py",
"args" : {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment