Commit 434b24eb authored by Miciah Masters's avatar Miciah Masters Committed by Miciah Dashiel Butler Masters
Browse files

alerts: Add ingresscontroller degraded/unavailable

Add an ingress_controller_conditions Prometheus metric that reports the
status conditions of ingresscontrollers, and add Prometheus rules to raise
alerts if an ingresscontroller is unavailable or degraded.

This commit is related to bug 1955854.

https://bugzilla.redhat.com/show_bug.cgi?id=1955854

* cmd/ingress-operator/start.go (start): Call StartMetricsListener from the
operator package, and call RegisterMetrics from the ingress and canary
controller packages.
* manifests/0000_90_ingress-operator_03_prometheusrules.yaml: Add alerts
using the new ingress_controller_conditions metric to warn if an
ingresscontroller is degraded or unavailable.
* pkg/manifests/bindata.go: Regenerate.
* pkg/operator/controller/canary/metrics.go (registerCanaryMetrics): Rename
from this...
(RegisterMetrics): ...to this.
(StartMetricsListener): Move from here...
* pkg/operator/metrics.go: ...to here.  New file.
* pkg/operator/controller/ingress/metrics.go (ingressControllerConditions):
New variable.  Define a "ingress_controller_conditions" Prometheus gauge.
(metricsList): New variable.  Define the list of metrics for this
controller.  Currently this list comprises ingressControllerConditions.
(reportedConditions): New variable.  Define the ingresscontroller status
conditions that are published in the ingress_controller_conditions metric.
(SetIngressControllerConditionsMetric): New function.  Update the new
"ingress_controller_conditions" gauge with the status conditions of the
given ingresscontroller.
(RegisterMetrics): New function.  Register metricsList with Prometheus.
* pkg/operator/controller/ingress/status.go (syncIngressControllerStatus):
Call SetIngressControllerConditionsMetric after updating status conditions.
parent 313ea5fd
......@@ -15,6 +15,7 @@ import (
operatorconfig "github.com/openshift/cluster-ingress-operator/pkg/operator/config"
operatorcontroller "github.com/openshift/cluster-ingress-operator/pkg/operator/controller"
canarycontroller "github.com/openshift/cluster-ingress-operator/pkg/operator/controller/canary"
ingresscontroller "github.com/openshift/cluster-ingress-operator/pkg/operator/controller/ingress"
statuscontroller "github.com/openshift/cluster-ingress-operator/pkg/operator/controller/status"
"sigs.k8s.io/controller-runtime/pkg/client"
......@@ -123,7 +124,15 @@ func start(opts *StartOptions) error {
}
// Start operator metrics.
go canarycontroller.StartMetricsListener(opts.MetricsListenAddr, signal)
go operator.StartMetricsListener(opts.MetricsListenAddr, signal)
log.Info("registering Prometheus metrics for canary_controller")
if err := canarycontroller.RegisterMetrics(); err != nil {
log.Error(err, "unable to register metrics for canary_controller")
}
log.Info("registering Prometheus metrics for ingress_controller")
if err := ingresscontroller.RegisterMetrics(); err != nil {
log.Error(err, "unable to register metrics for ingress_controller")
}
// Set up and start the file watcher.
watcher, err := fsnotify.NewWatcher()
......
......@@ -27,3 +27,21 @@ spec:
severity: critical
annotations:
message: "HAProxy metrics are reporting that HAProxy is down on pod {{ $labels.namespace }} / {{ $labels.pod }}"
- alert: IngressControllerDegraded
expr: ingress_controller_conditions{condition="Degraded"} == 1
for: 5m
labels:
severity: warning
annotations:
message: |
The {{ $labels.namespace }}/{{ $labels.name }} ingresscontroller is
degraded: {{ $labels.reason }}.
- alert: IngressControllerUnavailable
expr: ingress_controller_conditions{condition="Available"} == 0
for: 5m
labels:
severity: warning
annotations:
message: |
The {{ $labels.namespace }}/{{ $labels.name }} ingresscontroller is
unavailable: {{ $labels.reason }}.
......@@ -23,7 +23,7 @@
// manifests/0000_90_ingress-operator_00_prometheusrole.yaml (446B)
// manifests/0000_90_ingress-operator_01_prometheusrolebinding.yaml (514B)
// manifests/0000_90_ingress-operator_02_servicemonitor.yaml (720B)
// manifests/0000_90_ingress-operator_03_prometheusrules.yaml (997B)
// manifests/0000_90_ingress-operator_03_prometheusrules.yaml (1.68kB)
// manifests/01-cluster-role-binding.yaml (578B)
// manifests/01-role-binding.yaml (1.196kB)
// manifests/01-role.yaml (1.219kB)
......@@ -562,7 +562,7 @@ func manifests0000_90_ingressOperator_02_servicemonitorYaml() (*asset, error) {
return a, nil
}
var _manifests0000_90_ingressOperator_03_prometheusrulesYaml = []byte("\x1f\x8b\x08\x00\x00\x00\x00\x00\x00\xff\x9c\x93\x41\x6b\xdc\x3e\x10\xc5\xef\xfb\x29\x86\xe5\x7f\xb5\xf3\xcf\xa1\x17\x41\x0e\x85\x52\x7a\x0c\x39\xf4\xba\x4c\xac\xb7\xde\xa1\x92\x46\x8c\xe4\x4d\x42\xd8\xef\x5e\x64\x3b\x26\x74\x5b\x4a\xaa\x93\xc5\xbc\x19\xbf\xf7\x1b\xc4\x59\xbe\xc3\x8a\x68\x72\x14\x35\x49\x55\x93\x34\xf6\x83\x1a\xb4\xf4\x83\xc6\x9b\xf3\xed\xee\x87\x24\xef\xe8\xde\x34\xa2\x9e\x30\x95\x87\x29\x60\x17\x51\xd9\x73\x65\xb7\x23\x4a\x1c\xe1\x48\xd2\x68\x28\xa5\xd3\x0c\xe3\xaa\xb6\x16\x4a\xe6\x01\x8e\x34\x23\x95\x93\x1c\x6b\xf7\x1b\x5d\xe0\x47\x84\xd2\x46\x11\x99\x06\x38\xe2\x00\xab\x9d\x4d\x01\x65\x47\xc4\x29\x69\xe5\x2a\x9a\x56\x91\xa4\x21\x4c\x1e\xbd\x21\x80\x0b\xfa\x6d\x7a\x2f\x7a\x23\x8f\xb1\x1b\x82\x4e\xbe\x8b\x9c\x78\x84\x77\xb4\xaf\x36\x61\xff\xf7\xd6\x82\x70\x7c\xeb\xea\x4e\x32\x9e\x3a\x3e\xb3\x04\x7e\x94\x20\xf5\xe5\x03\x73\x24\x8d\x01\x5d\x52\x8f\xce\xe3\x8c\xd0\xc2\x6e\xed\x25\x63\x68\x39\x46\xd3\x29\xaf\x89\xba\x95\xe2\x15\xa7\xfe\x0d\xc2\x0c\xa7\x7d\xbb\xf5\xd2\x2d\x94\x1c\x7d\xfb\x7c\x6f\xfa\xfc\xf2\x80\xa0\xec\xbf\xb2\x84\x55\x40\x84\xe7\x6c\x8e\x2a\x62\x0e\x5c\x71\x30\x9d\x2a\xec\x60\xb3\xf0\x70\x64\x09\x93\x81\xee\xee\xe8\x76\xeb\x38\xaa\x39\xfa\x14\xb7\xfb\xfb\xd5\x2c\xa7\xe0\x0c\x9b\x69\x3c\xb1\x25\x49\xe3\x56\xbb\x5a\xd3\x72\x22\x4a\xe1\x11\x8e\xf6\xab\x51\x5a\x0c\x14\x62\x03\x35\x17\x92\x46\xd2\x44\xaf\xaf\xf4\xdf\xf2\xbf\x3e\xab\xa7\xcb\xa5\xa7\x87\xd9\x31\x49\xa1\xa4\x95\x0c\x0d\x5d\x6d\x72\xc3\x80\x54\xc3\x0b\x0d\x06\xae\xf0\xa4\x46\x51\xbd\x1c\x05\x9e\xe6\x9c\x65\xff\x07\x4e\x5f\xf4\x29\xfd\x42\xe8\xc4\xb9\x55\x0e\x53\x6e\x34\xfe\xff\x27\x1a\x83\x49\x95\x81\xc3\xc7\x71\x44\x54\x93\x61\xc1\x61\xc8\x6a\x73\xc2\x7a\xe2\xfa\x66\xb9\x01\xf0\xfa\x94\x1a\xa5\x86\xe6\x1d\xa9\xed\x8d\xd1\xe5\x42\x37\xd7\x0c\xf7\xbb\x9f\x01\x00\x00\xff\xff\xca\x2c\x36\xfc\xe5\x03\x00\x00")
var _manifests0000_90_ingressOperator_03_prometheusrulesYaml = []byte("\x1f\x8b\x08\x00\x00\x00\x00\x00\x00\xff\xcc\x94\x41\x6f\xdb\x38\x10\x85\xef\xfe\x15\x03\x63\xaf\x92\x37\x87\xbd\x08\xc8\x21\xd8\x60\xb1\xbd\x05\x41\xdb\xab\x31\x11\x9f\xa5\x41\x29\x8e\x30\xa4\x9c\x04\xa9\xff\x7b\x41\x49\x56\xdc\xb8\x41\x90\x1c\x8a\xf2\x44\x72\x66\x9e\xe7\x7d\x1c\x8b\x7b\xf9\x0a\x8b\xa2\xa1\xa2\x4e\x83\x24\x35\x09\x4d\x59\xab\x41\x63\x59\x6b\xb7\xd9\x5f\xac\xbe\x49\x70\x15\xdd\x98\x76\x48\x2d\x86\x78\x3b\x78\xac\x3a\x24\x76\x9c\xb8\x5a\x11\x05\xee\x50\x91\x84\xc6\x10\x63\xa1\x3d\x8c\x93\xda\x1c\x88\x3d\xd7\xa8\x48\x7b\x84\xd8\xca\x2e\x15\xbf\xc8\xf3\x7c\x07\x1f\xb3\x14\x91\xa9\x47\x45\xec\x61\xa9\xb0\xc1\x23\xae\x88\x38\x04\x4d\x9c\x44\xc3\x9c\x24\xa1\xf6\x83\x43\x69\xf0\xe0\x88\x72\x51\x2f\x45\x37\x72\xd7\x15\xb5\xd7\xc1\x15\x1d\x07\x6e\xe0\x2a\x5a\x27\x1b\xb0\x7e\xbb\x34\xc2\xef\x8e\x55\x45\x2b\x4d\x5b\xf0\x9e\xc5\xf3\x9d\x78\x49\x8f\xef\xd0\x91\xd0\x78\x14\x41\x1d\x0a\x87\x3d\x7c\x36\xbb\x94\xc7\x1e\x75\xf6\xd1\x98\x0e\xfd\xec\xa8\x98\x29\x9e\x71\x2a\x8f\x10\x46\x38\x79\x5f\xcd\x87\x62\xa2\x54\xd1\xff\x57\x37\xa6\x0f\x8f\xb7\xf0\xca\xee\x3f\x16\x3f\x27\x10\xe1\xa1\xb7\x8a\x12\xba\xde\x73\xc2\xd6\x74\x48\xb0\xad\x8d\x89\xdb\x1d\x8b\x1f\x0c\x74\x79\x49\x17\x4b\xc5\x4e\xad\xa2\x7f\xba\xe5\x7c\xfa\x34\xd3\x8a\xd8\xc3\x46\x1a\xf7\x6c\x41\x42\xb3\xc4\xce\x9e\x69\x5a\x1d\x62\xe4\x06\x15\xad\xe7\x46\x69\x6a\x20\x12\x1b\x28\x77\x21\xa1\x21\x0d\xf4\xf4\x44\x7f\x4d\xbf\x57\xf6\xea\xe8\x70\x28\xe9\x76\xec\x98\x24\x52\xd0\x44\x86\x8c\x2e\xe5\x74\x43\x8d\x90\xfc\x23\xd5\x06\x4e\x70\xa4\x46\x9d\x3a\xd9\x09\x1c\x8d\x3e\xe3\xfa\x15\x4e\xd7\x7a\x1f\x5e\x10\x6a\xb9\xcf\x91\xed\xd0\x67\x1a\x7f\x7f\x88\x46\x6d\x92\xa4\x66\xff\x7e\x1c\x1d\x92\x49\x3d\xe1\x30\xf4\x6a\xa3\xc3\xd4\x72\x3a\xb6\x9c\x01\x38\xbd\x0f\x99\x52\x46\x73\x42\x6a\xf9\x8f\xd1\xe1\x40\x9b\x73\x86\x67\x14\x3e\x4d\x73\xf5\xaf\x86\x64\xea\x3d\xec\x1a\x8d\xb1\x83\x7b\xc1\x64\x9e\xbf\x6d\xbd\x24\xe6\xad\x93\xd1\xd0\xd3\xb2\xbd\x5c\x1f\xeb\xd7\x87\xdf\x37\x4a\xdf\x4f\x2e\x89\x3e\xb7\x78\x0d\xc9\xe6\xc5\x7d\xa6\x34\x3b\x7b\x36\x46\x12\x7f\xd2\x73\xb3\xa3\xea\x54\xd5\xc0\x51\x43\x9e\xca\x37\x89\x7e\x09\xf3\x67\xc3\xe3\xa3\x50\xaf\x8e\x02\x13\xd5\x8f\x8d\xe4\x9f\x45\x75\x78\xa6\xf2\x1a\xd8\x1f\x01\x00\x00\xff\xff\x6d\x5c\x89\x0e\x90\x06\x00\x00")
func manifests0000_90_ingressOperator_03_prometheusrulesYamlBytes() ([]byte, error) {
return bindataRead(
......@@ -577,8 +577,8 @@ func manifests0000_90_ingressOperator_03_prometheusrulesYaml() (*asset, error) {
return nil, err
}
info := bindataFileInfo{name: "manifests/0000_90_ingress-operator_03_prometheusrules.yaml", size: 997, mode: os.FileMode(420), modTime: time.Unix(1, 0)}
a := &asset{bytes: bytes, info: info, digest: [32]uint8{0x43, 0x32, 0x10, 0x38, 0x4c, 0xd3, 0xab, 0x95, 0x2b, 0xba, 0x2e, 0xe5, 0x96, 0x40, 0x8, 0xa7, 0xe3, 0xab, 0x4e, 0xa5, 0x5d, 0xc9, 0x9f, 0x24, 0x2e, 0xe9, 0xc1, 0xcb, 0xfa, 0x45, 0x68, 0xd2}}
info := bindataFileInfo{name: "manifests/0000_90_ingress-operator_03_prometheusrules.yaml", size: 1680, mode: os.FileMode(420), modTime: time.Unix(1, 0)}
a := &asset{bytes: bytes, info: info, digest: [32]uint8{0xeb, 0x8c, 0xe1, 0x85, 0x67, 0x39, 0xac, 0x76, 0x85, 0x40, 0xe9, 0xaf, 0xf0, 0x4d, 0xb2, 0x43, 0x71, 0x9c, 0x32, 0x7d, 0x35, 0x67, 0x4d, 0x6, 0x86, 0x7b, 0xa6, 0x68, 0x8f, 0x4d, 0x1b, 0x20}}
return a, nil
}
......
package canary
import (
"context"
"net/http"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promhttp"
ctrlruntimemetrics "sigs.k8s.io/controller-runtime/pkg/metrics"
)
var (
......@@ -56,9 +51,9 @@ func SetCanaryRouteReachableMetric(host string, status bool) {
}
}
// registerCanaryMetrics calls prometheus.Register
// on each metric in metricsList, and returns on errors.
func registerCanaryMetrics() error {
// RegisterMetrics calls prometheus.Register on each metric in metricsList, and
// returns on errors.
func RegisterMetrics() error {
for _, metric := range metricsList {
err := prometheus.Register(metric)
if err != nil {
......@@ -67,41 +62,3 @@ func registerCanaryMetrics() error {
}
return nil
}
// StartMetricsListener starts the metrics listener on addr.
func StartMetricsListener(addr string, signal context.Context) {
// These metrics get registered in controller-runtime's registry via an init in the internal/controller/metrics package.
// Unregister the controller-runtime metrics, so that we can combine the controller-runtime metric's registry
// with that of the ingress-operator. This shouldn't have any side effects, as long as no 2 metrics across
// controller runtime or the ingress operator share the same name (which is unlikely). See
// https://github.com/kubernetes/test-infra/blob/master/prow/metrics/metrics.go for additional context.
ctrlruntimemetrics.Registry.Unregister(prometheus.NewGoCollector())
ctrlruntimemetrics.Registry.Unregister(prometheus.NewProcessCollector(prometheus.ProcessCollectorOpts{}))
// Create prometheus handler by combining the ingress-operator registry
// with the ingress-operator's controller runtime metrics registry.
handler := promhttp.HandlerFor(
prometheus.Gatherers{prometheus.DefaultGatherer, ctrlruntimemetrics.Registry},
promhttp.HandlerOpts{},
)
log.Info("registering Prometheus metrics")
if err := registerCanaryMetrics(); err != nil {
log.Error(err, "unable to register metrics")
}
log.Info("starting metrics listener", "addr", addr)
mux := http.NewServeMux()
mux.Handle("/metrics", handler)
s := http.Server{Addr: addr, Handler: mux}
go func() {
if err := s.ListenAndServe(); err != nil && err != http.ErrServerClosed {
log.Error(err, "metrics listener exited")
}
}()
<-signal.Done()
if err := s.Shutdown(context.Background()); err != http.ErrServerClosed {
log.Error(err, "error stopping metrics listener")
}
}
......@@ -4,6 +4,8 @@ import (
"context"
"fmt"
"github.com/prometheus/client_golang/prometheus"
corev1 "k8s.io/api/core/v1"
"github.com/openshift/cluster-ingress-operator/pkg/manifests"
......@@ -13,8 +15,59 @@ import (
"k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
"k8s.io/apimachinery/pkg/util/sets"
)
var (
// ingressControllerConditions reports the status conditions of each
// IngressController using the ingress_controller_conditions metric.
ingressControllerConditions = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "ingress_controller_conditions",
Help: "Report the conditions for ingress controllers. 0 is False and 1 is True.",
}, []string{"name", "condition"})
// metricsList is a list of metrics for this package.
metricsList = []prometheus.Collector{
ingressControllerConditions,
}
)
// reportedConditions is the set of ingresscontroller status conditions that are
// reported in the ingress_controller_conditions metric.
var reportedConditions = sets.NewString("Available", "Degraded")
// SetIngressControllerConditionsMetric updates the
// ingress_controller_conditions metric values for the given IngressController.
func SetIngressControllerConditionsMetric(ic *operatorv1.IngressController) {
for _, c := range ic.Status.Conditions {
if !reportedConditions.Has(c.Type) {
continue
}
switch c.Status {
case operatorv1.ConditionFalse, operatorv1.ConditionTrue:
default:
log.V(4).Info("skipping metrics for IngressController condition because it is neither True nor False", "ingresscontroller", ic.Name, "condition_type", c.Type, "condition_status", c.Status)
continue
}
var v float64 = 0
if c.Status == operatorv1.ConditionTrue {
v = 1
}
ingressControllerConditions.WithLabelValues(ic.Name, string(c.Type)).Set(v)
}
}
// RegisterMetrics calls prometheus.Register on each metric in metricsList, and
// returns on errors.
func RegisterMetrics() error {
for _, metric := range metricsList {
if err := prometheus.Register(metric); err != nil {
return err
}
}
return nil
}
// ensureMetricsIntegration ensures that router prometheus metrics is integrated with openshift-monitoring for the given ingresscontroller.
func (r *reconciler) ensureMetricsIntegration(ci *operatorv1.IngressController, svc *corev1.Service, deploymentRef metav1.OwnerReference) error {
statsSecret := manifests.RouterStatsSecret(ci)
......
......@@ -71,6 +71,8 @@ func (r *reconciler) syncIngressControllerStatus(ic *operatorv1.IngressControlle
if !IngressStatusesEqual(updated.Status, ic.Status) {
if err := r.client.Status().Update(context.TODO(), updated); err != nil {
errs = append(errs, fmt.Errorf("failed to update ingresscontroller status: %v", err))
} else {
SetIngressControllerConditionsMetric(updated)
}
}
......
package operator
import (
"context"
"net/http"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promhttp"
ctrlruntimemetrics "sigs.k8s.io/controller-runtime/pkg/metrics"
)
// StartMetricsListener starts the metrics listener on addr.
func StartMetricsListener(addr string, signal context.Context) {
// These metrics get registered in controller-runtime's registry via an init in the internal/controller/metrics package.
// Unregister the controller-runtime metrics, so that we can combine the controller-runtime metric's registry
// with that of the ingress-operator. This shouldn't have any side effects, as long as no 2 metrics across
// controller runtime or the ingress operator share the same name (which is unlikely). See
// https://github.com/kubernetes/test-infra/blob/master/prow/metrics/metrics.go for additional context.
ctrlruntimemetrics.Registry.Unregister(prometheus.NewGoCollector())
ctrlruntimemetrics.Registry.Unregister(prometheus.NewProcessCollector(prometheus.ProcessCollectorOpts{}))
// Create prometheus handler by combining the ingress-operator registry
// with the ingress-operator's controller runtime metrics registry.
handler := promhttp.HandlerFor(
prometheus.Gatherers{prometheus.DefaultGatherer, ctrlruntimemetrics.Registry},
promhttp.HandlerOpts{},
)
log.Info("starting metrics listener", "addr", addr)
mux := http.NewServeMux()
mux.Handle("/metrics", handler)
s := http.Server{Addr: addr, Handler: mux}
go func() {
if err := s.ListenAndServe(); err != nil && err != http.ErrServerClosed {
log.Error(err, "metrics listener exited")
}
}()
<-signal.Done()
if err := s.Shutdown(context.Background()); err != http.ErrServerClosed {
log.Error(err, "error stopping metrics listener")
}
}
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment