diff --git a/.helmdocsignore b/.helmdocsignore new file mode 100644 index 0000000000000000000000000000000000000000..88301c0048b186ce92c9df5738ca6ab4c51f1328 --- /dev/null +++ b/.helmdocsignore @@ -0,0 +1 @@ +charts/* \ No newline at end of file diff --git a/.helmignore b/.helmignore index e40aded5435ba4d39cba63f1e06e69785794841a..64e134a40f4f0d977604b215c788a69d576b0f00 100644 --- a/.helmignore +++ b/.helmignore @@ -10,6 +10,7 @@ .yamllint config README.md +README.md.gotmpl .gitignore .bzr/ .bzrignore diff --git a/README.md b/README.md index ca7b904bc771b5001de49b6208e9c66663ef0946..16fdafa84a30bbe1d2808e1dafd12b14513ad1ff 100644 --- a/README.md +++ b/README.md @@ -1,15 +1,156 @@ -# CERN IT Monitoring Kubernetes Helm Chart +# cern-it-monitoring-kubernetes + +Helm Chart provided by IT Monitoring Service to install and configure required components to gather and send monitoring data from kubernetes clusters to central service. ## Overview -The **CERN IT Monitoring Kubernetes Helm Chart** provides a solution for monitoring Kubernetes clusters at CERN. It enables the collection of **metrics**, **logs**, and future support for **traces**, which are forwarded to the central CERN monitoring infrastructure. From there users can consume them using the day-to-day tools that they already user like [Grafana](https://monit-docs.web.cern.ch/access/grafana/) or [OpenSearch](https://monit-docs.web.cern.ch/access/opensearch/). +The **CERN IT Monitoring Kubernetes Helm Chart** provides a solution +for monitoring Kubernetes clusters at CERN. It enables the collection +of **metrics**, **logs**, and future support for **traces**, which are +forwarded to the central CERN monitoring infrastructure. From there +users can consume them using the day-to-day tools that they already +user like [Grafana](https://monit-docs.web.cern.ch/access/grafana/) or +[OpenSearch](https://monit-docs.web.cern.ch/access/opensearch/). -This Helm chart simplifies the deployment and configuration of necessary components for observability, making it easier to manage monitoring across various Kubernetes clusters and their applications. +This Helm chart simplifies the deployment and configuration of +necessary components for observability, making it easier to manage +monitoring across various Kubernetes clusters and their applications. ## Quick Start See [getting started](docs/getting_started.md). +## Values + +| Key | Type | Default | Description | +|-----|------|---------|-------------| +| crds.enabled | bool | `true` | whether to install Prometheus operator's CRDs | +| fluentbit.image.imagePullPolicy | string | `"IfNotPresent"` | image pull policy applied to all Fluent Bit instances | +| fluentbit.image.repository | string | `"registry.cern.ch/monit/cern-it-monitoring-fluent-bit"` | image repository applied to all Fluent Bit instances | +| fluentbit.image.tag | string | `"3.2.6"` | image tag applied to all Fluent Bit instances | +| kubernetes.clusterName | string | `""` | name of the kubernetes cluster to monitor. This value will be appended to very metric and log via k8s_cluster_name label. This bit is required if fluentbit is enabled (default) | +| logs.enabled | bool | `false` | indicates if logs metrics components should be enabled or not. If set to false no logs component will be installed nor configured | +| logs.fluentbit.customParsers | string | `""` | | +| logs.fluentbit.enabled | bool | `false` | indicates if fluentbit logs component should be installed or not | +| logs.fluentbit.extraVolumeMounts | list | `[]` | | +| logs.fluentbit.extraVolumes | list | `[]` | | +| logs.fluentbit.filters | string | partly autogenerated -- see values.yaml | fluentbit filters as a yaml list in a multiline string | +| logs.fluentbit.image.imagePullPolicy | string | `""` | image pull policy for Fluent Bit (logs) | +| logs.fluentbit.image.repository | string | `""` | repository to use for Fluent Bit (logs) | +| logs.fluentbit.image.tag | string | `""` | tag to use for Fluent Bit (logs) | +| logs.fluentbit.inputs | string | partly autogenerated -- see values.yaml | fluentbit inputs as a yaml list in a multiline string | +| logs.fluentbit.luaScripts | object | `{}` | extra Lua scripts for user-provided transformations | +| logs.fluentbit.outputs | string | partly autogenerated -- see values.yaml | fluentbit outputs as a yaml list in a multiline string | +| logs.fluentbit.resources.limits.cpu | string | `"20m"` | | +| logs.fluentbit.resources.limits.memory | string | `"25Mi"` | | +| logs.fluentbit.resources.requests.cpu | string | `"5m"` | | +| logs.fluentbit.resources.requests.memory | string | `"15Mi"` | | +| logs.fluentbit.service | string | partly autogenerated -- see values.yaml | fluentbit service configuration options in a multiline string | +| metrics.alertmanager.enabled | bool | `false` | if true alertmanager will be installed and prometheus reconfigured to use it as the alerting endpoint | +| metrics.alertmanager.image | string | `"registry.cern.ch/monit/cern-it-monitoring-alertmanager"` | alertmanager image to use by the local cluster alertmanager | +| metrics.alertmanager.ingress.className | string | `""` | class name to be used by the alertmanager ingress | +| metrics.alertmanager.ingress.enabled | bool | `false` | if set to true an ingress will be created for the alertmanager service | +| metrics.alertmanager.ingress.hosts | list | `[]` | list of hosts for the alertmanager ingress | +| metrics.alertmanager.ingress.path | string | `"/"` | entry path for the alertmanager ingress | +| metrics.alertmanager.ingress.pathType | string | `"ImplementationSpecific"` | path type for the alertmanager ingress | +| metrics.alertmanager.ingress.tls | object | `{}` | tls configuration for the alertmanager ingress | +| metrics.alertmanager.nodeSelector | object | `{}` | node selector configuration for the alertmanager | +| metrics.alertmanager.pullPolicy | string | `"IfNotPresent"` | pull policy for the alertmanager image | +| metrics.alertmanager.replicas | int | `3` | number of replicas for the alertmanager deployment | +| metrics.alertmanager.tag | string | `"v0.27.0"` | alertmanager image tag to be used when pulling it | +| metrics.alertmanager.volumeMounts | list | `[]` | list of volumes to be mounted | +| metrics.alertmanager.volumes | list | `[]` | list of volumes to be declared | +| metrics.apiServer.serviceMonitor.relabelings | list | `[]` | | +| metrics.coredns.serviceMonitor.relabelings | list | `[]` | | +| metrics.defaultNodeSelector | object | `{}` | the default node selector will be applied when possible. In to the following components: metrics collectors (prometheus and fluentbit), metrics exporters (kube state). | +| metrics.enabled | bool | `true` | indicates if all metrics components should be enabled or not. If set to false no metrics component will be installed nor configured | +| metrics.etcd.serviceMonitor.relabelings | list | `[]` | | +| metrics.fluentbit.diskMaxCache | string | `"5G"` | max size for in-disk storage for fluent-bit | +| metrics.fluentbit.enabled | bool | `true` | if true fluentbit metrics forwarder will be installed | +| metrics.fluentbit.filters | string | partly autogenerated -- see values.yaml | fluentbit filters as a yaml list in a multiline string | +| metrics.fluentbit.image.imagePullPolicy | string | `""` | image pull policy for Fluent Bit (metrics) | +| metrics.fluentbit.image.repository | string | `""` | repository to use for Fluent Bit (metrics) | +| metrics.fluentbit.image.tag | string | `""` | tag to use for Fluent Bit (metrics) | +| metrics.fluentbit.inputs | string | partly autogenerated -- see values.yaml | fluentbit inputs as a yaml list in a multiline string | +| metrics.fluentbit.luaScripts | object | `{}` | extra Lua scripts for user-provided transformations | +| metrics.fluentbit.nodeSelector | object | `{}` | | +| metrics.fluentbit.outputs | string | partly autogenerated -- see values.yaml | fluentbit outputs as a yaml list in a multiline string | +| metrics.fluentbit.prometheusRemoteWriteInputConfig.bufferChunkSize | string | `"128M"` | | +| metrics.fluentbit.prometheusRemoteWriteInputConfig.bufferMaxSize | string | `"2G"` | | +| metrics.fluentbit.prometheusRemoteWriteInputConfig.listen | string | `"0.0.0.0"` | | +| metrics.fluentbit.prometheusRemoteWriteInputConfig.port | int | `8080` | | +| metrics.fluentbit.prometheusRemoteWriteInputConfig.successfulResponseCode | int | `201` | | +| metrics.fluentbit.prometheusRemoteWriteInputConfig.tag | string | `"monit.prom.k8s"` | | +| metrics.fluentbit.prometheusRemoteWriteInputConfig.tagFromUri | bool | `false` | | +| metrics.fluentbit.prometheusRemoteWriteInputConfig.threaded | bool | `false` | | +| metrics.fluentbit.prometheusRemoteWriteInputConfig.uri | string | `"/api/prom/push"` | | +| metrics.fluentbit.replicas | int | `2` | | +| metrics.fluentbit.resources.limits.cpu | string | `"1"` | | +| metrics.fluentbit.resources.limits.memory | string | `"1Gi"` | | +| metrics.fluentbit.resources.requests.cpu | string | `"1"` | | +| metrics.fluentbit.resources.requests.memory | string | `"512Mi"` | | +| metrics.fluentbit.service | string | partly autogenerated -- see values.yaml | fluentbit service configuration options in a multiline string | +| metrics.ingress.nginx.serviceMonitor.relabelings | list | `[]` | | +| metrics.kubeProxy.serviceMonitor.relabelings | list | `[]` | | +| metrics.kubeState.enabled | bool | `true` | if true kube state will be installed together with a service monitor | +| metrics.kubeState.nodeSelector | object | `{}` | | +| metrics.kubeState.resources.limits.cpu | string | `"20m"` | | +| metrics.kubeState.resources.limits.memory | string | `"25Mi"` | | +| metrics.kubeState.resources.requests.cpu | string | `"5m"` | | +| metrics.kubeState.resources.requests.memory | string | `"15Mi"` | | +| metrics.kubeState.scrapeInterval | string | `"30s"` | indicates how often this exporter will be scraped by the local prometheus | +| metrics.kubeState.serviceMonitor.relabelings | list | `[]` | | +| metrics.kubecontroller.serviceMonitor.relabelings | list | `[]` | | +| metrics.kubelet.serviceMonitor.relabelings | list | `[]` | | +| metrics.nodeExporter.enabled | bool | `true` | if true node exporter will be installed as a daemon set together with a pod monitor | +| metrics.nodeExporter.resources.limits.cpu | string | `"20m"` | | +| metrics.nodeExporter.resources.limits.memory | string | `"25Mi"` | | +| metrics.nodeExporter.resources.requests.cpu | string | `"5m"` | | +| metrics.nodeExporter.resources.requests.memory | string | `"15Mi"` | | +| metrics.nodeExporter.scrapeInterval | string | `""` | indicates how often this exporter will be scraped by the local prometheus | +| metrics.nodeExporter.serviceMonitor.relabelings | list | `[]` | | +| metrics.prometheus.enabled | bool | `true` | if true prometheus operator and a prometheus server will be installed | +| metrics.prometheus.operator.nodeSelector | object | `{}` | | +| metrics.prometheus.operator.resources.limits.cpu | string | `"100m"` | | +| metrics.prometheus.operator.resources.limits.memory | string | `"100Mi"` | | +| metrics.prometheus.operator.resources.requests.cpu | string | `"5m"` | | +| metrics.prometheus.operator.resources.requests.memory | string | `"25Mi"` | | +| metrics.prometheus.server.extraLabelsForMetrics | object | `{}` | set of static labels and values to add to all the metrics gathered by the in-cluster prometheus when exported to central monitoring | +| metrics.prometheus.server.image | string | `"registry.cern.ch/monit/cern-it-monitoring-prometheus:v2.53.3"` | prometheus image to use by the local cluster prometheus | +| metrics.prometheus.server.nodeSelector | object | `{}` | prometheus operator node selectors. If set it will override the metrics.defaultNodeSelector | +| metrics.prometheus.server.relabelings | list | `[]` | allows to drop / relabel node Exporter metrics. | +| metrics.prometheus.server.remoteWrite | object | `{}` | remote write prometheus configuration | +| metrics.prometheus.server.resources.limits.cpu | string | `"500m"` | | +| metrics.prometheus.server.resources.limits.memory | string | `"5Gi"` | | +| metrics.prometheus.server.resources.requests.cpu | string | `"100m"` | | +| metrics.prometheus.server.resources.requests.memory | string | `"2Gi"` | | +| metrics.prometheus.server.retention | string | `"24h"` | interval during which local cluster prometheus will store metrics | +| metrics.prometheus.server.scrapeInterval | string | `"10s"` | interval used to self scrape metrics | +| metrics.prometheus.server.scrapeTimeout | string | `"5s"` | timeout for self scraped metrics | +| metrics.prometheus.server.serviceMonitors | list | `[]` | service monitors to be created | +| metrics.prometheus.server.version | string | `"v2.53.3"` | prometheus version to use by the local cluster prometheus | +| metrics.pushgateway.enabled | bool | `false` | pushgateway allows you to send metrics to the monitoring infrastructure by pushing them to the local cluster service it-monit-metrics-collector-pushgateway. | +| metrics.pushgateway.image.pullPolicy | string | `"IfNotPresent"` | | +| metrics.pushgateway.image.repository | string | `"registry.cern.ch/monit/cern-it-monitoring-pushgateway"` | | +| metrics.pushgateway.image.tag | string | `"v1.10.0"` | | +| metrics.pushgateway.ingress.className | string | `""` | | +| metrics.pushgateway.ingress.enabled | bool | `false` | if set to true will install register a new ingress with the given configuration. | +| metrics.pushgateway.ingress.hosts | list | `[]` | | +| metrics.pushgateway.ingress.path | string | `"/"` | | +| metrics.pushgateway.ingress.pathType | string | `"ImplementationSpecific"` | | +| metrics.pushgateway.ingress.tls | object | `{}` | | +| metrics.pushgateway.nodeSelector | object | `{}` | if given will override the defaultNodeSelector and install the component only on the nodes that match the given condition. | +| metrics.pushgateway.resources.limits.cpu | float | `0.2` | | +| metrics.pushgateway.resources.limits.memory | string | `"100Mi"` | | +| metrics.pushgateway.resources.requests.cpu | float | `0.2` | | +| metrics.pushgateway.resources.requests.memory | string | `"100Mi"` | | +| metrics.scheduler.serviceMonitor.relabelings | list | `[]` | | +| metrics.scheduler.serviceMonitor.relabelings | list | `[]` | | +| otlp.endpoint | string | `"monit-otlp.cern.ch"` | otlp endpoint where the otlp receivers are listening | +| otlp.port | int | `4319` | otlp port where the otlp receivers are listening | +| tenant.name | string | `""` | username used for authenitcating in the MONIT infrastructure | +| tenant.password | string | `""` | password (plain) used for authenticating in the MONIT infrastructure | + ## Contributing We welcome contributions! If you're interested in helping improve this project, please review our [contribution guidelines](CONTRIBUTING.md). In brief: diff --git a/README.md.gotmpl b/README.md.gotmpl new file mode 100644 index 0000000000000000000000000000000000000000..76973b0870304ffcbb842ea5e79887019c8f3721 --- /dev/null +++ b/README.md.gotmpl @@ -0,0 +1,44 @@ +{{ template "chart.header" . }} +{{ template "chart.description" . }} + +## Overview + +The **CERN IT Monitoring Kubernetes Helm Chart** provides a solution +for monitoring Kubernetes clusters at CERN. It enables the collection +of **metrics**, **logs**, and future support for **traces**, which are +forwarded to the central CERN monitoring infrastructure. From there +users can consume them using the day-to-day tools that they already +user like [Grafana](https://monit-docs.web.cern.ch/access/grafana/) or +[OpenSearch](https://monit-docs.web.cern.ch/access/opensearch/). + +This Helm chart simplifies the deployment and configuration of +necessary components for observability, making it easier to manage +monitoring across various Kubernetes clusters and their applications. + +## Quick Start + +See [getting started](docs/getting_started.md). + +{{ template "chart.valuesSection" . }} + +## Contributing + +We welcome contributions! If you're interested in helping improve this project, please review our [contribution guidelines](CONTRIBUTING.md). In brief: + +1. **Fork** the repository. +2. Create a **feature branch**. +3. Implement, provide tests and validate your changes. +4. Submit a **Merge Request (MR)** to the `master` branch. + +For a full contribution workflow, visit the [contribution guide](CONTRIBUTING.md). + +## Documentation + +Complete documentation for this chart, including setup and configuration details, is available: + +- GitLab Repository: [link](docs) +- Project Documentation: [link](https://monit-docs.web.cern.ch) + +## License + +This repository is licensed under the [Apache License 2.0](https://www.apache.org/licenses/LICENSE-2.0). See the [LICENSE](LICENSE) file for more information. diff --git a/docs/values.md b/docs/values.md deleted file mode 100644 index e57ef01626757b07d81ff2401c1b23e2571bc90d..0000000000000000000000000000000000000000 --- a/docs/values.md +++ /dev/null @@ -1,95 +0,0 @@ -# CERN IT Monitoring Kubernetes Helm Chart Default Values -This file contains the markdown version of the default values that this chart takes. - -| Key | Type | Default | Description | -|-----|------|---------|-------------| -| crds.enabled | bool | `true` | whether to install Prometheus operator's CRDs | -| fluentbit.image.imagePullPolicy | string | `IfNotPresent` | image pull policy applied to all Fluent Bit instances | -| fluentbit.image.repository | string | `registry.cern.ch/monit/cern-it-monitoring-fluent-bit` | image repository applied to all Fluent Bit instances | -| fluentbit.image.tag | string | `3.2.6` | image tag applied to all Fluent Bit instances | -| kubernetes.clusterName | string | - | name of the kubernetes cluster to monitor. This value will be appended to every metric and log via k8s_cluster_name label | -| logs.enabled | bool | `false` | indicates if logs components should be enabled or not. If set to false no logs component will be installed nor configured | -| logs.fluentbit.customParsers | string | `""` | | -| logs.fluentbit.enabled | bool | `false` | indicates if fluentbit logs component should be installed or not | -| logs.fluentbit.extraVolumeMounts | list | `[]` | | -| logs.fluentbit.extraVolumes | list | `[]` | | -| logs.fluentbit.filters | string | Kubernetes filter. See `values.yaml` file. | fluentbit filters as a yaml list in a multiline string | -| logs.fluentbit.image.imagePullPolicy | string | `""` | image pull policy applied to Fluent Bit for logs if no global is set | -| logs.fluentbit.image.repository | string | `""` | image repository applied to Fluent Bit for logs if no global is set | -| logs.fluentbit.image.tag | string | `""` | image tag applied to Fluent Bit for logs if no global is set | -| logs.fluentbit.inputs | string | Tail plugin over `/var/log/containers/*.log` files. See `values.yaml` file. | -| logs.fluentbit.outputs | string | OpenTelemetry plugin using `otlp.endpoint`, `otlp.port`, `tenant.username` and `tenant.password`. See `values.yaml`. | fluentbit outputs as a yaml list in a multiline string | -| logs.fluentbit.resources.limits.cpu | string | `"20m"` | | -| logs.fluentbit.resources.limits.memory | string | `"25Mi"` | | -| logs.fluentbit.resources.requests.cpu | string | `"5m"` | | -| logs.fluentbit.resources.requests.memory | string | `"15Mi"` | | -| logs.fluentbit.scrapeInterval | string | `"15s"` | interval used by the local prometheus (if installed) to scrape metrics from logs fluentbits | -| logs.fluentbit.service | string | Daemon mode off listening on port 2020. See `values.yaml`. | fluentbit service configuration options in a multiline string | -| metrics.alertmanager.enabled | bool | `false` | if true alertmanager will be installed and prometheus reconfigured to use it as the alerting endpoint | -| metrics.alertmanager.image | string | `"registry.cern.ch/monit/cern-it-monitoring-alertmanager"` | alertmanager image to use by the local cluster alertmanager | -| metrics.alertmanager.ingress.className | string | `""` | class name to be used by the alertmanager ingress | -| metrics.alertmanager.ingress.enabled | bool | `false` | if set to true an ingress will be created for the alertmanager service | -| metrics.alertmanager.ingress.hosts | Array | `[]` | list of hosts for the alertmanager ingress | -| metrics.alertmanager.ingress.path | string | `"/"` | entry path for the alertmanager ingress | -| metrics.alertmanager.ingress.pathType | string | `"ImplementationSpecific"` | path type for the alertmanager ingress | -| metrics.alertmanager.ingress.tls | Hash | `{}` | tls configuration for the alertmanager ingress | -| metrics.alertmanager.nodeSelector | Hash | `{}` | node selector configuration for the alertmanager | -| metrics.alertmanager.pullPolicy | string | `"IfNotPresent"` | pull policy for the alertmanager image | -| metrics.alertmanager.replicas | int | `3` | number of replicas for the alertmanager deployment, defaults 3 for HA | -| metrics.alertmanager.tag | string | `"v0.27.0"` | alertmanager image tag to be used when pulling it | -| metrics.alertmanager.volumeMounts | Array | `[]` | list of volumes to be mounted | -| metrics.alertmanager.volumes | Array | `[]` | list of volumes to be declared | -| metrics.defaultNodeSelector | map | `{}` | if set will be used as `nodeSelector` for those components that allow one | -| metrics.enabled | bool | `true` | indicates if all metrics components should be enabled or not. If set to false no metrics component will be installed nor configured | -| metrics.fluentbit.diskMaxCache | string | `5G` | max size for in-disk storage for fluent-bit | -| metrics.fluentbit.enabled | bool | `true` | if true fluentbit will be installed | -| metrics.fluentbit.extraVolumeMounts | list | `[]` | | -| metrics.fluentbit.extraVolumes | list | `[]` | | -| metrics.fluentbit.filters | string | `"nil"` | fluentbit filters as a yaml list in a multiline string | -| metrics.fluentbit.image.imagePullPolicy | string | `""` | image pull policy applied to Fluent Bit for metrics if no global is set | -| metrics.fluentbit.image.repository | string | `""` | image repository applied to Fluent Bit for metrics if no global is set | -| metrics.fluentbit.image.tag | string | `""` | image tag applied to Fluent Bit for metrics if no global is set | -| metrics.fluentbit.inputs | string | Configuration to scrape local prometheus. See `values.yaml`. | fluentbit inputs as a yaml list in a multiline string | -| metrics.fluentbit.matchQuery | string | `"match[]={job!=\"\"}"` | Query parameter to apply to the federate Prometheus URL, use this to filter and send only specific metrics | -| metrics.fluentbit.nodeSelector | hash | `"nil"` | fluentbit statefulset node selectors | -| metrics.fluentbit.prometheusScrapeBufferMaxSize | string | `"100M"` | fluentbit buffer size. The more metrics to send the bigger needs to be | -| metrics.fluentbit.prometheusScrapeInterval | string | `"60s"` | interval used by fluentbit to scrape metrics from prometheus | -| metrics.fluentbit.resources.limits.cpu | string | `"1"` | | -| metrics.fluentbit.resources.limits.memory | string | `"1Gi"` | | -| metrics.fluentbit.resources.requests.cpu | string | `"1"` | | -| metrics.fluentbit.resources.requests.memory | string | `"150Mi"` | | -| metrics.fluentbit.service | string | Daemon mode off listening on port 2020. See `values.yaml`. | fluentbit service configuration options in a multiline string | -| metrics.kubeState.enabled | bool | `true` | if true kube state will be installed together with a service monitor | -| metrics.kubeState.nodeSelector | hash | `"nil"` | kubeState deployment node selectors | -| metrics.kubeState.resources.limits.cpu | string | `"20m"` | | -| metrics.kubeState.resources.limits.memory | string | `"25Mi"` | | -| metrics.kubeState.resources.requests.cpu | string | `"5m"` | | -| metrics.kubeState.resources.requests.memory | string | `"15Mi"` | | -| metrics.kubeState.scrapeInterval | string | `"15s"` | indicates how often kube state will be scraped by the local prometheus | -| metrics.nodeExporter.enabled | bool | `true` | if true node exporter will be installed as a daemon set together with a pod monitor | -| metrics.nodeExporter.resources.limits.cpu | string | `"20m"` | | -| metrics.nodeExporter.resources.limits.memory | string | `"25Mi"` | | -| metrics.nodeExporter.resources.requests.cpu | string | `"5m"` | | -| metrics.nodeExporter.resources.requests.memory | string | `"15Mi"` | | -| metrics.nodeExporter.scrapeInterval | string | `"15s"` | indicates how often node exporter will be scraped by the local prometheus | -| metrics.prometheus.enabled | bool | `true` | if true prometheus operator and a prometheus server will be installed | -| metrics.prometheus.operator | object | Resources configuration. See `values.yaml`. | specific configuration for the prometheus operator | -| metrics.prometheus.operator.nodeSelector | hash | `"nil"` | prometheus operator node selectors | -| metrics.prometheus.server.extraLabelsForMetrics | hash | `{}` | set of static labels and values to add to all the metrics gathered by the in-cluster prometheus when exported to central monitoring | -| metrics.prometheus.server.image | string | `"registry.cern.ch/monit/cern-it-monitoring-prometheus:v2.50.0"` | prometheus image to use by the local cluster prometheus | -| metrics.prometheus.server.nodeSelector | hash | `"nil"` | prometheus server node selectors | -| metrics.prometheus.server.remoteWrite | object | `{}` | remote write prometheus configuration | -| metrics.prometheus.server.resources.limits.cpu | string | `"500m"` | | -| metrics.prometheus.server.resources.limits.memory | string | `"5Gi"` | | -| metrics.prometheus.server.resources.requests.cpu | string | `"100m"` | | -| metrics.prometheus.server.resources.requests.memory | string | `"2Gi"` | | -| metrics.prometheus.server.retention | string | `"24h"` | interval during which local cluster prometheus will store metrics | -| metrics.prometheus.server.scrapeInterval | string | `"10s"` | interval used to self scrape metrics | -| metrics.prometheus.server.scrapeTimeout | string | `"5s"` | timeout for self scraped metrics | -| metrics.prometheus.server.version | string | `"v2.50.0"` | prometheus version to use by the local cluster prometheus | -| otlp.endpoint | string | `"monit-otlp.cern.ch"` | otlp endpoint where the otlp receivers are listening | -| otlp.port | int | `4319` | otlp port where the otlp receivers are listening | -| tenant.name | string | - | username used for authenticating with the MONIT infrastructure | -| tenant.password | string | - | password (plain) used for authenticating with the MONIT infrastructure | - ----------------------------------------------- diff --git a/values.yaml b/values.yaml index 0a7ebcbc9b0e89a97b87608bbf66542106bfff66..423526d1c24c34eb0220e2996689d5505ffeb142 100644 --- a/values.yaml +++ b/values.yaml @@ -1,5 +1,6 @@ # CRDs crds: + # -- whether to install Prometheus operator's CRDs enabled: true # OTLP default configuration. @@ -11,25 +12,26 @@ otlp: # Tenant configuration. Username and Password are provided via CERN Central IT # Monitoring service. This bit is required if fluentbit is enabled (default) -# tenant: +tenant: # -- username used for authenitcating in the MONIT infrastructure - # name: example + name: '' # -- password (plain) used for authenitcating in the MONIT infrastructure - # password: example + password: '' # Kubernetes configuration. -# kubernetes: - # -- name of the kubernetes cluster to monitor. This value will be appended to very metric and log via k8sClusterName label. This bit is required if fluentbit is enabled (default) - # clusterName: nil +kubernetes: + # -- name of the kubernetes cluster to monitor. This value will be appended to very metric and log via k8s_cluster_name label. This bit is required if fluentbit is enabled (default) + clusterName: '' +# Global Fluent Bit configuration fluentbit: # Default image used by all Fluent Bit instances image: - # -- If defined, a repository applied to all Fluent Bit instances - repository: registry.cern.ch/monit/cern-it-monitoring-fluent-bit - # -- If defined, a tag applied to all Fluent Bit instances + # -- image repository applied to all Fluent Bit instances + repository : registry.cern.ch/monit/cern-it-monitoring-fluent-bit + # -- image tag applied to all Fluent Bit instances tag: 3.2.6 - # -- If defined, an image pull policy applied to all Fluent Bit instances + # -- image pull policy applied to all Fluent Bit instances imagePullPolicy: IfNotPresent # The metrics section includes all the components meant to produce, scrape, @@ -82,7 +84,7 @@ metrics: prometheus: # -- if true prometheus operator and a prometheus server will be installed enabled: true - # -- specific configuration for the prometheus operator + # Specific configuration for the prometheus operator operator: # If set it will override the metrics.defaultNodeSelector. nodeSelector: {} @@ -94,10 +96,11 @@ metrics: cpu: "100m" memory: "100Mi" server: - # If set it will override the metrics.defaultNodeSelector. + # -- prometheus operator node selectors. If set it will override the metrics.defaultNodeSelector nodeSelector: {} - # -- prometheus version to use by the local cluster prometheus. Make sure this version exists in the indicated image repository. + # -- prometheus image to use by the local cluster prometheus image: registry.cern.ch/monit/cern-it-monitoring-prometheus:v2.53.3 + # -- prometheus version to use by the local cluster prometheus version: "v2.53.3" # -- interval used to self scrape metrics scrapeInterval: "10s" @@ -120,7 +123,8 @@ metrics: limits: cpu: "500m" memory: "5Gi" - # Service Monitors to be created by the helm chart install / upgrade. Ex: + # -- service monitors to be created + serviceMonitors: [] # serviceMonitors: # - name: my-nginx-sm # spec: @@ -135,16 +139,15 @@ metrics: # app.kubernetes.io/component: controller # app.kubernetes.io/instance: cern-magnum # app.kubernetes.io/name: ingress-nginx - serviceMonitors: [] - # Allows to drop / relabel node Exporter metrics. + # -- allows to drop / relabel node Exporter metrics. + relabelings: [] # More info on: https://prometheus.io/docs/prometheus/latest/configuration/configuration/#relabel_config # Example: # - action: drop # sourceLabels: # - __name__ # regex: "my_custom_counter_total|my_custom_counter_sum|my_custom_gauge" - relabelings: [] - # This fluentbit is used to allow scraping and fordwarding metrics from + # This fluentbit is used to allow scraping and forwarding metrics from # the local prometheus and send them to Open Telemetry Collector. # If the local .Values.metrics.prometheus enable=false will not be able # to scrape from local prometheus. Provide different inputs then. @@ -153,14 +156,11 @@ metrics: enabled: true ## Fluent Bit for metrics image image: - # -- Repository to use for Fluent Bit (metrics) - # @default -- `""` (defaults to fluentbit.image.repository) + # -- repository to use for Fluent Bit (metrics) repository: "" - # -- Tag to use for Fluent Bit (metrics) - # @default -- `""` (defaults to fluentbit.image.tag) + # -- tag to use for Fluent Bit (metrics) tag: "" - # -- Image pull policy for Fluent Bit (metrics) - # @default -- `""` (defaults to fluentbit.image.imagePullPolicy) + # -- image pull policy for Fluent Bit (metrics) imagePullPolicy: "" replicas: 2 # If set it will override the metrics.defaultNodeSelector. @@ -183,6 +183,8 @@ metrics: uri: /api/prom/push threaded: false + # -- extra Lua scripts for user-provided transformations + luaScripts: {} # These scripts are available in the fluentbit /fluent-bit/etc/scripts path. # Include your lua scripts in the following format: # luaScripts: @@ -192,12 +194,12 @@ metrics: # return 2, timestamp, record # end # my_other_lua_script.lua: ... - luaScripts: {} # -- max size for in-disk storage for fluent-bit diskMaxCache: "5G" # -- fluentbit service configuration options in a multiline string + # @default -- partly autogenerated -- see values.yaml service: | daemon: off flush: 1 @@ -215,6 +217,7 @@ metrics: storage.max_chunks_up: 507 # -- fluentbit inputs as a yaml list in a multiline string + # @default -- partly autogenerated -- see values.yaml inputs: | - name: prometheus_remote_write tag: {{ .Values.metrics.fluentbit.prometheusRemoteWriteInputConfig.tag }} @@ -228,9 +231,11 @@ metrics: threaded: {{ .Values.metrics.fluentbit.prometheusRemoteWriteInputConfig.threaded }} # -- fluentbit filters as a yaml list in a multiline string + # @default -- partly autogenerated -- see values.yaml filters: | # -- fluentbit outputs as a yaml list in a multiline string + # @default -- partly autogenerated -- see values.yaml outputs: | - name: opentelemetry match: monit.prom.k8s @@ -246,9 +251,9 @@ metrics: storage.total_limit_size: {{ .Values.metrics.fluentbit.diskMaxCache }} header: User-Agent {{ .Chart.Name }}/{{ .Chart.Version }} - # Pushgateway allows you to send metrics to the monitoring infrastructure - # by pushing them to the local cluster service it-monit-metrics-collector-pushgateway. pushgateway: + # -- pushgateway allows you to send metrics to the monitoring infrastructure + # by pushing them to the local cluster service it-monit-metrics-collector-pushgateway. enabled: false image: repository: registry.cern.ch/monit/cern-it-monitoring-pushgateway @@ -261,40 +266,52 @@ metrics: limits: cpu: 0.2 memory: 100Mi - # If set to true will install register a new ingress with the given - # configuration. ingress: + # -- if set to true will install register a new ingress with the given + # configuration. enabled: false className: "" # If no class set then default. path: / pathType: ImplementationSpecific hosts: [] tls: {} - # If given will override the defaultNodeSelector and install the component + # -- if given will override the defaultNodeSelector and install the component # only on the nodes that match the given condition. nodeSelector: {} # Alertmanager configuration. If configured the local prometheus will be # automatically configured with this alertmanager as target for alerts. alertmanager: + # -- if true alertmanager will be installed and prometheus reconfigured to use it as the alerting endpoint enabled: false + # -- alertmanager image to use by the local cluster alertmanager image: registry.cern.ch/monit/cern-it-monitoring-alertmanager + # -- alertmanager image tag to be used when pulling it tag: v0.27.0 + # -- pull policy for the alertmanager image pullPolicy: IfNotPresent + # -- number of replicas for the alertmanager deployment replicas: 3 # If set to true will install register a new ingress with the given # configuration. ingress: + # -- if set to true an ingress will be created for the alertmanager service enabled: false - className: "" # If no class set then default. + # -- class name to be used by the alertmanager ingress + className: "" + # -- entry path for the alertmanager ingress path: / + # -- path type for the alertmanager ingress pathType: ImplementationSpecific + # -- list of hosts for the alertmanager ingress hosts: [] + # -- tls configuration for the alertmanager ingress tls: {} - # If given will override the defaultNodeSelector and install the component - # only on the nodes that match the given condition. + # -- node selector configuration for the alertmanager nodeSelector: {} + # -- list of volumes to be declared volumes: [] + # -- list of volumes to be mounted volumeMounts: [] apiServer: @@ -338,14 +355,11 @@ logs: enabled: false ## Fluent Bit for logs image image: - # -- Repository to use for Fluent Bit (logs) - # @default -- `""` (defaults to fluentbit.image.repository) + # -- repository to use for Fluent Bit (logs) repository: "" - # -- Tag to use for Fluent Bit (logs) - # @default -- `""` (defaults to fluentbit.image.tag) + # -- tag to use for Fluent Bit (logs) tag: "" - # -- Image pull policy for Fluent Bit (logs) - # @default -- `""` (defaults to fluentbit.image.imagePullPolicy) + # -- image pull policy for Fluent Bit (logs) imagePullPolicy: "" # -- interval used by the local prometheus (if installed) to scrape metrics from logs fluentbits scrapeInterval: "15s" @@ -358,6 +372,7 @@ logs: memory: "25Mi" # -- fluentbit service configuration options in a multiline string + # @default -- partly autogenerated -- see values.yaml service: | [SERVICE] Daemon Off @@ -371,6 +386,7 @@ logs: Health_Check On # -- fluentbit inputs as a yaml list in a multiline string + # @default -- partly autogenerated -- see values.yaml inputs: | [INPUT] Name tail @@ -381,6 +397,7 @@ logs: Skip_Long_Lines Off # -- fluentbit filters as a yaml list in a multiline string + # @default -- partly autogenerated -- see values.yaml filters: | [FILTER] Name kubernetes @@ -436,6 +453,7 @@ logs: call add_timestamp_from_time # -- fluentbit outputs as a yaml list in a multiline string + # @default -- partly autogenerated -- see values.yaml outputs: | [OUTPUT] name opentelemetry @@ -461,6 +479,8 @@ logs: ## -- extra volumes to mount in the fluentbits, can be used to scrape metrics from pvcs extraVolumeMounts: [] + # -- extra Lua scripts for user-provided transformations + luaScripts: {} # These scripts are available in the fluentbit /fluent-bit/etc/scripts path. # Include your lua scripts in the following format: # luaScripts: @@ -470,4 +490,3 @@ logs: # return 2, timestamp, record # end # my_other_lua_script.lua: ... - luaScripts: {}