diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 1f74bfc968afbec179b0a1fc6d2ef9e0f0b0837e..88b6c64f86f172d30cf9ea5aef3a02011a0f2e92 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -7,6 +7,7 @@ include: stages: - lint + - validate - test - build - deploy @@ -39,11 +40,23 @@ helm_lint: rules: - if: '$CI_COMMIT_TAG' when: never + - if: $CI_PIPELINE_SOURCE == "push" image: alpine/helm script: - helm dep update . - helm lint --strict . +helm_docs_needs_running: + stage: validate + image: registry.cern.ch/docker.io/jnorwood/helm-docs:v1.14.2 + rules: + - if: '$CI_COMMIT_TAG' + when: never + - if: $CI_PIPELINE_SOURCE == "push" + script: + - helm-docs -o test.md + - diff -ura test.md README.md + unittest: stage: test image: registry.cern.ch/docker.io/helmunittest/helm-unittest:3.17.0-0.7.2 diff --git a/.helmdocsignore b/.helmdocsignore new file mode 100644 index 0000000000000000000000000000000000000000..88301c0048b186ce92c9df5738ca6ab4c51f1328 --- /dev/null +++ b/.helmdocsignore @@ -0,0 +1 @@ +charts/* \ No newline at end of file diff --git a/.helmignore b/.helmignore index e40aded5435ba4d39cba63f1e06e69785794841a..64e134a40f4f0d977604b215c788a69d576b0f00 100644 --- a/.helmignore +++ b/.helmignore @@ -10,6 +10,7 @@ .yamllint config README.md +README.md.gotmpl .gitignore .bzr/ .bzrignore diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 7ec0d44cbe495fc868bab1a540f59f8db67e3885..40341c099ec7f7c0778db23e6fba8d99239b5bcb 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -28,7 +28,7 @@ See [getting started](docs/getting_started.md) for hints on how to install the c This procedure ensures that your changes do not introduce any unintended issues. ### 4. Update Documentation -If your changes modify or extend the functionality of the chart, do not forget to update the relevant documentation (such as `README.md`, `docs/values.md` or any related configuration files) to reflect these modifications. +If your changes modify or extend the functionality of the chart, do not forget to update the relevant documentation. `README.md` is autogenerated using [helm-docs](https://github.com/norwoodj/helm-docs) so please don't edit this file by hand (modify instead `values.yaml` or `README.md.gotmpl` as needed). Then re-generate the final `README.md` using helm-docs and add those changes too to your change request. ### 5. Submit a Merge Request Once your changes are ready, push your branch to your fork and create a Merge Request (MR) targeting the `master` branch of the main repository. diff --git a/Chart.yaml b/Chart.yaml index c51e87c51931c545bc48bf4bafd571336a51e94e..a95dc52264898d1f4f352a8d1d35caf590ec1ac1 100644 --- a/Chart.yaml +++ b/Chart.yaml @@ -3,7 +3,7 @@ name: cern-it-monitoring-kubernetes version: 0.0.0 # DO NOT UPDATE MANUALLY! type: application kubeVersion: ">=1.27.0-0" -description: Helm Chart provided by IT Monitoring Service to install and configure required components to gather and send monitoring data from kubernetes clusters to central service. +description: Helm Chart provided by the IT Monitoring Service to install and configure the required components to gather and send monitoring data from Kubernetes clusters to the central service. home: https://cern.ch/monitoring dependencies: - name: crds diff --git a/README.md b/README.md index ca7b904bc771b5001de49b6208e9c66663ef0946..a4ef22e0a309057936ae43ee4270a07719574e34 100644 --- a/README.md +++ b/README.md @@ -1,15 +1,153 @@ -# CERN IT Monitoring Kubernetes Helm Chart +# cern-it-monitoring-kubernetes -## Overview +## Description -The **CERN IT Monitoring Kubernetes Helm Chart** provides a solution for monitoring Kubernetes clusters at CERN. It enables the collection of **metrics**, **logs**, and future support for **traces**, which are forwarded to the central CERN monitoring infrastructure. From there users can consume them using the day-to-day tools that they already user like [Grafana](https://monit-docs.web.cern.ch/access/grafana/) or [OpenSearch](https://monit-docs.web.cern.ch/access/opensearch/). +The **CERN IT Monitoring Kubernetes Helm Chart** provides a solution +for monitoring Kubernetes clusters at CERN. It enables the collection +of **metrics**, **logs**, and future support for **traces**, which are +forwarded to the central CERN monitoring infrastructure. From there +users can consume them using the day-to-day tools that they already +user like [Grafana](https://monit-docs.web.cern.ch/access/grafana/) or +[OpenSearch](https://monit-docs.web.cern.ch/access/opensearch/). -This Helm chart simplifies the deployment and configuration of necessary components for observability, making it easier to manage monitoring across various Kubernetes clusters and their applications. +This Helm chart simplifies the deployment and configuration of +necessary components for observability, making it easier to manage +monitoring across various Kubernetes clusters and their applications. ## Quick Start See [getting started](docs/getting_started.md). +## Values + +| Key | Type | Default | Description | +|-----|------|---------|-------------| +| crds.enabled | bool | `true` | whether to install Prometheus operator's CRDs | +| fluentbit.image.imagePullPolicy | string | `"IfNotPresent"` | image pull policy applied to all Fluent Bit instances | +| fluentbit.image.repository | string | `"registry.cern.ch/monit/cern-it-monitoring-fluent-bit"` | image repository applied to all Fluent Bit instances | +| fluentbit.image.tag | string | `"3.2.6"` | image tag applied to all Fluent Bit instances | +| kubernetes.clusterName | string | `""` | name of the kubernetes cluster to monitor. This value will be appended to very metric and log via k8s_cluster_name label. This bit is required if fluentbit is enabled (default) | +| logs.enabled | bool | `false` | indicates if logs metrics components should be enabled or not. If set to false no logs component will be installed nor configured | +| logs.fluentbit.customParsers | string | `""` | | +| logs.fluentbit.enabled | bool | `false` | indicates if fluentbit logs component should be installed or not | +| logs.fluentbit.extraVolumeMounts | list | `[]` | | +| logs.fluentbit.extraVolumes | list | `[]` | | +| logs.fluentbit.filters | string | partly autogenerated -- see values.yaml | fluentbit filters as a yaml list in a multiline string | +| logs.fluentbit.image.imagePullPolicy | string | `""` | image pull policy for Fluent Bit (logs) | +| logs.fluentbit.image.repository | string | `""` | repository to use for Fluent Bit (logs) | +| logs.fluentbit.image.tag | string | `""` | tag to use for Fluent Bit (logs) | +| logs.fluentbit.inputs | string | partly autogenerated -- see values.yaml | fluentbit inputs as a yaml list in a multiline string | +| logs.fluentbit.luaScripts | object | `{}` | extra Lua scripts for user-provided transformations | +| logs.fluentbit.outputs | string | partly autogenerated -- see values.yaml | fluentbit outputs as a yaml list in a multiline string | +| logs.fluentbit.resources.limits.cpu | string | `"20m"` | | +| logs.fluentbit.resources.limits.memory | string | `"25Mi"` | | +| logs.fluentbit.resources.requests.cpu | string | `"5m"` | | +| logs.fluentbit.resources.requests.memory | string | `"15Mi"` | | +| logs.fluentbit.service | string | partly autogenerated -- see values.yaml | fluentbit service configuration options in a multiline string | +| metrics.alertmanager.enabled | bool | `false` | if true alertmanager will be installed and prometheus reconfigured to use it as the alerting endpoint | +| metrics.alertmanager.image | string | `"registry.cern.ch/monit/cern-it-monitoring-alertmanager"` | alertmanager image to use by the local cluster alertmanager | +| metrics.alertmanager.ingress.className | string | `""` | class name to be used by the alertmanager ingress | +| metrics.alertmanager.ingress.enabled | bool | `false` | if set to true an ingress will be created for the alertmanager service | +| metrics.alertmanager.ingress.hosts | list | `[]` | list of hosts for the alertmanager ingress | +| metrics.alertmanager.ingress.path | string | `"/"` | entry path for the alertmanager ingress | +| metrics.alertmanager.ingress.pathType | string | `"ImplementationSpecific"` | path type for the alertmanager ingress | +| metrics.alertmanager.ingress.tls | object | `{}` | tls configuration for the alertmanager ingress | +| metrics.alertmanager.nodeSelector | object | `{}` | node selector configuration for the alertmanager | +| metrics.alertmanager.pullPolicy | string | `"IfNotPresent"` | pull policy for the alertmanager image | +| metrics.alertmanager.replicas | int | `3` | number of replicas for the alertmanager deployment | +| metrics.alertmanager.tag | string | `"v0.27.0"` | alertmanager image tag to be used when pulling it | +| metrics.alertmanager.volumeMounts | list | `[]` | list of volumes to be mounted | +| metrics.alertmanager.volumes | list | `[]` | list of volumes to be declared | +| metrics.apiServer.serviceMonitor.relabelings | list | `[]` | | +| metrics.coredns.serviceMonitor.relabelings | list | `[]` | | +| metrics.defaultNodeSelector | object | `{}` | the default node selector will be applied when possible. In to the following components: metrics collectors (prometheus and fluentbit), metrics exporters (kube state). | +| metrics.enabled | bool | `true` | indicates if all metrics components should be enabled or not. If set to false no metrics component will be installed nor configured | +| metrics.etcd.serviceMonitor.relabelings | list | `[]` | | +| metrics.fluentbit.diskMaxCache | string | `"5G"` | max size for in-disk storage for fluent-bit | +| metrics.fluentbit.enabled | bool | `true` | if true fluentbit metrics forwarder will be installed | +| metrics.fluentbit.filters | string | partly autogenerated -- see values.yaml | fluentbit filters as a yaml list in a multiline string | +| metrics.fluentbit.image.imagePullPolicy | string | `""` | image pull policy for Fluent Bit (metrics) | +| metrics.fluentbit.image.repository | string | `""` | repository to use for Fluent Bit (metrics) | +| metrics.fluentbit.image.tag | string | `""` | tag to use for Fluent Bit (metrics) | +| metrics.fluentbit.inputs | string | partly autogenerated -- see values.yaml | fluentbit inputs as a yaml list in a multiline string | +| metrics.fluentbit.luaScripts | object | `{}` | extra Lua scripts for user-provided transformations | +| metrics.fluentbit.nodeSelector | object | `{}` | | +| metrics.fluentbit.outputs | string | partly autogenerated -- see values.yaml | fluentbit outputs as a yaml list in a multiline string | +| metrics.fluentbit.prometheusRemoteWriteInputConfig.bufferChunkSize | string | `"128M"` | | +| metrics.fluentbit.prometheusRemoteWriteInputConfig.bufferMaxSize | string | `"2G"` | | +| metrics.fluentbit.prometheusRemoteWriteInputConfig.listen | string | `"0.0.0.0"` | | +| metrics.fluentbit.prometheusRemoteWriteInputConfig.port | int | `8080` | | +| metrics.fluentbit.prometheusRemoteWriteInputConfig.successfulResponseCode | int | `201` | | +| metrics.fluentbit.prometheusRemoteWriteInputConfig.tag | string | `"monit.prom.k8s"` | | +| metrics.fluentbit.prometheusRemoteWriteInputConfig.tagFromUri | bool | `false` | | +| metrics.fluentbit.prometheusRemoteWriteInputConfig.threaded | bool | `false` | | +| metrics.fluentbit.prometheusRemoteWriteInputConfig.uri | string | `"/api/prom/push"` | | +| metrics.fluentbit.replicas | int | `2` | | +| metrics.fluentbit.resources.limits.cpu | string | `"1"` | | +| metrics.fluentbit.resources.limits.memory | string | `"1Gi"` | | +| metrics.fluentbit.resources.requests.cpu | string | `"1"` | | +| metrics.fluentbit.resources.requests.memory | string | `"512Mi"` | | +| metrics.fluentbit.service | string | partly autogenerated -- see values.yaml | fluentbit service configuration options in a multiline string | +| metrics.ingress.nginx.serviceMonitor.relabelings | list | `[]` | | +| metrics.kubeProxy.serviceMonitor.relabelings | list | `[]` | | +| metrics.kubeState.enabled | bool | `true` | if true kube state will be installed together with a service monitor | +| metrics.kubeState.nodeSelector | object | `{}` | | +| metrics.kubeState.resources.limits.cpu | string | `"20m"` | | +| metrics.kubeState.resources.limits.memory | string | `"25Mi"` | | +| metrics.kubeState.resources.requests.cpu | string | `"5m"` | | +| metrics.kubeState.resources.requests.memory | string | `"15Mi"` | | +| metrics.kubeState.scrapeInterval | string | `"30s"` | indicates how often this exporter will be scraped by the local prometheus | +| metrics.kubeState.serviceMonitor.relabelings | list | `[]` | | +| metrics.kubecontroller.serviceMonitor.relabelings | list | `[]` | | +| metrics.kubelet.serviceMonitor.relabelings | list | `[]` | | +| metrics.nodeExporter.enabled | bool | `true` | if true node exporter will be installed as a daemon set together with a pod monitor | +| metrics.nodeExporter.resources.limits.cpu | string | `"20m"` | | +| metrics.nodeExporter.resources.limits.memory | string | `"25Mi"` | | +| metrics.nodeExporter.resources.requests.cpu | string | `"5m"` | | +| metrics.nodeExporter.resources.requests.memory | string | `"15Mi"` | | +| metrics.nodeExporter.scrapeInterval | string | `""` | indicates how often this exporter will be scraped by the local prometheus | +| metrics.nodeExporter.serviceMonitor.relabelings | list | `[]` | | +| metrics.prometheus.enabled | bool | `true` | if true prometheus operator and a prometheus server will be installed | +| metrics.prometheus.operator.nodeSelector | object | `{}` | | +| metrics.prometheus.operator.resources.limits.cpu | string | `"100m"` | | +| metrics.prometheus.operator.resources.limits.memory | string | `"100Mi"` | | +| metrics.prometheus.operator.resources.requests.cpu | string | `"5m"` | | +| metrics.prometheus.operator.resources.requests.memory | string | `"25Mi"` | | +| metrics.prometheus.server.extraLabelsForMetrics | object | `{}` | set of static labels and values to add to all the metrics gathered by the in-cluster prometheus when exported to central monitoring | +| metrics.prometheus.server.image | string | `"registry.cern.ch/monit/cern-it-monitoring-prometheus:v2.53.3"` | prometheus image to use by the local cluster prometheus | +| metrics.prometheus.server.nodeSelector | object | `{}` | prometheus operator node selectors. If set it will override the metrics.defaultNodeSelector | +| metrics.prometheus.server.relabelings | list | `[]` | allows to drop / relabel node Exporter metrics. | +| metrics.prometheus.server.remoteWrite | object | `{}` | remote write prometheus configuration | +| metrics.prometheus.server.resources.limits.cpu | string | `"500m"` | | +| metrics.prometheus.server.resources.limits.memory | string | `"5Gi"` | | +| metrics.prometheus.server.resources.requests.cpu | string | `"100m"` | | +| metrics.prometheus.server.resources.requests.memory | string | `"2Gi"` | | +| metrics.prometheus.server.retention | string | `"24h"` | interval during which local cluster prometheus will store metrics | +| metrics.prometheus.server.scrapeInterval | string | `"10s"` | interval used to self scrape metrics | +| metrics.prometheus.server.scrapeTimeout | string | `"5s"` | timeout for self scraped metrics | +| metrics.prometheus.server.serviceMonitors | list | `[]` | service monitors to be created | +| metrics.prometheus.server.version | string | `"v2.53.3"` | prometheus version to use by the local cluster prometheus | +| metrics.pushgateway.enabled | bool | `false` | pushgateway allows you to send metrics to the monitoring infrastructure by pushing them to the local cluster service it-monit-metrics-collector-pushgateway. | +| metrics.pushgateway.image.pullPolicy | string | `"IfNotPresent"` | | +| metrics.pushgateway.image.repository | string | `"registry.cern.ch/monit/cern-it-monitoring-pushgateway"` | | +| metrics.pushgateway.image.tag | string | `"v1.10.0"` | | +| metrics.pushgateway.ingress.className | string | `""` | | +| metrics.pushgateway.ingress.enabled | bool | `false` | if set to true will install register a new ingress with the given configuration. | +| metrics.pushgateway.ingress.hosts | list | `[]` | | +| metrics.pushgateway.ingress.path | string | `"/"` | | +| metrics.pushgateway.ingress.pathType | string | `"ImplementationSpecific"` | | +| metrics.pushgateway.ingress.tls | object | `{}` | | +| metrics.pushgateway.nodeSelector | object | `{}` | if given will override the defaultNodeSelector and install the component only on the nodes that match the given condition. | +| metrics.pushgateway.resources.limits.cpu | float | `0.2` | | +| metrics.pushgateway.resources.limits.memory | string | `"100Mi"` | | +| metrics.pushgateway.resources.requests.cpu | float | `0.2` | | +| metrics.pushgateway.resources.requests.memory | string | `"100Mi"` | | +| metrics.scheduler.serviceMonitor.relabelings | list | `[]` | | +| otlp.endpoint | string | `"monit-otlp.cern.ch"` | otlp endpoint where the otlp receivers are listening | +| otlp.port | int | `4319` | otlp port where the otlp receivers are listening | +| tenant.name | string | `""` | username used for authenitcating in the MONIT infrastructure | +| tenant.password | string | `""` | password (plain) used for authenticating in the MONIT infrastructure | + ## Contributing We welcome contributions! If you're interested in helping improve this project, please review our [contribution guidelines](CONTRIBUTING.md). In brief: @@ -23,10 +161,12 @@ For a full contribution workflow, visit the [contribution guide](CONTRIBUTING.md ## Documentation -Complete documentation for this chart, including setup and configuration details, is available: +The main source of documentation for this chart is this +README. There's some extra details about the components being deployed +in the [docs](docs) directory, though. -- GitLab Repository: [link](docs) -- Project Documentation: [link](https://monit-docs.web.cern.ch) +The [MONIT project documentation](https://monit-docs.web.cern.ch) +contains useful bits complementing this chart. ## License diff --git a/README.md.gotmpl b/README.md.gotmpl new file mode 100644 index 0000000000000000000000000000000000000000..065dc658af739e89188b1a727895ba96f85a0071 --- /dev/null +++ b/README.md.gotmpl @@ -0,0 +1,45 @@ +{{ template "chart.header" . }} + +## Description + +The **CERN IT Monitoring Kubernetes Helm Chart** provides a solution +for monitoring Kubernetes clusters at CERN. It enables the collection +of **metrics**, **logs**, and future support for **traces**, which are +forwarded to the central CERN monitoring infrastructure. From there +users can consume them using the day-to-day tools that they already +user like [Grafana](https://monit-docs.web.cern.ch/access/grafana/) or +[OpenSearch](https://monit-docs.web.cern.ch/access/opensearch/). + +This Helm chart simplifies the deployment and configuration of +necessary components for observability, making it easier to manage +monitoring across various Kubernetes clusters and their applications. + +## Quick Start + +See [getting started](docs/getting_started.md). + +{{ template "chart.valuesSection" . }} + +## Contributing + +We welcome contributions! If you're interested in helping improve this project, please review our [contribution guidelines](CONTRIBUTING.md). In brief: + +1. **Fork** the repository. +2. Create a **feature branch**. +3. Implement, provide tests and validate your changes. +4. Submit a **Merge Request (MR)** to the `master` branch. + +For a full contribution workflow, visit the [contribution guide](CONTRIBUTING.md). + +## Documentation + +The main source of documentation for this chart is this +README. There's some extra details about the components being deployed +in the [docs](docs) directory, though. + +The [MONIT project documentation](https://monit-docs.web.cern.ch) +contains useful bits complementing this chart. + +## License + +This repository is licensed under the [Apache License 2.0](https://www.apache.org/licenses/LICENSE-2.0). See the [LICENSE](LICENSE) file for more information. diff --git a/docs/logs.md b/docs/logs.md index b791898c64983b351e4c3cbf27a2288ce0bd8b49..0ed88b9faa01bf6f6fb2c2bc99fa44948c27d255 100644 --- a/docs/logs.md +++ b/docs/logs.md @@ -25,7 +25,6 @@ Fluentbit collects logs from the `/var/log/containers/` directory and applies a logs: fluentbit: enabled: true # Enable/disable Fluentbit for log collection - scrapeInterval: "15s" # Prometheus scrape interval for Fluentbit metrics resources: requests: cpu: "5m" @@ -141,4 +140,4 @@ Be cautious when enabling Fluentbit for large clusters (100+ nodes). The **Kuber - Adjust Fluentbit resource requests and limits. - Tune the API request rate by modifying the filters. -- Consider other optimization techniques, such as excluding specific log sources. \ No newline at end of file +- Consider other optimization techniques, such as excluding specific log sources. diff --git a/docs/values.md b/docs/values.md deleted file mode 100644 index e57ef01626757b07d81ff2401c1b23e2571bc90d..0000000000000000000000000000000000000000 --- a/docs/values.md +++ /dev/null @@ -1,95 +0,0 @@ -# CERN IT Monitoring Kubernetes Helm Chart Default Values -This file contains the markdown version of the default values that this chart takes. - -| Key | Type | Default | Description | -|-----|------|---------|-------------| -| crds.enabled | bool | `true` | whether to install Prometheus operator's CRDs | -| fluentbit.image.imagePullPolicy | string | `IfNotPresent` | image pull policy applied to all Fluent Bit instances | -| fluentbit.image.repository | string | `registry.cern.ch/monit/cern-it-monitoring-fluent-bit` | image repository applied to all Fluent Bit instances | -| fluentbit.image.tag | string | `3.2.6` | image tag applied to all Fluent Bit instances | -| kubernetes.clusterName | string | - | name of the kubernetes cluster to monitor. This value will be appended to every metric and log via k8s_cluster_name label | -| logs.enabled | bool | `false` | indicates if logs components should be enabled or not. If set to false no logs component will be installed nor configured | -| logs.fluentbit.customParsers | string | `""` | | -| logs.fluentbit.enabled | bool | `false` | indicates if fluentbit logs component should be installed or not | -| logs.fluentbit.extraVolumeMounts | list | `[]` | | -| logs.fluentbit.extraVolumes | list | `[]` | | -| logs.fluentbit.filters | string | Kubernetes filter. See `values.yaml` file. | fluentbit filters as a yaml list in a multiline string | -| logs.fluentbit.image.imagePullPolicy | string | `""` | image pull policy applied to Fluent Bit for logs if no global is set | -| logs.fluentbit.image.repository | string | `""` | image repository applied to Fluent Bit for logs if no global is set | -| logs.fluentbit.image.tag | string | `""` | image tag applied to Fluent Bit for logs if no global is set | -| logs.fluentbit.inputs | string | Tail plugin over `/var/log/containers/*.log` files. See `values.yaml` file. | -| logs.fluentbit.outputs | string | OpenTelemetry plugin using `otlp.endpoint`, `otlp.port`, `tenant.username` and `tenant.password`. See `values.yaml`. | fluentbit outputs as a yaml list in a multiline string | -| logs.fluentbit.resources.limits.cpu | string | `"20m"` | | -| logs.fluentbit.resources.limits.memory | string | `"25Mi"` | | -| logs.fluentbit.resources.requests.cpu | string | `"5m"` | | -| logs.fluentbit.resources.requests.memory | string | `"15Mi"` | | -| logs.fluentbit.scrapeInterval | string | `"15s"` | interval used by the local prometheus (if installed) to scrape metrics from logs fluentbits | -| logs.fluentbit.service | string | Daemon mode off listening on port 2020. See `values.yaml`. | fluentbit service configuration options in a multiline string | -| metrics.alertmanager.enabled | bool | `false` | if true alertmanager will be installed and prometheus reconfigured to use it as the alerting endpoint | -| metrics.alertmanager.image | string | `"registry.cern.ch/monit/cern-it-monitoring-alertmanager"` | alertmanager image to use by the local cluster alertmanager | -| metrics.alertmanager.ingress.className | string | `""` | class name to be used by the alertmanager ingress | -| metrics.alertmanager.ingress.enabled | bool | `false` | if set to true an ingress will be created for the alertmanager service | -| metrics.alertmanager.ingress.hosts | Array | `[]` | list of hosts for the alertmanager ingress | -| metrics.alertmanager.ingress.path | string | `"/"` | entry path for the alertmanager ingress | -| metrics.alertmanager.ingress.pathType | string | `"ImplementationSpecific"` | path type for the alertmanager ingress | -| metrics.alertmanager.ingress.tls | Hash | `{}` | tls configuration for the alertmanager ingress | -| metrics.alertmanager.nodeSelector | Hash | `{}` | node selector configuration for the alertmanager | -| metrics.alertmanager.pullPolicy | string | `"IfNotPresent"` | pull policy for the alertmanager image | -| metrics.alertmanager.replicas | int | `3` | number of replicas for the alertmanager deployment, defaults 3 for HA | -| metrics.alertmanager.tag | string | `"v0.27.0"` | alertmanager image tag to be used when pulling it | -| metrics.alertmanager.volumeMounts | Array | `[]` | list of volumes to be mounted | -| metrics.alertmanager.volumes | Array | `[]` | list of volumes to be declared | -| metrics.defaultNodeSelector | map | `{}` | if set will be used as `nodeSelector` for those components that allow one | -| metrics.enabled | bool | `true` | indicates if all metrics components should be enabled or not. If set to false no metrics component will be installed nor configured | -| metrics.fluentbit.diskMaxCache | string | `5G` | max size for in-disk storage for fluent-bit | -| metrics.fluentbit.enabled | bool | `true` | if true fluentbit will be installed | -| metrics.fluentbit.extraVolumeMounts | list | `[]` | | -| metrics.fluentbit.extraVolumes | list | `[]` | | -| metrics.fluentbit.filters | string | `"nil"` | fluentbit filters as a yaml list in a multiline string | -| metrics.fluentbit.image.imagePullPolicy | string | `""` | image pull policy applied to Fluent Bit for metrics if no global is set | -| metrics.fluentbit.image.repository | string | `""` | image repository applied to Fluent Bit for metrics if no global is set | -| metrics.fluentbit.image.tag | string | `""` | image tag applied to Fluent Bit for metrics if no global is set | -| metrics.fluentbit.inputs | string | Configuration to scrape local prometheus. See `values.yaml`. | fluentbit inputs as a yaml list in a multiline string | -| metrics.fluentbit.matchQuery | string | `"match[]={job!=\"\"}"` | Query parameter to apply to the federate Prometheus URL, use this to filter and send only specific metrics | -| metrics.fluentbit.nodeSelector | hash | `"nil"` | fluentbit statefulset node selectors | -| metrics.fluentbit.prometheusScrapeBufferMaxSize | string | `"100M"` | fluentbit buffer size. The more metrics to send the bigger needs to be | -| metrics.fluentbit.prometheusScrapeInterval | string | `"60s"` | interval used by fluentbit to scrape metrics from prometheus | -| metrics.fluentbit.resources.limits.cpu | string | `"1"` | | -| metrics.fluentbit.resources.limits.memory | string | `"1Gi"` | | -| metrics.fluentbit.resources.requests.cpu | string | `"1"` | | -| metrics.fluentbit.resources.requests.memory | string | `"150Mi"` | | -| metrics.fluentbit.service | string | Daemon mode off listening on port 2020. See `values.yaml`. | fluentbit service configuration options in a multiline string | -| metrics.kubeState.enabled | bool | `true` | if true kube state will be installed together with a service monitor | -| metrics.kubeState.nodeSelector | hash | `"nil"` | kubeState deployment node selectors | -| metrics.kubeState.resources.limits.cpu | string | `"20m"` | | -| metrics.kubeState.resources.limits.memory | string | `"25Mi"` | | -| metrics.kubeState.resources.requests.cpu | string | `"5m"` | | -| metrics.kubeState.resources.requests.memory | string | `"15Mi"` | | -| metrics.kubeState.scrapeInterval | string | `"15s"` | indicates how often kube state will be scraped by the local prometheus | -| metrics.nodeExporter.enabled | bool | `true` | if true node exporter will be installed as a daemon set together with a pod monitor | -| metrics.nodeExporter.resources.limits.cpu | string | `"20m"` | | -| metrics.nodeExporter.resources.limits.memory | string | `"25Mi"` | | -| metrics.nodeExporter.resources.requests.cpu | string | `"5m"` | | -| metrics.nodeExporter.resources.requests.memory | string | `"15Mi"` | | -| metrics.nodeExporter.scrapeInterval | string | `"15s"` | indicates how often node exporter will be scraped by the local prometheus | -| metrics.prometheus.enabled | bool | `true` | if true prometheus operator and a prometheus server will be installed | -| metrics.prometheus.operator | object | Resources configuration. See `values.yaml`. | specific configuration for the prometheus operator | -| metrics.prometheus.operator.nodeSelector | hash | `"nil"` | prometheus operator node selectors | -| metrics.prometheus.server.extraLabelsForMetrics | hash | `{}` | set of static labels and values to add to all the metrics gathered by the in-cluster prometheus when exported to central monitoring | -| metrics.prometheus.server.image | string | `"registry.cern.ch/monit/cern-it-monitoring-prometheus:v2.50.0"` | prometheus image to use by the local cluster prometheus | -| metrics.prometheus.server.nodeSelector | hash | `"nil"` | prometheus server node selectors | -| metrics.prometheus.server.remoteWrite | object | `{}` | remote write prometheus configuration | -| metrics.prometheus.server.resources.limits.cpu | string | `"500m"` | | -| metrics.prometheus.server.resources.limits.memory | string | `"5Gi"` | | -| metrics.prometheus.server.resources.requests.cpu | string | `"100m"` | | -| metrics.prometheus.server.resources.requests.memory | string | `"2Gi"` | | -| metrics.prometheus.server.retention | string | `"24h"` | interval during which local cluster prometheus will store metrics | -| metrics.prometheus.server.scrapeInterval | string | `"10s"` | interval used to self scrape metrics | -| metrics.prometheus.server.scrapeTimeout | string | `"5s"` | timeout for self scraped metrics | -| metrics.prometheus.server.version | string | `"v2.50.0"` | prometheus version to use by the local cluster prometheus | -| otlp.endpoint | string | `"monit-otlp.cern.ch"` | otlp endpoint where the otlp receivers are listening | -| otlp.port | int | `4319` | otlp port where the otlp receivers are listening | -| tenant.name | string | - | username used for authenticating with the MONIT infrastructure | -| tenant.password | string | - | password (plain) used for authenticating with the MONIT infrastructure | - ----------------------------------------------- diff --git a/templates/kube_state/servicemonitor.yaml b/templates/kube_state/servicemonitor.yaml index c55b1c66e5e77314641ba7c5421ecb1ff4e74d4c..5f9811c2bf26b883e682943c91b77ddb975d3931 100644 --- a/templates/kube_state/servicemonitor.yaml +++ b/templates/kube_state/servicemonitor.yaml @@ -10,7 +10,7 @@ spec: endpoints: - port: http-metrics scheme: http - interval: 30s + interval: {{ .Values.metrics.kubeState.scrapeInterval }} path: /metrics {{- if and .Values.metrics.kubeState .Values.metrics.kubeState.serviceMonitor.relabelings }} relabelings: @@ -18,7 +18,7 @@ spec: {{- end }} - port: telemetry scheme: http - interval: 30s + interval: {{ .Values.metrics.kubeState.scrapeInterval }} {{- if and .Values.metrics.kubeState .Values.metrics.kubeState.relabelings }} relabelings: {{- .Values.metrics.kubeState.relabelings | toYaml | nindent 4}} diff --git a/templates/node_exporter/podmonitor.yaml b/templates/node_exporter/podmonitor.yaml index a30bad9e283223e767666f0e7b6b25ad84c15f6f..fbf48f723e6cbfb7b4bb3be2d6ff753e8e655044 100644 --- a/templates/node_exporter/podmonitor.yaml +++ b/templates/node_exporter/podmonitor.yaml @@ -13,6 +13,9 @@ spec: app.kubernetes.io/name: node-exporter podMetricsEndpoints: - targetPort: 9100 + {{- if .Values.metrics.nodeExporter.scrapeInterval }} + interval: {{ .Values.metrics.nodeExporter.scrapeInterval }} + {{- end }} relabelings: - action: replace sourceLabels: diff --git a/values.yaml b/values.yaml index 0a7ebcbc9b0e89a97b87608bbf66542106bfff66..bad6971ef14a18959eff32846008b14eb1aa324b 100644 --- a/values.yaml +++ b/values.yaml @@ -1,5 +1,6 @@ # CRDs crds: + # -- whether to install Prometheus operator's CRDs enabled: true # OTLP default configuration. @@ -11,25 +12,26 @@ otlp: # Tenant configuration. Username and Password are provided via CERN Central IT # Monitoring service. This bit is required if fluentbit is enabled (default) -# tenant: +tenant: # -- username used for authenitcating in the MONIT infrastructure - # name: example - # -- password (plain) used for authenitcating in the MONIT infrastructure - # password: example + name: '' + # -- password (plain) used for authenticating in the MONIT infrastructure + password: '' # Kubernetes configuration. -# kubernetes: - # -- name of the kubernetes cluster to monitor. This value will be appended to very metric and log via k8sClusterName label. This bit is required if fluentbit is enabled (default) - # clusterName: nil +kubernetes: + # -- name of the kubernetes cluster to monitor. This value will be appended to very metric and log via k8s_cluster_name label. This bit is required if fluentbit is enabled (default) + clusterName: '' +# Global Fluent Bit configuration fluentbit: # Default image used by all Fluent Bit instances image: - # -- If defined, a repository applied to all Fluent Bit instances - repository: registry.cern.ch/monit/cern-it-monitoring-fluent-bit - # -- If defined, a tag applied to all Fluent Bit instances + # -- image repository applied to all Fluent Bit instances + repository : registry.cern.ch/monit/cern-it-monitoring-fluent-bit + # -- image tag applied to all Fluent Bit instances tag: 3.2.6 - # -- If defined, an image pull policy applied to all Fluent Bit instances + # -- image pull policy applied to all Fluent Bit instances imagePullPolicy: IfNotPresent # The metrics section includes all the components meant to produce, scrape, @@ -46,8 +48,8 @@ metrics: nodeExporter: # -- if true node exporter will be installed as a daemon set together with a pod monitor enabled: true - # -- indicates how often node exporter will be scraped by the local prometheus - scrapeInterval: "15s" + # -- indicates how often this exporter will be scraped by the local prometheus + scrapeInterval: '' resources: requests: cpu: "5m" @@ -65,8 +67,8 @@ metrics: nodeSelector: {} # -- if true kube state will be installed together with a service monitor enabled: true - # -- indicates how often node exporter will be scraped by the local prometheus - scrapeInterval: "15s" + # -- indicates how often this exporter will be scraped by the local prometheus + scrapeInterval: "30s" resources: requests: cpu: "5m" @@ -82,7 +84,7 @@ metrics: prometheus: # -- if true prometheus operator and a prometheus server will be installed enabled: true - # -- specific configuration for the prometheus operator + # Specific configuration for the prometheus operator operator: # If set it will override the metrics.defaultNodeSelector. nodeSelector: {} @@ -94,10 +96,11 @@ metrics: cpu: "100m" memory: "100Mi" server: - # If set it will override the metrics.defaultNodeSelector. + # -- prometheus operator node selectors. If set it will override the metrics.defaultNodeSelector nodeSelector: {} - # -- prometheus version to use by the local cluster prometheus. Make sure this version exists in the indicated image repository. + # -- prometheus image to use by the local cluster prometheus image: registry.cern.ch/monit/cern-it-monitoring-prometheus:v2.53.3 + # -- prometheus version to use by the local cluster prometheus version: "v2.53.3" # -- interval used to self scrape metrics scrapeInterval: "10s" @@ -120,7 +123,8 @@ metrics: limits: cpu: "500m" memory: "5Gi" - # Service Monitors to be created by the helm chart install / upgrade. Ex: + # -- service monitors to be created + serviceMonitors: [] # serviceMonitors: # - name: my-nginx-sm # spec: @@ -135,16 +139,15 @@ metrics: # app.kubernetes.io/component: controller # app.kubernetes.io/instance: cern-magnum # app.kubernetes.io/name: ingress-nginx - serviceMonitors: [] - # Allows to drop / relabel node Exporter metrics. + # -- allows to drop / relabel node Exporter metrics. + relabelings: [] # More info on: https://prometheus.io/docs/prometheus/latest/configuration/configuration/#relabel_config # Example: # - action: drop # sourceLabels: # - __name__ # regex: "my_custom_counter_total|my_custom_counter_sum|my_custom_gauge" - relabelings: [] - # This fluentbit is used to allow scraping and fordwarding metrics from + # This fluentbit is used to allow scraping and forwarding metrics from # the local prometheus and send them to Open Telemetry Collector. # If the local .Values.metrics.prometheus enable=false will not be able # to scrape from local prometheus. Provide different inputs then. @@ -153,14 +156,11 @@ metrics: enabled: true ## Fluent Bit for metrics image image: - # -- Repository to use for Fluent Bit (metrics) - # @default -- `""` (defaults to fluentbit.image.repository) + # -- repository to use for Fluent Bit (metrics) repository: "" - # -- Tag to use for Fluent Bit (metrics) - # @default -- `""` (defaults to fluentbit.image.tag) + # -- tag to use for Fluent Bit (metrics) tag: "" - # -- Image pull policy for Fluent Bit (metrics) - # @default -- `""` (defaults to fluentbit.image.imagePullPolicy) + # -- image pull policy for Fluent Bit (metrics) imagePullPolicy: "" replicas: 2 # If set it will override the metrics.defaultNodeSelector. @@ -183,6 +183,8 @@ metrics: uri: /api/prom/push threaded: false + # -- extra Lua scripts for user-provided transformations + luaScripts: {} # These scripts are available in the fluentbit /fluent-bit/etc/scripts path. # Include your lua scripts in the following format: # luaScripts: @@ -192,12 +194,12 @@ metrics: # return 2, timestamp, record # end # my_other_lua_script.lua: ... - luaScripts: {} # -- max size for in-disk storage for fluent-bit diskMaxCache: "5G" # -- fluentbit service configuration options in a multiline string + # @default -- partly autogenerated -- see values.yaml service: | daemon: off flush: 1 @@ -215,6 +217,7 @@ metrics: storage.max_chunks_up: 507 # -- fluentbit inputs as a yaml list in a multiline string + # @default -- partly autogenerated -- see values.yaml inputs: | - name: prometheus_remote_write tag: {{ .Values.metrics.fluentbit.prometheusRemoteWriteInputConfig.tag }} @@ -228,9 +231,11 @@ metrics: threaded: {{ .Values.metrics.fluentbit.prometheusRemoteWriteInputConfig.threaded }} # -- fluentbit filters as a yaml list in a multiline string + # @default -- partly autogenerated -- see values.yaml filters: | # -- fluentbit outputs as a yaml list in a multiline string + # @default -- partly autogenerated -- see values.yaml outputs: | - name: opentelemetry match: monit.prom.k8s @@ -246,9 +251,9 @@ metrics: storage.total_limit_size: {{ .Values.metrics.fluentbit.diskMaxCache }} header: User-Agent {{ .Chart.Name }}/{{ .Chart.Version }} - # Pushgateway allows you to send metrics to the monitoring infrastructure - # by pushing them to the local cluster service it-monit-metrics-collector-pushgateway. pushgateway: + # -- pushgateway allows you to send metrics to the monitoring infrastructure + # by pushing them to the local cluster service it-monit-metrics-collector-pushgateway. enabled: false image: repository: registry.cern.ch/monit/cern-it-monitoring-pushgateway @@ -261,48 +266,57 @@ metrics: limits: cpu: 0.2 memory: 100Mi - # If set to true will install register a new ingress with the given - # configuration. ingress: + # -- if set to true will install register a new ingress with the given + # configuration. enabled: false className: "" # If no class set then default. path: / pathType: ImplementationSpecific hosts: [] tls: {} - # If given will override the defaultNodeSelector and install the component + # -- if given will override the defaultNodeSelector and install the component # only on the nodes that match the given condition. nodeSelector: {} # Alertmanager configuration. If configured the local prometheus will be # automatically configured with this alertmanager as target for alerts. alertmanager: + # -- if true alertmanager will be installed and prometheus reconfigured to use it as the alerting endpoint enabled: false + # -- alertmanager image to use by the local cluster alertmanager image: registry.cern.ch/monit/cern-it-monitoring-alertmanager + # -- alertmanager image tag to be used when pulling it tag: v0.27.0 + # -- pull policy for the alertmanager image pullPolicy: IfNotPresent + # -- number of replicas for the alertmanager deployment replicas: 3 # If set to true will install register a new ingress with the given # configuration. ingress: + # -- if set to true an ingress will be created for the alertmanager service enabled: false - className: "" # If no class set then default. + # -- class name to be used by the alertmanager ingress + className: "" + # -- entry path for the alertmanager ingress path: / + # -- path type for the alertmanager ingress pathType: ImplementationSpecific + # -- list of hosts for the alertmanager ingress hosts: [] + # -- tls configuration for the alertmanager ingress tls: {} - # If given will override the defaultNodeSelector and install the component - # only on the nodes that match the given condition. + # -- node selector configuration for the alertmanager nodeSelector: {} + # -- list of volumes to be declared volumes: [] + # -- list of volumes to be mounted volumeMounts: [] apiServer: serviceMonitor: relabelings: [] - scheduler: - serviceMonitor: - relabelings: [] coredns: serviceMonitor: relabelings: [] @@ -338,17 +352,12 @@ logs: enabled: false ## Fluent Bit for logs image image: - # -- Repository to use for Fluent Bit (logs) - # @default -- `""` (defaults to fluentbit.image.repository) + # -- repository to use for Fluent Bit (logs) repository: "" - # -- Tag to use for Fluent Bit (logs) - # @default -- `""` (defaults to fluentbit.image.tag) + # -- tag to use for Fluent Bit (logs) tag: "" - # -- Image pull policy for Fluent Bit (logs) - # @default -- `""` (defaults to fluentbit.image.imagePullPolicy) + # -- image pull policy for Fluent Bit (logs) imagePullPolicy: "" - # -- interval used by the local prometheus (if installed) to scrape metrics from logs fluentbits - scrapeInterval: "15s" resources: requests: cpu: "5m" @@ -358,6 +367,7 @@ logs: memory: "25Mi" # -- fluentbit service configuration options in a multiline string + # @default -- partly autogenerated -- see values.yaml service: | [SERVICE] Daemon Off @@ -371,6 +381,7 @@ logs: Health_Check On # -- fluentbit inputs as a yaml list in a multiline string + # @default -- partly autogenerated -- see values.yaml inputs: | [INPUT] Name tail @@ -381,6 +392,7 @@ logs: Skip_Long_Lines Off # -- fluentbit filters as a yaml list in a multiline string + # @default -- partly autogenerated -- see values.yaml filters: | [FILTER] Name kubernetes @@ -436,6 +448,7 @@ logs: call add_timestamp_from_time # -- fluentbit outputs as a yaml list in a multiline string + # @default -- partly autogenerated -- see values.yaml outputs: | [OUTPUT] name opentelemetry @@ -461,6 +474,8 @@ logs: ## -- extra volumes to mount in the fluentbits, can be used to scrape metrics from pvcs extraVolumeMounts: [] + # -- extra Lua scripts for user-provided transformations + luaScripts: {} # These scripts are available in the fluentbit /fluent-bit/etc/scripts path. # Include your lua scripts in the following format: # luaScripts: @@ -470,4 +485,3 @@ logs: # return 2, timestamp, record # end # my_other_lua_script.lua: ... - luaScripts: {}