From 3fb857fe4ce35abdf248974482cce7054be3819b Mon Sep 17 00:00:00 2001
From: Guillermo Facundo Colunga <guillermo.facundo.colunga@cern.ch>
Date: Wed, 27 Nov 2024 16:10:44 +0100
Subject: [PATCH] otel-metrics: enable remote write from prom to fb

Some users reported that the current implementation of the metrics flow
it is causing errors in the fluent bit components that scrape the metrics
from the local prometheus and forwards to open telemetry.

This commits inverts the paradigm and now it is prometheus the one doing
the remote write into the local fluent bit. After that, fluent bit is
doing a remote write as it was doing previously into the monitoring infra.

Reported-at: https://its.cern.ch/jira/browse/MONIT-4077
Signed-off-by: Guillermo Facundo Colunga <guillermo.facundo.colunga@gmail.com>
Signed-off-by: Guillermo Facundo Colunga <guillermo.facundo.colunga@cern.ch>
---
 templates/fluentbit-metrics/service.yaml     | 16 +++++++
 templates/fluentbit-metrics/statefulset.yaml |  2 +-
 templates/prometheus/prometheus.yaml         | 45 +++++++++++---------
 values.yaml                                  | 35 +++++++++------
 4 files changed, 64 insertions(+), 34 deletions(-)
 create mode 100644 templates/fluentbit-metrics/service.yaml

diff --git a/templates/fluentbit-metrics/service.yaml b/templates/fluentbit-metrics/service.yaml
new file mode 100644
index 0000000..194a684
--- /dev/null
+++ b/templates/fluentbit-metrics/service.yaml
@@ -0,0 +1,16 @@
+{{- if and .Values.metrics.enabled -}}
+apiVersion: v1
+kind: Service
+metadata:
+  name: it-monit-metrics-fluentbit
+  namespace: {{ .Release.Namespace }}
+  labels:
+    name: it-monit-metrics-collector-fluentbit
+spec:
+  clusterIP: None
+  ports:
+  - name: http
+    port: {{ .Values.metrics.fluentbit.prometheusRemoteWriteInputConfig.port }}
+  selector:
+    name: it-monit-metrics-collector-fluentbit
+{{- end -}}
diff --git a/templates/fluentbit-metrics/statefulset.yaml b/templates/fluentbit-metrics/statefulset.yaml
index 15f0f23..32eff77 100644
--- a/templates/fluentbit-metrics/statefulset.yaml
+++ b/templates/fluentbit-metrics/statefulset.yaml
@@ -8,7 +8,7 @@ spec:
   selector:
       matchLabels:
         name: it-monit-metrics-collector-fluentbit
-  replicas: 1
+  replicas: {{ .Values.metrics.fluentbit.replicas }}
   template:
     metadata:
       name: it-monit-metrics-collector-fluentbit
diff --git a/templates/prometheus/prometheus.yaml b/templates/prometheus/prometheus.yaml
index 65382e9..10b2573 100644
--- a/templates/prometheus/prometheus.yaml
+++ b/templates/prometheus/prometheus.yaml
@@ -37,24 +37,31 @@ spec:
   scrapeConfigNamespaceSelector: {}
   ruleSelector: {}
   ruleNamespaceSelector: {}
-  {{if .Values.metrics.prometheus.server.remoteWrite.endpoint }}
   remoteWrite:
-  - url: {{ .Values.metrics.prometheus.server.remoteWrite.endpoint }}
-    tlsConfig:
-      insecureSkipVerify: true
-    basicAuth:
-      username:
-        name: it-monit-metrics-collector-prometheus
-        key: username
-      password:
-        name: it-monit-metrics-collector-prometheus
-        key: password
-  {{ end }}
-  {{- if .Values.metrics.alertmanager.enabled }}
-  alerting:
-    alertmanagers:
-      - namespace: {{ .Release.Namespace }}
-        name: it-monit-alertmanager
-        port: http
-  {{- end }}
+    - url: "http://it-monit-metrics-fluentbit:8080/api/prom/push"
+      protobuf_message: io.prometheus.write.v2.Request
+      queue_config:
+        capacity: 5000
+        max_samples_per_send: 1000
+        batch_send_deadline: 5s
+      write_relabel_configs:
+        - source_labels: [__name__]
+          regex: 'temp.*'
+          action: drop
+        - regex: '(id|uuid)'
+          action: labeldrop
+      metadata_config:
+        send: false
+    {{if .Values.metrics.prometheus.server.remoteWrite.endpoint }}
+    - url: {{ .Values.metrics.prometheus.server.remoteWrite.endpoint }}
+      tlsConfig:
+        insecureSkipVerify: true
+      basicAuth:
+        username:
+          name: it-monit-metrics-collector-prometheus
+          key: username
+        password:
+          name: it-monit-metrics-collector-prometheus
+          key: password
+    {{ end }}
 {{- end -}}
diff --git a/values.yaml b/values.yaml
index a49c9ee..c5e5dd6 100644
--- a/values.yaml
+++ b/values.yaml
@@ -125,6 +125,7 @@ metrics:
   fluentbit:
     # -- if true fluentbit metrics forwarder will be installed
     enabled: true
+    replicas: 2
     # If set it will override the metrics.defaultNodeSelector.
     nodeSelector: {}
     resources:
@@ -134,12 +135,16 @@ metrics:
       limits:
         cpu: "1"
         memory: "1Gi"
-
-    matchQuery: "match[]={job!=\"\"}"
-    # -- interval used by fluentbit to scrape metrics from prometheus
-    prometheusScrapeInterval: "60s"
-    # -- fluentbit buffer size. The more metrics to send the bigger needs to be
-    prometheusScrapeBufferMaxSize: "100M"
+    prometheusRemoteWriteInputConfig:
+      listen: 0.0.0.0
+      port: 8080
+      bufferMaxSize: 2G
+      bufferChunkSize: 128M
+      successfulResponseCode: 201
+      tagFromUri: false
+      tag: monit.prom.k8s
+      uri: /api/prom/push
+      threaded: false
 
     # -- max size for in-disk storage for fluent-bit
     diskMaxCache: "5G"
@@ -163,14 +168,16 @@ metrics:
 
     # -- fluentbit inputs as a yaml list in a multiline string
     inputs: |
-      - name: prometheus_scrape
-        tag: monit.prom.k8s
-        host: prometheus-operated.{{ .Release.Namespace }}.svc.cluster.local
-        port: 9090
-        storage.type: filesystem
-        scrape_interval: {{ .Values.metrics.fluentbit.prometheusScrapeInterval }}
-        metrics_path: /federate?{{ .Values.metrics.fluentbit.matchQuery }}
-        buffer_max_size: {{ .Values.metrics.fluentbit.prometheusScrapeBufferMaxSize }}
+      - name: prometheus_remote_write
+        tag: {{ .Values.metrics.fluentbit.prometheusRemoteWriteInputConfig.tag }}
+        listen: {{ .Values.metrics.fluentbit.prometheusRemoteWriteInputConfig.listen }}
+        port: {{ .Values.metrics.fluentbit.prometheusRemoteWriteInputConfig.port }}
+        uri: {{ .Values.metrics.fluentbit.prometheusRemoteWriteInputConfig.uri }}
+        buffer_max_size: {{ .Values.metrics.fluentbit.prometheusRemoteWriteInputConfig.bufferMaxSize }}
+        buffer_chunk_size: {{ .Values.metrics.fluentbit.prometheusRemoteWriteInputConfig.bufferChunkSize }}
+        successful_response_code: {{ .Values.metrics.fluentbit.prometheusRemoteWriteInputConfig.successfulResponseCode }}
+        tag_from_uri: {{ .Values.metrics.fluentbit.prometheusRemoteWriteInputConfig.tagFromUri }}
+        threaded: {{ .Values.metrics.fluentbit.prometheusRemoteWriteInputConfig.threaded }}
 
     # -- fluentbit filters as a yaml list in a multiline string
     filters: |
-- 
GitLab