fix(infra-logging): migrate from logging-operator to vector-agent

This commit is contained in:
WrenIX 2025-04-18 00:33:48 +02:00
parent a87841325d
commit abcc9442b2
Signed by: wrenix
GPG key ID: 7AFDB012974B1BB5
14 changed files with 130 additions and 408 deletions

View file

@ -0,0 +1,53 @@
data_dir: /vector-data-dir
api:
enabled: true
address: 127.0.0.1:8686
playground: false
sources:
kubernetes_logs:
type: kubernetes_logs
host_metrics:
filesystem:
devices:
excludes: [binfmt_misc]
filesystems:
excludes: [binfmt_misc]
mountpoints:
excludes: ["*/proc/sys/fs/binfmt_misc"]
type: host_metrics
internal_metrics:
type: internal_metrics
transforms:
logs:
type: remap
inputs:
- kubernetes_logs
source: |-
if !exists(.pod_namespace) {
.pod_namespace = "unknown"
}
if !exists(.metadata) {
.metadata = {
"not found": "unknown"
}
}
sinks:
prom_exporter:
type: prometheus_exporter
inputs: [host_metrics, internal_metrics]
address: 0.0.0.0:9090
{{- if .Values.loki.enabled }}
loki:
type: loki
inputs:
- logs
endpoint: http://loki:3100
encoding:
codec: logfmt
labels:
{{`
"pod_labels_*": "{{ kubernetes.pod_labels }}"
"namespace": "{{ kubernetes.pod_namespace }}"
"*": "{{ metadata }}"
`}}
{{- end }}

View file

@ -6,9 +6,8 @@ metadata:
namespace: "{{ .Values.init.namespace }}"
data:
{{- if and
(.Capabilities.APIVersions.Has "monitoring.coreos.com/v1/ServiceMonitor")
(.Capabilities.APIVersions.Has "monitoring.coreos.com/v1/PodMonitor")
(.Capabilities.APIVersions.Has "monitoring.coreos.com/v1/PrometheusRule")
(.Capabilities.APIVersions.Has "logging.banzaicloud.io/v1beta1/Flow")
}}
init: "-1"
{{- else }}

View file

@ -1,45 +0,0 @@
{{- if (.Capabilities.APIVersions.Has "logging.banzaicloud.io/v1beta1/Flow") }}
---
apiVersion: logging.banzaicloud.io/v1beta1
kind: Flow
metadata:
name: coredns
namespace: kube-system
spec:
match:
- select:
labels:
k8s-app: "coredns"
filters:
- tag_normaliser: {}
- parser:
reserve_data: true
remove_key_name_field: true
parse:
type: "multi_format"
patterns:
- format: "regexp"
expression: '^\[(?<log.level>.*)\] \[?(?<source.address>.*)\]?:(?<source.port>.*) - (?<dns.id>.*) "(?<dns.question.type>.*) (?<dns.question.class>.*) (?<dns.question.name>.*)\.? (?<network.transport>.*) (?<coredns.query.size>.*) (?<coredns.dnssec_ok>.*) (?<bufsize>.*)" (?<dns.response_code>.*) (?<dns.header_flags>.*) (?<coredns.response.size>.*) (?<coredns.duration>.*)s'
types: "source.port:integer,dns.id:integer,coredns.query.size:integer,coredns.dnssec_ok:bool,bufsize:integer,dns.header_flags:array,coredns.response.size:integer,coredns.duration:float"
- format: "none"
- record_transformer:
enable_ruby: true
records:
- source.ip: '${ record["source.address"] }'
dns.header_flags: '${ !(record["dns.header_flags"].nil?) ? record["dns.header_flags"].map(&:upcase) : nil }'
event.duration: '${ !(record["coredns.duration"].nil?) ? record["coredns.duration"] * 1000000000 : nil }'
event.kind: "event"
event.category: "network"
event.type: "protocol"
event.outcome: '${ record["dns.response_code"] == "NOERROR" ? "success" : "failure" }'
event.protocol: "dns"
event.module: "coredns"
related.ip: '${ record["source.address"] }'
# for dashboard
fileset.name: "kubernetes"
coredns.query.name: '${ record["dns.question.name"] }'
remove_keys: "coredns.duration,coredns.dnssec_ok"
globalOutputRefs:
- "default"
{{- end }}

View file

@ -1,52 +0,0 @@
{{- if (.Capabilities.APIVersions.Has "logging.banzaicloud.io/v1beta1/Flow") }}
---
apiVersion: logging.banzaicloud.io/v1beta1
kind: Flow
metadata:
name: klog
namespace: kube-system
spec:
match:
- select:
labels:
k8s-app: "konnectivity-agent"
- select:
labels:
k8s-app: "kube-proxy"
- select:
labels:
app: "snapshot-validation-webhook"
filters:
- tag_normaliser: {}
- parser:
hash_value_field: "klog"
reserve_data: true
remove_key_name_field: true
parse:
type: "multi_format"
patterns:
- format: "regexp"
expression: '(?<log_level>[A-Z])(?<month>\d{2})(?<day>\d{2})\s+(?<time>\d{2}:\d{2}:\d{2}(|\.\d+))\s+(?<threadid>\d+)\s+(?<file>[^ ]*):(?<line>\d+)\]\s("(?<msg>([^"\\]*(?:\\.[^"\\]*)*))"(|\s+(?<kv>.*))|(?<greedy_msg>.*))$'
types: "month:integer,day:integer,threadid:integer"
- format: "none"
- record_transformer:
enable_ruby: true
records:
- timestamp: '${time.strftime("%Y")}-${ record["klog"]["month"] }-${ record["klog"]["day"] }T${ record["klog"]["time"] }Z'
message: '${ !(record["klog"]["greedy_msg"].nil?) ? record["klog"]["greedy_msg"] : record["klog"]["msg"] }'
log.level: '${ record["klog"]["log_level"].gsub("I", "info").gsub("W", "warn").gsub("E", "error").gsub("F", "fatal") }'
klog_kv: '${ !(record["klog"]["kv"].nil?) ? record["klog"]["kv"] : "" }'
remove_keys: "$['klog']['month'],$['klog']['day'],$['klog']['time'],$['klog']['log_level'],$['klog']['msg'],$['klog']['greedy_msg'],$['klog']['kv']"
- parser:
key_name: "klog_kv"
hash_value_field: "klog.fields"
reserve_data: true
remove_key_name_field: true
parse:
type: "multi_format"
patterns:
- format: "logfmt"
- format: "none"
globalOutputRefs:
- "default"
{{- end }}

View file

@ -1,29 +0,0 @@
{{- if (.Capabilities.APIVersions.Has "logging.banzaicloud.io/v1beta1/Flow") }}
---
apiVersion: logging.banzaicloud.io/v1beta1
kind: Flow
metadata:
name: event-tailer
spec:
match:
- select:
labels:
"app.kubernetes.io/name": "event-tailer"
filters:
- tag_normaliser: {}
- parser:
hash_value_field: "kubernetes"
remove_key_name_field: true
reserve_data: true
parse:
type: "json"
- record_transformer:
enable_ruby: true
records:
- event.module: "kubernetes"
message: '${ record["kubernetes"]["event"]["message"] }'
remove_keys: "$['kubernetes']['event']['message']"
globalOutputRefs:
- "default"
{{- end }}

View file

@ -1,36 +0,0 @@
{{- if (.Capabilities.APIVersions.Has "logging.banzaicloud.io/v1beta1/Flow") }}
---
apiVersion: logging.banzaicloud.io/v1beta1
kind: Flow
metadata:
name: fluentbit
spec:
match:
- select:
labels:
"app.kubernetes.io/name": "fluentbit"
filters:
- tag_normaliser: {}
- parser:
hash_value_field: "fluentbit"
reserve_data: true
remove_key_name_field: true
parse:
type: "regexp"
expression: '^\[(?<timestamp>.*)\] \[(?<log.level>.*)\] \[(?<component>.*)\] (?<message>.*)'
types: "timestamp:string,log.level:string,component:string,message:string"
time_key: "timestamp"
time_type: "string"
time_format: "%Y/%m/%d %H:%M:%S"
- record_transformer:
enable_ruby: true
records:
- event.kind: "event"
event.module: "fluentbit"
message: '${record["fluentbit"]["message"]}'
log.level: '${record["fluentbit"]["log.level"]}'
remove_keys: "$['fluentbit']['log']['level'],$['fluentbit']['message']"
globalOutputRefs:
- default
{{- end }}

View file

@ -1,34 +0,0 @@
{{- if (.Capabilities.APIVersions.Has "logging.banzaicloud.io/v1beta1/Flow") }}
---
apiVersion: logging.banzaicloud.io/v1beta1
kind: Flow
metadata:
name: logging-operator
spec:
match:
- select:
labels:
"app.kubernetes.io/name": "logging-operator"
filters:
- tag_normaliser: {}
- parser:
hash_value_field: "logging-operator"
reserve_data: true
remove_key_name_field: true
parse:
type: "json"
time_key: "ts"
time_type: "string"
time_format: "%iso8601"
- record_transformer:
enable_ruby: true
records:
- event.kind: "event"
event.module: "logging-operator"
message: '${record["logging-operator"]["msg"]}'
log.level: '${record["logging-operator"]["level"]}'
remove_keys: "$['logging-operator']['level'],$['logging-operator']['msg']"
globalOutputRefs:
- default
{{- end }}

View file

@ -1,202 +0,0 @@
---
apiVersion: helm.toolkit.fluxcd.io/v2
kind: HelmRelease
metadata:
name: logging-operator
spec:
chart:
spec:
sourceRef:
kind: HelmRepository
name: kube-logging
chart: logging-operator
install:
{{- toYaml .Values.commons.helm.release.install | nindent 4 }}
test:
{{- toYaml .Values.commons.helm.release.test | nindent 4 }}
upgrade:
{{- toYaml .Values.commons.helm.release.upgrade | nindent 4 }}
driftDetection:
{{- toYaml .Values.commons.helm.release.driftDetection | nindent 4 }}
interval: 10m
values:
monitoring:
serviceMonitor:
enabled: {{ (.Capabilities.APIVersions.Has "monitoring.coreos.com/v1/ServiceMonitor") }}
additionalLabels:
{{- toYaml .Values.commons.prometheus.monitor.labels | nindent 10 }}
# resources for logging-operator
resources:
limits:
memory: 3Gi
requests:
cpu: 100m
memory: 128Mi
logging:
enabled: true
# fluentbit is used to collect data on nodes (so it is usefull to use hostPath)
fluentbit:
bufferStorageVolume:
hostPath:
path: "/var/lib/kube-logging/fluentbit/buffer"
positiondb:
hostPath:
path: "/var/lib/kube-logging/fluentbit/positiondb"
metrics:
prometheusRules: {{ (.Capabilities.APIVersions.Has "monitoring.coreos.com/v1/PrometheusRule") }}
serviceMonitor: {{ (.Capabilities.APIVersions.Has "monitoring.coreos.com/v1/ServiceMonitor") }}
serviceMonitorConfig:
additionalLabels:
{{- toYaml .Values.commons.prometheus.monitor.labels | nindent 14 }}
# fluentd is used to recieve data from fluentbit, filter (e.g. parse, grep) and forward output (e.g. loki)
fluentd:
scaling:
replicas: {{ .Values.fluentd.replicas }}
# resources for fluentd
resources:
limits:
memory: "2400M"
requests:
cpu: "500m"
memory: "200M"
metrics:
prometheusRules: {{ (.Capabilities.APIVersions.Has "monitoring.coreos.com/v1/PrometheusRule") }}
serviceMonitor: {{ (.Capabilities.APIVersions.Has "monitoring.coreos.com/v1/ServiceMonitor") }}
serviceMonitorConfig:
additionalLabels:
{{- toYaml .Values.commons.prometheus.monitor.labels | nindent 14 }}
bufferVolumeMetrics:
prometheusRules: {{ (.Capabilities.APIVersions.Has "monitoring.coreos.com/v1/PrometheusRule") }}
serviceMonitor: {{ (.Capabilities.APIVersions.Has "monitoring.coreos.com/v1/ServiceMonitor") }}
serviceMonitorConfig:
additionalLabels:
{{- toYaml .Values.commons.prometheus.monitor.labels | nindent 14 }}
tls:
# make problems on reinstall (maybe try it sometime again)
enabled: false
# allow clusteroutput from flow in other namespace
allowClusterResourcesFromAllNamespaces: true
enableRecreateWorkloadOnImmutableFieldChange: true
# log kubernetes events
eventTailer:
name: "default"
# forward errors to output
errorOutputRef: "default"
# if no (cluster)flow exits for pods:
# filter: drop log messages if they contains "debug"
# send logs: clusterOutput "default"
defaultFlow:
filters:
- grep:
exclude:
- key: "message"
pattern: /.*[Dd]ebug.*/
- prometheus:
metrics:
- name: "logs_defaultflow_count"
desc: "The total number of message in namespace"
type: "counter"
labels:
exported_namespace: "$.kubernetes.namespace_name"
exported_pod: "$.kubernetes.pod_name"
exported_container: "$.kubernetes.container_name"
image: "$.kubernetes.container_image"
app_kubernetes_io_name: "$['kubernetes']['labels']['app.kubernetes.io/name']"
app_kubernetes_io_instance: "$['kubernetes']['labels']['app.kubernetes.io/instance']"
globalOutputRefs:
- "default"
# usefull on elastic e.g. with dedot
globalFilters:
- prometheus:
metrics:
- name: "logs_all_count"
desc: "The total number of messages in namespace"
type: "counter"
labels:
exported_namespace: "$.kubernetes.namespace_name"
exported_pod: "$.kubernetes.pod_name"
exported_container: "$.kubernetes.container_name"
image: "$.kubernetes.container_image"
app_kubernetes_io_name: "$['kubernetes']['labels']['app.kubernetes.io/name']"
app_kubernetes_io_instance: "$['kubernetes']['labels']['app.kubernetes.io/instance']"
# deploy a clusteroutput (which all flows can use)
clusterOutputs:
- name: "default"
spec:
{{- if .Values.loki.enabled }}
# for loki:
# https://kube-logging.dev/docs/configuration/plugins/outputs/loki/
loki:
url: http://loki:3100
buffer:
timekey: 1m
timekey_wait: 30s
timekey_use_utc: true
# do not use configure_kubernetes_labels strip other kubernetes labels
extract_kubernetes_labels: true
labels:
# from configure_kubernetes_labels reimplement
host: $.kubernetes.host
namespace: $.kubernetes.namespace_name
pod: $.kubernetes.pod_name
pod_id: $.kubernetes.pod_id
container: $.kubernetes.container_name
container_id: $.kubernetes.docker_id
{{- else }}
nullout: {}
{{- end }}
# add some usefull default clusterFlows
clusterFlows:
# parse all data with logfmt of pod which contain label: kube_logging_parser=logfmt (and send to ClusterOutput default)
- name: logfmt
spec:
filters:
- parser:
reserve_data: true
remove_key_name_field: true
hash_value_field: "logfmt"
parse:
type: "multi_format"
patterns:
- format: "logfmt"
# fallback, just keep data unparsed
- format: "none"
match:
- select:
labels:
"kube_logging_parser": "logfmt"
globalOutputRefs:
- "default"
# parse all data with json of pod which contain label: kube_logging_parser=json (and send to ClusterOutput default)
- name: json
spec:
filters:
- parser:
reserve_data: true
remove_key_name_field: true
hash_value_field: "json"
parse:
type: "multi_format"
patterns:
- format: "json"
# fallback, just keep data unparsed
- format: "none"
match:
- select:
labels:
"kube_logging_parser": "json"
globalOutputRefs:
- "default"

View file

@ -66,8 +66,8 @@ spec:
enabled: false
grafanaAgent:
installOperator: false
lokiCanary:
enabled: false
lokiCanary:
enabled: false
test:
enabled: false
gateway:

View file

@ -0,0 +1,6 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: vector
data:
vector.yaml: {{ tpl (.Files.Get "config/vector.yaml" ) . | quote }}

View file

@ -0,0 +1,24 @@
{{- if (.Capabilities.APIVersions.Has "monitoring.coreos.com/v1/PrometheusRule") }}
{{- $without := "instance,endpoint,container,pod,job,host" }}
---
apiVersion: "monitoring.coreos.com/v1"
kind: "PrometheusRule"
metadata:
name: "vector"
labels:
{{- toYaml .Values.commons.prometheus.rules.labels | nindent 4 }}
spec:
groups:
- name: "Vector"
rules:
- alert: "Component Errors"
expr: 'sum(increase(vector_component_errors_total[1h])) without ({{ $without }}) > 0'
for: "1m"
labels:
severity: "warning"
annotations:
{{`
summary: "{{ $labels.component_kind }} {{ $labels.component_id }} ({{ $labels.compnent_type }}) has {{ $labels.request_failed }} in stage {{ $labels.stage }}"
`}}
{{- end }}{{/* end-if */}}

View file

@ -0,0 +1,42 @@
---
apiVersion: helm.toolkit.fluxcd.io/v2
kind: HelmRelease
metadata:
name: vector-agent
spec:
chart:
spec:
sourceRef:
kind: HelmRepository
name: vector
chart: vector
install:
{{- toYaml .Values.commons.helm.release.install | nindent 4 }}
test:
{{- toYaml .Values.commons.helm.release.test | nindent 4 }}
upgrade:
{{- toYaml .Values.commons.helm.release.upgrade | nindent 4 }}
driftDetection:
{{- toYaml .Values.commons.helm.release.driftDetection | nindent 4 }}
interval: 10m
values:
role: Agent
updateStrategy:
maxSurge: 1
dataDir: "/vector-data-dir"
logLevel: "warn"
existingConfigMaps:
- vector
containerPorts:
- name: prom-exporter
containerPort: 9090
protocol: TCP
service:
ports:
- name: prom-exporter
port: 9090
protocol: TCP
podMonitor:
enabled: {{ (.Capabilities.APIVersions.Has "monitoring.coreos.com/v1/PodMonitor") }}
additionalLabels:
{{- toYaml .Values.commons.prometheus.monitor.labels | nindent 10 }}

View file

@ -2,8 +2,7 @@
apiVersion: source.toolkit.fluxcd.io/v1
kind: HelmRepository
metadata:
name: kube-logging
name: vector
spec:
url: oci://ghcr.io/kube-logging/helm-charts
type: oci
url: https://helm.vector.dev
interval: 10m

View file

@ -45,8 +45,5 @@ grafana:
# -- annotations of grafana dashboard configmap
annotations: {}
fluentd:
replicas: 1
loki:
enabled: true