From 2cbd22d4738ebe9c3cf4925ad4200b793d5ebf58 Mon Sep 17 00:00:00 2001 From: WrenIX Date: Mon, 29 Jan 2024 14:50:23 +0100 Subject: [PATCH] fix(infra-monitoring): add alerts for blackbox-exporter --- .../exporter/blackbox/prometheus-rules.yaml | 119 ++++++++++++++++++ 1 file changed, 119 insertions(+) create mode 100644 infra-monitoring/templates/exporter/blackbox/prometheus-rules.yaml diff --git a/infra-monitoring/templates/exporter/blackbox/prometheus-rules.yaml b/infra-monitoring/templates/exporter/blackbox/prometheus-rules.yaml new file mode 100644 index 0000000..bf9d002 --- /dev/null +++ b/infra-monitoring/templates/exporter/blackbox/prometheus-rules.yaml @@ -0,0 +1,119 @@ +{{- if and .Values.prometheus.exporter.blackbox.enabled (.Capabilities.APIVersions.Has "monitoring.coreos.com/v1/PrometheusRule") }} +{{- $filter := `job=~"probe/.*"` }} +--- +apiVersion: "monitoring.coreos.com/v1" +kind: "PrometheusRule" +metadata: + name: prometheus-blackbox-exporter + labels: + {{- toYaml .Values.commons.prometheus.rules.labels | nindent 4 }} +spec: + groups: + - name: "Exporter It-Self" + rules: + - alert: "Blackbox-Exporter down" + expr: 'up{ {{ $filter }} } == 0' + for: "5m" + labels: + severity: "critical" + detectedBy: "Blackbox" + annotations: + {{` + summary: "Blackbox-Exporter to probe {{ $labels.instance | stripPort }} is not reachable for more than 5 minutes." + `}} + + - alert: "BlackboxProbeFailed" + expr: 'probe_success{ {{ $filter }}, instancetype!="server" } == 0' + for: "5m" + labels: + severity: "critical" + detectedBy: "Blackbox" + annotations: + {{` + summary: "Probe of the target {{ $labels.instance | stripPort }} failed." + `}} + + - name: "ICMP" + rules: + - alert: "BlackboxProbeIcmpFailed" + expr: 'probe_success{ {{ $filter }}, instancetype="server" } == 0' + for: "5m" + labels: + severity: "critical" + detectedBy: "Blackbox" + annotations: + {{` + summary: "Icmp request of the target {{ $labels.instance | stripPort }} failed for more than 5 minutes." + `}} + + - alert: "HostnameCouldNotBeResolvedByDNS" + expr: 'probe_icmp_duration_seconds{ {{ $filter }}, phase="resolve"} == 0' + for: "5m" + labels: + severity: "critical" + detectedBy: "Blackbox" + annotations: + {{` + summary: "Instance {{ $labels.instance | stripPort }} could not be resolved by blackbox-exporter while ICMP requests." + description: "Could not resolve hostname for more than 5 minutes." + `}} + + - alert: "SlowPing" + expr: 'avg_over_time(probe_icmp_duration_seconds{ {{ $filter }} }[1m]) > 1' + for: "5m" + labels: + severity: "warning" + detectedBy: "Blackbox" + annotations: + {{` + summary: "ping duration is {{ $value }} s for {{ $labels.instance | stripPort }}." + `}} + + - name: "SSL Certificates" + rules: + - alert: "CertificateAboutToExpire" + expr: 'probe_ssl_earliest_cert_expiry{ {{ $filter }} } - time() <= 86400' + for: "1m" + labels: + severity: "critical" + detectedBy: "Blackbox" + annotations: + {{` + summary: "SSL certificate for domain {{ $labels.instance }} has almost expired" + `}} + + - alert: "CertificateAboutToExpire" + expr: 'probe_ssl_earliest_cert_expiry{ {{ $filter }} } - time() < 86400 * 6' + for: "1m" + labels: + severity: "warning" + detectedBy: "Blackbox" + annotations: + {{` + summary: "SSL certificate for domain {{ $labels.instance }} will expire in {{ $value | humanizeDuration }}." + `}} + + - name: "HTTP Probes" + rules: + - alert: "HttpStatusCode" + expr: 'probe_http_status_code{ {{ $filter }}, module != "http_404" } <= 199 OR probe_http_status_code{ {{ $filter }}, module != "http_404" } >= 400' + for: "5m" + labels: + severity: "critical" + detectedBy: "Blackbox" + annotations: + {{` + summary: "HTTP status code for domain {{ $labels.instance | stripPort }} is {{ $value }}" + `}} + + - alert: "HttpSlowRequests" + expr: 'avg_over_time(probe_http_duration_seconds{ {{ $filter }} }[1m]) > 1' + for: "5m" + labels: + severity: "warning" + detectedBy: "Blackbox" + annotations: + {{` + summary: 'HTTP responce time for domain {{ $labels.instance | stripPort }} is {{ printf "%.0f" $value }} s.' + `}} +{{- end }}