119 lines
4.3 KiB
YAML
119 lines
4.3 KiB
YAML
{{- if and .Values.prometheus.exporter.blackbox.enabled (.Capabilities.APIVersions.Has "monitoring.coreos.com/v1/PrometheusRule") }}
|
|
{{- $filter := `job=~"probe/.*"` }}
|
|
---
|
|
apiVersion: "monitoring.coreos.com/v1"
|
|
kind: "PrometheusRule"
|
|
metadata:
|
|
name: prometheus-blackbox-exporter
|
|
labels:
|
|
{{- toYaml .Values.commons.prometheus.rules.labels | nindent 4 }}
|
|
spec:
|
|
groups:
|
|
- name: "Exporter It-Self"
|
|
rules:
|
|
- alert: "Blackbox-Exporter down"
|
|
expr: 'up{ {{ $filter }} } == 0'
|
|
for: "5m"
|
|
labels:
|
|
severity: "critical"
|
|
detectedBy: "Blackbox"
|
|
annotations:
|
|
{{`
|
|
summary: "Blackbox-Exporter to probe {{ $labels.instance | stripPort }} is not reachable for more than 5 minutes."
|
|
`}}
|
|
|
|
- alert: "BlackboxProbeFailed"
|
|
expr: 'sum(probe_success{ {{ $filter }}, instancetype!="server"}) without (job) / count(probe_success{ {{ $filter }}, instancetype!="server"}) without (job) < 0.5'
|
|
for: "5m"
|
|
labels:
|
|
severity: "critical"
|
|
detectedBy: "Blackbox"
|
|
annotations:
|
|
{{`
|
|
summary: "Probe of the target {{ $labels.instance | stripPort }} failed."
|
|
`}}
|
|
|
|
- name: "ICMP"
|
|
rules:
|
|
- alert: "BlackboxProbeIcmpFailed"
|
|
expr: 'sum(probe_success{ {{ $filter }}, instancetype="server"}) without (job) / count(probe_success{ {{ $filter }}, instancetype="server"}) without (job) < 0.5'
|
|
for: "5m"
|
|
labels:
|
|
severity: "critical"
|
|
detectedBy: "Blackbox"
|
|
annotations:
|
|
{{`
|
|
summary: "Icmp request of the target {{ $labels.instance | stripPort }} failed for more than 5 minutes."
|
|
`}}
|
|
|
|
- alert: "HostnameCouldNotBeResolvedByDNS"
|
|
expr: 'probe_icmp_duration_seconds{ {{ $filter }}, phase="resolve"} == 0'
|
|
for: "5m"
|
|
labels:
|
|
severity: "critical"
|
|
detectedBy: "Blackbox"
|
|
annotations:
|
|
{{`
|
|
summary: "Instance {{ $labels.instance | stripPort }} could not be resolved by blackbox-exporter while ICMP requests."
|
|
description: "Could not resolve hostname for more than 5 minutes."
|
|
`}}
|
|
|
|
- alert: "SlowPing"
|
|
expr: 'avg_over_time(probe_icmp_duration_seconds{ {{ $filter }} }[1m]) > 1'
|
|
for: "5m"
|
|
labels:
|
|
severity: "warning"
|
|
detectedBy: "Blackbox"
|
|
annotations:
|
|
{{`
|
|
summary: "ping duration is {{ $value }} s for {{ $labels.instance | stripPort }}."
|
|
`}}
|
|
|
|
- name: "SSL Certificates"
|
|
rules:
|
|
- alert: "CertificateAboutToExpire"
|
|
expr: 'probe_ssl_earliest_cert_expiry{ {{ $filter }} } - time() <= 86400'
|
|
for: "1m"
|
|
labels:
|
|
severity: "critical"
|
|
detectedBy: "Blackbox"
|
|
annotations:
|
|
{{`
|
|
summary: "SSL certificate for domain {{ $labels.instance }} has almost expired"
|
|
`}}
|
|
|
|
- alert: "CertificateAboutToExpire"
|
|
expr: 'probe_ssl_earliest_cert_expiry{ {{ $filter }} } - time() < 86400 * 6'
|
|
for: "1m"
|
|
labels:
|
|
severity: "warning"
|
|
detectedBy: "Blackbox"
|
|
annotations:
|
|
{{`
|
|
summary: "SSL certificate for domain {{ $labels.instance }} will expire in {{ $value | humanizeDuration }}."
|
|
`}}
|
|
|
|
- name: "HTTP Probes"
|
|
rules:
|
|
- alert: "HttpStatusCode"
|
|
expr: 'probe_http_status_code{ {{ $filter }}, module != "http_404" } <= 199 OR probe_http_status_code{ {{ $filter }}, module != "http_404" } >= 400'
|
|
for: "5m"
|
|
labels:
|
|
severity: "critical"
|
|
detectedBy: "Blackbox"
|
|
annotations:
|
|
{{`
|
|
summary: "HTTP status code for domain {{ $labels.instance | stripPort }} is {{ $value }}"
|
|
`}}
|
|
|
|
- alert: "HttpSlowRequests"
|
|
expr: 'avg_over_time(probe_http_duration_seconds{ {{ $filter }} }[1m]) > 1'
|
|
for: "5m"
|
|
labels:
|
|
severity: "warning"
|
|
detectedBy: "Blackbox"
|
|
annotations:
|
|
{{`
|
|
summary: 'HTTP responce time for domain {{ $labels.instance | stripPort }} is {{ printf "%.0f" $value }} s.'
|
|
`}}
|
|
{{- end }}
|