{{- if and .Values.prometheus.exporter.blackbox.enabled (.Capabilities.APIVersions.Has "monitoring.coreos.com/v1/PrometheusRule") }} {{- $filter := `job=~"probe/.*"` }} --- apiVersion: "monitoring.coreos.com/v1" kind: "PrometheusRule" metadata: name: prometheus-blackbox-exporter labels: {{- toYaml .Values.commons.prometheus.rules.labels | nindent 4 }} spec: groups: - name: "Exporter It-Self" rules: - alert: "Blackbox-Exporter down" expr: 'up{ {{ $filter }} } == 0' for: "5m" labels: severity: "critical" detectedBy: "Blackbox" annotations: {{` summary: "Blackbox-Exporter to probe {{ $labels.instance | stripPort }} is not reachable for more than 5 minutes." `}} - alert: "BlackboxProbeFailed" expr: 'sum(probe_success{ {{ $filter }}, instancetype!="server"}) without (job) / count(probe_success{ {{ $filter }}, instancetype!="server"}) without (job) < 0.5' for: "5m" labels: severity: "critical" detectedBy: "Blackbox" annotations: {{` summary: "Probe of the target {{ $labels.instance | stripPort }} failed." `}} - name: "ICMP" rules: - alert: "BlackboxProbeIcmpFailed" expr: 'sum(probe_success{ {{ $filter }}, instancetype="server"}) without (job) / count(probe_success{ {{ $filter }}, instancetype="server"}) without (job) < 0.5' for: "5m" labels: severity: "critical" detectedBy: "Blackbox" annotations: {{` summary: "Icmp request of the target {{ $labels.instance | stripPort }} failed for more than 5 minutes." `}} - alert: "HostnameCouldNotBeResolvedByDNS" expr: 'probe_icmp_duration_seconds{ {{ $filter }}, phase="resolve"} == 0' for: "5m" labels: severity: "critical" detectedBy: "Blackbox" annotations: {{` summary: "Instance {{ $labels.instance | stripPort }} could not be resolved by blackbox-exporter while ICMP requests." description: "Could not resolve hostname for more than 5 minutes." `}} - alert: "SlowPing" expr: 'avg_over_time(probe_icmp_duration_seconds{ {{ $filter }} }[1m]) > 1' for: "5m" labels: severity: "warning" detectedBy: "Blackbox" annotations: {{` summary: "ping duration is {{ $value }} s for {{ $labels.instance | stripPort }}." `}} - name: "SSL Certificates" rules: - alert: "CertificateAboutToExpire" expr: 'probe_ssl_earliest_cert_expiry{ {{ $filter }} } - time() <= 86400' for: "1m" labels: severity: "critical" detectedBy: "Blackbox" annotations: {{` summary: "SSL certificate for domain {{ $labels.instance }} has almost expired" `}} - alert: "CertificateAboutToExpire" expr: 'probe_ssl_earliest_cert_expiry{ {{ $filter }} } - time() < 86400 * 6' for: "1m" labels: severity: "warning" detectedBy: "Blackbox" annotations: {{` summary: "SSL certificate for domain {{ $labels.instance }} will expire in {{ $value | humanizeDuration }}." `}} - name: "HTTP Probes" rules: - alert: "HttpStatusCode" expr: 'probe_http_status_code{ {{ $filter }}, module != "http_404" } <= 199 OR probe_http_status_code{ {{ $filter }}, module != "http_404" } >= 400' for: "5m" labels: severity: "critical" detectedBy: "Blackbox" annotations: {{` summary: "HTTP status code for domain {{ $labels.instance | stripPort }} is {{ $value }}" `}} - alert: "HttpSlowRequests" expr: 'avg_over_time(probe_http_duration_seconds{ {{ $filter }} }[1m]) > 1' for: "5m" labels: severity: "warning" detectedBy: "Blackbox" annotations: {{` summary: 'HTTP responce time for domain {{ $labels.instance | stripPort }} is {{ printf "%.0f" $value }} s.' `}} {{- end }}