fix(infra-certificates): improve monitoring

This commit is contained in:
WrenIX 2024-03-14 22:22:22 +01:00
parent 6539a4b62b
commit 41c880d22a
Signed by: wrenix
GPG key ID: 7AFDB012974B1BB5
6 changed files with 1291 additions and 1 deletions

View file

@ -0,0 +1,4 @@
= Cert-Manager
Dashboard downloaded from:
https://gitlab.com/uneeq-oss/cert-manager-mixin/-/blob/eae22f642aaa5d422e4766f6811df2158fc05539/dashboards/cert-manager.json

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,15 @@
{{- range $path, $bytes := $.Files.Glob "grafana_dashboards/*.json" }}
---
apiVersion: v1
kind: ConfigMap
metadata:
name: grafana-dashboards-{{ base $path }}
labels:
{{- toYaml $.Values.commons.grafana.dashboards.labels | nindent 4 }}
{{- with $.Values.commons.grafana.dashboards.annotations }}
annotations:
{{- toYaml . | nindent 4 }}
{{- end }}
data:
{{- ($.Files.Glob $path ).AsConfig | nindent 2 }}
{{- end }}

View file

@ -5,7 +5,10 @@ metadata:
name: {{ .Release.Name }}-init name: {{ .Release.Name }}-init
namespace: "{{ .Values.init.namespace }}" namespace: "{{ .Values.init.namespace }}"
data: data:
{{- $isMonitoring := (.Capabilities.APIVersions.Has "monitoring.coreos.com/v1/ServiceMonitor") }} {{- $isMonitoring := and
(.Capabilities.APIVersions.Has "monitoring.coreos.com/v1/ServiceMonitor")
(.Capabilities.APIVersions.Has "monitoring.coreos.com/v1/PrometheusRule")
}}
monitoring: {{ $isMonitoring | quote }} monitoring: {{ $isMonitoring | quote }}
{{- $isCertManager := (.Capabilities.APIVersions.Has "cert-manager.io/v1/ClusterIssuer") }} {{- $isCertManager := (.Capabilities.APIVersions.Has "cert-manager.io/v1/ClusterIssuer") }}
certmanager: {{ $isCertManager | quote }} certmanager: {{ $isCertManager | quote }}

View file

@ -0,0 +1,55 @@
{{- if (.Capabilities.APIVersions.Has "monitoring.coreos.com/v1/PrometheusRule") }}
{{- $without := "instance,endpoint,container,pod,service,job,namespace" }}
---
apiVersion: "monitoring.coreos.com/v1"
kind: "PrometheusRule"
metadata:
name: "cert-manager"
labels:
{{- toYaml .Values.commons.prometheus.rules.labels | nindent 4 }}
spec:
groups:
- name: "CertManager"
rules:
- alert: "CertificateAboutToExpire"
expr: '(min(certmanager_certificate_expiration_timestamp_seconds - time()) without ({{ $without }}) < 86400)'
for: "1m"
labels:
severity: "critical"
detectedBy: "CertManager"
{{`
annotations:
summary: "SSL certificate {{ $labels.name }} in namespace {{ $labels.exported_namespace }} by {{ $labels.issuer_kind }} {{ $labels.issuer_name }} will expire in {{ $value | humanizeDuration }}"
`}}
- alert: "CertificateAboutToExpire"
expr: '(min(certmanager_certificate_expiration_timestamp_seconds - time()) without ({{ $without }}) < 86400 * 6)'
for: "1m"
labels:
severity: "warning"
detectedBy: "CertManager"
{{`
annotations:
summary: "SSL certificate {{ $labels.name }} in namespace {{ $labels.exported_namespace }} by {{ $labels.issuer_kind }} {{ $labels.issuer_name }} will expire in {{ $value | humanizeDuration }}."
`}}
- alert: "CertManager CertificateReady"
expr: '(sum(certmanager_certificate_ready_status{condition!="True"}) without ({{ $without }}, condition) > 0)'
for: "1m"
labels:
severity: "critical"
{{`
annotations:
summary: "Certificate {{ $labels.name }} in namespace {{ $labels.exported_namespace }} is not ready by {{ $labels.issuer_kind }} {{ $labels.issuer_name }}"
`}}
- alert: "CertManager HittingRateLimits"
expr: '(sum (rate(certmanager_http_acme_client_request_count{status="429"}[5m])) without ({{ $without }}) > 0)'
for: "1m"
labels:
severity: "critical"
{{`
annotations:
summary: "Cert manager hitting rate limits for {{ $labels.host }}"
`}}
{{- end }}{{/* end-if */}}

View file

@ -13,5 +13,15 @@ commons:
prometheus: prometheus:
monitor: monitor:
labels: {} labels: {}
rules:
labels: {}
grafana:
datasource:
labels:
grafana_datasource: "1"
dashboards:
labels:
grafana_dashboard: "1"
email: "an@example.org" email: "an@example.org"