fix(infra-certificates): improve monitoring
This commit is contained in:
parent
6539a4b62b
commit
41c880d22a
6 changed files with 1291 additions and 1 deletions
4
infra-certificates/grafana_dashboards/README.adoc
Normal file
4
infra-certificates/grafana_dashboards/README.adoc
Normal file
|
@ -0,0 +1,4 @@
|
|||
= Cert-Manager
|
||||
|
||||
Dashboard downloaded from:
|
||||
https://gitlab.com/uneeq-oss/cert-manager-mixin/-/blob/eae22f642aaa5d422e4766f6811df2158fc05539/dashboards/cert-manager.json
|
1203
infra-certificates/grafana_dashboards/certmanager.json
Normal file
1203
infra-certificates/grafana_dashboards/certmanager.json
Normal file
File diff suppressed because it is too large
Load diff
|
@ -0,0 +1,15 @@
|
|||
{{- range $path, $bytes := $.Files.Glob "grafana_dashboards/*.json" }}
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: grafana-dashboards-{{ base $path }}
|
||||
labels:
|
||||
{{- toYaml $.Values.commons.grafana.dashboards.labels | nindent 4 }}
|
||||
{{- with $.Values.commons.grafana.dashboards.annotations }}
|
||||
annotations:
|
||||
{{- toYaml . | nindent 4 }}
|
||||
{{- end }}
|
||||
data:
|
||||
{{- ($.Files.Glob $path ).AsConfig | nindent 2 }}
|
||||
{{- end }}
|
|
@ -5,7 +5,10 @@ metadata:
|
|||
name: {{ .Release.Name }}-init
|
||||
namespace: "{{ .Values.init.namespace }}"
|
||||
data:
|
||||
{{- $isMonitoring := (.Capabilities.APIVersions.Has "monitoring.coreos.com/v1/ServiceMonitor") }}
|
||||
{{- $isMonitoring := and
|
||||
(.Capabilities.APIVersions.Has "monitoring.coreos.com/v1/ServiceMonitor")
|
||||
(.Capabilities.APIVersions.Has "monitoring.coreos.com/v1/PrometheusRule")
|
||||
}}
|
||||
monitoring: {{ $isMonitoring | quote }}
|
||||
{{- $isCertManager := (.Capabilities.APIVersions.Has "cert-manager.io/v1/ClusterIssuer") }}
|
||||
certmanager: {{ $isCertManager | quote }}
|
||||
|
|
55
infra-certificates/templates/prometheus-rule.yaml
Normal file
55
infra-certificates/templates/prometheus-rule.yaml
Normal file
|
@ -0,0 +1,55 @@
|
|||
{{- if (.Capabilities.APIVersions.Has "monitoring.coreos.com/v1/PrometheusRule") }}
|
||||
{{- $without := "instance,endpoint,container,pod,service,job,namespace" }}
|
||||
---
|
||||
apiVersion: "monitoring.coreos.com/v1"
|
||||
kind: "PrometheusRule"
|
||||
metadata:
|
||||
name: "cert-manager"
|
||||
labels:
|
||||
{{- toYaml .Values.commons.prometheus.rules.labels | nindent 4 }}
|
||||
spec:
|
||||
groups:
|
||||
- name: "CertManager"
|
||||
rules:
|
||||
- alert: "CertificateAboutToExpire"
|
||||
expr: '(min(certmanager_certificate_expiration_timestamp_seconds - time()) without ({{ $without }}) < 86400)'
|
||||
for: "1m"
|
||||
labels:
|
||||
severity: "critical"
|
||||
detectedBy: "CertManager"
|
||||
{{`
|
||||
annotations:
|
||||
summary: "SSL certificate {{ $labels.name }} in namespace {{ $labels.exported_namespace }} by {{ $labels.issuer_kind }} {{ $labels.issuer_name }} will expire in {{ $value | humanizeDuration }}"
|
||||
`}}
|
||||
|
||||
- alert: "CertificateAboutToExpire"
|
||||
expr: '(min(certmanager_certificate_expiration_timestamp_seconds - time()) without ({{ $without }}) < 86400 * 6)'
|
||||
for: "1m"
|
||||
labels:
|
||||
severity: "warning"
|
||||
detectedBy: "CertManager"
|
||||
{{`
|
||||
annotations:
|
||||
summary: "SSL certificate {{ $labels.name }} in namespace {{ $labels.exported_namespace }} by {{ $labels.issuer_kind }} {{ $labels.issuer_name }} will expire in {{ $value | humanizeDuration }}."
|
||||
`}}
|
||||
|
||||
- alert: "CertManager CertificateReady"
|
||||
expr: '(sum(certmanager_certificate_ready_status{condition!="True"}) without ({{ $without }}, condition) > 0)'
|
||||
for: "1m"
|
||||
labels:
|
||||
severity: "critical"
|
||||
{{`
|
||||
annotations:
|
||||
summary: "Certificate {{ $labels.name }} in namespace {{ $labels.exported_namespace }} is not ready by {{ $labels.issuer_kind }} {{ $labels.issuer_name }}"
|
||||
`}}
|
||||
|
||||
- alert: "CertManager HittingRateLimits"
|
||||
expr: '(sum (rate(certmanager_http_acme_client_request_count{status="429"}[5m])) without ({{ $without }}) > 0)'
|
||||
for: "1m"
|
||||
labels:
|
||||
severity: "critical"
|
||||
{{`
|
||||
annotations:
|
||||
summary: "Cert manager hitting rate limits for {{ $labels.host }}"
|
||||
`}}
|
||||
{{- end }}{{/* end-if */}}
|
|
@ -13,5 +13,15 @@ commons:
|
|||
prometheus:
|
||||
monitor:
|
||||
labels: {}
|
||||
rules:
|
||||
labels: {}
|
||||
|
||||
grafana:
|
||||
datasource:
|
||||
labels:
|
||||
grafana_datasource: "1"
|
||||
dashboards:
|
||||
labels:
|
||||
grafana_dashboard: "1"
|
||||
|
||||
email: "an@example.org"
|
||||
|
|
Loading…
Add table
Reference in a new issue