fix(infra-certificates): improve monitoring
This commit is contained in:
parent
6539a4b62b
commit
41c880d22a
6 changed files with 1291 additions and 1 deletions
4
infra-certificates/grafana_dashboards/README.adoc
Normal file
4
infra-certificates/grafana_dashboards/README.adoc
Normal file
|
@ -0,0 +1,4 @@
|
||||||
|
= Cert-Manager
|
||||||
|
|
||||||
|
Dashboard downloaded from:
|
||||||
|
https://gitlab.com/uneeq-oss/cert-manager-mixin/-/blob/eae22f642aaa5d422e4766f6811df2158fc05539/dashboards/cert-manager.json
|
1203
infra-certificates/grafana_dashboards/certmanager.json
Normal file
1203
infra-certificates/grafana_dashboards/certmanager.json
Normal file
File diff suppressed because it is too large
Load diff
|
@ -0,0 +1,15 @@
|
||||||
|
{{- range $path, $bytes := $.Files.Glob "grafana_dashboards/*.json" }}
|
||||||
|
---
|
||||||
|
apiVersion: v1
|
||||||
|
kind: ConfigMap
|
||||||
|
metadata:
|
||||||
|
name: grafana-dashboards-{{ base $path }}
|
||||||
|
labels:
|
||||||
|
{{- toYaml $.Values.commons.grafana.dashboards.labels | nindent 4 }}
|
||||||
|
{{- with $.Values.commons.grafana.dashboards.annotations }}
|
||||||
|
annotations:
|
||||||
|
{{- toYaml . | nindent 4 }}
|
||||||
|
{{- end }}
|
||||||
|
data:
|
||||||
|
{{- ($.Files.Glob $path ).AsConfig | nindent 2 }}
|
||||||
|
{{- end }}
|
|
@ -5,7 +5,10 @@ metadata:
|
||||||
name: {{ .Release.Name }}-init
|
name: {{ .Release.Name }}-init
|
||||||
namespace: "{{ .Values.init.namespace }}"
|
namespace: "{{ .Values.init.namespace }}"
|
||||||
data:
|
data:
|
||||||
{{- $isMonitoring := (.Capabilities.APIVersions.Has "monitoring.coreos.com/v1/ServiceMonitor") }}
|
{{- $isMonitoring := and
|
||||||
|
(.Capabilities.APIVersions.Has "monitoring.coreos.com/v1/ServiceMonitor")
|
||||||
|
(.Capabilities.APIVersions.Has "monitoring.coreos.com/v1/PrometheusRule")
|
||||||
|
}}
|
||||||
monitoring: {{ $isMonitoring | quote }}
|
monitoring: {{ $isMonitoring | quote }}
|
||||||
{{- $isCertManager := (.Capabilities.APIVersions.Has "cert-manager.io/v1/ClusterIssuer") }}
|
{{- $isCertManager := (.Capabilities.APIVersions.Has "cert-manager.io/v1/ClusterIssuer") }}
|
||||||
certmanager: {{ $isCertManager | quote }}
|
certmanager: {{ $isCertManager | quote }}
|
||||||
|
|
55
infra-certificates/templates/prometheus-rule.yaml
Normal file
55
infra-certificates/templates/prometheus-rule.yaml
Normal file
|
@ -0,0 +1,55 @@
|
||||||
|
{{- if (.Capabilities.APIVersions.Has "monitoring.coreos.com/v1/PrometheusRule") }}
|
||||||
|
{{- $without := "instance,endpoint,container,pod,service,job,namespace" }}
|
||||||
|
---
|
||||||
|
apiVersion: "monitoring.coreos.com/v1"
|
||||||
|
kind: "PrometheusRule"
|
||||||
|
metadata:
|
||||||
|
name: "cert-manager"
|
||||||
|
labels:
|
||||||
|
{{- toYaml .Values.commons.prometheus.rules.labels | nindent 4 }}
|
||||||
|
spec:
|
||||||
|
groups:
|
||||||
|
- name: "CertManager"
|
||||||
|
rules:
|
||||||
|
- alert: "CertificateAboutToExpire"
|
||||||
|
expr: '(min(certmanager_certificate_expiration_timestamp_seconds - time()) without ({{ $without }}) < 86400)'
|
||||||
|
for: "1m"
|
||||||
|
labels:
|
||||||
|
severity: "critical"
|
||||||
|
detectedBy: "CertManager"
|
||||||
|
{{`
|
||||||
|
annotations:
|
||||||
|
summary: "SSL certificate {{ $labels.name }} in namespace {{ $labels.exported_namespace }} by {{ $labels.issuer_kind }} {{ $labels.issuer_name }} will expire in {{ $value | humanizeDuration }}"
|
||||||
|
`}}
|
||||||
|
|
||||||
|
- alert: "CertificateAboutToExpire"
|
||||||
|
expr: '(min(certmanager_certificate_expiration_timestamp_seconds - time()) without ({{ $without }}) < 86400 * 6)'
|
||||||
|
for: "1m"
|
||||||
|
labels:
|
||||||
|
severity: "warning"
|
||||||
|
detectedBy: "CertManager"
|
||||||
|
{{`
|
||||||
|
annotations:
|
||||||
|
summary: "SSL certificate {{ $labels.name }} in namespace {{ $labels.exported_namespace }} by {{ $labels.issuer_kind }} {{ $labels.issuer_name }} will expire in {{ $value | humanizeDuration }}."
|
||||||
|
`}}
|
||||||
|
|
||||||
|
- alert: "CertManager CertificateReady"
|
||||||
|
expr: '(sum(certmanager_certificate_ready_status{condition!="True"}) without ({{ $without }}, condition) > 0)'
|
||||||
|
for: "1m"
|
||||||
|
labels:
|
||||||
|
severity: "critical"
|
||||||
|
{{`
|
||||||
|
annotations:
|
||||||
|
summary: "Certificate {{ $labels.name }} in namespace {{ $labels.exported_namespace }} is not ready by {{ $labels.issuer_kind }} {{ $labels.issuer_name }}"
|
||||||
|
`}}
|
||||||
|
|
||||||
|
- alert: "CertManager HittingRateLimits"
|
||||||
|
expr: '(sum (rate(certmanager_http_acme_client_request_count{status="429"}[5m])) without ({{ $without }}) > 0)'
|
||||||
|
for: "1m"
|
||||||
|
labels:
|
||||||
|
severity: "critical"
|
||||||
|
{{`
|
||||||
|
annotations:
|
||||||
|
summary: "Cert manager hitting rate limits for {{ $labels.host }}"
|
||||||
|
`}}
|
||||||
|
{{- end }}{{/* end-if */}}
|
|
@ -13,5 +13,15 @@ commons:
|
||||||
prometheus:
|
prometheus:
|
||||||
monitor:
|
monitor:
|
||||||
labels: {}
|
labels: {}
|
||||||
|
rules:
|
||||||
|
labels: {}
|
||||||
|
|
||||||
|
grafana:
|
||||||
|
datasource:
|
||||||
|
labels:
|
||||||
|
grafana_datasource: "1"
|
||||||
|
dashboards:
|
||||||
|
labels:
|
||||||
|
grafana_dashboard: "1"
|
||||||
|
|
||||||
email: "an@example.org"
|
email: "an@example.org"
|
||||||
|
|
Loading…
Add table
Reference in a new issue