feat(postgresql): add monitoring
This commit is contained in:
parent
b3a05ca3d8
commit
1efbbcfd15
10 changed files with 1657 additions and 5 deletions
|
@ -4,7 +4,7 @@ name: "postgresql"
|
|||
description: "A Helm chart for running PostgreSQL (Postgres) database"
|
||||
icon: https://wiki.postgresql.org/images/a/a4/PostgreSQL_logo.3colors.svg
|
||||
type: "application"
|
||||
version: "0.2.6"
|
||||
version: "0.3.0"
|
||||
# renovate: image=docker.io/library/postgres
|
||||
appVersion: "17.4-alpine"
|
||||
maintainers:
|
||||
|
|
|
@ -7,7 +7,7 @@ description: "A Helm chart for running PostgreSQL (Postgres) database"
|
|||
|
||||
# postgresql
|
||||
|
||||
  
|
||||
  
|
||||
|
||||
A Helm chart for running PostgreSQL (Postgres) database
|
||||
|
||||
|
@ -41,6 +41,26 @@ helm uninstall postgresql-release
|
|||
|
||||
## Values
|
||||
|
||||
### Monitoring
|
||||
|
||||
| Key | Type | Default | Description |
|
||||
|-----|------|---------|-------------|
|
||||
| grafana.dashboards.annotations | object | `{}` | label of configmap |
|
||||
| grafana.dashboards.enabled | bool | `false` | deploy grafana dashboard in configmap |
|
||||
| grafana.dashboards.labels | object | `{"grafana_dashboard":"1"}` | label of configmap |
|
||||
| prometheus.enabled | bool | `false` | add prometheus exporter sidecar |
|
||||
| prometheus.image.pullPolicy | string | `"IfNotPresent"` | This sets the pull policy for images. (could be overwritten by global.image.pullPolicy) |
|
||||
| prometheus.image.registry | string | `"docker.io"` | image registry (could be overwritten by global.image.registry) |
|
||||
| prometheus.image.repository | string | `"prometheuscommunity/postgres-exporter"` | image repository |
|
||||
| prometheus.image.tag | string | `"v0.17.1"` | image tag |
|
||||
| prometheus.rules.additionalRules | list | `[]` | add own rules to prometheusrules (current no default alertrules are provided) |
|
||||
| prometheus.rules.enabled | bool | `false` | deploy prometheusrules |
|
||||
| prometheus.rules.labels | object | `{}` | labels of prometheusrule |
|
||||
| prometheus.servicemonitor.enabled | bool | `false` | deploy servicemonitor |
|
||||
| prometheus.servicemonitor.labels | object | `{}` | label of servicemonitor |
|
||||
|
||||
### Other Values
|
||||
|
||||
| Key | Type | Default | Description |
|
||||
|-----|------|---------|-------------|
|
||||
| affinity | object | `{}` | |
|
||||
|
|
1412
postgresql/grafana_dashboards/postgres-overview.json
Normal file
1412
postgresql/grafana_dashboards/postgres-overview.json
Normal file
File diff suppressed because it is too large
Load diff
14
postgresql/templates/configmap_grafana_dashboards.yaml
Normal file
14
postgresql/templates/configmap_grafana_dashboards.yaml
Normal file
|
@ -0,0 +1,14 @@
|
|||
{{- if .Values.grafana.dashboards.enabled }}
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: {{ include "postgresql.fullname" . }}-grafana-dashboards
|
||||
labels:
|
||||
{{- include "postgresql.labels" . | nindent 4 }}
|
||||
{{- toYaml .Values.grafana.dashboards.labels | nindent 4 }}
|
||||
annotations:
|
||||
{{- toYaml .Values.grafana.dashboards.annotations | nindent 4 }}
|
||||
data:
|
||||
{{- (.Files.Glob "grafana_dashboards/*.json" ).AsConfig | nindent 2 }}
|
||||
{{- end }}
|
83
postgresql/templates/prometheus-rules.yaml
Normal file
83
postgresql/templates/prometheus-rules.yaml
Normal file
|
@ -0,0 +1,83 @@
|
|||
{{- if and .Values.prometheus.rules.enabled }}
|
||||
{{- $fullname := include "postgresql.fullname" . }}
|
||||
{{- $filter := printf `namespace="%s",service=~"%s.*"` .Release.Namespace $fullname }}
|
||||
---
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ $fullname }}
|
||||
labels:
|
||||
{{- include "postgresql.labels" . | nindent 4 }}
|
||||
{{- with .Values.prometheus.rules.labels }}
|
||||
{{- toYaml . | nindent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: {{ $fullname }}-Default
|
||||
rules:
|
||||
- alert: "PostgreSQLDown"
|
||||
expr: 'avg(pg_up{ {{ $filter }} }) by (job, service, namespace) != 1'
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
{{`
|
||||
summary: "PostgreSQL is not processing queries: {{ $labels.service }}.{{ $labels.namespace }}"
|
||||
description: "{{ $labels.service }}.{{ $labels.namespace }} is rejecting query requests from the exporter, and thus probably not allowing DNS requests to work either. User services should not be effected provided at least 1 node is still alive."
|
||||
`}}
|
||||
- alert: "PostgreSQLHighConnections"
|
||||
expr: 'sum(pg_stat_activity_count{ {{ $filter }} }) by (job, service, namespace) >= sum(pg_settings_max_connections{ {{ $filter }} }) by (job, service, namespace) - sum(pg_settings_superuser_reserved_connections{ {{ $filter }} }) by (job, service, namespace)'
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
{{`
|
||||
summary: "{{ $labels.service }}.{{ $labels.namespace }} has maxed out Postgres connections."
|
||||
description: "{{ $labels.service }}.{{ $labels.namespace }} is exceeding the currently configured maximum Postgres connection limit (current value: {{ $value }}s). Services may be degraded - please take immediate action (you probably need to increase max_connections in the Docker image and re-deploy."
|
||||
`}}
|
||||
- alert: "PostgreSQLHighConnections"
|
||||
expr: 'sum(pg_stat_activity_count{ {{ $filter }} }) by (job, service, namespace) >= (sum(pg_settings_max_connections{ {{ $filter }} }) by (job, service, namespace) - sum(pg_settings_superuser_reserved_connections{ {{ $filter }} }) by (job, service, namespace) ) * 0.8'
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
{{`
|
||||
summary: "{{ $labels.service }}.{{ $labels.namespace }} is over 80% of max Postgres connections."
|
||||
description: "{{ $labels.service }}.{{ $labels.namespace }} is exceeding 80% of the currently configured maximum Postgres connection limit (current value: {{ $value }}s). Please check utilization graphs and confirm if this is normal service growth, abuse or an otherwise temporary condition or if new resources need to be provisioned (or the limits increased, which is mostly likely)."
|
||||
`}}
|
||||
- alert: "PostgreSQLSlowQueries"
|
||||
expr: 'avg(rate(pg_stat_activity_max_tx_duration{ {{ $filter }},datname!~"template.*" }[2m])) by (job, service, namespace, datname) > 2 * 60'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
{{`
|
||||
summary: "PostgreSQL high number of slow on {{ $labels.service }}.{{ $labels.namespace }} for database {{ $labels.datname }}"
|
||||
description: "PostgreSQL high number of slow queries {{ $labels.service }}.{{ $labels.namespace }} for database {{ $labels.datname }} with a value of {{ $value }}"
|
||||
`}}
|
||||
- alert: "PostgreSQLQPS"
|
||||
expr: 'avg(irate(pg_stat_database_xact_commit{ {{ $filter }},datname!~"template.*"}[5m])+irate(pg_stat_database_xact_rollback{ {{ $filter }},datname!~"template.*"}[5m])) by (job, service, namespace, datname) > 10000'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
{{`
|
||||
description: "PostgreSQL high number of queries per second on {{ $labels.service }}.{{ $labels.namespace }} for database {{ $labels.datname }} with a value of {{ $value }}"
|
||||
summary: "PostgreSQL high number of queries per second {{ $labels.service }}.{{ $labels.namespace }} for database {{ $labels.datname }}"
|
||||
`}}
|
||||
- alert: "PostgreSQLCacheHitRatio"
|
||||
expr: 'avg(rate(pg_stat_database_blks_hit{ {{ $filter }},datname!~"template.*" }[5m])/(rate(pg_stat_database_blks_hit{ {{ $filter }},datname!~"template.*" }[5m])+rate(pg_stat_database_blks_read{ {{ $filter }},datname!~"template.*" }[5m]))) by (job,service, namespace, datname) < 0.98'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
{{`
|
||||
summary: "PostgreSQL low cache hit rate on {{ $labels.service }}.{{ $labels.namespace }} for database {{ $labels.datname }}"
|
||||
description: "PostgreSQL low on cache hit rate on {{ $labels.service }}.{{ $labels.namespace }} for database {{ $labels.datname }} with a value of {{ $value }}"
|
||||
`}}
|
||||
{{- with .Values.prometheus.rules.additionalRules }}
|
||||
- name: {{ $fullname }}-Additional
|
||||
rules:
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
10
postgresql/templates/secret.yaml
Normal file
10
postgresql/templates/secret.yaml
Normal file
|
@ -0,0 +1,10 @@
|
|||
---
|
||||
apiVersion: "v1"
|
||||
kind: "Secret"
|
||||
metadata:
|
||||
name: {{ include "postgresql.fullname" . }}
|
||||
labels:
|
||||
{{- include "postgresql.labels" . | nindent 4 }}
|
||||
data:
|
||||
user: {{ .Values.postgres.user | b64enc }}
|
||||
password: {{ .Values.postgres.password | b64enc }}
|
|
@ -11,5 +11,11 @@ spec:
|
|||
targetPort: postgresql
|
||||
protocol: TCP
|
||||
name: postgresql
|
||||
{{- if .Values.prometheus.enabled }}
|
||||
- port: 9187
|
||||
targetPort: metrics
|
||||
protocol: TCP
|
||||
name: metrics
|
||||
{{- end }}
|
||||
selector:
|
||||
{{- include "postgresql.selectorLabels" . | nindent 4 }}
|
||||
|
|
27
postgresql/templates/servicemonitor.yaml
Normal file
27
postgresql/templates/servicemonitor.yaml
Normal file
|
@ -0,0 +1,27 @@
|
|||
{{- if and .Values.prometheus.servicemonitor.enabled }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: ServiceMonitor
|
||||
metadata:
|
||||
name: {{ include "postgresql.fullname" . }}
|
||||
labels:
|
||||
{{- include "postgresql.labels" . | nindent 4 }}
|
||||
{{- with .Values.prometheus.servicemonitor.labels }}
|
||||
{{- toYaml . | nindent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
{{- include "postgresql.selectorLabels" . | nindent 6 }}
|
||||
endpoints:
|
||||
- port: metrics
|
||||
path: "/metrics"
|
||||
{{- with .Values.prometheus.servicemonitor }}
|
||||
{{- with .interval }}
|
||||
interval: {{ . }}
|
||||
{{- end }}
|
||||
{{- with .scrapeTimeout }}
|
||||
scrapeTimeout: {{ . }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
|
|
@ -56,10 +56,16 @@ spec:
|
|||
imagePullPolicy: {{ coalesce $.Values.global.image.pullPolicy .pullPolicy }}
|
||||
{{- end }}
|
||||
env:
|
||||
- name: "POSTGRES_PASSWORD"
|
||||
value: {{ .Values.postgres.password | quote }}
|
||||
- name: "POSTGRES_USER"
|
||||
value: {{ .Values.postgres.user | quote }}
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: {{ include "postgresql.fullname" . }}
|
||||
key: user
|
||||
- name: "POSTGRES_PASSWORD"
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: {{ include "postgresql.fullname" . }}
|
||||
key: password
|
||||
- name: "PGPORT"
|
||||
value: {{ .Values.service.port | quote }}
|
||||
- name: "PGDATA"
|
||||
|
@ -85,6 +91,32 @@ spec:
|
|||
volumeMounts:
|
||||
- name: "data"
|
||||
mountPath: "/var/lib/postgresql/data"
|
||||
{{- if .Values.prometheus.enabled }}
|
||||
- name: "exporter"
|
||||
{{- with .Values.prometheus.image }}
|
||||
image: "{{ coalesce $.Values.global.image.registry .registry }}/{{ .repository }}:{{ .tag }}"
|
||||
imagePullPolicy: {{ coalesce $.Values.global.image.pullPolicy .pullPolicy }}
|
||||
{{- end }}
|
||||
args:
|
||||
- --collector.stat_statements
|
||||
env:
|
||||
- name: "DATA_SOURCE_USER"
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: {{ include "postgresql.fullname" . }}
|
||||
key: user
|
||||
- name: "DATA_SOURCE_PASS"
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: {{ include "postgresql.fullname" . }}
|
||||
key: password
|
||||
- name: "DATA_SOURCE_URI"
|
||||
value: {{ printf "127.0.0.1:%.0f/postgres?sslmode=disable" .Values.service.port }}
|
||||
ports:
|
||||
- name: metrics
|
||||
containerPort: 9187
|
||||
protocol: TCP
|
||||
{{- end }}
|
||||
{{- with .Values.nodeSelector }}
|
||||
nodeSelector:
|
||||
{{- toYaml . | nindent 8 }}
|
||||
|
|
|
@ -137,3 +137,51 @@ autoupgrade:
|
|||
# readOnlyRootFilesystem: true
|
||||
# runAsNonRoot: true
|
||||
# runAsUser: 1000
|
||||
|
||||
prometheus:
|
||||
# -- add prometheus exporter sidecar
|
||||
# @section -- Monitoring
|
||||
enabled: false
|
||||
servicemonitor:
|
||||
# -- deploy servicemonitor
|
||||
# @section -- Monitoring
|
||||
enabled: false
|
||||
# -- label of servicemonitor
|
||||
# @section -- Monitoring
|
||||
labels: {}
|
||||
rules:
|
||||
# -- deploy prometheusrules
|
||||
# @section -- Monitoring
|
||||
enabled: false
|
||||
# -- labels of prometheusrule
|
||||
# @section -- Monitoring
|
||||
labels: {}
|
||||
# -- add own rules to prometheusrules (current no default alertrules are provided)
|
||||
# @section -- Monitoring
|
||||
additionalRules: []
|
||||
image:
|
||||
# -- image registry (could be overwritten by global.image.registry)
|
||||
# @section -- Monitoring
|
||||
registry: docker.io
|
||||
# -- image repository
|
||||
# @section -- Monitoring
|
||||
repository: prometheuscommunity/postgres-exporter
|
||||
# -- This sets the pull policy for images. (could be overwritten by global.image.pullPolicy)
|
||||
# @section -- Monitoring
|
||||
pullPolicy: IfNotPresent
|
||||
# -- image tag
|
||||
# @section -- Monitoring
|
||||
tag: v0.17.1
|
||||
|
||||
grafana:
|
||||
dashboards:
|
||||
# -- deploy grafana dashboard in configmap
|
||||
# @section -- Monitoring
|
||||
enabled: false
|
||||
# -- label of configmap
|
||||
# @section -- Monitoring
|
||||
labels:
|
||||
grafana_dashboard: "1"
|
||||
# -- label of configmap
|
||||
# @section -- Monitoring
|
||||
annotations: {}
|
||||
|
|
Loading…
Add table
Reference in a new issue