feat(postgresql): add monitoring

This commit is contained in:
WrenIX 2025-04-11 10:08:12 +02:00
parent b3a05ca3d8
commit 1efbbcfd15
Signed by: wrenix
GPG key ID: 7AFDB012974B1BB5
10 changed files with 1657 additions and 5 deletions

View file

@ -4,7 +4,7 @@ name: "postgresql"
description: "A Helm chart for running PostgreSQL (Postgres) database"
icon: https://wiki.postgresql.org/images/a/a4/PostgreSQL_logo.3colors.svg
type: "application"
version: "0.2.6"
version: "0.3.0"
# renovate: image=docker.io/library/postgres
appVersion: "17.4-alpine"
maintainers:

View file

@ -7,7 +7,7 @@ description: "A Helm chart for running PostgreSQL (Postgres) database"
# postgresql
![Version: 0.2.6](https://img.shields.io/badge/Version-0.2.6-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: 17.4-alpine](https://img.shields.io/badge/AppVersion-17.4--alpine-informational?style=flat-square)
![Version: 0.3.0](https://img.shields.io/badge/Version-0.3.0-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: 17.4-alpine](https://img.shields.io/badge/AppVersion-17.4--alpine-informational?style=flat-square)
A Helm chart for running PostgreSQL (Postgres) database
@ -41,6 +41,26 @@ helm uninstall postgresql-release
## Values
### Monitoring
| Key | Type | Default | Description |
|-----|------|---------|-------------|
| grafana.dashboards.annotations | object | `{}` | label of configmap |
| grafana.dashboards.enabled | bool | `false` | deploy grafana dashboard in configmap |
| grafana.dashboards.labels | object | `{"grafana_dashboard":"1"}` | label of configmap |
| prometheus.enabled | bool | `false` | add prometheus exporter sidecar |
| prometheus.image.pullPolicy | string | `"IfNotPresent"` | This sets the pull policy for images. (could be overwritten by global.image.pullPolicy) |
| prometheus.image.registry | string | `"docker.io"` | image registry (could be overwritten by global.image.registry) |
| prometheus.image.repository | string | `"prometheuscommunity/postgres-exporter"` | image repository |
| prometheus.image.tag | string | `"v0.17.1"` | image tag |
| prometheus.rules.additionalRules | list | `[]` | add own rules to prometheusrules (current no default alertrules are provided) |
| prometheus.rules.enabled | bool | `false` | deploy prometheusrules |
| prometheus.rules.labels | object | `{}` | labels of prometheusrule |
| prometheus.servicemonitor.enabled | bool | `false` | deploy servicemonitor |
| prometheus.servicemonitor.labels | object | `{}` | label of servicemonitor |
### Other Values
| Key | Type | Default | Description |
|-----|------|---------|-------------|
| affinity | object | `{}` | |

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,14 @@
{{- if .Values.grafana.dashboards.enabled }}
---
apiVersion: v1
kind: ConfigMap
metadata:
name: {{ include "postgresql.fullname" . }}-grafana-dashboards
labels:
{{- include "postgresql.labels" . | nindent 4 }}
{{- toYaml .Values.grafana.dashboards.labels | nindent 4 }}
annotations:
{{- toYaml .Values.grafana.dashboards.annotations | nindent 4 }}
data:
{{- (.Files.Glob "grafana_dashboards/*.json" ).AsConfig | nindent 2 }}
{{- end }}

View file

@ -0,0 +1,83 @@
{{- if and .Values.prometheus.rules.enabled }}
{{- $fullname := include "postgresql.fullname" . }}
{{- $filter := printf `namespace="%s",service=~"%s.*"` .Release.Namespace $fullname }}
---
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: {{ $fullname }}
labels:
{{- include "postgresql.labels" . | nindent 4 }}
{{- with .Values.prometheus.rules.labels }}
{{- toYaml . | nindent 4 }}
{{- end }}
spec:
groups:
- name: {{ $fullname }}-Default
rules:
- alert: "PostgreSQLDown"
expr: 'avg(pg_up{ {{ $filter }} }) by (job, service, namespace) != 1'
for: 1m
labels:
severity: critical
annotations:
{{`
summary: "PostgreSQL is not processing queries: {{ $labels.service }}.{{ $labels.namespace }}"
description: "{{ $labels.service }}.{{ $labels.namespace }} is rejecting query requests from the exporter, and thus probably not allowing DNS requests to work either. User services should not be effected provided at least 1 node is still alive."
`}}
- alert: "PostgreSQLHighConnections"
expr: 'sum(pg_stat_activity_count{ {{ $filter }} }) by (job, service, namespace) >= sum(pg_settings_max_connections{ {{ $filter }} }) by (job, service, namespace) - sum(pg_settings_superuser_reserved_connections{ {{ $filter }} }) by (job, service, namespace)'
for: 1m
labels:
severity: critical
annotations:
{{`
summary: "{{ $labels.service }}.{{ $labels.namespace }} has maxed out Postgres connections."
description: "{{ $labels.service }}.{{ $labels.namespace }} is exceeding the currently configured maximum Postgres connection limit (current value: {{ $value }}s). Services may be degraded - please take immediate action (you probably need to increase max_connections in the Docker image and re-deploy."
`}}
- alert: "PostgreSQLHighConnections"
expr: 'sum(pg_stat_activity_count{ {{ $filter }} }) by (job, service, namespace) >= (sum(pg_settings_max_connections{ {{ $filter }} }) by (job, service, namespace) - sum(pg_settings_superuser_reserved_connections{ {{ $filter }} }) by (job, service, namespace) ) * 0.8'
for: 10m
labels:
severity: warning
annotations:
{{`
summary: "{{ $labels.service }}.{{ $labels.namespace }} is over 80% of max Postgres connections."
description: "{{ $labels.service }}.{{ $labels.namespace }} is exceeding 80% of the currently configured maximum Postgres connection limit (current value: {{ $value }}s). Please check utilization graphs and confirm if this is normal service growth, abuse or an otherwise temporary condition or if new resources need to be provisioned (or the limits increased, which is mostly likely)."
`}}
- alert: "PostgreSQLSlowQueries"
expr: 'avg(rate(pg_stat_activity_max_tx_duration{ {{ $filter }},datname!~"template.*" }[2m])) by (job, service, namespace, datname) > 2 * 60'
for: 2m
labels:
severity: warning
annotations:
{{`
summary: "PostgreSQL high number of slow on {{ $labels.service }}.{{ $labels.namespace }} for database {{ $labels.datname }}"
description: "PostgreSQL high number of slow queries {{ $labels.service }}.{{ $labels.namespace }} for database {{ $labels.datname }} with a value of {{ $value }}"
`}}
- alert: "PostgreSQLQPS"
expr: 'avg(irate(pg_stat_database_xact_commit{ {{ $filter }},datname!~"template.*"}[5m])+irate(pg_stat_database_xact_rollback{ {{ $filter }},datname!~"template.*"}[5m])) by (job, service, namespace, datname) > 10000'
for: 5m
labels:
severity: warning
annotations:
{{`
description: "PostgreSQL high number of queries per second on {{ $labels.service }}.{{ $labels.namespace }} for database {{ $labels.datname }} with a value of {{ $value }}"
summary: "PostgreSQL high number of queries per second {{ $labels.service }}.{{ $labels.namespace }} for database {{ $labels.datname }}"
`}}
- alert: "PostgreSQLCacheHitRatio"
expr: 'avg(rate(pg_stat_database_blks_hit{ {{ $filter }},datname!~"template.*" }[5m])/(rate(pg_stat_database_blks_hit{ {{ $filter }},datname!~"template.*" }[5m])+rate(pg_stat_database_blks_read{ {{ $filter }},datname!~"template.*" }[5m]))) by (job,service, namespace, datname) < 0.98'
for: 5m
labels:
severity: warning
annotations:
{{`
summary: "PostgreSQL low cache hit rate on {{ $labels.service }}.{{ $labels.namespace }} for database {{ $labels.datname }}"
description: "PostgreSQL low on cache hit rate on {{ $labels.service }}.{{ $labels.namespace }} for database {{ $labels.datname }} with a value of {{ $value }}"
`}}
{{- with .Values.prometheus.rules.additionalRules }}
- name: {{ $fullname }}-Additional
rules:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- end }}

View file

@ -0,0 +1,10 @@
---
apiVersion: "v1"
kind: "Secret"
metadata:
name: {{ include "postgresql.fullname" . }}
labels:
{{- include "postgresql.labels" . | nindent 4 }}
data:
user: {{ .Values.postgres.user | b64enc }}
password: {{ .Values.postgres.password | b64enc }}

View file

@ -11,5 +11,11 @@ spec:
targetPort: postgresql
protocol: TCP
name: postgresql
{{- if .Values.prometheus.enabled }}
- port: 9187
targetPort: metrics
protocol: TCP
name: metrics
{{- end }}
selector:
{{- include "postgresql.selectorLabels" . | nindent 4 }}

View file

@ -0,0 +1,27 @@
{{- if and .Values.prometheus.servicemonitor.enabled }}
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: {{ include "postgresql.fullname" . }}
labels:
{{- include "postgresql.labels" . | nindent 4 }}
{{- with .Values.prometheus.servicemonitor.labels }}
{{- toYaml . | nindent 4 }}
{{- end }}
spec:
selector:
matchLabels:
{{- include "postgresql.selectorLabels" . | nindent 6 }}
endpoints:
- port: metrics
path: "/metrics"
{{- with .Values.prometheus.servicemonitor }}
{{- with .interval }}
interval: {{ . }}
{{- end }}
{{- with .scrapeTimeout }}
scrapeTimeout: {{ . }}
{{- end }}
{{- end }}
{{- end }}

View file

@ -56,10 +56,16 @@ spec:
imagePullPolicy: {{ coalesce $.Values.global.image.pullPolicy .pullPolicy }}
{{- end }}
env:
- name: "POSTGRES_PASSWORD"
value: {{ .Values.postgres.password | quote }}
- name: "POSTGRES_USER"
value: {{ .Values.postgres.user | quote }}
valueFrom:
secretKeyRef:
name: {{ include "postgresql.fullname" . }}
key: user
- name: "POSTGRES_PASSWORD"
valueFrom:
secretKeyRef:
name: {{ include "postgresql.fullname" . }}
key: password
- name: "PGPORT"
value: {{ .Values.service.port | quote }}
- name: "PGDATA"
@ -85,6 +91,32 @@ spec:
volumeMounts:
- name: "data"
mountPath: "/var/lib/postgresql/data"
{{- if .Values.prometheus.enabled }}
- name: "exporter"
{{- with .Values.prometheus.image }}
image: "{{ coalesce $.Values.global.image.registry .registry }}/{{ .repository }}:{{ .tag }}"
imagePullPolicy: {{ coalesce $.Values.global.image.pullPolicy .pullPolicy }}
{{- end }}
args:
- --collector.stat_statements
env:
- name: "DATA_SOURCE_USER"
valueFrom:
secretKeyRef:
name: {{ include "postgresql.fullname" . }}
key: user
- name: "DATA_SOURCE_PASS"
valueFrom:
secretKeyRef:
name: {{ include "postgresql.fullname" . }}
key: password
- name: "DATA_SOURCE_URI"
value: {{ printf "127.0.0.1:%.0f/postgres?sslmode=disable" .Values.service.port }}
ports:
- name: metrics
containerPort: 9187
protocol: TCP
{{- end }}
{{- with .Values.nodeSelector }}
nodeSelector:
{{- toYaml . | nindent 8 }}

View file

@ -137,3 +137,51 @@ autoupgrade:
# readOnlyRootFilesystem: true
# runAsNonRoot: true
# runAsUser: 1000
prometheus:
# -- add prometheus exporter sidecar
# @section -- Monitoring
enabled: false
servicemonitor:
# -- deploy servicemonitor
# @section -- Monitoring
enabled: false
# -- label of servicemonitor
# @section -- Monitoring
labels: {}
rules:
# -- deploy prometheusrules
# @section -- Monitoring
enabled: false
# -- labels of prometheusrule
# @section -- Monitoring
labels: {}
# -- add own rules to prometheusrules (current no default alertrules are provided)
# @section -- Monitoring
additionalRules: []
image:
# -- image registry (could be overwritten by global.image.registry)
# @section -- Monitoring
registry: docker.io
# -- image repository
# @section -- Monitoring
repository: prometheuscommunity/postgres-exporter
# -- This sets the pull policy for images. (could be overwritten by global.image.pullPolicy)
# @section -- Monitoring
pullPolicy: IfNotPresent
# -- image tag
# @section -- Monitoring
tag: v0.17.1
grafana:
dashboards:
# -- deploy grafana dashboard in configmap
# @section -- Monitoring
enabled: false
# -- label of configmap
# @section -- Monitoring
labels:
grafana_dashboard: "1"
# -- label of configmap
# @section -- Monitoring
annotations: {}