From 1efbbcfd15e8edfeee4f257c8f360b41ee0fa2f1 Mon Sep 17 00:00:00 2001 From: WrenIX Date: Fri, 11 Apr 2025 10:08:12 +0200 Subject: [PATCH] feat(postgresql): add monitoring --- postgresql/Chart.yaml | 2 +- postgresql/README.md | 22 +- .../grafana_dashboards/postgres-overview.json | 1412 +++++++++++++++++ .../configmap_grafana_dashboards.yaml | 14 + postgresql/templates/prometheus-rules.yaml | 83 + postgresql/templates/secret.yaml | 10 + postgresql/templates/service.yaml | 6 + postgresql/templates/servicemonitor.yaml | 27 + postgresql/templates/statefulset.yaml | 38 +- postgresql/values.yaml | 48 + 10 files changed, 1657 insertions(+), 5 deletions(-) create mode 100644 postgresql/grafana_dashboards/postgres-overview.json create mode 100644 postgresql/templates/configmap_grafana_dashboards.yaml create mode 100644 postgresql/templates/prometheus-rules.yaml create mode 100644 postgresql/templates/secret.yaml create mode 100644 postgresql/templates/servicemonitor.yaml diff --git a/postgresql/Chart.yaml b/postgresql/Chart.yaml index 1597f80..537000b 100644 --- a/postgresql/Chart.yaml +++ b/postgresql/Chart.yaml @@ -4,7 +4,7 @@ name: "postgresql" description: "A Helm chart for running PostgreSQL (Postgres) database" icon: https://wiki.postgresql.org/images/a/a4/PostgreSQL_logo.3colors.svg type: "application" -version: "0.2.6" +version: "0.3.0" # renovate: image=docker.io/library/postgres appVersion: "17.4-alpine" maintainers: diff --git a/postgresql/README.md b/postgresql/README.md index 3b3dd88..0a1a6db 100644 --- a/postgresql/README.md +++ b/postgresql/README.md @@ -7,7 +7,7 @@ description: "A Helm chart for running PostgreSQL (Postgres) database" # postgresql -![Version: 0.2.6](https://img.shields.io/badge/Version-0.2.6-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: 17.4-alpine](https://img.shields.io/badge/AppVersion-17.4--alpine-informational?style=flat-square) +![Version: 0.3.0](https://img.shields.io/badge/Version-0.3.0-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: 17.4-alpine](https://img.shields.io/badge/AppVersion-17.4--alpine-informational?style=flat-square) A Helm chart for running PostgreSQL (Postgres) database @@ -41,6 +41,26 @@ helm uninstall postgresql-release ## Values +### Monitoring + +| Key | Type | Default | Description | +|-----|------|---------|-------------| +| grafana.dashboards.annotations | object | `{}` | label of configmap | +| grafana.dashboards.enabled | bool | `false` | deploy grafana dashboard in configmap | +| grafana.dashboards.labels | object | `{"grafana_dashboard":"1"}` | label of configmap | +| prometheus.enabled | bool | `false` | add prometheus exporter sidecar | +| prometheus.image.pullPolicy | string | `"IfNotPresent"` | This sets the pull policy for images. (could be overwritten by global.image.pullPolicy) | +| prometheus.image.registry | string | `"docker.io"` | image registry (could be overwritten by global.image.registry) | +| prometheus.image.repository | string | `"prometheuscommunity/postgres-exporter"` | image repository | +| prometheus.image.tag | string | `"v0.17.1"` | image tag | +| prometheus.rules.additionalRules | list | `[]` | add own rules to prometheusrules (current no default alertrules are provided) | +| prometheus.rules.enabled | bool | `false` | deploy prometheusrules | +| prometheus.rules.labels | object | `{}` | labels of prometheusrule | +| prometheus.servicemonitor.enabled | bool | `false` | deploy servicemonitor | +| prometheus.servicemonitor.labels | object | `{}` | label of servicemonitor | + +### Other Values + | Key | Type | Default | Description | |-----|------|---------|-------------| | affinity | object | `{}` | | diff --git a/postgresql/grafana_dashboards/postgres-overview.json b/postgresql/grafana_dashboards/postgres-overview.json new file mode 100644 index 0000000..9bf41be --- /dev/null +++ b/postgresql/grafana_dashboards/postgres-overview.json @@ -0,0 +1,1412 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "description": "Performance metrics for Postgres", + "editable": true, + "gnetId": 455, + "graphTooltip": 0, + "id": 1, + "iteration": 1603191461722, + "links": [], + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "grid": {}, + "gridPos": { + "h": 7, + "w": 20, + "x": 0, + "y": 0 + }, + "hiddenSeries": false, + "id": 1, + "isNew": true, + "legend": { + "alignAsTable": true, + "avg": true, + "current": false, + "max": true, + "min": true, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "connected", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.2.1", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "alias": "fetched", + "dsType": "prometheus", + "expr": "sum(irate(pg_stat_database_tup_fetched{datname=~\"$db\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval]))", + "format": "time_series", + "groupBy": [ + { + "params": [ + "$interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "intervalFactor": 2, + "legendFormat": "fetched", + "measurement": "postgresql", + "policy": "default", + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "tup_fetched" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + }, + { + "params": [ + "10s" + ], + "type": "non_negative_derivative" + } + ] + ], + "step": 120, + "tags": [ + { + "key": "instance", + "operator": "=~", + "value": "/^$instance$/" + } + ] + }, + { + "alias": "fetched", + "dsType": "prometheus", + "expr": "sum(irate(pg_stat_database_tup_returned{datname=~\"$db\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval]))", + "format": "time_series", + "groupBy": [ + { + "params": [ + "$interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "intervalFactor": 2, + "legendFormat": "returned", + "measurement": "postgresql", + "policy": "default", + "refId": "B", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "tup_fetched" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + }, + { + "params": [ + "10s" + ], + "type": "non_negative_derivative" + } + ] + ], + "step": 120, + "tags": [ + { + "key": "instance", + "operator": "=~", + "value": "/^$instance$/" + } + ] + }, + { + "alias": "fetched", + "dsType": "prometheus", + "expr": "sum(irate(pg_stat_database_tup_inserted{datname=~\"$db\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval]))", + "format": "time_series", + "groupBy": [ + { + "params": [ + "$interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "intervalFactor": 2, + "legendFormat": "inserted", + "measurement": "postgresql", + "policy": "default", + "refId": "C", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "tup_fetched" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + }, + { + "params": [ + "10s" + ], + "type": "non_negative_derivative" + } + ] + ], + "step": 120, + "tags": [ + { + "key": "instance", + "operator": "=~", + "value": "/^$instance$/" + } + ] + }, + { + "alias": "fetched", + "dsType": "prometheus", + "expr": "sum(irate(pg_stat_database_tup_updated{datname=~\"$db\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval]))", + "format": "time_series", + "groupBy": [ + { + "params": [ + "$interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "intervalFactor": 2, + "legendFormat": "updated", + "measurement": "postgresql", + "policy": "default", + "refId": "D", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "tup_fetched" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + }, + { + "params": [ + "10s" + ], + "type": "non_negative_derivative" + } + ] + ], + "step": 120, + "tags": [ + { + "key": "instance", + "operator": "=~", + "value": "/^$instance$/" + } + ] + }, + { + "alias": "fetched", + "dsType": "prometheus", + "expr": "sum(irate(pg_stat_database_tup_deleted{datname=~\"$db\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval]))", + "format": "time_series", + "groupBy": [ + { + "params": [ + "$interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "intervalFactor": 2, + "legendFormat": "deleted", + "measurement": "postgresql", + "policy": "default", + "refId": "E", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "tup_fetched" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + }, + { + "params": [ + "10s" + ], + "type": "non_negative_derivative" + } + ] + ], + "step": 120, + "tags": [ + { + "key": "instance", + "operator": "=~", + "value": "/^$instance$/" + } + ] + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Rows", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "$datasource", + "decimals": 0, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 3, + "w": 4, + "x": 20, + "y": 0 + }, + "height": "55px", + "id": 11, + "interval": null, + "isNew": true, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": true, + "lineColor": "rgb(31, 120, 193)", + "show": true + }, + "tableColumn": "", + "targets": [ + { + "dsType": "prometheus", + "expr": "sum(irate(pg_stat_database_xact_commit{datname=~\"$db\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])) + sum(irate(pg_stat_database_xact_rollback{datname=~\"$db\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval]))", + "format": "time_series", + "groupBy": [ + { + "params": [ + "$interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "intervalFactor": 2, + "measurement": "postgresql", + "policy": "default", + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "xact_commit" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + }, + { + "params": [ + "10s" + ], + "type": "non_negative_derivative" + } + ] + ], + "step": 1800, + "tags": [ + { + "key": "instance", + "operator": "=~", + "value": "/^$instance$/" + } + ] + } + ], + "thresholds": "", + "title": "QPS", + "transparent": true, + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "decimals": 1, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "grid": {}, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 7 + }, + "hiddenSeries": false, + "id": 2, + "isNew": true, + "legend": { + "alignAsTable": true, + "avg": true, + "current": false, + "hideZero": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "connected", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.2.1", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "alias": "Buffers Allocated", + "dsType": "prometheus", + "expr": "irate(pg_stat_bgwriter_buffers_alloc{job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])", + "format": "time_series", + "groupBy": [ + { + "params": [ + "$interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "intervalFactor": 2, + "legendFormat": "buffers_alloc", + "measurement": "postgresql", + "policy": "default", + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "buffers_alloc" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + }, + { + "params": [], + "type": "difference" + } + ] + ], + "step": 240, + "tags": [ + { + "key": "instance", + "operator": "=~", + "value": "/^$instance$/" + } + ] + }, + { + "alias": "Buffers Allocated", + "dsType": "prometheus", + "expr": "irate(pg_stat_bgwriter_buffers_backend_fsync{job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])", + "format": "time_series", + "groupBy": [ + { + "params": [ + "$interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "intervalFactor": 2, + "legendFormat": "buffers_backend_fsync", + "measurement": "postgresql", + "policy": "default", + "refId": "B", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "buffers_alloc" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + }, + { + "params": [], + "type": "difference" + } + ] + ], + "step": 240, + "tags": [ + { + "key": "instance", + "operator": "=~", + "value": "/^$instance$/" + } + ] + }, + { + "alias": "Buffers Allocated", + "dsType": "prometheus", + "expr": "irate(pg_stat_bgwriter_buffers_backend{job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])", + "format": "time_series", + "groupBy": [ + { + "params": [ + "$interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "intervalFactor": 2, + "legendFormat": "buffers_backend", + "measurement": "postgresql", + "policy": "default", + "refId": "C", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "buffers_alloc" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + }, + { + "params": [], + "type": "difference" + } + ] + ], + "step": 240, + "tags": [ + { + "key": "instance", + "operator": "=~", + "value": "/^$instance$/" + } + ] + }, + { + "alias": "Buffers Allocated", + "dsType": "prometheus", + "expr": "irate(pg_stat_bgwriter_buffers_clean{job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])", + "format": "time_series", + "groupBy": [ + { + "params": [ + "$interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "intervalFactor": 2, + "legendFormat": "buffers_clean", + "measurement": "postgresql", + "policy": "default", + "refId": "D", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "buffers_alloc" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + }, + { + "params": [], + "type": "difference" + } + ] + ], + "step": 240, + "tags": [ + { + "key": "instance", + "operator": "=~", + "value": "/^$instance$/" + } + ] + }, + { + "alias": "Buffers Allocated", + "dsType": "prometheus", + "expr": "irate(pg_stat_bgwriter_buffers_checkpoint{job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])", + "format": "time_series", + "groupBy": [ + { + "params": [ + "$interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "intervalFactor": 2, + "legendFormat": "buffers_checkpoint", + "measurement": "postgresql", + "policy": "default", + "refId": "E", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "buffers_alloc" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + }, + { + "params": [], + "type": "difference" + } + ] + ], + "step": 240, + "tags": [ + { + "key": "instance", + "operator": "=~", + "value": "/^$instance$/" + } + ] + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Buffers", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "grid": {}, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 7 + }, + "hiddenSeries": false, + "id": 3, + "isNew": true, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.2.1", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "alias": "conflicts", + "dsType": "prometheus", + "expr": "sum(rate(pg_stat_database_deadlocks{datname=~\"$db\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval]))", + "format": "time_series", + "groupBy": [ + { + "params": [ + "$interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "intervalFactor": 2, + "legendFormat": "deadlocks", + "measurement": "postgresql", + "policy": "default", + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "conflicts" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + }, + { + "params": [], + "type": "difference" + } + ] + ], + "step": 240, + "tags": [ + { + "key": "instance", + "operator": "=~", + "value": "/^$instance$/" + } + ] + }, + { + "alias": "deadlocks", + "dsType": "prometheus", + "expr": "sum(rate(pg_stat_database_conflicts{datname=~\"$db\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval]))", + "format": "time_series", + "groupBy": [ + { + "params": [ + "$interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "intervalFactor": 2, + "legendFormat": "conflicts", + "measurement": "postgresql", + "policy": "default", + "refId": "B", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "deadlocks" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + }, + { + "params": [], + "type": "difference" + } + ] + ], + "step": 240, + "tags": [ + { + "key": "instance", + "operator": "=~", + "value": "/^$instance$/" + } + ] + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Conflicts/Deadlocks", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "grid": {}, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 14 + }, + "hiddenSeries": false, + "id": 12, + "isNew": true, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "options": { + "alertThreshold": true + }, + "percentage": true, + "pluginVersion": "7.2.1", + "pointradius": 1, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (datname) (rate(pg_stat_database_blks_hit{datname=~\"$db\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])) / (sum by (datname)(rate(pg_stat_database_blks_hit{datname=~\"$db\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])) + sum by (datname)(rate(pg_stat_database_blks_read{datname=~\"$db\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{datname}} - cache hit rate", + "refId": "A", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Cache hit ratio", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "grid": {}, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 14 + }, + "hiddenSeries": false, + "id": 13, + "isNew": true, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.2.1", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "pg_stat_database_numbackends{datname=~\"$db\",job=~\"$job\",instance=~\"$instance\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{datname}} - {{__name__}}", + "refId": "A", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Number of active connections", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "refresh": false, + "schemaVersion": 26, + "style": "dark", + "tags": [ + "postgres" + ], + "templating": { + "list": [ + { + "hide": 0, + "includeAll": false, + "label": "Data Source", + "multi": false, + "name": "datasource", + "options": [], + "query": "prometheus", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + }, + { + "allValue": ".+", + "datasource": "$datasource", + "definition": "label_values(pg_up, job)", + "hide": 0, + "includeAll": true, + "label": "job", + "multi": true, + "name": "job", + "options": [], + "query": "label_values(pg_up, job)", + "refresh": 0, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": ".+", + "datasource": "$datasource", + "definition": "", + "hide": 0, + "includeAll": true, + "label": "instance", + "multi": true, + "name": "instance", + "options": [], + "query": "label_values(up{job=~\"$job\"},instance)", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": ".+", + "datasource": "$datasource", + "definition": "label_values(pg_stat_database_tup_fetched{instance=~\"$instance\",datname!~\"template.*|postgres\"},datname)", + "hide": 0, + "includeAll": true, + "label": "db", + "multi": false, + "name": "db", + "options": [], + "query": "label_values(pg_stat_database_tup_fetched{instance=~\"$instance\",datname!~\"template.*|postgres\"},datname)", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "Postgres Overview", + "uid": "wGgaPlciz", + "version": 5 +} diff --git a/postgresql/templates/configmap_grafana_dashboards.yaml b/postgresql/templates/configmap_grafana_dashboards.yaml new file mode 100644 index 0000000..525c7d3 --- /dev/null +++ b/postgresql/templates/configmap_grafana_dashboards.yaml @@ -0,0 +1,14 @@ +{{- if .Values.grafana.dashboards.enabled }} +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ include "postgresql.fullname" . }}-grafana-dashboards + labels: + {{- include "postgresql.labels" . | nindent 4 }} + {{- toYaml .Values.grafana.dashboards.labels | nindent 4 }} + annotations: + {{- toYaml .Values.grafana.dashboards.annotations | nindent 4 }} +data: + {{- (.Files.Glob "grafana_dashboards/*.json" ).AsConfig | nindent 2 }} +{{- end }} diff --git a/postgresql/templates/prometheus-rules.yaml b/postgresql/templates/prometheus-rules.yaml new file mode 100644 index 0000000..3e570f4 --- /dev/null +++ b/postgresql/templates/prometheus-rules.yaml @@ -0,0 +1,83 @@ +{{- if and .Values.prometheus.rules.enabled }} +{{- $fullname := include "postgresql.fullname" . }} +{{- $filter := printf `namespace="%s",service=~"%s.*"` .Release.Namespace $fullname }} +--- +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: {{ $fullname }} + labels: + {{- include "postgresql.labels" . | nindent 4 }} + {{- with .Values.prometheus.rules.labels }} + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + groups: + - name: {{ $fullname }}-Default + rules: + - alert: "PostgreSQLDown" + expr: 'avg(pg_up{ {{ $filter }} }) by (job, service, namespace) != 1' + for: 1m + labels: + severity: critical + annotations: + {{` + summary: "PostgreSQL is not processing queries: {{ $labels.service }}.{{ $labels.namespace }}" + description: "{{ $labels.service }}.{{ $labels.namespace }} is rejecting query requests from the exporter, and thus probably not allowing DNS requests to work either. User services should not be effected provided at least 1 node is still alive." + `}} + - alert: "PostgreSQLHighConnections" + expr: 'sum(pg_stat_activity_count{ {{ $filter }} }) by (job, service, namespace) >= sum(pg_settings_max_connections{ {{ $filter }} }) by (job, service, namespace) - sum(pg_settings_superuser_reserved_connections{ {{ $filter }} }) by (job, service, namespace)' + for: 1m + labels: + severity: critical + annotations: + {{` + summary: "{{ $labels.service }}.{{ $labels.namespace }} has maxed out Postgres connections." + description: "{{ $labels.service }}.{{ $labels.namespace }} is exceeding the currently configured maximum Postgres connection limit (current value: {{ $value }}s). Services may be degraded - please take immediate action (you probably need to increase max_connections in the Docker image and re-deploy." + `}} + - alert: "PostgreSQLHighConnections" + expr: 'sum(pg_stat_activity_count{ {{ $filter }} }) by (job, service, namespace) >= (sum(pg_settings_max_connections{ {{ $filter }} }) by (job, service, namespace) - sum(pg_settings_superuser_reserved_connections{ {{ $filter }} }) by (job, service, namespace) ) * 0.8' + for: 10m + labels: + severity: warning + annotations: + {{` + summary: "{{ $labels.service }}.{{ $labels.namespace }} is over 80% of max Postgres connections." + description: "{{ $labels.service }}.{{ $labels.namespace }} is exceeding 80% of the currently configured maximum Postgres connection limit (current value: {{ $value }}s). Please check utilization graphs and confirm if this is normal service growth, abuse or an otherwise temporary condition or if new resources need to be provisioned (or the limits increased, which is mostly likely)." + `}} + - alert: "PostgreSQLSlowQueries" + expr: 'avg(rate(pg_stat_activity_max_tx_duration{ {{ $filter }},datname!~"template.*" }[2m])) by (job, service, namespace, datname) > 2 * 60' + for: 2m + labels: + severity: warning + annotations: + {{` + summary: "PostgreSQL high number of slow on {{ $labels.service }}.{{ $labels.namespace }} for database {{ $labels.datname }}" + description: "PostgreSQL high number of slow queries {{ $labels.service }}.{{ $labels.namespace }} for database {{ $labels.datname }} with a value of {{ $value }}" + `}} + - alert: "PostgreSQLQPS" + expr: 'avg(irate(pg_stat_database_xact_commit{ {{ $filter }},datname!~"template.*"}[5m])+irate(pg_stat_database_xact_rollback{ {{ $filter }},datname!~"template.*"}[5m])) by (job, service, namespace, datname) > 10000' + for: 5m + labels: + severity: warning + annotations: + {{` + description: "PostgreSQL high number of queries per second on {{ $labels.service }}.{{ $labels.namespace }} for database {{ $labels.datname }} with a value of {{ $value }}" + summary: "PostgreSQL high number of queries per second {{ $labels.service }}.{{ $labels.namespace }} for database {{ $labels.datname }}" + `}} + - alert: "PostgreSQLCacheHitRatio" + expr: 'avg(rate(pg_stat_database_blks_hit{ {{ $filter }},datname!~"template.*" }[5m])/(rate(pg_stat_database_blks_hit{ {{ $filter }},datname!~"template.*" }[5m])+rate(pg_stat_database_blks_read{ {{ $filter }},datname!~"template.*" }[5m]))) by (job,service, namespace, datname) < 0.98' + for: 5m + labels: + severity: warning + annotations: + {{` + summary: "PostgreSQL low cache hit rate on {{ $labels.service }}.{{ $labels.namespace }} for database {{ $labels.datname }}" + description: "PostgreSQL low on cache hit rate on {{ $labels.service }}.{{ $labels.namespace }} for database {{ $labels.datname }} with a value of {{ $value }}" + `}} + {{- with .Values.prometheus.rules.additionalRules }} + - name: {{ $fullname }}-Additional + rules: + {{- toYaml . | nindent 8 }} + {{- end }} +{{- end }} diff --git a/postgresql/templates/secret.yaml b/postgresql/templates/secret.yaml new file mode 100644 index 0000000..b68ebb1 --- /dev/null +++ b/postgresql/templates/secret.yaml @@ -0,0 +1,10 @@ +--- +apiVersion: "v1" +kind: "Secret" +metadata: + name: {{ include "postgresql.fullname" . }} + labels: + {{- include "postgresql.labels" . | nindent 4 }} +data: + user: {{ .Values.postgres.user | b64enc }} + password: {{ .Values.postgres.password | b64enc }} diff --git a/postgresql/templates/service.yaml b/postgresql/templates/service.yaml index cb3296e..9f4e252 100644 --- a/postgresql/templates/service.yaml +++ b/postgresql/templates/service.yaml @@ -11,5 +11,11 @@ spec: targetPort: postgresql protocol: TCP name: postgresql + {{- if .Values.prometheus.enabled }} + - port: 9187 + targetPort: metrics + protocol: TCP + name: metrics + {{- end }} selector: {{- include "postgresql.selectorLabels" . | nindent 4 }} diff --git a/postgresql/templates/servicemonitor.yaml b/postgresql/templates/servicemonitor.yaml new file mode 100644 index 0000000..33cab71 --- /dev/null +++ b/postgresql/templates/servicemonitor.yaml @@ -0,0 +1,27 @@ +{{- if and .Values.prometheus.servicemonitor.enabled }} +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: {{ include "postgresql.fullname" . }} + labels: + {{- include "postgresql.labels" . | nindent 4 }} + {{- with .Values.prometheus.servicemonitor.labels }} + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + selector: + matchLabels: + {{- include "postgresql.selectorLabels" . | nindent 6 }} + endpoints: + - port: metrics + path: "/metrics" + {{- with .Values.prometheus.servicemonitor }} + {{- with .interval }} + interval: {{ . }} + {{- end }} + {{- with .scrapeTimeout }} + scrapeTimeout: {{ . }} + {{- end }} + {{- end }} +{{- end }} + diff --git a/postgresql/templates/statefulset.yaml b/postgresql/templates/statefulset.yaml index 5c2f284..2c92025 100644 --- a/postgresql/templates/statefulset.yaml +++ b/postgresql/templates/statefulset.yaml @@ -56,10 +56,16 @@ spec: imagePullPolicy: {{ coalesce $.Values.global.image.pullPolicy .pullPolicy }} {{- end }} env: - - name: "POSTGRES_PASSWORD" - value: {{ .Values.postgres.password | quote }} - name: "POSTGRES_USER" - value: {{ .Values.postgres.user | quote }} + valueFrom: + secretKeyRef: + name: {{ include "postgresql.fullname" . }} + key: user + - name: "POSTGRES_PASSWORD" + valueFrom: + secretKeyRef: + name: {{ include "postgresql.fullname" . }} + key: password - name: "PGPORT" value: {{ .Values.service.port | quote }} - name: "PGDATA" @@ -85,6 +91,32 @@ spec: volumeMounts: - name: "data" mountPath: "/var/lib/postgresql/data" + {{- if .Values.prometheus.enabled }} + - name: "exporter" + {{- with .Values.prometheus.image }} + image: "{{ coalesce $.Values.global.image.registry .registry }}/{{ .repository }}:{{ .tag }}" + imagePullPolicy: {{ coalesce $.Values.global.image.pullPolicy .pullPolicy }} + {{- end }} + args: + - --collector.stat_statements + env: + - name: "DATA_SOURCE_USER" + valueFrom: + secretKeyRef: + name: {{ include "postgresql.fullname" . }} + key: user + - name: "DATA_SOURCE_PASS" + valueFrom: + secretKeyRef: + name: {{ include "postgresql.fullname" . }} + key: password + - name: "DATA_SOURCE_URI" + value: {{ printf "127.0.0.1:%.0f/postgres?sslmode=disable" .Values.service.port }} + ports: + - name: metrics + containerPort: 9187 + protocol: TCP + {{- end }} {{- with .Values.nodeSelector }} nodeSelector: {{- toYaml . | nindent 8 }} diff --git a/postgresql/values.yaml b/postgresql/values.yaml index 89370a7..0090eb0 100644 --- a/postgresql/values.yaml +++ b/postgresql/values.yaml @@ -137,3 +137,51 @@ autoupgrade: # readOnlyRootFilesystem: true # runAsNonRoot: true # runAsUser: 1000 + +prometheus: + # -- add prometheus exporter sidecar + # @section -- Monitoring + enabled: false + servicemonitor: + # -- deploy servicemonitor + # @section -- Monitoring + enabled: false + # -- label of servicemonitor + # @section -- Monitoring + labels: {} + rules: + # -- deploy prometheusrules + # @section -- Monitoring + enabled: false + # -- labels of prometheusrule + # @section -- Monitoring + labels: {} + # -- add own rules to prometheusrules (current no default alertrules are provided) + # @section -- Monitoring + additionalRules: [] + image: + # -- image registry (could be overwritten by global.image.registry) + # @section -- Monitoring + registry: docker.io + # -- image repository + # @section -- Monitoring + repository: prometheuscommunity/postgres-exporter + # -- This sets the pull policy for images. (could be overwritten by global.image.pullPolicy) + # @section -- Monitoring + pullPolicy: IfNotPresent + # -- image tag + # @section -- Monitoring + tag: v0.17.1 + +grafana: + dashboards: + # -- deploy grafana dashboard in configmap + # @section -- Monitoring + enabled: false + # -- label of configmap + # @section -- Monitoring + labels: + grafana_dashboard: "1" + # -- label of configmap + # @section -- Monitoring + annotations: {}