diff --git a/infra-monitoring/templates/exporter/node/prometheus-rules.yaml b/infra-monitoring/templates/exporter/node/prometheus-rules.yaml new file mode 100644 index 0000000..327dfaf --- /dev/null +++ b/infra-monitoring/templates/exporter/node/prometheus-rules.yaml @@ -0,0 +1,50 @@ +{{- if (.Capabilities.APIVersions.Has "monitoring.coreos.com/v1/PrometheusRule") }} +--- +apiVersion: "monitoring.coreos.com/v1" +kind: "PrometheusRule" +metadata: + name: prometheus-node-exporter + labels: + {{- toYaml .Values.commons.prometheus.rules.labels | nindent 4 }} +spec: + groups: + - name: "Node disk rate" + rules: + - alert: HostUnusualDiskReadRate + expr: '(sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: "5m" + labels: + severity: "warning" + detectedBy: "NodeExporter" + annotations: + {{` + summary: "Host unusual disk read rate (instance {{ $labels.instance }})" + description: "Disk is probably reading too much data (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + `}} + + - alert: HostUnusualDiskWriteRate + expr: '(sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: "2m" + labels: + severity: "warning" + detectedBy: "NodeExporter" + annotations: + {{` + summary: "Host unusual disk write rate (instance {{ $labels.instance }})" + description: "Disk is probably writing too much data (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + `}} + + - name: "ZFS" + rules: + - alert: ZFSOfflinePool + expr: 'node_zfs_zpool_state{state!="online"} > 0' + for: "1m" + labels: + severity: "critical" + detectedBy: "NodeExporter" + annotations: + {{` + summary: "ZFS offline pool (instance {{ $labels.instance }})" + description: "A ZFS zpool is in a unexpected state: {{ $labels.state }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + `}} +{{- end }}