fix(infra-monitoring): add some rules for node-exporter e.g. zfs
This commit is contained in:
parent
d8f8677531
commit
ea31d567fd
1 changed files with 50 additions and 0 deletions
|
@ -0,0 +1,50 @@
|
||||||
|
{{- if (.Capabilities.APIVersions.Has "monitoring.coreos.com/v1/PrometheusRule") }}
|
||||||
|
---
|
||||||
|
apiVersion: "monitoring.coreos.com/v1"
|
||||||
|
kind: "PrometheusRule"
|
||||||
|
metadata:
|
||||||
|
name: prometheus-node-exporter
|
||||||
|
labels:
|
||||||
|
{{- toYaml .Values.commons.prometheus.rules.labels | nindent 4 }}
|
||||||
|
spec:
|
||||||
|
groups:
|
||||||
|
- name: "Node disk rate"
|
||||||
|
rules:
|
||||||
|
- alert: HostUnusualDiskReadRate
|
||||||
|
expr: '(sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||||
|
for: "5m"
|
||||||
|
labels:
|
||||||
|
severity: "warning"
|
||||||
|
detectedBy: "NodeExporter"
|
||||||
|
annotations:
|
||||||
|
{{`
|
||||||
|
summary: "Host unusual disk read rate (instance {{ $labels.instance }})"
|
||||||
|
description: "Disk is probably reading too much data (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
`}}
|
||||||
|
|
||||||
|
- alert: HostUnusualDiskWriteRate
|
||||||
|
expr: '(sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||||
|
for: "2m"
|
||||||
|
labels:
|
||||||
|
severity: "warning"
|
||||||
|
detectedBy: "NodeExporter"
|
||||||
|
annotations:
|
||||||
|
{{`
|
||||||
|
summary: "Host unusual disk write rate (instance {{ $labels.instance }})"
|
||||||
|
description: "Disk is probably writing too much data (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
`}}
|
||||||
|
|
||||||
|
- name: "ZFS"
|
||||||
|
rules:
|
||||||
|
- alert: ZFSOfflinePool
|
||||||
|
expr: 'node_zfs_zpool_state{state!="online"} > 0'
|
||||||
|
for: "1m"
|
||||||
|
labels:
|
||||||
|
severity: "critical"
|
||||||
|
detectedBy: "NodeExporter"
|
||||||
|
annotations:
|
||||||
|
{{`
|
||||||
|
summary: "ZFS offline pool (instance {{ $labels.instance }})"
|
||||||
|
description: "A ZFS zpool is in a unexpected state: {{ $labels.state }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
`}}
|
||||||
|
{{- end }}
|
Loading…
Add table
Reference in a new issue