fix(infra-monitoring): add node-exporter alert for temperature of hardware
This commit is contained in:
parent
52e5f8e479
commit
c8208bd150
1 changed files with 25 additions and 0 deletions
|
@ -47,4 +47,29 @@ spec:
|
||||||
summary: "ZFS offline pool (instance {{ $labels.instance }})"
|
summary: "ZFS offline pool (instance {{ $labels.instance }})"
|
||||||
description: "A ZFS zpool is in a unexpected state: {{ $labels.state }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "A ZFS zpool is in a unexpected state: {{ $labels.state }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
`}}
|
`}}
|
||||||
|
|
||||||
|
- name: "Hardware Temperature"
|
||||||
|
rules:
|
||||||
|
- alert: HostPhysicalComponentTooHot
|
||||||
|
expr: 'avg(node_hwmon_temp_crit_alarm_celsius * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}) without(container,endpoint,instance,pod,service,namespace,job) > 0'
|
||||||
|
for: "0m"
|
||||||
|
labels:
|
||||||
|
severity: "critical"
|
||||||
|
detectedBy: "NodeExporter"
|
||||||
|
annotations:
|
||||||
|
{{`
|
||||||
|
summary: Host node overtemperature alarm (node {{ $labels.nodename }})
|
||||||
|
description: "Physical node temperature alarm triggered\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
`}}
|
||||||
|
- alert: HostPhysicalComponentTooHot
|
||||||
|
expr: 'avg((node_hwmon_temp_celsius > node_hwmon_temp_max_celsius) * ignoring(label) group_left(instance,chip,sensor) node_hwmon_sensor_label{} * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} ) without(container,endpoint,instance,pod,service,namespace,job)'
|
||||||
|
for: "5m"
|
||||||
|
labels:
|
||||||
|
severity: "warning"
|
||||||
|
detectedBy: "NodeExporter"
|
||||||
|
annotations:
|
||||||
|
{{`
|
||||||
|
summary: Host physical component too hot (node {{ $labels.nodename }})
|
||||||
|
description: "Physical hardware component too hot\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
`}}
|
||||||
{{- end }}
|
{{- end }}
|
||||||
|
|
Loading…
Add table
Reference in a new issue