fix(infra-monitoring): add node-exporter alert for temperature of hardware

This commit is contained in:
WrenIX 2024-08-25 05:30:31 +02:00
parent 52e5f8e479
commit c8208bd150
Signed by: wrenix
GPG key ID: 7AFDB012974B1BB5

View file

@ -47,4 +47,29 @@ spec:
summary: "ZFS offline pool (instance {{ $labels.instance }})" summary: "ZFS offline pool (instance {{ $labels.instance }})"
description: "A ZFS zpool is in a unexpected state: {{ $labels.state }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "A ZFS zpool is in a unexpected state: {{ $labels.state }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
`}} `}}
- name: "Hardware Temperature"
rules:
- alert: HostPhysicalComponentTooHot
expr: 'avg(node_hwmon_temp_crit_alarm_celsius * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}) without(container,endpoint,instance,pod,service,namespace,job) > 0'
for: "0m"
labels:
severity: "critical"
detectedBy: "NodeExporter"
annotations:
{{`
summary: Host node overtemperature alarm (node {{ $labels.nodename }})
description: "Physical node temperature alarm triggered\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
`}}
- alert: HostPhysicalComponentTooHot
expr: 'avg((node_hwmon_temp_celsius > node_hwmon_temp_max_celsius) * ignoring(label) group_left(instance,chip,sensor) node_hwmon_sensor_label{} * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} ) without(container,endpoint,instance,pod,service,namespace,job)'
for: "5m"
labels:
severity: "warning"
detectedBy: "NodeExporter"
annotations:
{{`
summary: Host physical component too hot (node {{ $labels.nodename }})
description: "Physical hardware component too hot\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
`}}
{{- end }} {{- end }}