From c8208bd1504ba877fac75b1e31e6f1577f354d9a Mon Sep 17 00:00:00 2001 From: WrenIX Date: Sun, 25 Aug 2024 05:30:31 +0200 Subject: [PATCH] fix(infra-monitoring): add node-exporter alert for temperature of hardware --- .../exporter/node/prometheus-rules.yaml | 25 +++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/infra-monitoring/templates/exporter/node/prometheus-rules.yaml b/infra-monitoring/templates/exporter/node/prometheus-rules.yaml index 327dfaf..38e8b97 100644 --- a/infra-monitoring/templates/exporter/node/prometheus-rules.yaml +++ b/infra-monitoring/templates/exporter/node/prometheus-rules.yaml @@ -47,4 +47,29 @@ spec: summary: "ZFS offline pool (instance {{ $labels.instance }})" description: "A ZFS zpool is in a unexpected state: {{ $labels.state }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" `}} + + - name: "Hardware Temperature" + rules: + - alert: HostPhysicalComponentTooHot + expr: 'avg(node_hwmon_temp_crit_alarm_celsius * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}) without(container,endpoint,instance,pod,service,namespace,job) > 0' + for: "0m" + labels: + severity: "critical" + detectedBy: "NodeExporter" + annotations: + {{` + summary: Host node overtemperature alarm (node {{ $labels.nodename }}) + description: "Physical node temperature alarm triggered\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + `}} + - alert: HostPhysicalComponentTooHot + expr: 'avg((node_hwmon_temp_celsius > node_hwmon_temp_max_celsius) * ignoring(label) group_left(instance,chip,sensor) node_hwmon_sensor_label{} * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} ) without(container,endpoint,instance,pod,service,namespace,job)' + for: "5m" + labels: + severity: "warning" + detectedBy: "NodeExporter" + annotations: + {{` + summary: Host physical component too hot (node {{ $labels.nodename }}) + description: "Physical hardware component too hot\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + `}} {{- end }}