diff --git a/roles/prometheus/files/node.rules.yml b/roles/prometheus/files/node.rules.yml index 710c11dccad65b2e5ec7d8c94d02b0aca725af1e..15aa6fb1d0f62b557d7d1439e14108b57fb24ad5 100644 --- a/roles/prometheus/files/node.rules.yml +++ b/roles/prometheus/files/node.rules.yml @@ -21,7 +21,7 @@ groups: description: "Swap is filling up (>80%)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - alert: HostOutOfMemory - expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10 + expr: node_memory_MemAvailable_bytes{instance!~"build.archlinux.org",instance!~"repro3.pkgbuild.com",instance!~"repro2.pkgbuild.com"} / node_memory_MemTotal_bytes * 100 < 10 for: 5m labels: severity: warning @@ -29,6 +29,15 @@ groups: summary: "Host out of memory (instance {{ $labels.instance }})" description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + - alert: BuildHostOutOfMemory + expr: node_memory_MemAvailable_bytes{instance~"build.archlinux.org",instance~"repro3.pkgbuild.com",instance~"repro2.pkgbuild.com"} / node_memory_MemTotal_bytes * 100 < 10 + for: 12h + labels: + severity: warning + annotations: + summary: "Build host out of memory for a long time (instance {{ $labels.instance }})" + description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + - alert: HostMemoryUnderMemoryPressure expr: rate(node_vmstat_pgmajfault[1m]) > 1000 for: 5m