node.rules.yml 17.2 KB
Newer Older
1
2
3
4
5
6
groups:
- name: node_common
  interval: 60s
  rules:

  - alert: HostHighCpuLoad
7
    expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle",instance!~"dragon.archlinux.org",instance!~"repro1.pkgbuild.com",instance!~"repro2.pkgbuild.com"}[5m])) * 100) > 80
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Host high CPU load (instance {{ $labels.instance }})"
      description: "CPU load is > 80%\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: HostSwapIsFillingUp
    expr: (1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Host swap is filling up (instance {{ $labels.instance }})"
      description: "Swap is filling up (>80%)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: HostOutOfMemory
    expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Host out of memory (instance {{ $labels.instance }})"
      description: "Node memory is filling up (< 10% left)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: HostMemoryUnderMemoryPressure
    expr: rate(node_vmstat_pgmajfault[1m]) > 1000
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Host memory under memory pressure (instance {{ $labels.instance }})"
      description: "The node is under heavy memory pressure. High rate of major page faults\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: HostUnusualNetworkThroughputIn
    expr: sum by (instance) (irate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Host unusual network throughput in (instance {{ $labels.instance }})"
      description: "Host network interfaces are probably receiving too much data (> 100 MB/s)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: HostUnusualNetworkThroughputOut
    expr: sum by (instance) (irate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Host unusual network throughput out (instance {{ $labels.instance }})"
      description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: HostOutOfDiskSpace
    expr: (node_filesystem_avail_bytes{mountpoint="/rootfs"}  * 100) / node_filesystem_size_bytes{mountpoint="/rootfs"} < 10
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Host out of disk space (instance {{ $labels.instance }})"
      description: "Disk is almost full (< 20% left)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: HostDiskWillFillIn4Hours
    expr: predict_linear(node_filesystem_free_bytes{fstype!~"tmpfs",mountpoint!~"/backup"}[1h], 4 * 3600) < 0
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Host disk will fill in 4 hours (instance {{ $labels.instance }})"
      description: "Disk will fill in 4 hours at current write rate\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: HostOutOfInodes
    expr: node_filesystem_files_free{mountpoint ="/rootfs"} / node_filesystem_files{mountpoint ="/rootfs"} * 100 < 10
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Host out of inodes (instance {{ $labels.instance }})"
      description: "Disk is almost running out of available inodes (< 10% left)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: HostOomKillDetected
    expr: increase(node_vmstat_oom_kill[5m]) > 0
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Host OOM kill detected (instance {{ $labels.instance }})"
      description: "OOM kill detected\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

- name: prometheus
  interval: 60s
  rules:
  - alert: PrometheusTargetMissing
    expr: up == 0
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "Prometheus target missing (instance {{ $labels.instance }})"
      description: "A Prometheus target {{ $value }} has disappeared. An exporter might have crashed."
  - alert: PrometheusTooManyRestarts
    expr: changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 2
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Prometheus too many restarts (instance {{ $labels.instance }})"
      description: "Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
  - alert: PrometheusNotConnectedToAlertmanager
    expr: prometheus_notifications_alertmanagers_discovered < 1
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "Prometheus not connected to alertmanager (instance {{ $labels.instance }})"
      description: "Prometheus cannot connect the alertmanager\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
  - alert: PrometheusRuleEvaluationFailures
    expr: increase(prometheus_rule_evaluation_failures_total[3m]) > 0
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "Prometheus rule evaluation failures (instance {{ $labels.instance }})"
      description: "Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
  - alert: PrometheusTemplateTextExpansionFailures
    expr: increase(prometheus_template_text_expansion_failures_total[3m]) > 0
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "Prometheus template text expansion failures (instance {{ $labels.instance }})"
      description: "Prometheus encountered {{ $value }} template text expansion failures\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
  - alert: PrometheusNotificationsBacklog
    expr: min_over_time(prometheus_notifications_queue_length[10m]) > 0
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Prometheus notifications backlog (instance {{ $labels.instance }})"
      description: "The Prometheus notification queue has not been empty for 10 minutes\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
  - alert: PrometheusAlertmanagerNotificationFailing
    expr: rate(alertmanager_notifications_failed_total[1m]) > 0
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "Prometheus AlertManager notification failing (instance {{ $labels.instance }})"
      description: "Alertmanager is failing sending notifications\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
  - alert: PrometheusTargetScrapingSlow
    expr: prometheus_target_interval_length_seconds{quantile="0.9"} > 60
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Prometheus target scraping slow (instance {{ $labels.instance }})"
      description: "Prometheus is scraping exporters slowly\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
  - alert: PrometheusLargeScrape
    expr: increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m]) > 10
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Prometheus large scrape (instance {{ $labels.instance }})"
      description: "Prometheus has many scrapes that exceed the sample limit\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
  - alert: PrometheusTsdbCheckpointCreationFailures
    expr: increase(prometheus_tsdb_checkpoint_creations_failed_total[3m]) > 0
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "Prometheus TSDB checkpoint creation failures (instance {{ $labels.instance }})"
      description: "Prometheus encountered {{ $value }} checkpoint creation failures\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
  - alert: PrometheusTsdbCompactionsFailed
    expr: increase(prometheus_tsdb_compactions_failed_total[3m]) > 0
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "Prometheus TSDB compactions failed (instance {{ $labels.instance }})"
      description: "Prometheus encountered {{ $value }} TSDB compactions failures\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
  - alert: PrometheusTsdbWalCorruptions
    expr: increase(prometheus_tsdb_wal_corruptions_total[3m]) > 0
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "Prometheus TSDB WAL corruptions (instance {{ $labels.instance }})"
      description: "Prometheus encountered {{ $value }} TSDB WAL corruptions\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
  - alert: PrometheusTsdbWalTruncationsFailed
    expr: increase(prometheus_tsdb_wal_truncations_failed_total[3m]) > 0
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }})"
      description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

204
- name: pacman
205
  interval: 2m
206
  rules:
207
208
  - alert: pacman_updates_pending
    expr: pacman_updates_pending > 50
209
    for: 15m
210
211
212
213
    labels:
      severity: warning
    annotations:
      description: 'host {{ $labels.instance }} has out of date packages'
214
      summary: '{{ $labels.instance }} has {{ $value }} > 50 out of date packages'
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358

- name: borg
  interval: 60s
  rules:
  - alert: BorgHetznerMissingBackup
    expr: time() - borg_hetzner_last_archive_timestamp > 86400 * 1.2
    for: 2m
    labels:
      severity: critical
    annotations:
      summary: 'Borg Hetzner missing backup (instance {{ $labels.instance }})'
      description: 'Borg has not backuped for more than 24 hours. Last backup made on {{ $value | humanizeTimestamp }}'
  - alert: BorgOffsiteMissingBackup
    expr: time() - borg_offsite_last_archive_timestamp > 86400 * 1.2
    for: 2m
    labels:
      severity: critical
    annotations:
      summary: 'Borg Offsite missing backup (instance {{ $labels.instance }})'
      description: 'Borg has not backuped for more than 24 hours. Last backup made on {{ $value | humanizeTimestamp }}'

- name: systemd_unit
  interval: 15s
  rules:
  - alert: systemd_unit_failed
    expr: |
      node_systemd_unit_state{state="failed"} > 0
    for: 3m
    labels:
      severity: critical
    annotations:
      description: 'Instance {{ $labels.instance }}: Service {{ $labels.name }} failed'
      summary: 'Systemd unit failed'

  - alert: systemd_unit_flapping
    expr: |
      changes(node_systemd_unit_state{state="active"}[5m]) > 5 or (changes(node_systemd_unit_state{state="active"}[60m]) > 15 unless changes(node_systemd_unit_state{state="active"}[30m]) < 7)
    labels:
      severity: critical
    annotations:
      description: 'Instance {{ $labels.instance }}: Service {{ $labels.name }} flapping'
      summary: 'Systemd unit flapping'

- name: gitlab
  interval: 15s
  rules:
  - alert: ServiceDown
    expr: avg_over_time(up[5m]) * 100 < 50
    annotations:
      description: The service {{ $labels.job }} instance {{ $labels.instance }} is
        not responding for more than 50% of the time for 5 minutes.
      summary: The service {{ $labels.job }} is not responding
  - alert: RedisDown
    expr: avg_over_time(redis_up[5m]) * 100 < 50
    annotations:
      description: The Redis service {{ $labels.job }} instance {{ $labels.instance
        }} is not responding for more than 50% of the time for 5 minutes.
      summary: The Redis service {{ $labels.job }} is not responding
  - alert: PostgresDown
    expr: avg_over_time(pg_up[5m]) * 100 < 50
    annotations:
      description: The Postgres service {{ $labels.job }} instance {{ $labels.instance
        }} is not responding for more than 50% of the time for 5 minutes.
      summary: The Postgres service {{ $labels.job }} is not responding
  - alert: UnicornQueueing
    expr: avg_over_time(unicorn_queued_connections[30m]) > 1
    annotations:
      description: Unicorn instance {{ $labels.instance }} is queueing requests with
        an average of {{ $value | printf "%.1f" }} over the last 30 minutes.
      summary: Unicorn is queueing requests
  - alert: PumaQueueing
    expr: avg_over_time(puma_queued_connections[30m]) > 1
    annotations:
      description: Puma instance {{ $labels.instance }} is queueing requests with
        an average of {{ $value | printf "%.1f" }} over the last 30 minutes.
      summary: Puma is queueing requests
  - alert: HighUnicornUtilization
    expr: instance:unicorn_utilization:ratio * 100 > 90
    for: 60m
    annotations:
      description: Unicorn instance {{ $labels.instance }} has more than 90% worker utilization ({{ $value | printf "%.1f" }}%) over the last 60 minutes.
      summary: Unicorn is has high utilization
  - alert: HighPumaUtilization
    expr: instance:puma_utilization:ratio * 100 > 90
    for: 60m
    annotations:
      description: Puma instance {{ $labels.instance }} has more than 90% thread utilization ({{ $value | printf "%.1f" }}%) over the last 60 minutes.
      summary: Puma is has high utilization
  - alert: SidekiqJobsQueuing
    expr: sum by (name) (sidekiq_queue_size) > 0
    for: 60m
    annotations:
      summary: Sidekiq has jobs queued
      description: Sidekiq queue {{ $labels.name }} has {{ $value }} jobs queued for 60 minutes.
  - alert: HighgRPCResourceExhaustedRate
    expr: >
      sum without (grpc_code) (
        job_grpc:grpc_server_handled_total:rate5m{grpc_code="ResourceExhausted"}
      ) /
      sum without (grpc_code) (
        job_grpc:grpc_server_handled_total:rate5m
      ) * 100 > 1
    for: 60m
    annotations:
      summary: High gRPC ResourceExhausted error rate
      description: gRPC is returning more than 1% ({{ $value | printf "%.1f" }}%) ResourceExhausted errors over the last 60 minutes.
  - alert: PostgresDatabaseDeadlocks
    expr: increase(pg_stat_database_deadlocks[5m]) > 0
    annotations:
      summary: Postgres database has deadlocks
      description: Postgres database {{ $labels.instance }} had {{ $value | printf "%d" }} deadlocks in the last 5 minutes.
  - alert: PostgresDatabaseDeadlockCancels
    expr: increase(pg_stat_database_deadlocks[5m]) > 0
    annotations:
      summary: Postgres database has queries canceled due to deadlocks
      description: Postgres database {{ $labels.instance }} had {{ $value | printf "%d" }} queries canceled due to deadlocks in the last 5 minutes.
  # Low-traffic - < 10 QPS (600 RPM)
  - alert: WorkhorseHighErrorRate
    expr: >
      (
        sum without (job, code) (
          job_route_method_code:gitlab_workhorse_http_request_duration_seconds_count:rate5m{code=~"5.."}
        ) /
        sum without (job,code) (
          job_route_method_code:gitlab_workhorse_http_request_duration_seconds_count:rate5m
        ) < 10
      ) * 100 > 50
    annotations:
      summary: Workhorse has high error rates
      description: Workhorse route {{ $labels.route }} method {{ $labels.method }} has more than 50% errors ({{ $value | printf "%.1f" }}%) for the last 60 minutes.
  # High-traffic - >= 10 QPS (600 RPM)
  - alert: WorkhorseHighErrorRate
    expr: >
      (
        sum without (job, code) (
          job_route_method_code:gitlab_workhorse_http_request_duration_seconds_count:rate5m{code=~"5.."}
        ) /
        sum without (job,code) (
          job_route_method_code:gitlab_workhorse_http_request_duration_seconds_count:rate5m
        ) > 10
      ) * 100 > 10
    annotations:
      summary: Workhorse has high error rates
      description: Workhorse route {{ $labels.route }} method {{ $labels.method }} has more than 10% errors ({{ $value | printf "%.1f" }}%) for the last 60 minutes.
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379

- name: blackbox
  interval: 15s
  rules:
  - alert: BlackboxProbeFailed
    expr: probe_success == 0
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "Blackbox probe failed (instance {{ $labels.instance }})"
      description: "Probe failed\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
  - alert: BlackboxProbeHttpFailure
    expr: probe_http_status_code <= 199 OR probe_http_status_code >= 400
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "Blackbox probe HTTP failure (instance {{ $labels.instance }})"
      description: "HTTP status code is not 200-399\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
  - alert: BlackboxSslCertificateWillExpireSoon
380
    expr: probe_ssl_earliest_cert_expiry - time() < 86400 * 25
381
382
383
384
385
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "Blackbox SSL certificate will expire soon (instance {{ $labels.instance }})"
386
      description: "SSL certificate expires in 25 days\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
387
388
389
390
391
392
393
394
395
396
397
398
399
400

- name: rebuilderd
  interval: 15m
  rules:
  - alert: RebuilderdQueueNotEmpty
    expr: rebuilderd_queue_length > 2000
    for: 24h
    labels:
      severity: warning
      service: rebuilderd
    annotations:
      summary: "Rebuilderd queue length is not empty {{ $labels.instance }})"
      description: "Rebuilderd's queue length is now: {{ $value }}"
  - alert: RebuilderdWorkersOffline
401
    expr: rebuilderd_workers  < 3
402
403
404
405
406
407
408
    for: 5m
    labels:
      severity: warning
      service: rebuilderd
    annotations:
      summary: "Rebuilderd workers offline {{ $labels.instance }})"
      description: "Not all rebuilder-workers are online, currently {{ $value }} workers are online"