node.rules.yml 20.3 KB
Newer Older
1
groups:
Kristian Klausen's avatar
Kristian Klausen committed
2
3
4
  - name: node_common
    interval: 60s
    rules:
5
6
7
8
9
10
11
12
      - alert: HostHighCpuLoad
        expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle",instance!~"build.archlinux.org",instance!~"repro1.pkgbuild.com",instance!~"repro2.pkgbuild.com"}[5m])) * 100) > 80
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Host high CPU load (instance {{ $labels.instance }})"
          description: "CPU load is > 80%\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
13

14
15
16
17
18
19
20
21
      - alert: HostSwapIsFillingUp
        expr: (1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Host swap is filling up (instance {{ $labels.instance }})"
          description: "Swap is filling up (>80%)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
22

23
24
25
26
27
28
29
30
      - alert: HostOutOfMemory
        expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Host out of memory (instance {{ $labels.instance }})"
          description: "Node memory is filling up (< 10% left)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
31

32
33
34
35
36
37
38
39
      - alert: HostMemoryUnderMemoryPressure
        expr: rate(node_vmstat_pgmajfault[1m]) > 1000
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Host memory under memory pressure (instance {{ $labels.instance }})"
          description: "The node is under heavy memory pressure. High rate of major page faults\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
40

41
42
43
44
45
46
47
48
      - alert: HostUnusualNetworkThroughputIn
        expr: sum by (instance) (irate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Host unusual network throughput in (instance {{ $labels.instance }})"
          description: "Host network interfaces are probably receiving too much data (> 100 MB/s)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
49

50
51
52
53
54
55
56
57
      - alert: HostUnusualNetworkThroughputOut
        expr: sum by (instance) (irate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Host unusual network throughput out (instance {{ $labels.instance }})"
          description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
58

59
60
61
62
63
64
65
66
      - alert: HostOutOfDiskSpace
        expr: (node_filesystem_avail_bytes{mountpoint="/rootfs"}  * 100) / node_filesystem_size_bytes{mountpoint="/rootfs"} < 10
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Host out of disk space (instance {{ $labels.instance }})"
          description: "Disk is almost full (< 20% left)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
67

68
69
70
71
72
73
74
75
      - alert: HostDiskWillFillIn4Hours
        expr: predict_linear(node_filesystem_free_bytes{fstype!~"tmpfs",mountpoint!~"/backup"}[1h], 4 * 3600) < 0
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Host disk will fill in 4 hours (instance {{ $labels.instance }})"
          description: "Disk will fill in 4 hours at current write rate\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
76

77
78
79
80
81
82
83
84
      - alert: HostOutOfInodes
        expr: node_filesystem_files_free{mountpoint ="/rootfs"} / node_filesystem_files{mountpoint ="/rootfs"} * 100 < 10
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Host out of inodes (instance {{ $labels.instance }})"
          description: "Disk is almost running out of available inodes (< 10% left)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
85

86
87
88
89
90
91
92
93
      - alert: HostOomKillDetected
        expr: increase(node_vmstat_oom_kill[5m]) > 0
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Host OOM kill detected (instance {{ $labels.instance }})"
          description: "OOM kill detected\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
94

Kristian Klausen's avatar
Kristian Klausen committed
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
  - name: prometheus
    interval: 60s
    rules:
      - alert: PrometheusTargetMissing
        expr: up == 0
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Prometheus target missing (instance {{ $labels.instance }})"
          description: "A Prometheus target {{ $value }} has disappeared. An exporter might have crashed."
      - alert: PrometheusTooManyRestarts
        expr: changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 2
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Prometheus too many restarts (instance {{ $labels.instance }})"
          description: "Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
      - alert: PrometheusNotConnectedToAlertmanager
        expr: prometheus_notifications_alertmanagers_discovered < 1
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Prometheus not connected to alertmanager (instance {{ $labels.instance }})"
          description: "Prometheus cannot connect the alertmanager\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
      - alert: PrometheusRuleEvaluationFailures
        expr: increase(prometheus_rule_evaluation_failures_total[3m]) > 0
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Prometheus rule evaluation failures (instance {{ $labels.instance }})"
          description: "Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
      - alert: PrometheusTemplateTextExpansionFailures
        expr: increase(prometheus_template_text_expansion_failures_total[3m]) > 0
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Prometheus template text expansion failures (instance {{ $labels.instance }})"
          description: "Prometheus encountered {{ $value }} template text expansion failures\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
      - alert: PrometheusNotificationsBacklog
        expr: min_over_time(prometheus_notifications_queue_length[10m]) > 0
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Prometheus notifications backlog (instance {{ $labels.instance }})"
          description: "The Prometheus notification queue has not been empty for 10 minutes\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
      - alert: PrometheusAlertmanagerNotificationFailing
        expr: rate(alertmanager_notifications_failed_total[1m]) > 0
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Prometheus AlertManager notification failing (instance {{ $labels.instance }})"
          description: "Alertmanager is failing sending notifications\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
      - alert: PrometheusTargetScrapingSlow
        expr: prometheus_target_interval_length_seconds{quantile="0.9"} > 60
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Prometheus target scraping slow (instance {{ $labels.instance }})"
          description: "Prometheus is scraping exporters slowly\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
      - alert: PrometheusLargeScrape
        expr: increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m]) > 10
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Prometheus large scrape (instance {{ $labels.instance }})"
          description: "Prometheus has many scrapes that exceed the sample limit\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
      - alert: PrometheusTsdbCheckpointCreationFailures
        expr: increase(prometheus_tsdb_checkpoint_creations_failed_total[3m]) > 0
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Prometheus TSDB checkpoint creation failures (instance {{ $labels.instance }})"
          description: "Prometheus encountered {{ $value }} checkpoint creation failures\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
      - alert: PrometheusTsdbCompactionsFailed
        expr: increase(prometheus_tsdb_compactions_failed_total[3m]) > 0
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Prometheus TSDB compactions failed (instance {{ $labels.instance }})"
          description: "Prometheus encountered {{ $value }} TSDB compactions failures\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
      - alert: PrometheusTsdbWalCorruptions
        expr: increase(prometheus_tsdb_wal_corruptions_total[3m]) > 0
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Prometheus TSDB WAL corruptions (instance {{ $labels.instance }})"
          description: "Prometheus encountered {{ $value }} TSDB WAL corruptions\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
      - alert: PrometheusTsdbWalTruncationsFailed
        expr: increase(prometheus_tsdb_wal_truncations_failed_total[3m]) > 0
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }})"
          description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
202

Kristian Klausen's avatar
Kristian Klausen committed
203
204
205
206
207
208
209
210
211
212
213
  - name: pacman
    interval: 2m
    rules:
      - alert: pacman_updates_pending
        expr: pacman_updates_pending > 50
        for: 15m
        labels:
          severity: warning
        annotations:
          description: 'host {{ $labels.instance }} has out of date packages'
          summary: '{{ $labels.instance }} has {{ $value }} > 50 out of date packages'
214

Kristian Klausen's avatar
Kristian Klausen committed
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
  - name: btrfs
    interval: 2m
    rules:
      - alert: btrfs_corruption_errs
        expr: btrfs_corruption_errs > 1
        for: 15m
        labels:
          severity: warning
        annotations:
          description: 'host {{ $labels.instance }} btrfs corruption errors'
          summary: '{{ $labels.instance }} has {{ $value }} btrfs_corruption_errs'
      - alert: btrfs_write_io_errs
        expr: btrfs_write_io_errs > 1
        for: 15m
        labels:
          severity: warning
        annotations:
          description: 'host {{ $labels.instance }} btrfs write_io errors'
          summary: '{{ $labels.instance }} has {{ $value }} btrfs_write_io_errs'
      - alert: btrfs_read_io_errs
        expr: btrfs_read_io_errs > 1
        for: 15m
        labels:
          severity: warning
        annotations:
          description: 'host {{ $labels.instance }} btrfs read_io errors'
          summary: '{{ $labels.instance }} has {{ $value }} btrfs_read_io_errs'
      - alert: btrfs_flush_io_errs
        expr: btrfs_flush_io_errs > 1
        for: 15m
        labels:
          severity: warning
        annotations:
          description: 'host {{ $labels.instance }} btrfs flush_io errors'
          summary: '{{ $labels.instance }} has {{ $value }} btrfs_flush_io_errs'
      - alert: btrfs_corruption_errs
        expr: btrfs_corruption_errs > 1
        for: 15m
        labels:
          severity: warning
        annotations:
          description: 'host {{ $labels.instance }} btrfs corruption errors'
          summary: '{{ $labels.instance }} has {{ $value }} btrfs_corruption_errs'
258

Kristian Klausen's avatar
Kristian Klausen committed
259
260
261
262
  - name: borg
    interval: 60s
    rules:
      - alert: BorgHetznerMissingBackup
263
        expr: time() - borg_hetzner_last_archive_timestamp > 86400 * 1.5
Kristian Klausen's avatar
Kristian Klausen committed
264
265
266
267
268
        for: 2m
        labels:
          severity: critical
        annotations:
          summary: 'Borg Hetzner missing backup (instance {{ $labels.instance }})'
269
          description: 'Borg has not backuped for more than 24 hours. Last backup was made {{ $value | humanizeDuration }} ago'
Kristian Klausen's avatar
Kristian Klausen committed
270
      - alert: BorgOffsiteMissingBackup
271
        expr: time() - borg_offsite_last_archive_timestamp > 86400 * 1.5
Kristian Klausen's avatar
Kristian Klausen committed
272
273
274
275
276
        for: 2m
        labels:
          severity: critical
        annotations:
          summary: 'Borg Offsite missing backup (instance {{ $labels.instance }})'
277
          description: 'Borg has not backuped for more than 24 hours. Last backup was made {{ $value | humanizeDuration }} ago'
278

Kristian Klausen's avatar
Kristian Klausen committed
279
280
281
282
283
284
285
286
287
288
289
290
  - name: systemd_unit
    interval: 15s
    rules:
      - alert: systemd_unit_failed
        expr: |
          node_systemd_unit_state{state="failed"} > 0
        for: 3m
        labels:
          severity: critical
        annotations:
          description: 'Instance {{ $labels.instance }}: Service {{ $labels.name }} failed'
          summary: 'Systemd unit failed'
291

Kristian Klausen's avatar
Kristian Klausen committed
292
293
294
295
296
297
298
299
      - alert: systemd_unit_flapping
        expr: |
          changes(node_systemd_unit_state{state="active"}[5m]) > 5 or (changes(node_systemd_unit_state{state="active"}[60m]) > 15 unless changes(node_systemd_unit_state{state="active"}[30m]) < 7)
        labels:
          severity: critical
        annotations:
          description: 'Instance {{ $labels.instance }}: Service {{ $labels.name }} flapping'
          summary: 'Systemd unit flapping'
300

Kristian Klausen's avatar
Kristian Klausen committed
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
  - name: gitlab
    interval: 15s
    rules:
      - alert: ServiceDown
        expr: avg_over_time(up[5m]) * 100 < 50
        annotations:
          description: The service {{ $labels.job }} instance {{ $labels.instance }} is
            not responding for more than 50% of the time for 5 minutes.
          summary: The service {{ $labels.job }} is not responding
      - alert: RedisDown
        expr: avg_over_time(redis_up[5m]) * 100 < 50
        annotations:
          description: The Redis service {{ $labels.job }} instance {{ $labels.instance
            }} is not responding for more than 50% of the time for 5 minutes.
          summary: The Redis service {{ $labels.job }} is not responding
      - alert: PostgresDown
        expr: avg_over_time(pg_up[5m]) * 100 < 50
        annotations:
          description: The Postgres service {{ $labels.job }} instance {{ $labels.instance
            }} is not responding for more than 50% of the time for 5 minutes.
          summary: The Postgres service {{ $labels.job }} is not responding
      - alert: UnicornQueueing
        expr: avg_over_time(unicorn_queued_connections[30m]) > 1
        annotations:
          description: Unicorn instance {{ $labels.instance }} is queueing requests with
            an average of {{ $value | printf "%.1f" }} over the last 30 minutes.
          summary: Unicorn is queueing requests
      - alert: PumaQueueing
        expr: avg_over_time(puma_queued_connections[30m]) > 1
        annotations:
          description: Puma instance {{ $labels.instance }} is queueing requests with
            an average of {{ $value | printf "%.1f" }} over the last 30 minutes.
          summary: Puma is queueing requests
      - alert: HighUnicornUtilization
        expr: instance:unicorn_utilization:ratio * 100 > 90
        for: 60m
        annotations:
          description: Unicorn instance {{ $labels.instance }} has more than 90% worker utilization ({{ $value | printf "%.1f" }}%) over the last 60 minutes.
          summary: Unicorn is has high utilization
      - alert: HighPumaUtilization
        expr: instance:puma_utilization:ratio * 100 > 90
        for: 60m
        annotations:
          description: Puma instance {{ $labels.instance }} has more than 90% thread utilization ({{ $value | printf "%.1f" }}%) over the last 60 minutes.
          summary: Puma is has high utilization
      - alert: SidekiqJobsQueuing
        expr: sum by (name) (sidekiq_queue_size) > 0
        for: 60m
        annotations:
          summary: Sidekiq has jobs queued
          description: Sidekiq queue {{ $labels.name }} has {{ $value }} jobs queued for 60 minutes.
      - alert: HighgRPCResourceExhaustedRate
        expr: >
          sum without (grpc_code) (
            job_grpc:grpc_server_handled_total:rate5m{grpc_code="ResourceExhausted"}
          ) /
          sum without (grpc_code) (
            job_grpc:grpc_server_handled_total:rate5m
          ) * 100 > 1
        for: 60m
        annotations:
          summary: High gRPC ResourceExhausted error rate
          description: gRPC is returning more than 1% ({{ $value | printf "%.1f" }}%) ResourceExhausted errors over the last 60 minutes.
      - alert: PostgresDatabaseDeadlocks
        expr: increase(pg_stat_database_deadlocks[5m]) > 0
        annotations:
          summary: Postgres database has deadlocks
          description: Postgres database {{ $labels.instance }} had {{ $value | printf "%d" }} deadlocks in the last 5 minutes.
      - alert: PostgresDatabaseDeadlockCancels
        expr: increase(pg_stat_database_deadlocks[5m]) > 0
        annotations:
          summary: Postgres database has queries canceled due to deadlocks
          description: Postgres database {{ $labels.instance }} had {{ $value | printf "%d" }} queries canceled due to deadlocks in the last 5 minutes.
      # Low-traffic - < 10 QPS (600 RPM)
      - alert: WorkhorseHighErrorRate
        expr: >
          (
            sum without (job, code) (
              job_route_method_code:gitlab_workhorse_http_request_duration_seconds_count:rate5m{code=~"5.."}
            ) /
            sum without (job,code) (
              job_route_method_code:gitlab_workhorse_http_request_duration_seconds_count:rate5m
            ) < 10
          ) * 100 > 50
        annotations:
          summary: Workhorse has high error rates
          description: Workhorse route {{ $labels.route }} method {{ $labels.method }} has more than 50% errors ({{ $value | printf "%.1f" }}%) for the last 60 minutes.
      # High-traffic - >= 10 QPS (600 RPM)
      - alert: WorkhorseHighErrorRate
        expr: >
          (
            sum without (job, code) (
              job_route_method_code:gitlab_workhorse_http_request_duration_seconds_count:rate5m{code=~"5.."}
            ) /
            sum without (job,code) (
              job_route_method_code:gitlab_workhorse_http_request_duration_seconds_count:rate5m
            ) > 10
          ) * 100 > 10
        annotations:
          summary: Workhorse has high error rates
          description: Workhorse route {{ $labels.route }} method {{ $labels.method }} has more than 10% errors ({{ $value | printf "%.1f" }}%) for the last 60 minutes.
402

Kristian Klausen's avatar
Kristian Klausen committed
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
  - name: blackbox
    interval: 15s
    rules:
      - alert: BlackboxProbeFailed
        expr: probe_success == 0
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Blackbox probe failed (instance {{ $labels.instance }})"
          description: "Probe failed\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
      - alert: BlackboxProbeHttpFailure
        expr: probe_http_status_code <= 199 OR probe_http_status_code >= 400
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Blackbox probe HTTP failure (instance {{ $labels.instance }})"
          description: "HTTP status code is not 200-399\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
      - alert: BlackboxSslCertificateWillExpireSoon
        expr: probe_ssl_earliest_cert_expiry - time() < 86400 * 25
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Blackbox SSL certificate will expire soon (instance {{ $labels.instance }})"
          description: "SSL certificate expires in 25 days\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
430

Kristian Klausen's avatar
Kristian Klausen committed
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
  - name: rebuilderd
    interval: 15m
    rules:
      - alert: RebuilderdQueueNotEmpty
        expr: rebuilderd_queue_length > 2000
        for: 24h
        labels:
          severity: warning
          service: rebuilderd
        annotations:
          summary: "Rebuilderd queue length is not empty {{ $labels.instance }})"
          description: "Rebuilderd's queue length is now: {{ $value }}"
      - alert: RebuilderdWorkersOffline
        expr: rebuilderd_workers  < 3
        for: 5m
        labels:
          severity: warning
          service: rebuilderd
        annotations:
          summary: "Rebuilderd workers offline {{ $labels.instance }})"
          description: "Not all rebuilder-workers are online, currently {{ $value }} workers are online"