Verified Commit 8b3c68e5 authored by Jelle van der Waa's avatar Jelle van der Waa 🚧
Browse files

Add prometheus role for the prometheus/alertmanager server

Introduce a new monitoring server with prometheus and alertmanager for
monitoring all our boxes.
parent fa89191e
Pipeline #1466 passed with stage
in 46 seconds
......@@ -257,6 +257,15 @@ Medium-fast-ish packet.net box with Debian on it. Is currently maintained manual
#### Services
- GitLab runner
### monitoring.archlinux.org
Prometheus server which collects performance/metrics from our services and runs alertmanager
### Services
- Prometheus
- Alertmanager
## Ansible repo workflows
### Replace vault password and change vaulted passwords
......
......@@ -9,6 +9,9 @@ arch_groups:
- docker-image-sudo
arch_users:
alertmanager:
name: ""
groups: []
aaron:
name: "Aaron Griffin"
ssh_key: aaron.pub
......
---
filesystem: btrfs
ipv4_address: 95.217.220.31
$ANSIBLE_VAULT;1.1;AES256
66633663616636326339373764306333386330353631643734333633663361633437613432323836
6164623837303336343161653838396434623139353939340a386234616563396433393564613665
36613238396137633132313737303166393265393363386538373833316636373964366561303335
3935653864343131350a376236363834383865306566346462646566646439363162393730643831
36346631313335666262643136613734333239366530303365353432306663333265316162636534
33393134643363383433336635366439643465333639346164336362643662666632336336346466
663635323638393661393764666364646530
......@@ -97,3 +97,6 @@ repro3.pkgbuild.com
apollo.archlinux.org
aur.archlinux.org
aur-dev.archlinux.org
[prometheus]
monitoring.archlinux.org
- name: setup prometheus server
hosts: monitoring.archlinux.org
remote_user: root
roles:
- { role: firewalld }
- { role: common }
- { role: tools }
- { role: sshd }
- { role: root_ssh }
- { role: hardening }
- { role: borg_client, tags: ["borg"], when: "'borg_clients' in group_names" }
- { role: prometheus }
- { role: certbot }
- { role: nginx }
monitoring_domain: monitoring.archlinux.org
groups:
- name: node_common
interval: 60s
rules:
- alert: HostHighCpuLoad
expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
for: 5m
labels:
severity: warning
annotations:
summary: "Host high CPU load (instance {{ $labels.instance }})"
description: "CPU load is > 80%\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: HostSwapIsFillingUp
expr: (1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80
for: 5m
labels:
severity: warning
annotations:
summary: "Host swap is filling up (instance {{ $labels.instance }})"
description: "Swap is filling up (>80%)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: HostOutOfMemory
expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
for: 5m
labels:
severity: warning
annotations:
summary: "Host out of memory (instance {{ $labels.instance }})"
description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: HostMemoryUnderMemoryPressure
expr: rate(node_vmstat_pgmajfault[1m]) > 1000
for: 5m
labels:
severity: warning
annotations:
summary: "Host memory under memory pressure (instance {{ $labels.instance }})"
description: "The node is under heavy memory pressure. High rate of major page faults\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: HostUnusualNetworkThroughputIn
expr: sum by (instance) (irate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100
for: 5m
labels:
severity: warning
annotations:
summary: "Host unusual network throughput in (instance {{ $labels.instance }})"
description: "Host network interfaces are probably receiving too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: HostUnusualNetworkThroughputOut
expr: sum by (instance) (irate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100
for: 5m
labels:
severity: warning
annotations:
summary: "Host unusual network throughput out (instance {{ $labels.instance }})"
description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: HostOutOfDiskSpace
expr: (node_filesystem_avail_bytes{mountpoint="/rootfs"} * 100) / node_filesystem_size_bytes{mountpoint="/rootfs"} < 10
for: 5m
labels:
severity: warning
annotations:
summary: "Host out of disk space (instance {{ $labels.instance }})"
description: "Disk is almost full (< 20% left)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: HostDiskWillFillIn4Hours
expr: predict_linear(node_filesystem_free_bytes{fstype!~"tmpfs",mountpoint!~"/backup"}[1h], 4 * 3600) < 0
for: 5m
labels:
severity: warning
annotations:
summary: "Host disk will fill in 4 hours (instance {{ $labels.instance }})"
description: "Disk will fill in 4 hours at current write rate\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: HostOutOfInodes
expr: node_filesystem_files_free{mountpoint ="/rootfs"} / node_filesystem_files{mountpoint ="/rootfs"} * 100 < 10
for: 5m
labels:
severity: warning
annotations:
summary: "Host out of inodes (instance {{ $labels.instance }})"
description: "Disk is almost running out of available inodes (< 10% left)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: HostOomKillDetected
expr: increase(node_vmstat_oom_kill[5m]) > 0
for: 5m
labels:
severity: warning
annotations:
summary: "Host OOM kill detected (instance {{ $labels.instance }})"
description: "OOM kill detected\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- name: prometheus
interval: 60s
rules:
- alert: PrometheusTargetMissing
expr: up == 0
for: 5m
labels:
severity: critical
annotations:
summary: "Prometheus target missing (instance {{ $labels.instance }})"
description: "A Prometheus target {{ $value }} has disappeared. An exporter might have crashed."
- alert: PrometheusTooManyRestarts
expr: changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 2
for: 5m
labels:
severity: warning
annotations:
summary: "Prometheus too many restarts (instance {{ $labels.instance }})"
description: "Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: PrometheusNotConnectedToAlertmanager
expr: prometheus_notifications_alertmanagers_discovered < 1
for: 5m
labels:
severity: critical
annotations:
summary: "Prometheus not connected to alertmanager (instance {{ $labels.instance }})"
description: "Prometheus cannot connect the alertmanager\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: PrometheusRuleEvaluationFailures
expr: increase(prometheus_rule_evaluation_failures_total[3m]) > 0
for: 5m
labels:
severity: critical
annotations:
summary: "Prometheus rule evaluation failures (instance {{ $labels.instance }})"
description: "Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: PrometheusTemplateTextExpansionFailures
expr: increase(prometheus_template_text_expansion_failures_total[3m]) > 0
for: 5m
labels:
severity: critical
annotations:
summary: "Prometheus template text expansion failures (instance {{ $labels.instance }})"
description: "Prometheus encountered {{ $value }} template text expansion failures\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: PrometheusNotificationsBacklog
expr: min_over_time(prometheus_notifications_queue_length[10m]) > 0
for: 5m
labels:
severity: warning
annotations:
summary: "Prometheus notifications backlog (instance {{ $labels.instance }})"
description: "The Prometheus notification queue has not been empty for 10 minutes\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: PrometheusAlertmanagerNotificationFailing
expr: rate(alertmanager_notifications_failed_total[1m]) > 0
for: 5m
labels:
severity: critical
annotations:
summary: "Prometheus AlertManager notification failing (instance {{ $labels.instance }})"
description: "Alertmanager is failing sending notifications\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: PrometheusTargetScrapingSlow
expr: prometheus_target_interval_length_seconds{quantile="0.9"} > 60
for: 5m
labels:
severity: warning
annotations:
summary: "Prometheus target scraping slow (instance {{ $labels.instance }})"
description: "Prometheus is scraping exporters slowly\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: PrometheusLargeScrape
expr: increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m]) > 10
for: 5m
labels:
severity: warning
annotations:
summary: "Prometheus large scrape (instance {{ $labels.instance }})"
description: "Prometheus has many scrapes that exceed the sample limit\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: PrometheusTsdbCheckpointCreationFailures
expr: increase(prometheus_tsdb_checkpoint_creations_failed_total[3m]) > 0
for: 5m
labels:
severity: critical
annotations:
summary: "Prometheus TSDB checkpoint creation failures (instance {{ $labels.instance }})"
description: "Prometheus encountered {{ $value }} checkpoint creation failures\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: PrometheusTsdbCompactionsFailed
expr: increase(prometheus_tsdb_compactions_failed_total[3m]) > 0
for: 5m
labels:
severity: critical
annotations:
summary: "Prometheus TSDB compactions failed (instance {{ $labels.instance }})"
description: "Prometheus encountered {{ $value }} TSDB compactions failures\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: PrometheusTsdbWalCorruptions
expr: increase(prometheus_tsdb_wal_corruptions_total[3m]) > 0
for: 5m
labels:
severity: critical
annotations:
summary: "Prometheus TSDB WAL corruptions (instance {{ $labels.instance }})"
description: "Prometheus encountered {{ $value }} TSDB WAL corruptions\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: PrometheusTsdbWalTruncationsFailed
expr: increase(prometheus_tsdb_wal_truncations_failed_total[3m]) > 0
for: 5m
labels:
severity: critical
annotations:
summary: "Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }})"
description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- name: arch
interval: 60s
rules:
- alert: pacman_security_updates_pending
expr: pacman_security_updates_pending > 0
for: 1m
labels:
severity: warning
annotations:
description: 'host {{ $labels.instance }} has a vulernable but update-able package'
summary: '{{ $labels.instance }} has {{ $value }} vulnerable packages'
- alert: pacman_updates_pending
expr: pacman_updates_pending > 50
for: 1m
labels:
severity: warning
annotations:
description: 'host {{ $labels.instance }} has out of date packages'
summary: '{{ $labels.instance }} has {{ $value }} out of date packages'
- name: borg
interval: 60s
rules:
- alert: BorgHetznerMissingBackup
expr: time() - borg_hetzner_last_archive_timestamp > 86400 * 1.2
for: 2m
labels:
severity: critical
annotations:
summary: 'Borg Hetzner missing backup (instance {{ $labels.instance }})'
description: 'Borg has not backuped for more than 24 hours. Last backup made on {{ $value | humanizeTimestamp }}'
- alert: BorgOffsiteMissingBackup
expr: time() - borg_offsite_last_archive_timestamp > 86400 * 1.2
for: 2m
labels:
severity: critical
annotations:
summary: 'Borg Offsite missing backup (instance {{ $labels.instance }})'
description: 'Borg has not backuped for more than 24 hours. Last backup made on {{ $value | humanizeTimestamp }}'
- name: systemd_unit
interval: 15s
rules:
- alert: systemd_unit_failed
expr: |
node_systemd_unit_state{state="failed"} > 0
for: 3m
labels:
severity: critical
annotations:
description: 'Instance {{ $labels.instance }}: Service {{ $labels.name }} failed'
summary: 'Systemd unit failed'
- alert: systemd_unit_flapping
expr: |
changes(node_systemd_unit_state{state="active"}[5m]) > 5 or (changes(node_systemd_unit_state{state="active"}[60m]) > 15 unless changes(node_systemd_unit_state{state="active"}[30m]) < 7)
labels:
severity: critical
annotations:
description: 'Instance {{ $labels.instance }}: Service {{ $labels.name }} flapping'
summary: 'Systemd unit flapping'
- name: gitlab
interval: 15s
rules:
- alert: ServiceDown
expr: avg_over_time(up[5m]) * 100 < 50
annotations:
description: The service {{ $labels.job }} instance {{ $labels.instance }} is
not responding for more than 50% of the time for 5 minutes.
summary: The service {{ $labels.job }} is not responding
- alert: RedisDown
expr: avg_over_time(redis_up[5m]) * 100 < 50
annotations:
description: The Redis service {{ $labels.job }} instance {{ $labels.instance
}} is not responding for more than 50% of the time for 5 minutes.
summary: The Redis service {{ $labels.job }} is not responding
- alert: PostgresDown
expr: avg_over_time(pg_up[5m]) * 100 < 50
annotations:
description: The Postgres service {{ $labels.job }} instance {{ $labels.instance
}} is not responding for more than 50% of the time for 5 minutes.
summary: The Postgres service {{ $labels.job }} is not responding
- alert: UnicornQueueing
expr: avg_over_time(unicorn_queued_connections[30m]) > 1
annotations:
description: Unicorn instance {{ $labels.instance }} is queueing requests with
an average of {{ $value | printf "%.1f" }} over the last 30 minutes.
summary: Unicorn is queueing requests
- alert: PumaQueueing
expr: avg_over_time(puma_queued_connections[30m]) > 1
annotations:
description: Puma instance {{ $labels.instance }} is queueing requests with
an average of {{ $value | printf "%.1f" }} over the last 30 minutes.
summary: Puma is queueing requests
- alert: HighUnicornUtilization
expr: instance:unicorn_utilization:ratio * 100 > 90
for: 60m
annotations:
description: Unicorn instance {{ $labels.instance }} has more than 90% worker utilization ({{ $value | printf "%.1f" }}%) over the last 60 minutes.
summary: Unicorn is has high utilization
- alert: HighPumaUtilization
expr: instance:puma_utilization:ratio * 100 > 90
for: 60m
annotations:
description: Puma instance {{ $labels.instance }} has more than 90% thread utilization ({{ $value | printf "%.1f" }}%) over the last 60 minutes.
summary: Puma is has high utilization
- alert: SidekiqJobsQueuing
expr: sum by (name) (sidekiq_queue_size) > 0
for: 60m
annotations:
summary: Sidekiq has jobs queued
description: Sidekiq queue {{ $labels.name }} has {{ $value }} jobs queued for 60 minutes.
- alert: HighgRPCResourceExhaustedRate
expr: >
sum without (grpc_code) (
job_grpc:grpc_server_handled_total:rate5m{grpc_code="ResourceExhausted"}
) /
sum without (grpc_code) (
job_grpc:grpc_server_handled_total:rate5m
) * 100 > 1
for: 60m
annotations:
summary: High gRPC ResourceExhausted error rate
description: gRPC is returning more than 1% ({{ $value | printf "%.1f" }}%) ResourceExhausted errors over the last 60 minutes.
- alert: PostgresDatabaseDeadlocks
expr: increase(pg_stat_database_deadlocks[5m]) > 0
annotations:
summary: Postgres database has deadlocks
description: Postgres database {{ $labels.instance }} had {{ $value | printf "%d" }} deadlocks in the last 5 minutes.
- alert: PostgresDatabaseDeadlockCancels
expr: increase(pg_stat_database_deadlocks[5m]) > 0
annotations:
summary: Postgres database has queries canceled due to deadlocks
description: Postgres database {{ $labels.instance }} had {{ $value | printf "%d" }} queries canceled due to deadlocks in the last 5 minutes.
# Low-traffic - < 10 QPS (600 RPM)
- alert: WorkhorseHighErrorRate
expr: >
(
sum without (job, code) (
job_route_method_code:gitlab_workhorse_http_request_duration_seconds_count:rate5m{code=~"5.."}
) /
sum without (job,code) (
job_route_method_code:gitlab_workhorse_http_request_duration_seconds_count:rate5m
) < 10
) * 100 > 50
annotations:
summary: Workhorse has high error rates
description: Workhorse route {{ $labels.route }} method {{ $labels.method }} has more than 50% errors ({{ $value | printf "%.1f" }}%) for the last 60 minutes.
# High-traffic - >= 10 QPS (600 RPM)
- alert: WorkhorseHighErrorRate
expr: >
(
sum without (job, code) (
job_route_method_code:gitlab_workhorse_http_request_duration_seconds_count:rate5m{code=~"5.."}
) /
sum without (job,code) (
job_route_method_code:gitlab_workhorse_http_request_duration_seconds_count:rate5m
) > 10
) * 100 > 10
annotations:
summary: Workhorse has high error rates
description: Workhorse route {{ $labels.route }} method {{ $labels.method }} has more than 10% errors ({{ $value | printf "%.1f" }}%) for the last 60 minutes.
---
- name: reload prometheus
service: name=prometheus state=reloaded
- name: reload alertmanager
service: name=alertmanager state=reloaded
---
- name: install prometheus,alertmanager server
pacman: name=prometheus,alertmanager state=present
- name: install prometheus configuration
template: src=prometheus.yml.j2 dest=/etc/prometheus/prometheus.yml owner=root group=root mode=644
notify: reload prometheus
- name: install prometheus alert configuration
copy: src=node.rules.yml dest=/etc/prometheus/node.rules.yml owner=root group=root mode=644
notify: reload prometheus
- name: install alertmanager configuration
template: src=alertmanager.yml.j2 dest=/etc/alertmanager/alertmanager.yml owner=root group=root mode=644
notify: reload alertmanager
- name: enable prometheus server service
systemd: name=prometheus enabled=yes daemon_reload=yes state=started
- name: enable alertmanager server service
systemd: name=alertmanager enabled=yes daemon_reload=yes state=started
global:
resolve_timeout: 5m
smtp_smarthost: 'mail.archlinux.org:587'
smtp_from: 'alertmanager@archlinux.org'
smtp_require_tls: true
smtp_auth_username: alertmanager
smtp_auth_password: {{ vault_monitoring_alertmanager_smtp_pass }}
route:
group_by: ['instance', 'severity']
group_wait: 30s
group_interval: 5m
repeat_interval: 1h
receiver: 'devops'
receivers:
- name: 'devops'
email_configs:
- to: 'alerts@archlinux.org'
global:
scrape_interval: 60s
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
- node.rules.yml
# Alertmanager configuration
alerting:
alertmanagers:
- static_configs:
- targets:
- localhost:9093
scrape_configs:
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment