Verified Commit cd4b2844 authored by Jelle van der Waa's avatar Jelle van der Waa 🚧
Browse files

Add rebuilderd build queue length textcollector

Record the rebuilderd queue length in prometheus so we can generate an
alert for when the queue length keeps rising. As this could be an
indication that the rebuilders have builds which are stuck.
parent 3fd36ddb
Pipeline #1641 passed with stage
in 49 seconds
......@@ -93,6 +93,9 @@ secure-runner2.archlinux.org
repro1.pkgbuild.com
repro3.pkgbuild.com
[rebuilderd]
reproducible.archlinux.org
[memcached]
apollo.archlinux.org
aur.archlinux.org
......
......@@ -394,3 +394,25 @@ groups:
annotations:
summary: "Blackbox SSL certificate will expire soon (instance {{ $labels.instance }})"
description: "SSL certificate expires in 3 days\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- name: rebuilderd
interval: 15m
rules:
- alert: RebuilderdQueueNotEmpty
expr: rebuilderd_queue_length > 2000
for: 24h
labels:
severity: warning
service: rebuilderd
annotations:
summary: "Rebuilderd queue length is not empty {{ $labels.instance }})"
description: "Rebuilderd's queue length is now: {{ $value }}"
- alert: RebuilderdWorkersOffline
expr: rebuilderd_workers < 4
for: 5m
labels:
severity: warning
service: rebuilderd
annotations:
summary: "Rebuilderd workers offline {{ $labels.instance }})"
description: "Not all rebuilder-workers are online, currently {{ $value }} workers are online"
......@@ -11,9 +11,16 @@ route:
group_wait: 30s
group_interval: 5m
repeat_interval: 24h
receiver: 'devops'
receiver: 'default-receiver'
routes:
- receiver: 'rebuilderd'
match_re:
service: rebuilderd
receivers:
- name: 'devops'
- name: 'default-receiver'
email_configs:
- to: 'alerts@archlinux.org'
- name: 'rebuilderd'
email_configs:
- to: 'jelle@archlinux.org'
#!/bin/bash
set -o errexit
set -o nounset
if (( $# != 1 )); then
echo "Missing textcollector directory argument"
exit 1
fi
HOSTNAME=$(hostname)
TEXTFILE_COLLECTOR_DIR=${1}
PROM_FILE=$TEXTFILE_COLLECTOR_DIR/rebuilderd.prom
TMP_FILE=$PROM_FILE.$$
[ -e $TMP_FILE ] && rm -f $TMP_FILE
trap "rm -f $TMP_FILE" EXIT
queuelength=$(rebuildctl queue ls --json | jq '.queue | length')
workers=$(rebuildctl status | wc -l)
echo "# HELP rebuilderd_queue_length number of packages in rebuilderd queue" >> $TMP_FILE
echo "# TYPE rebuilderd_queue_length gauge" >> $TMP_FILE
echo "rebuilderd_queue_length{host=\"${HOSTNAME}\"} $queuelength" >> $TMP_FILE
echo "# HELP rebuilderd_workers number of rebuilderd-workers available" >> $TMP_FILE
echo "# TYPE rebuilderd_workers gauge" >> $TMP_FILE
echo "rebuilderd_workers{host=\"${HOSTNAME}\"} $workers" >> $TMP_FILE
mv -f $TMP_FILE $PROM_FILE
......@@ -11,6 +11,14 @@
pacman: name=prometheus-memcached-exporter state=present
when: "'memcached' in group_names"
- name: install jq for rebuilderd-textcollector
pacman: name=jq state=present
when: "'rebuilderd' in group_names"
- name: add node_exporter to rebuilderd group
user: name=node_exporter groups=rebuilderd append=yes
when: "'rebuilderd' in group_names"
- name: install prometheus-mysqld-exporter
pacman: name=prometheus-mysqld-exporter state=present
when: "'mysql_servers' in group_names"
......@@ -47,6 +55,7 @@
with_items:
- arch-textcollector.sh
- borg-textcollector.sh
- rebuilderd-textcollector.sh
- name: install arch textcollector service
template: src=prometheus-arch-textcollector.service.j2 dest=/etc/systemd/system/prometheus-arch-textcollector.service owner=root group=root mode=600
......@@ -73,6 +82,18 @@
copy: src=blackbox.yml dest=/etc/prometheus/blackbox.yml owner=root group=root mode=0755
when: "'prometheus' in group_names"
- name: install rebuilderd textcollector service
template: src=prometheus-rebuilderd-textcollector.service.j2 dest=/etc/systemd/system/prometheus-rebuilderd-textcollector.service owner=root group=root mode=600
when: "'rebuilderd' in group_names"
- name: install rebuilderd textcollector timer
template: src=prometheus-rebuilderd-textcollector.timer.j2 dest=/etc/systemd/system/prometheus-rebuilderd-textcollector.timer owner=root group=root mode=600
when: "'rebuilderd' in group_names"
- name: enable and start prometheus rebuilderd textcollector timer
systemd: name=prometheus-rebuilderd-textcollector.timer enabled=yes daemon_reload=yes state=started
when: "'rebuilderd' in group_names"
- name: enable prometheus-node-exporter service
systemd: name=prometheus-node-exporter enabled=yes daemon_reload=yes state=started
......
[Unit]
Description=Prometheus Rebuilderd Exporter
After=network.target
[Service]
Type=oneshot
User=node_exporter
ExecStart=/usr/local/bin/rebuilderd-textcollector.sh {{ prometheus_textfile_dir }}
NoNewPrivileges=true
LockPersonality=true
CapabilityBoundingSet=
UMask=077
PrivateDevices=true
PrivateTmp=true
ProtectSystem=strict
ProtectHome=true
ReadWritePaths={{ prometheus_textfile_dir }}
MemoryDenyWriteExecute=true
RemoveIPC=true
RestrictRealtime=true
RestrictNamespaces=true
RestrictSUIDSGID=true
RestrictAddressFamilies=~AF_NETLINK
RestrictAddressFamilies=~AF_PACKET
ProtectHostname=true
ProtectControlGroups=true
ProtectKernelLogs=true
ProtectKernelTunables=true
ProtectKernelModules=true
ProtectClock=true
SystemCallArchitectures=native
[Unit]
Description=Prometheus Rebuilderd Exporter TextCollector Timer
[Timer]
OnUnitActiveSec=60m
OnBootSec=15min
RandomizedDelaySec=1min
[Install]
WantedBy=timers.target
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment