Verified Commit 8ea35153 authored by Jelle van der Waa's avatar Jelle van der Waa 🚧
Browse files

Add a btrfs prometheus exporter

Collect prometheus btrfs errors from the btrfs command from btrfs-progs
which since 5.10 supports json output for device stats. The collected
errors will in the future trigger an alert when the errors reach a
certain treshold.
parent c0f0f566
Pipeline #4857 passed with stage
in 46 seconds
......@@ -11,7 +11,7 @@ To access our monitoring system, go to https://monitoring.archlinux and log in v
### System
For general system performance monitoring [prometheus-node-exporter](https://github.com/prometheus/node_exporter) is used in combination with the textfile collector for Arch Linux specific metrics. A systemd service/timer 'prometheus-arch-textcollector' writes the amount of out of date packages and security updates. When running the prometheus_exporters role the node-exporter and arch textcollector is automatically added.
For general system performance monitoring [prometheus-node-exporter](https://github.com/prometheus/node_exporter) is used in combination with a textfile collector for Arch Linux specific and btrfs metrics. A systemd service/timer 'prometheus-arch-textcollector' writes the amount of out of date packages and security updates. For btrfs monitoring, `btrfs device stats` is executed on all btrfs devices on the system and all errors stats are record. When running the prometheus_exporters role the node-exporter, arch textcollector and btrfs textcollector is automatically added.
### memcached
......
......@@ -213,6 +213,50 @@ groups:
description: 'host {{ $labels.instance }} has out of date packages'
summary: '{{ $labels.instance }} has {{ $value }} > 50 out of date packages'
- name: btrfs
interval: 2m
rules:
- alert: btrfs_corruption_errs
expr: btrfs_corruption_errs > 1
for: 15m
labels:
severity: warning
annotations:
description: 'host {{ $labels.instance }} btrfs corruption errors'
summary: '{{ $labels.instance }} has {{ $value }} btrfs_corruption_errs'
- alert: btrfs_write_io_errs
expr: btrfs_write_io_errs > 1
for: 15m
labels:
severity: warning
annotations:
description: 'host {{ $labels.instance }} btrfs write_io errors'
summary: '{{ $labels.instance }} has {{ $value }} btrfs_write_io_errs'
- alert: btrfs_read_io_errs
expr: btrfs_read_io_errs > 1
for: 15m
labels:
severity: warning
annotations:
description: 'host {{ $labels.instance }} btrfs read_io errors'
summary: '{{ $labels.instance }} has {{ $value }} btrfs_read_io_errs'
- alert: btrfs_flush_io_errs
expr: btrfs_flush_io_errs > 1
for: 15m
labels:
severity: warning
annotations:
description: 'host {{ $labels.instance }} btrfs flush_io errors'
summary: '{{ $labels.instance }} has {{ $value }} btrfs_flush_io_errs'
- alert: btrfs_corruption_errs
expr: btrfs_corruption_errs > 1
for: 15m
labels:
severity: warning
annotations:
description: 'host {{ $labels.instance }} btrfs corruption errors'
summary: '{{ $labels.instance }} has {{ $value }} btrfs_corruption_errs'
- name: borg
interval: 60s
rules:
......
#!/bin/bash
set -o errexit
set -o nounset
if (( $# != 1 )); then
echo "Missing textcollector directory argument"
exit 1
fi
TEXTFILE_COLLECTOR_DIR=${1}
PROM_FILE=$TEXTFILE_COLLECTOR_DIR/btrfs.prom
TMP_FILE=$PROM_FILE.$$
[ -e $TMP_FILE ] && rm -f $TMP_FILE
trap "rm -f $TMP_FILE" EXIT
list_btrfs_submounts=$(sudo btrfs filesystem show | awk '/ path /{print $NF}')
# Errors outputed by btrfs device stats /
btrfs_errors=(write_io_errs read_io_errs flush_io_errs corruption_errs generation_errs)
for btrfs_mount in ${list_btrfs_submounts[@]}; do
for btrfs_error in "${btrfs_errors[@]}"
do
jq_filter=".[\"device-stats\"][].${btrfs_error}"
errors=$(sudo btrfs --format json device stats $btrfs_mount | jq -r ${jq_filter})
device=$(sudo btrfs --format json device stats $btrfs_mount | jq -r '.["device-stats"][].device')
echo "# HELP btrfs_${btrfs_error} error" >> $TMP_FILE
echo "# TYPE btrfs_${btrfs_error} gauge" >> $TMP_FILE
echo "btrfs_${btrfs_error}{device=\"${device}\"} ${errors}" >> $TMP_FILE
done
done
mv -f $TMP_FILE $PROM_FILE
Cmnd_Alias EXPORTER = /usr/bin/btrfs --format json device stats /, /usr/bin/btrfs --format json device stats /[a-zA-Z]*, /usr/bin/btrfs filesystem show
node_exporter ALL=(ALL) NOPASSWD: EXPORTER
---
- name: install prometheus-node-exporter
pacman: name=prometheus-node-exporter,arch-audit,pacman-contrib state=present
pacman: name=prometheus-node-exporter,arch-audit,pacman-contrib,jq state=present
- name: install prometheus-blackbox-exporter
pacman: name=prometheus-blackbox-exporter state=present
......@@ -11,10 +11,6 @@
pacman: name=prometheus-memcached-exporter state=present
when: "'memcached' in group_names"
- name: install jq for rebuilderd-textcollector
pacman: name=jq state=present
when: "'rebuilderd' in group_names"
- name: add node_exporter to rebuilderd group
user: name=node_exporter groups=rebuilderd append=yes
when: "'rebuilderd' in group_names"
......@@ -59,6 +55,7 @@
- rebuilderd-textcollector.sh
- rebuilderd-status-textcollector.py
- archive-textcollector.sh
- btrfs-textcollector.sh
- name: install arch textcollector service
template: src=prometheus-arch-textcollector.service.j2 dest=/etc/systemd/system/prometheus-arch-textcollector.service owner=root group=root mode=644
......@@ -109,6 +106,22 @@
systemd: name=prometheus-archive-textcollector.timer enabled=yes daemon_reload=yes state=started
when: "'archive_mirrors' in group_names or inventory_hostname == 'gemini.archlinux.org'"
- name: install sudoers for btrfs
copy: src=sudoers dest=/etc/sudoers.d/node_exporter owner=root group=root mode=0440
when: filesystem == "btrfs"
- name: install btrfs textcollector service
template: src=prometheus-btrfs-textcollector.service.j2 dest=/etc/systemd/system/prometheus-btrfs-textcollector.service owner=root group=root mode=644
when: filesystem == "btrfs"
- name: install btrfs textcollector timer
template: src=prometheus-btrfs-textcollector.timer.j2 dest=/etc/systemd/system/prometheus-btrfs-textcollector.timer owner=root group=root mode=644
when: filesystem == "btrfs"
- name: enable and start prometheus btrfs textcollector timer
systemd: name=prometheus-btrfs-textcollector.timer enabled=yes daemon_reload=yes state=started
when: filesystem == "btrfs"
- name: enable prometheus-node-exporter service
systemd: name=prometheus-node-exporter enabled=yes daemon_reload=yes state=started
......
[Unit]
Description=Prometheus btrfs Exporter
[Service]
Type=oneshot
User=node_exporter
ExecStart=/usr/local/bin/btrfs-textcollector.sh {{ prometheus_textfile_dir }}
ReadWritePaths={{ prometheus_textfile_dir }}
PrivateNetwork=true
[Unit]
Description=Prometheus Btrfs Exporter TextCollector Timer
[Timer]
OnUnitActiveSec=1d
OnBootSec=15min
RandomizedDelaySec=1min
[Install]
WantedBy=timers.target
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment