From 23564b29a6eec7b477f5441661c1a45adf99c77c Mon Sep 17 00:00:00 2001 From: Jelle van der Waa Date: Mon, 31 Aug 2020 18:18:53 +0200 Subject: [PATCH 1/3] Introduce prometheus exporters role for collection Add a new role called prometheus_exporters which should be run on every machine we have and starts different collectors depending on what group the machine is in. Currently supported our the gitlab runner exporter, rebuilder textcollector, mysqld-exporter, borg textcollector and an node/arch exporter. The arch exporter monitors the security status and pacman out of date packages gauge. --- group_vars/all/common.yml | 3 + group_vars/all/vault_gitlab.yml | 177 +++++++++--------- group_vars/gitlab_runners.yml | 2 + hosts | 10 + playbooks/aur.archlinux.org.yml | 1 + playbooks/gitlab-runners.yml | 1 + playbooks/gitlab.archlinux.org.yml | 1 + playbooks/monitoring.archlinux.org.yml | 1 + playbooks/reproducible.archlinux.org.yml | 1 + roles/gitlab_runner/tasks/main.yml | 7 + roles/prometheus/defaults/main.yml | 1 + roles/prometheus/templates/prometheus.yml.j2 | 66 +++++++ roles/prometheus_exporters/defaults/main.yml | 10 + .../files/arch-textcollector.sh | 31 +++ .../files/borg-textcollector.sh | 46 +++++ roles/prometheus_exporters/tasks/main.yml | 88 +++++++++ .../prometheus-arch-textcollector.service.j2 | 37 ++++ .../prometheus-arch-textcollector.timer.j2 | 10 + .../prometheus-borg-textcollector.service.j2 | 35 ++++ .../prometheus-borg-textcollector.timer.j2 | 10 + .../templates/prometheus-mysqld-exporter.j2 | 3 + .../templates/prometheus-node-exporter.env.j2 | 1 + 22 files changed, 457 insertions(+), 85 deletions(-) create mode 100644 roles/prometheus_exporters/defaults/main.yml create mode 100755 roles/prometheus_exporters/files/arch-textcollector.sh create mode 100755 roles/prometheus_exporters/files/borg-textcollector.sh create mode 100644 roles/prometheus_exporters/tasks/main.yml create mode 100644 roles/prometheus_exporters/templates/prometheus-arch-textcollector.service.j2 create mode 100644 roles/prometheus_exporters/templates/prometheus-arch-textcollector.timer.j2 create mode 100644 roles/prometheus_exporters/templates/prometheus-borg-textcollector.service.j2 create mode 100644 roles/prometheus_exporters/templates/prometheus-borg-textcollector.timer.j2 create mode 100644 roles/prometheus_exporters/templates/prometheus-mysqld-exporter.j2 create mode 100644 roles/prometheus_exporters/templates/prometheus-node-exporter.env.j2 diff --git a/group_vars/all/common.yml b/group_vars/all/common.yml index 56a6bf87..664f5ddc 100644 --- a/group_vars/all/common.yml +++ b/group_vars/all/common.yml @@ -10,3 +10,6 @@ zabbix_agent_templates: # this is used by the maintenance role to get the ip address # of the machine running the playbook maintenance_remote_machine: "{{ hostvars[inventory_hostname]['ansible_env'].SSH_CLIENT.split(' ')[0] }}" + +# prometheus-node-exporter port +prometheus_exporter_port: '9100' diff --git a/group_vars/all/vault_gitlab.yml b/group_vars/all/vault_gitlab.yml index 8f4e0e01..64145b7e 100644 --- a/group_vars/all/vault_gitlab.yml +++ b/group_vars/all/vault_gitlab.yml @@ -1,86 +1,93 @@ $ANSIBLE_VAULT;1.1;AES256 -35393938326563366437646365633563303031393034626433333163373838613535333136356132 -3139393330383337376633313739643431636337343263310a616263613665356437383862663134 -34643230613832643332323634613561313634386636373937373533653338313030633339653235 -6330646665656530350aa326136373934633930656538363633 +39636137336436363233643038663935386633383433353533383134636532353139303239326332 +3465626434646334620adiff --git a/group_vars/gitlab_runners.yml b/group_vars/gitlab_runners.yml index 1c98fb00..80777b00 100644 --- a/group_vars/gitlab_runners.yml +++ b/group_vars/gitlab_runners.yml @@ -1,3 +1,5 @@ +gitlab_runner_exporter_port: 9252 + fail2ban_jails: sshd: true postfix: false diff --git a/hosts b/hosts index 43a89c29..bc91b96b 100644 --- a/hosts +++ b/hosts @@ -100,3 +100,13 @@ aur-dev.archlinux.org [prometheus] monitoring.archlinux.org + +[node_exporters] +aur.archlinux.org +monitoring.archlinux.org +gitlab.archlinux.org +reproducible.archlinux.org +runner1.archlinux.org +runner2.archlinux.org +secure-runner1.archlinux.org +secure-runner2.archlinux.org diff --git a/playbooks/aur.archlinux.org.yml b/playbooks/aur.archlinux.org.yml index e2fcabf5..6a13585a 100644 --- a/playbooks/aur.archlinux.org.yml +++ b/playbooks/aur.archlinux.org.yml @@ -8,6 +8,7 @@ - { role: tools } - { role: sshd, sshd_enable_includes: true } - { role: root_ssh } + - { role: prometheus_exporters } - { role: certbot } - { role: nginx } - { role: mariadb, mariadb_innodb_buffer_pool_size: '64M', mariadb_table_open_cache: '256', mariadb_query_cache_type: '0', diff --git a/playbooks/gitlab-runners.yml b/playbooks/gitlab-runners.yml index b2d09a37..3980aed0 100644 --- a/playbooks/gitlab-runners.yml +++ b/playbooks/gitlab-runners.yml @@ -9,4 +9,5 @@ - { role: sshd } - { role: root_ssh } - { role: fail2ban } + - { role: prometheus_exporters } - { role: gitlab_runner } diff --git a/playbooks/gitlab.archlinux.org.yml b/playbooks/gitlab.archlinux.org.yml index 3f47c753..d5e7eeee 100644 --- a/playbooks/gitlab.archlinux.org.yml +++ b/playbooks/gitlab.archlinux.org.yml @@ -11,3 +11,4 @@ - { role: root_ssh } - { role: gitlab, gitlab_domain: "gitlab.archlinux.org" } - { role: borg_client, tags: ["borg"] } + - { role: prometheus_exporters } diff --git a/playbooks/monitoring.archlinux.org.yml b/playbooks/monitoring.archlinux.org.yml index ec5aa794..2fc94d7b 100644 --- a/playbooks/monitoring.archlinux.org.yml +++ b/playbooks/monitoring.archlinux.org.yml @@ -10,5 +10,6 @@ - { role: hardening } - { role: borg_client, tags: ["borg"], when: "'borg_clients' in group_names" } - { role: prometheus } + - { role: prometheus_exporters } - { role: certbot } - { role: nginx } diff --git a/playbooks/reproducible.archlinux.org.yml b/playbooks/reproducible.archlinux.org.yml index d194e102..ce800050 100644 --- a/playbooks/reproducible.archlinux.org.yml +++ b/playbooks/reproducible.archlinux.org.yml @@ -14,3 +14,4 @@ - { role: certbot } - { role: nginx } - { role: rebuilderd } + - { role: prometheus_exporters } diff --git a/roles/gitlab_runner/tasks/main.yml b/roles/gitlab_runner/tasks/main.yml index c5e18fab..f1a70b46 100644 --- a/roles/gitlab_runner/tasks/main.yml +++ b/roles/gitlab_runner/tasks/main.yml @@ -40,5 +40,12 @@ line: concurrent = 100 notify: restart gitlab-runner +- name: enable prometheus exporter + lineinfile: + path: /etc/gitlab-runner/config.toml + insertbefore: '^concurrent' + line: listen_address = ":{{ gitlab_runner_exporter_port }}" + notify: restart gitlab-runner + - name: enable and start gitlab runner service systemd: name=gitlab-runner state=started enabled=yes daemon_reload=yes diff --git a/roles/prometheus/defaults/main.yml b/roles/prometheus/defaults/main.yml index 3b75be8a..47660d86 100644 --- a/roles/prometheus/defaults/main.yml +++ b/roles/prometheus/defaults/main.yml @@ -1 +1,2 @@ monitoring_domain: monitoring.archlinux.org +gitlab_runner_exporter_port: '9252' diff --git a/roles/prometheus/templates/prometheus.yml.j2 b/roles/prometheus/templates/prometheus.yml.j2 index c868c754..1da101ed 100644 --- a/roles/prometheus/templates/prometheus.yml.j2 +++ b/roles/prometheus/templates/prometheus.yml.j2 @@ -13,3 +13,69 @@ alerting: - localhost:9093 scrape_configs: + - job_name: 'node_exporter' + static_configs: + {% for host in groups['node_exporters'] %} + + - targets: ['{{ host }}:{{ prometheus_exporter_port }}'] + labels: + instance: "{{ host }}" + + {% endfor %} + + - job_name: 'gitlab_runner_exporter' + static_configs: + {% for host in groups['gitlab_runners'] %} + + - targets: ['{{ host }}:{{ gitlab_runner_exporter_port }}'] + labels: + instance: "{{ host }}" + + {% endfor %} + + - job_name: 'keycloak' + scheme: https + metrics_path: "/auth/realms/master/metrics" + basic_auth: + username: "{{ vault_keycloak_nginx_user }}" + password: "{{ vault_keycloak_nginx_passwd }}" + static_configs: + - targets: ['accounts.archlinux.org:443'] + labels: + instance: "accounts.archlinux.org" + + - job_name: 'gitlab_exporter' + scheme: https + metrics_path: "-/metrics" + params: + token: ["{{ vault_gitlab_prometheus_token }}"] + static_configs: + - targets: ['gitlab.archlinux.org:443'] + labels: + instance: "gitlab.archlinux.org" + + - job_name: 'mysqld_exporter' + static_configs: + + - targets: ['aur.archlinux.org:9104'] + labels: + instance: "aur.archlinux.org" + + - job_name: 'blackbox' + metrics_path: /probe + scrape_interval: 15s + params: + module: [http_prometheus] + static_configs: + - targets: + {% for target in blackbox_targets %} + - {{ target }} + {% endfor %} + + relabel_configs: + - source_labels: [__address__] + target_label: __param_target + - source_labels: [__param_target] + target_label: instance + - target_label: __address__ + replacement: 127.0.0.1:9115 diff --git a/roles/prometheus_exporters/defaults/main.yml b/roles/prometheus_exporters/defaults/main.yml new file mode 100644 index 00000000..b66b5c46 --- /dev/null +++ b/roles/prometheus_exporters/defaults/main.yml @@ -0,0 +1,10 @@ +--- + +prometheus_domain: monitoring.archlinux.org + +prometheus_textfile_dir: /var/lib/node_exporter + +gitlab_runner_exporter_port: '9252' + +prometheus_mysqld_user: mysqld_exporter +prometheus_mysqld_exporter_port: '9104' diff --git a/roles/prometheus_exporters/files/arch-textcollector.sh b/roles/prometheus_exporters/files/arch-textcollector.sh new file mode 100755 index 00000000..963a851a --- /dev/null +++ b/roles/prometheus_exporters/files/arch-textcollector.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +set -o errexit +set -o nounset + +if (( $# != 1 )); then + echo "Missing textcollector directory argument" + exit 1 +fi + +HOSTNAME=$(hostname) +TEXTFILE_COLLECTOR_DIR=${1} +PROM_FILE=$TEXTFILE_COLLECTOR_DIR/pacman.prom + +TMP_FILE=$PROM_FILE.$$ +[ -e $TMP_FILE ] && rm -f $TMP_FILE + +trap "rm -f $TMP_FILE" EXIT + +updates=$(/usr/bin/checkupdates | wc -l) +secupdates=$(/usr/bin/arch-audit -u | wc -l) + +echo "# HELP pacman_updates_pending number of pending updates from pacman" >> $TMP_FILE +echo "# TYPE pacman_updates_pending gauge" >> $TMP_FILE +echo "pacman_updates_pending{host=\"${HOSTNAME}\"} $updates" >> $TMP_FILE + +echo "# HELP pacman_security_updates_pending number of pending updates from pacman" >> $TMP_FILE +echo "# TYPE pacman_security_updates_pending gauge" >> $TMP_FILE +echo "pacman_security_updates_pending{host=\"${HOSTNAME}\"} $secupdates" >> $TMP_FILE + +mv -f $TMP_FILE $PROM_FILE diff --git a/roles/prometheus_exporters/files/borg-textcollector.sh b/roles/prometheus_exporters/files/borg-textcollector.sh new file mode 100755 index 00000000..14de6284 --- /dev/null +++ b/roles/prometheus_exporters/files/borg-textcollector.sh @@ -0,0 +1,46 @@ +#!/usr/bin/bash + +set -o errexit +set -o nounset +set -o pipefail + +if (( $# != 1 )); then + echo "Missing textcollector directory argument" + exit 1 +fi + +HOSTNAME=$(hostname) +TEXTFILE_COLLECTOR_DIR=${1} +PROM_FILE=$TEXTFILE_COLLECTOR_DIR/borg.prom + + +TMP_FILE=$PROM_FILE.$$ +[ -e $TMP_FILE ] && rm -f $TMP_FILE + +trap "rm -f $TMP_FILE" EXIT + +# Hetzner borg +if [[ -f /usr/local/bin/borg ]]; then + LAST_ARCHIVE=$(/usr/local/bin/borg list --last 1) + LAST_ARCHIVE_NAME=$(echo $LAST_ARCHIVE | awk '{print $1}') + LAST_ARCHIVE_DATE=$(echo $LAST_ARCHIVE | awk '{print $3" "$4}') + LAST_ARCHIVE_TIMESTAMP=$(date -d "$LAST_ARCHIVE_DATE" +"%s") + + echo "# HELP borg_hetzner_last_archive_timestamp timestamp of last backup in UTC" >> $TMP_FILE + echo "# TYPE borg_hetzner_last_archive_timestamp counter" >> $TMP_FILE + echo "borg_hetzner_last_archive_timestamp{host=\"${HOSTNAME}\"} $LAST_ARCHIVE_TIMESTAMP" >> $TMP_FILE; +fi + +# rsync.net borg +if [[ -f /usr/local/bin/borg-offsite ]]; then + LAST_ARCHIVE=$(/usr/local/bin/borg-offsite list --last 1) + LAST_ARCHIVE_NAME=$(echo $LAST_ARCHIVE | awk '{print $1}') + LAST_ARCHIVE_DATE=$(echo $LAST_ARCHIVE | awk '{print $3" "$4}') + LAST_ARCHIVE_TIMESTAMP=$(date -d "$LAST_ARCHIVE_DATE" +"%s") + + echo "# HELP borg_offsite_last_archive_timestamp timestamp of last backup in UTC" >> $TMP_FILE + echo "# TYPE borg_offsite_last_archive_timestamp counter" >> $TMP_FILE + echo "borg_offsite_last_archive_timestamp{host=\"${HOSTNAME}\"} $LAST_ARCHIVE_TIMESTAMP" >> $TMP_FILE; +fi + +mv -f $TMP_FILE $PROM_FILE diff --git a/roles/prometheus_exporters/tasks/main.yml b/roles/prometheus_exporters/tasks/main.yml new file mode 100644 index 00000000..60ea1ee3 --- /dev/null +++ b/roles/prometheus_exporters/tasks/main.yml @@ -0,0 +1,88 @@ +--- + +- name: install prometheus-node-exporter + pacman: name=prometheus-node-exporter,arch-audit,pacman-contrib state=present + +- name: install prometheus-memcached-exporter + pacman: name=prometheus-memcached-exporter state=present + when: "'memcached' in group_names" + +- name: install prometheus-mysqld-exporter + pacman: name=prometheus-mysqld-exporter state=present + when: "'mysql_servers' in group_names" + +- name: create prometheus mysqld database user + mysql_user: + name: '{{ prometheus_mysqld_user }}' + password: '{{ vault_prometheus_mysql_password }}' + priv: "*.*:PROCESS,REPLICATION CLIENT" + state: present + when: "'mysql_servers' in group_names" + +# TODO: with ansible 2.10 this can be set by mysql_user https://github.com/ansible/ansible/issues/26581 +- name: set max_user_connections for prometheus mysqld user + command: mysql -u root -e "ALTER USER '{{ prometheus_mysqld_user }}'@'localhost' WITH MAX_USER_CONNECTIONS 3;" + when: "'mysql_servers' in group_names" + +- name: copy prometheus mysqld exporter configuration + template: src=prometheus-mysqld-exporter.j2 dest=/etc/conf.d/prometheus-mysqld-exporter owner=root group=root mode=600 + when: "'mysql_servers' in group_names" + +- name: enable prometheus-mysqld-exporter service + systemd: name=prometheus-mysqld-exporter enabled=yes daemon_reload=yes state=started + when: "'mysql_servers' in group_names" + +- name: install node exporter configuration + template: src=prometheus-node-exporter.env.j2 dest=/etc/conf.d/prometheus-node-exporter owner=root group=root mode=600 + +- name: create textcollector directory + file: path="{{ prometheus_textfile_dir }}" state=directory owner=node_exporter group=node_exporter mode=700 + +- name: install node exporter textcollector scripts + copy: src={{ item }} dest=/usr/local/bin/{{ item }} owner=root group=root mode=0755 + with_items: + - arch-textcollector.sh + - borg-textcollector.sh + +- name: install arch textcollector service + template: src=prometheus-arch-textcollector.service.j2 dest=/etc/systemd/system/prometheus-arch-textcollector.service owner=root group=root mode=600 + +- name: install arch textcollector timer + template: src=prometheus-arch-textcollector.timer.j2 dest=/etc/systemd/system/prometheus-arch-textcollector.timer owner=root group=root mode=600 + +- name: enable and start prometheus arch textcollector timer + systemd: name=prometheus-arch-textcollector.timer enabled=yes daemon_reload=yes state=started + +- name: install borg textcollector service + template: src=prometheus-borg-textcollector.service.j2 dest=/etc/systemd/system/prometheus-borg-textcollector.service owner=root group=root mode=600 + when: "'borg_clients' in group_names" + +- name: install borg textcollector timer + template: src=prometheus-borg-textcollector.timer.j2 dest=/etc/systemd/system/prometheus-borg-textcollector.timer owner=root group=root mode=600 + when: "'borg_clients' in group_names" + +- name: enable and start prometheus borg textcollector timer + systemd: name=prometheus-borg-textcollector.timer enabled=yes daemon_reload=yes state=started + when: "'borg_clients' in group_names" + +- name: enable prometheus-node-exporter service + systemd: name=prometheus-node-exporter enabled=yes daemon_reload=yes state=started + +- name: enable prometheus-memcached-exporter service + systemd: name=prometheus-memcached-exporter enabled=yes daemon_reload=yes state=started + when: "'memcached' in group_names" + +- name: open prometheus-node-exporter ipv4 port for monitoring.archlinux.org + firewalld: state=enabled permanent=true immediate=yes + rich_rule="rule family=ipv4 source address={{ hostvars['monitoring.archlinux.org']['ipv4_address'] }} port protocol=tcp port={{ prometheus_exporter_port }} accept" + when: "'prometheus' not in group_names" + +- name: open gitlab exporter ipv4 port for monitoring.archlinux.org + firewalld: state=enabled permanent=true immediate=yes + rich_rule="rule family=ipv4 source address={{ hostvars['monitoring.archlinux.org']['ipv4_address'] }} port protocol=tcp port={{ gitlab_runner_exporter_port }} accept" + when: "'gitlab_runners' in group_names" + +- name: open prometheus mysqld exporter ipv4 port for monitoring.archlinux.org + firewalld: state=enabled permanent=true immediate=yes + rich_rule="rule family=ipv4 source address={{ hostvars['monitoring.archlinux.org']['ipv4_address'] }} port protocol=tcp port={{ prometheus_mysqld_exporter_port }} accept" + when: "'mysql_servers' in group_names" diff --git a/roles/prometheus_exporters/templates/prometheus-arch-textcollector.service.j2 b/roles/prometheus_exporters/templates/prometheus-arch-textcollector.service.j2 new file mode 100644 index 00000000..5edb6dc8 --- /dev/null +++ b/roles/prometheus_exporters/templates/prometheus-arch-textcollector.service.j2 @@ -0,0 +1,37 @@ +[Unit] +Description=Prometheus Arch Exporter +After=network.target + +[Service] +Type=oneshot +User=node_exporter +ExecStart=/usr/local/bin/arch-textcollector.sh {{ prometheus_textfile_dir }} + +NoNewPrivileges=true +LockPersonality=true +CapabilityBoundingSet= +UMask=077 + +PrivateDevices=true +PrivateTmp=true +ProtectSystem=strict +ProtectHome=true +ReadWritePaths={{ prometheus_textfile_dir }} + +MemoryDenyWriteExecute=true +RemoveIPC=true +RestrictRealtime=true +RestrictNamespaces=true +RestrictSUIDSGID=true + +RestrictAddressFamilies=~AF_NETLINK +RestrictAddressFamilies=~AF_PACKET + +ProtectHostname=true +ProtectControlGroups=true +ProtectKernelLogs=true +ProtectKernelTunables=true +ProtectKernelModules=true +ProtectClock=true + +SystemCallArchitectures=native diff --git a/roles/prometheus_exporters/templates/prometheus-arch-textcollector.timer.j2 b/roles/prometheus_exporters/templates/prometheus-arch-textcollector.timer.j2 new file mode 100644 index 00000000..6afeacf4 --- /dev/null +++ b/roles/prometheus_exporters/templates/prometheus-arch-textcollector.timer.j2 @@ -0,0 +1,10 @@ +[Unit] +Description=Prometheus Arch Exporter TextCollector Timer + +[Timer] +OnUnitActiveSec=60m +OnBootSec=15min +RandomizedDelaySec=1min + +[Install] +WantedBy=timers.target diff --git a/roles/prometheus_exporters/templates/prometheus-borg-textcollector.service.j2 b/roles/prometheus_exporters/templates/prometheus-borg-textcollector.service.j2 new file mode 100644 index 00000000..593a774e --- /dev/null +++ b/roles/prometheus_exporters/templates/prometheus-borg-textcollector.service.j2 @@ -0,0 +1,35 @@ +[Unit] +Description=Prometheus Borg Exporter TextCollector +After=network.target +ConditionPathExistsGlob=!/root/.cache/borg/*/lock.roster + +[Service] +Type=oneshot +ExecStart=/usr/local/bin/borg-textcollector.sh {{ prometheus_textfile_dir }} + +NoNewPrivileges=true +LockPersonality=true + +PrivateDevices=true +PrivateTmp=true +ProtectSystem=strict +ProtectHome=read-only +ReadWritePaths={{ prometheus_textfile_dir }} /root/.cache/borg + +MemoryDenyWriteExecute=true +RemoveIPC=true +RestrictRealtime=true +RestrictNamespaces=true +RestrictSUIDSGID=true + +RestrictAddressFamilies=~AF_PACKET +RestrictAddressFamilies=~AF_NETLINK + +ProtectHostname=true +ProtectControlGroups=true +ProtectKernelLogs=true +ProtectKernelTunables=true +ProtectKernelModules=true +ProtectClock=true + +SystemCallArchitectures=native diff --git a/roles/prometheus_exporters/templates/prometheus-borg-textcollector.timer.j2 b/roles/prometheus_exporters/templates/prometheus-borg-textcollector.timer.j2 new file mode 100644 index 00000000..ca8a197e --- /dev/null +++ b/roles/prometheus_exporters/templates/prometheus-borg-textcollector.timer.j2 @@ -0,0 +1,10 @@ +[Unit] +Description=Prometheus Borg Exporter TextCollector Timer + +[Timer] +OnUnitActiveSec=1h +OnBootSec=15min +RandomizedDelaySec=1min + +[Install] +WantedBy=timers.target diff --git a/roles/prometheus_exporters/templates/prometheus-mysqld-exporter.j2 b/roles/prometheus_exporters/templates/prometheus-mysqld-exporter.j2 new file mode 100644 index 00000000..c74feee7 --- /dev/null +++ b/roles/prometheus_exporters/templates/prometheus-mysqld-exporter.j2 @@ -0,0 +1,3 @@ +DATA_SOURCE_NAME="{{ prometheus_mysqld_user }}:{{ vault_prometheus_mysql_password }}@(localhost:3306)/" +# TODO: review these settings +MYSQLD_EXPORTER_ARGS="--collect.binlog_size --collect.info_schema.processlist --collect.info_schema.userstats" diff --git a/roles/prometheus_exporters/templates/prometheus-node-exporter.env.j2 b/roles/prometheus_exporters/templates/prometheus-node-exporter.env.j2 new file mode 100644 index 00000000..88dd42d6 --- /dev/null +++ b/roles/prometheus_exporters/templates/prometheus-node-exporter.env.j2 @@ -0,0 +1 @@ +NODE_EXPORTER_ARGS="--collector.systemd --collector.textfile.directory={{ prometheus_textfile_dir }}" -- GitLab From 3fd36ddb177c42476ee3c194a553f723954507d8 Mon Sep 17 00:00:00 2001 From: Jelle van der Waa Date: Sun, 9 Aug 2020 19:26:10 +0200 Subject: [PATCH 2/3] Add blackbox exporter for https status checking Run the blackbox exporter on monitoring.archlinux.org to monitor other machines http status for public services we provide. Also has an alert for when a certificate is about to expire in 3 days. --- roles/prometheus/defaults/main.yml | 11 ++++++++ roles/prometheus/files/node.rules.yml | 28 +++++++++++++++++++ roles/prometheus_exporters/files/blackbox.yml | 5 ++++ roles/prometheus_exporters/tasks/main.yml | 12 ++++++++ 4 files changed, 56 insertions(+) create mode 100644 roles/prometheus_exporters/files/blackbox.yml diff --git a/roles/prometheus/defaults/main.yml b/roles/prometheus/defaults/main.yml index 47660d86..7193311a 100644 --- a/roles/prometheus/defaults/main.yml +++ b/roles/prometheus/defaults/main.yml @@ -1,2 +1,13 @@ monitoring_domain: monitoring.archlinux.org gitlab_runner_exporter_port: '9252' + +blackbox_targets: + - https://archlinux.org + - https://wiki.archlinux.org + - https://gitlab.archlinux.org + - https://bbs.archlinux.org + - https://bugs.archlinux.org + - https://aur.archlinux.org + - https://archive.archlinux.org + - https://mirror.pkgbuild.com + - https://pkgbuild.com diff --git a/roles/prometheus/files/node.rules.yml b/roles/prometheus/files/node.rules.yml index d1eba8ba..4fb0d23f 100644 --- a/roles/prometheus/files/node.rules.yml +++ b/roles/prometheus/files/node.rules.yml @@ -366,3 +366,31 @@ groups: annotations: summary: Workhorse has high error rates description: Workhorse route {{ $labels.route }} method {{ $labels.method }} has more than 10% errors ({{ $value | printf "%.1f" }}%) for the last 60 minutes. + +- name: blackbox + interval: 15s + rules: + - alert: BlackboxProbeFailed + expr: probe_success == 0 + for: 5m + labels: + severity: critical + annotations: + summary: "Blackbox probe failed (instance {{ $labels.instance }})" + description: "Probe failed\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + - alert: BlackboxProbeHttpFailure + expr: probe_http_status_code <= 199 OR probe_http_status_code >= 400 + for: 5m + labels: + severity: critical + annotations: + summary: "Blackbox probe HTTP failure (instance {{ $labels.instance }})" + description: "HTTP status code is not 200-399\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + - alert: BlackboxSslCertificateWillExpireSoon + expr: probe_ssl_earliest_cert_expiry - time() < 86400 * 3 + for: 5m + labels: + severity: critical + annotations: + summary: "Blackbox SSL certificate will expire soon (instance {{ $labels.instance }})" + description: "SSL certificate expires in 3 days\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" diff --git a/roles/prometheus_exporters/files/blackbox.yml b/roles/prometheus_exporters/files/blackbox.yml new file mode 100644 index 00000000..a537090b --- /dev/null +++ b/roles/prometheus_exporters/files/blackbox.yml @@ -0,0 +1,5 @@ +modules: + http_prometheus: + prober: http + timeout: 5s + http: diff --git a/roles/prometheus_exporters/tasks/main.yml b/roles/prometheus_exporters/tasks/main.yml index 60ea1ee3..65772aac 100644 --- a/roles/prometheus_exporters/tasks/main.yml +++ b/roles/prometheus_exporters/tasks/main.yml @@ -3,6 +3,10 @@ - name: install prometheus-node-exporter pacman: name=prometheus-node-exporter,arch-audit,pacman-contrib state=present +- name: install prometheus-blackbox-exporter + pacman: name=prometheus-blackbox-exporter state=present + when: "'prometheus' in group_names" + - name: install prometheus-memcached-exporter pacman: name=prometheus-memcached-exporter state=present when: "'memcached' in group_names" @@ -65,9 +69,17 @@ systemd: name=prometheus-borg-textcollector.timer enabled=yes daemon_reload=yes state=started when: "'borg_clients' in group_names" +- name: install blackbox exporter configuration + copy: src=blackbox.yml dest=/etc/prometheus/blackbox.yml owner=root group=root mode=0755 + when: "'prometheus' in group_names" + - name: enable prometheus-node-exporter service systemd: name=prometheus-node-exporter enabled=yes daemon_reload=yes state=started +- name: enable prometheus-blackbox-exporter service + systemd: name=prometheus-blackbox-exporter enabled=yes daemon_reload=yes state=started + when: "'prometheus' in group_names" + - name: enable prometheus-memcached-exporter service systemd: name=prometheus-memcached-exporter enabled=yes daemon_reload=yes state=started when: "'memcached' in group_names" -- GitLab From cd4b284446d2f19b0dcf866f7a911fa0eb122cea Mon Sep 17 00:00:00 2001 From: Jelle van der Waa Date: Sat, 15 Aug 2020 14:38:20 +0200 Subject: [PATCH 3/3] Add rebuilderd build queue length textcollector Record the rebuilderd queue length in prometheus so we can generate an alert for when the queue length keeps rising. As this could be an indication that the rebuilders have builds which are stuck. --- hosts | 3 ++ roles/prometheus/files/node.rules.yml | 22 +++++++++++ .../prometheus/templates/alertmanager.yml.j2 | 11 +++++- .../files/rebuilderd-textcollector.sh | 31 ++++++++++++++++ roles/prometheus_exporters/tasks/main.yml | 21 +++++++++++ ...etheus-rebuilderd-textcollector.service.j2 | 37 +++++++++++++++++++ ...ometheus-rebuilderd-textcollector.timer.j2 | 10 +++++ 7 files changed, 133 insertions(+), 2 deletions(-) create mode 100644 roles/prometheus_exporters/files/rebuilderd-textcollector.sh create mode 100644 roles/prometheus_exporters/templates/prometheus-rebuilderd-textcollector.service.j2 create mode 100644 roles/prometheus_exporters/templates/prometheus-rebuilderd-textcollector.timer.j2 diff --git a/hosts b/hosts index bc91b96b..bf25cecf 100644 --- a/hosts +++ b/hosts @@ -93,6 +93,9 @@ secure-runner2.archlinux.org repro1.pkgbuild.com repro3.pkgbuild.com +[rebuilderd] +reproducible.archlinux.org + [memcached] apollo.archlinux.org aur.archlinux.org diff --git a/roles/prometheus/files/node.rules.yml b/roles/prometheus/files/node.rules.yml index 4fb0d23f..10989074 100644 --- a/roles/prometheus/files/node.rules.yml +++ b/roles/prometheus/files/node.rules.yml @@ -394,3 +394,25 @@ groups: annotations: summary: "Blackbox SSL certificate will expire soon (instance {{ $labels.instance }})" description: "SSL certificate expires in 3 days\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + +- name: rebuilderd + interval: 15m + rules: + - alert: RebuilderdQueueNotEmpty + expr: rebuilderd_queue_length > 2000 + for: 24h + labels: + severity: warning + service: rebuilderd + annotations: + summary: "Rebuilderd queue length is not empty {{ $labels.instance }})" + description: "Rebuilderd's queue length is now: {{ $value }}" + - alert: RebuilderdWorkersOffline + expr: rebuilderd_workers < 4 + for: 5m + labels: + severity: warning + service: rebuilderd + annotations: + summary: "Rebuilderd workers offline {{ $labels.instance }})" + description: "Not all rebuilder-workers are online, currently {{ $value }} workers are online" diff --git a/roles/prometheus/templates/alertmanager.yml.j2 b/roles/prometheus/templates/alertmanager.yml.j2 index 73083d67..de998095 100644 --- a/roles/prometheus/templates/alertmanager.yml.j2 +++ b/roles/prometheus/templates/alertmanager.yml.j2 @@ -11,9 +11,16 @@ route: group_wait: 30s group_interval: 5m repeat_interval: 24h - receiver: 'devops' + receiver: 'default-receiver' + routes: + - receiver: 'rebuilderd' + match_re: + service: rebuilderd receivers: - - name: 'devops' + - name: 'default-receiver' email_configs: - to: 'alerts@archlinux.org' + - name: 'rebuilderd' + email_configs: + - to: 'jelle@archlinux.org' diff --git a/roles/prometheus_exporters/files/rebuilderd-textcollector.sh b/roles/prometheus_exporters/files/rebuilderd-textcollector.sh new file mode 100644 index 00000000..635d8207 --- /dev/null +++ b/roles/prometheus_exporters/files/rebuilderd-textcollector.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +set -o errexit +set -o nounset + +if (( $# != 1 )); then + echo "Missing textcollector directory argument" + exit 1 +fi + +HOSTNAME=$(hostname) +TEXTFILE_COLLECTOR_DIR=${1} +PROM_FILE=$TEXTFILE_COLLECTOR_DIR/rebuilderd.prom + +TMP_FILE=$PROM_FILE.$$ +[ -e $TMP_FILE ] && rm -f $TMP_FILE + +trap "rm -f $TMP_FILE" EXIT + +queuelength=$(rebuildctl queue ls --json | jq '.queue | length') +workers=$(rebuildctl status | wc -l) + +echo "# HELP rebuilderd_queue_length number of packages in rebuilderd queue" >> $TMP_FILE +echo "# TYPE rebuilderd_queue_length gauge" >> $TMP_FILE +echo "rebuilderd_queue_length{host=\"${HOSTNAME}\"} $queuelength" >> $TMP_FILE + +echo "# HELP rebuilderd_workers number of rebuilderd-workers available" >> $TMP_FILE +echo "# TYPE rebuilderd_workers gauge" >> $TMP_FILE +echo "rebuilderd_workers{host=\"${HOSTNAME}\"} $workers" >> $TMP_FILE + +mv -f $TMP_FILE $PROM_FILE diff --git a/roles/prometheus_exporters/tasks/main.yml b/roles/prometheus_exporters/tasks/main.yml index 65772aac..51c305d1 100644 --- a/roles/prometheus_exporters/tasks/main.yml +++ b/roles/prometheus_exporters/tasks/main.yml @@ -11,6 +11,14 @@ pacman: name=prometheus-memcached-exporter state=present when: "'memcached' in group_names" +- name: install jq for rebuilderd-textcollector + pacman: name=jq state=present + when: "'rebuilderd' in group_names" + +- name: add node_exporter to rebuilderd group + user: name=node_exporter groups=rebuilderd append=yes + when: "'rebuilderd' in group_names" + - name: install prometheus-mysqld-exporter pacman: name=prometheus-mysqld-exporter state=present when: "'mysql_servers' in group_names" @@ -47,6 +55,7 @@ with_items: - arch-textcollector.sh - borg-textcollector.sh + - rebuilderd-textcollector.sh - name: install arch textcollector service template: src=prometheus-arch-textcollector.service.j2 dest=/etc/systemd/system/prometheus-arch-textcollector.service owner=root group=root mode=600 @@ -73,6 +82,18 @@ copy: src=blackbox.yml dest=/etc/prometheus/blackbox.yml owner=root group=root mode=0755 when: "'prometheus' in group_names" +- name: install rebuilderd textcollector service + template: src=prometheus-rebuilderd-textcollector.service.j2 dest=/etc/systemd/system/prometheus-rebuilderd-textcollector.service owner=root group=root mode=600 + when: "'rebuilderd' in group_names" + +- name: install rebuilderd textcollector timer + template: src=prometheus-rebuilderd-textcollector.timer.j2 dest=/etc/systemd/system/prometheus-rebuilderd-textcollector.timer owner=root group=root mode=600 + when: "'rebuilderd' in group_names" + +- name: enable and start prometheus rebuilderd textcollector timer + systemd: name=prometheus-rebuilderd-textcollector.timer enabled=yes daemon_reload=yes state=started + when: "'rebuilderd' in group_names" + - name: enable prometheus-node-exporter service systemd: name=prometheus-node-exporter enabled=yes daemon_reload=yes state=started diff --git a/roles/prometheus_exporters/templates/prometheus-rebuilderd-textcollector.service.j2 b/roles/prometheus_exporters/templates/prometheus-rebuilderd-textcollector.service.j2 new file mode 100644 index 00000000..7b219b2a --- /dev/null +++ b/roles/prometheus_exporters/templates/prometheus-rebuilderd-textcollector.service.j2 @@ -0,0 +1,37 @@ +[Unit] +Description=Prometheus Rebuilderd Exporter +After=network.target + +[Service] +Type=oneshot +User=node_exporter +ExecStart=/usr/local/bin/rebuilderd-textcollector.sh {{ prometheus_textfile_dir }} + +NoNewPrivileges=true +LockPersonality=true +CapabilityBoundingSet= +UMask=077 + +PrivateDevices=true +PrivateTmp=true +ProtectSystem=strict +ProtectHome=true +ReadWritePaths={{ prometheus_textfile_dir }} + +MemoryDenyWriteExecute=true +RemoveIPC=true +RestrictRealtime=true +RestrictNamespaces=true +RestrictSUIDSGID=true + +RestrictAddressFamilies=~AF_NETLINK +RestrictAddressFamilies=~AF_PACKET + +ProtectHostname=true +ProtectControlGroups=true +ProtectKernelLogs=true +ProtectKernelTunables=true +ProtectKernelModules=true +ProtectClock=true + +SystemCallArchitectures=native diff --git a/roles/prometheus_exporters/templates/prometheus-rebuilderd-textcollector.timer.j2 b/roles/prometheus_exporters/templates/prometheus-rebuilderd-textcollector.timer.j2 new file mode 100644 index 00000000..5b3115f5 --- /dev/null +++ b/roles/prometheus_exporters/templates/prometheus-rebuilderd-textcollector.timer.j2 @@ -0,0 +1,10 @@ +[Unit] +Description=Prometheus Rebuilderd Exporter TextCollector Timer + +[Timer] +OnUnitActiveSec=60m +OnBootSec=15min +RandomizedDelaySec=1min + +[Install] +WantedBy=timers.target -- GitLab