From 201b1c243c5c197c82d96cbe03f30f9a9b7ffee3 Mon Sep 17 00:00:00 2001 From: Jelle van der Waa <jelle@archlinux.org> Date: Tue, 4 Jan 2022 17:47:24 +0100 Subject: [PATCH] Add Grafana dashboard / alert for SMART errors --- roles/grafana/files/dashboards/smart.json | 324 ++++++++++++++++++++++ roles/prometheus/files/node.rules.yml | 22 ++ 2 files changed, 346 insertions(+) create mode 100644 roles/grafana/files/dashboards/smart.json diff --git a/roles/grafana/files/dashboards/smart.json b/roles/grafana/files/dashboards/smart.json new file mode 100644 index 000000000..588943be8 --- /dev/null +++ b/roles/grafana/files/dashboards/smart.json @@ -0,0 +1,324 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + } + ] + }, + "description": "", + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "iteration": 1641326230915, + "links": [], + "liveNow": false, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "description": "", + "fieldConfig": { + "defaults": { + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 1 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 0, + "y": 0 + }, + "id": 4, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "8.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "exemplar": true, + "expr": "sum(smart_device_smart_healthy{instance=~\"$instance\"})-sum(smart_device_smart_healthy{instance=~\"$instance\"})", + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Unhealthy disks", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "celsius" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 21, + "x": 3, + "y": 0 + }, + "id": 2, + "options": { + "legend": { + "calcs": [ + "min", + "max" + ], + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "single" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "exemplar": true, + "expr": "avg(smart_temperature_celsius{ instance=~\"$instance\", disk=~\"$disk\" }) by (instance, disk)", + "interval": "", + "legendFormat": "{{ disk }}", + "refId": "A" + } + ], + "title": "Temperature", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "description": "", + "fieldConfig": { + "defaults": { + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 1 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 0, + "y": 5 + }, + "id": 5, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "8.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "exemplar": true, + "expr": "sum(smart_device_self_test{instance=~\"$instance\"})-sum(smart_device_self_test{instance=~\"$instance\"})", + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Self test", + "type": "stat" + } + ], + "schemaVersion": 33, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "current": { + "selected": false, + "text": "build.archlinux.org", + "value": "build.archlinux.org" + }, + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "definition": "label_values(smart_device_info, instance)", + "hide": 0, + "includeAll": false, + "multi": false, + "name": "instance", + "options": [], + "query": { + "query": "label_values(smart_device_info, instance)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + }, + { + "current": { + "selected": true, + "text": [ + "All" + ], + "value": [ + "$__all" + ] + }, + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "definition": "label_values(smart_device_info, disk)", + "hide": 0, + "includeAll": true, + "multi": true, + "name": "disk", + "options": [], + "query": { + "query": "label_values(smart_device_info, disk)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + } + ] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "S.M.A.R.T. disk monitoring", + "uid": "1BFMELA7z", + "version": 1, + "weekStart": "" +} \ No newline at end of file diff --git a/roles/prometheus/files/node.rules.yml b/roles/prometheus/files/node.rules.yml index 5bdbb64df..a60229bfe 100644 --- a/roles/prometheus/files/node.rules.yml +++ b/roles/prometheus/files/node.rules.yml @@ -284,6 +284,28 @@ groups: description: 'host {{ $labels.instance }} btrfs corruption errors' summary: '{{ $labels.instance }} has {{ $value }} btrfs_corruption_errs' + - name: smart + interval: 1m + rules: + - alert: smart_device_smart_healthy + expr: smart_device_smart_healthy == 0 + for: 2m + labels: + severity: critical + service: smart + annotations: + description: 'host {{ $labels.instance }} has an unhealthy disk {{ $labels.disk }}' + summary: '{{ $labels.instance }} has an unhealthy disk {{ $labels.disk }}' + - alert: smart_device_self_test + expr: smart_device_self_test == 0 + for: 2m + labels: + severity: critical + service: smart + annotations: + description: 'host {{ $labels.instance }} has an not passing self test' + summary: '{{ $labels.instance }} has an unhealthy disk {{ $labels.disk }}' + - name: borg interval: 60s rules: -- GitLab