diff options
author | Azat Bahawi <azat@bahawi.net> | 2023-04-12 04:01:46 +0300 |
---|---|---|
committer | Azat Bahawi <azat@bahawi.net> | 2023-04-12 04:01:46 +0300 |
commit | d6368c86bc949371e904eed3d0a6583ebd53b055 (patch) | |
tree | 042db513412ba7f1577b1ac690d4e0e0fac22cbf /modules/nixos/monitoring | |
parent | 2023-04-07 (diff) |
2023-04-12
Diffstat (limited to 'modules/nixos/monitoring')
-rw-r--r-- | modules/nixos/monitoring/default.nix | 195 | ||||
-rw-r--r-- | modules/nixos/monitoring/rules/nginx.yaml | 40 | ||||
-rw-r--r-- | modules/nixos/monitoring/rules/node.yaml | 304 | ||||
-rw-r--r-- | modules/nixos/monitoring/rules/postgres.yaml | 154 | ||||
-rw-r--r-- | modules/nixos/monitoring/rules/redis.yaml | 86 |
5 files changed, 359 insertions, 420 deletions
diff --git a/modules/nixos/monitoring/default.nix b/modules/nixos/monitoring/default.nix index a492a47..37e34d9 100644 --- a/modules/nixos/monitoring/default.nix +++ b/modules/nixos/monitoring/default.nix @@ -124,146 +124,77 @@ in { prometheus = { # It would be nice if these could be generated dynamically. That would # require a complete rework of how configurations are defined, though. - scrapeConfigs = let - mkTargets = hosts: port: map (host: "${host.hostname}:${toString port}") hosts; - in - with my.configurations; - with config.services.prometheus.exporters; [ - { - job_name = "promtail"; - static_configs = [ - { - targets = - mkTargets - [ - manwe - varda - yavanna - ] - config.nixfiles.modules.promtail.port; - } - ]; - } - { - job_name = "ntfy"; - static_configs = [ - { - targets = - mkTargets - [ - manwe - ] - config.nixfiles.modules.ntfy.prometheus.port; - } - ]; - } - { - job_name = "soju"; - static_configs = [ - { - targets = [ - "127.0.0.1:${toString config.nixfiles.modules.soju.prometheus.port}" - ]; - } - ]; - } - { - job_name = "endlessh-go"; - static_configs = [ - { - targets = - mkTargets - [ - manwe - varda - yavanna - ] - config.services.endlessh-go.prometheus.port; - } - ]; - } - { - job_name = "nginx"; - static_configs = [ - { - targets = - mkTargets - [ - manwe - yavanna - ] - nginx.port; - } - ]; - } - { - job_name = "node"; - static_configs = [ - { - targets = - mkTargets - [ - manwe - varda - yavanna - ] - node.port; - } - ]; - } - { - job_name = "postgres"; + scrapeConfigs = with my.configurations; + mapAttrsToList + ( + name: value: { + job_name = name; static_configs = [ { - targets = - mkTargets - [ - manwe - ] - postgres.port; + targets = with value; + map (host: + concatStringsSep ":" [ + ( + if isAttrs host + then host.hostname + else host + ) + (toString port) + ]) + hosts; } ]; - } - { - job_name = "redis"; - static_configs = [ + relabel_configs = [ { - targets = - mkTargets - [ - manwe - ] - redis.port; + source_labels = ["__address__"]; + regex = "([^:]+):\\d+"; + target_label = "instance"; } ]; } - { - job_name = "unbound"; - static_configs = [ - { - targets = - mkTargets - [ - manwe - ] - unbound.port; - } - ]; - } - { - job_name = "wireguard"; - static_configs = [ - { - targets = - mkTargets - [ - manwe - ] - wireguard.port; - } - ]; - } - ]; + ) + { + promtail = { + hosts = [manwe varda yavanna]; + inherit (config.nixfiles.modules.promtail) port; + }; + ntfy = { + hosts = [manwe]; + inherit (config.nixfiles.modules.ntfy.prometheus) port; + }; + soju = { + hosts = ["127.0.0.1"]; + inherit (config.nixfiles.modules.soju.prometheus) port; + }; + endlessh-go = { + hosts = [manwe varda yavanna]; + inherit (config.services.endlessh-go.prometheus) port; + }; + nginx = { + hosts = [manwe yavanna]; + inherit (config.services.prometheus.exporters.nginx) port; + }; + node = { + hosts = [manwe varda yavanna]; + inherit (config.services.prometheus.exporters.node) port; + }; + postgres = { + hosts = [manwe]; + inherit (config.services.prometheus.exporters.postgres) port; + }; + redis = { + hosts = [manwe]; + inherit (config.services.prometheus.exporters.redis) port; + }; + unbound = { + hosts = [manwe]; + inherit (config.services.prometheus.exporters.unbound) port; + }; + wireguard = { + hosts = [manwe]; + inherit (config.services.prometheus.exporters.wireguard) port; + }; + }; ruleFiles = [ ./rules/nginx.yaml diff --git a/modules/nixos/monitoring/rules/nginx.yaml b/modules/nixos/monitoring/rules/nginx.yaml index 59229a8..f00d372 100644 --- a/modules/nixos/monitoring/rules/nginx.yaml +++ b/modules/nixos/monitoring/rules/nginx.yaml @@ -6,47 +6,55 @@ groups: - alert: NginxHighHttp4xxErrorRate expr: >- sum(rate(nginx_http_requests_total{status=~"^4.."}[1m])) - / sum(rate(nginx_http_requests_total[1m])) * 100 + / + sum(rate(nginx_http_requests_total[1m])) * 100 > 5 for: 1m labels: severity: critical annotations: - summary: NGINX high HTTP 4xx error rate (instance {{ $labels.instance }}) + summary: NGINX high HTTP 4xx error rate at {{ $labels.instance }}. description: |- - Too many HTTP requests with status 4xx (> 5%). + Too many HTTP requests with a 4xx status code. - VALUE = {{ $value }} - LABELS = {{ $labels }} + VALUE = {{ $value }} + LABELS = {{ $labels }} - alert: NginxHighHttp5xxErrorRate expr: >- sum(rate(nginx_http_requests_total{status=~"^5.."}[1m])) - / sum(rate(nginx_http_requests_total[1m])) * 100 + / + sum(rate(nginx_http_requests_total[1m])) * 100 > 5 for: 1m labels: severity: critical annotations: - summary: NGINX high HTTP 5xx error rate (instance {{ $labels.instance }}) + summary: NGINX high HTTP 5xx error rate at {{ $labels.instance }}. description: |- - Too many HTTP requests with status 5xx (> 5%). + Too many HTTP requests with a 5xx status code. - VALUE = {{ $value }} - LABELS = {{ $labels }} + VALUE = {{ $value }} + LABELS = {{ $labels }} - alert: NginxLatencyHigh expr: >- - histogram_quantile(0.99, - sum(rate(nginx_http_request_duration_seconds_bucket[2m])) by (host, node)) + histogram_quantile( + 0.99, + sum( + rate( + nginx_http_request_duration_seconds_bucket[2m] + ) + ) by (host, node) + ) > 3 for: 2m labels: severity: warning annotations: - summary: NGINX high latency (instance {{ $labels.instance }}) + summary: NGINX high latency at {{ $labels.instance }}. description: |- - NGINX 99% latency is higher than 3 seconds. + NGINX 99% of latency spikes is higher than 3 seconds. - VALUE = {{ $value }} - LABELS = {{ $labels }} + VALUE = {{ $value }} + LABELS = {{ $labels }} diff --git a/modules/nixos/monitoring/rules/node.yaml b/modules/nixos/monitoring/rules/node.yaml index 81d7810..98217b3 100644 --- a/modules/nixos/monitoring/rules/node.yaml +++ b/modules/nixos/monitoring/rules/node.yaml @@ -13,12 +13,12 @@ groups: labels: severity: warning annotations: - summary: Host out of memory (instance {{ $labels.instance }}) + summary: Host out of memory at {{ $labels.instance }}. description: |- - Node memory is filling up (< 10% left). + Node memory is filling up. - VALUE = {{ $value }} - LABELS = {{ $labels }} + VALUE = {{ $value }} + LABELS = {{ $labels }} - alert: HostMemoryUnderMemoryPressure expr: >- @@ -27,13 +27,12 @@ groups: labels: severity: warning annotations: - summary: Host memory under memory pressure (instance {{ $labels.instance }}) + summary: Host memory under memory pressure at {{ $labels.instance }}. description: |- - The node is under heavy memory pressure. High rate of major page - faults. + The node is under heavy memory pressure. High rate of major page faults. - VALUE = {{ $value }} - LABELS = {{ $labels }} + VALUE = {{ $value }} + LABELS = {{ $labels }} - alert: HostUnusualNetworkThroughputIn expr: >- @@ -47,10 +46,10 @@ groups: annotations: summary: Host unusual network throughput in (instance {{ $labels.instance }}) description: |- - Host network interfaces are probably receiving too much data (> 100 MB/s). + Host network interfaces are probably receiving too much data. - VALUE = {{ $value }} - LABELS = {{ $labels }} + VALUE = {{ $value }} + LABELS = {{ $labels }} - alert: HostUnusualNetworkThroughputOut expr: >- @@ -62,12 +61,12 @@ groups: labels: severity: warning annotations: - summary: Host unusual network throughput out (instance {{ $labels.instance }}) + summary: Host unusual network throughput out at {{ $labels.instance }}. description: |- - Host network interfaces are probably sending too much data (> 100 MB/s). + Host network interfaces are probably sending too much data. - VALUE = {{ $value }} - LABELS = {{ $labels }} + VALUE = {{ $value }} + LABELS = {{ $labels }} - alert: HostUnusualDiskReadRate expr: >- @@ -79,12 +78,12 @@ groups: labels: severity: warning annotations: - summary: Host unusual disk read rate (instance {{ $labels.instance }}) + summary: Host unusual disk read rate at {{ $labels.instance }}. description: |- - Disk is probably reading too much data (> 50 MB/s). + Disk is probably reading too much data. - VALUE = {{ $value }} - LABELS = {{ $labels }} + VALUE = {{ $value }} + LABELS = {{ $labels }} - alert: HostUnusualDiskWriteRate expr: >- @@ -96,12 +95,12 @@ groups: labels: severity: warning annotations: - summary: Host unusual disk write rate (instance {{ $labels.instance }}) + summary: Host unusual disk write rate at {{ $labels.instance }}. description: |- - Disk is probably writing too much data (> 50 MB/s). + Disk is probably writing too much data. - VALUE = {{ $value }} - LABELS = {{ $labels }} + VALUE = {{ $value }} + LABELS = {{ $labels }} - alert: HostOutOfDiskSpace expr: >- @@ -109,18 +108,18 @@ groups: / node_filesystem_size_bytes < 10 and - ON (instance, device, mountpoint) node_filesystem_readonly + on (instance, device, mountpoint) node_filesystem_readonly == 0 for: 2m labels: severity: warning annotations: - summary: Host out of disk space (instance {{ $labels.instance }}) + summary: Host out of disk space at {{ $labels.instance }}. description: |- - Disk is almost full (< 10% left). + Disk is almost full. - VALUE = {{ $value }} - LABELS = {{ $labels }} + VALUE = {{ $value }} + LABELS = {{ $labels }} - alert: HostDiskWillFillIn24Hours expr: >- @@ -136,13 +135,12 @@ groups: labels: severity: warning annotations: - summary: Host disk will fill in 24 hours (instance {{ $labels.instance }}) + summary: Host disk will fill in 24 hours at {{ $labels.instance }}. description: |- - Filesystem is predicted to run out of space within the next 24 hours - at current write rate. + Filesystem is predicted to run out of space within the next 24 hours at current write rate. - VALUE = {{ $value }} - LABELS = {{ $labels }} + VALUE = {{ $value }} + LABELS = {{ $labels }} - alert: HostOutOfInodes expr: >- @@ -156,12 +154,12 @@ groups: labels: severity: warning annotations: - summary: Host out of inodes (instance {{ $labels.instance }}) + summary: Host out of inodes at {{ $labels.instance }}. description: |- - Disk is almost running out of available inodes (< 10% left). + Disk is almost running out of available inodes. - VALUE = {{ $value }} - LABELS = {{ $labels }} + VALUE = {{ $value }} + LABELS = {{ $labels }} - alert: HostInodesWillFillIn24Hours expr: >- @@ -178,13 +176,12 @@ groups: labels: severity: warning annotations: - summary: Host inodes will fill in 24 hours (instance {{ $labels.instance }}) + summary: Host inodes will fill in 24 hours at {{ $labels.instance }}. description: |- - Filesystem is predicted to run out of inodes within the next 24 - hours at current write rate. + Filesystem is predicted to run out of inodes within the next 24 hours at current write rate. - VALUE = {{ $value }} - LABELS = {{ $labels }} + VALUE = {{ $value }} + LABELS = {{ $labels }} - alert: HostUnusualDiskReadLatency expr: >- @@ -198,12 +195,12 @@ groups: labels: severity: warning annotations: - summary: Host unusual disk read latency (instance {{ $labels.instance }}) + summary: Host unusual disk read latency at {{ $labels.instance }}. description: |- - Disk latency is growing (read operations > 100ms). + Disk latency is growing. - VALUE = {{ $value }} - LABELS = {{ $labels }} + VALUE = {{ $value }} + LABELS = {{ $labels }} - alert: HostUnusualDiskWriteLatency expr: >- @@ -217,12 +214,12 @@ groups: labels: severity: warning annotations: - summary: Host unusual disk write latency (instance {{ $labels.instance }}) + summary: Host unusual disk write latency at {{ $labels.instance }}. description: |- - Disk latency is growing (write operations > 100ms). + Disk latency is growing. - VALUE = {{ $value }} - LABELS = {{ $labels }} + VALUE = {{ $value }} + LABELS = {{ $labels }} - alert: HostHighCpuLoad expr: >- @@ -234,23 +231,23 @@ groups: labels: severity: warning annotations: - summary: Host high CPU load (instance {{ $labels.instance }}) + summary: Host high CPU load at {{ $labels.instance }}. description: |- - CPU load is > 80%. + CPU load is high. - VALUE = {{ $value }} - LABELS = {{ $labels }} + VALUE = {{ $value }} + LABELS = {{ $labels }} - alert: HostCpuStealNoisyNeighbor expr: >- avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 - > 10 + > 15 for: 0m labels: severity: warning annotations: - summary: Host CPU steal noisy neighbor (instance {{ $labels.instance }}) + summary: Host CPU steal noisy neighbor at {{ $labels.instance }}. description: |- CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit. @@ -262,18 +259,17 @@ groups: expr: |- avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) * 100 - > 10 + > 15 for: 0m labels: severity: warning annotations: - summary: Host CPU high iowait (instance {{ $labels.instance }}) + summary: Host CPU high I/O wait at {{ $labels.instance }}. description: |- - CPU iowait > 10%. A high iowait means that you are disk or network - bound. + CPU I/O wait is high. A high I/O wait means that you are disk or network bound. - VALUE = {{ $value }} - LABELS = {{ $labels }} + VALUE = {{ $value }} + LABELS = {{ $labels }} - alert: HostUnusualDiskIo expr: >- @@ -282,12 +278,12 @@ groups: labels: severity: warning annotations: - summary: Host unusual disk IO (instance {{ $labels.instance }}) + summary: Host unusual disk I/O at {{ $labels.instance }}. description: |- - Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues. + Time spent over I/O is too high. Check storage for issues. - VALUE = {{ $value }} - LABELS = {{ $labels }} + VALUE = {{ $value }} + LABELS = {{ $labels }} - alert: HostSwapIsFillingUp expr: >- @@ -298,12 +294,12 @@ groups: labels: severity: warning annotations: - summary: Host swap is filling up (instance {{ $labels.instance }}) + summary: Host swap is filling up at {{ $labels.instance }}. description: |- - Swap is filling up (> 80%). + Swap is filling up. - VALUE = {{ $value }} - LABELS = {{ $labels }} + VALUE = {{ $value }} + LABELS = {{ $labels }} - alert: HostSystemdServiceCrashed expr: >- @@ -312,12 +308,12 @@ groups: labels: severity: warning annotations: - summary: Host systemd service crashed (instance {{ $labels.instance }}) + summary: Host systemd service crashed at {{ $labels.instance }}. description: |- Systemd service crashed. - VALUE = {{ $value }} - LABELS = {{ $labels }} + VALUE = {{ $value }} + LABELS = {{ $labels }} - alert: HostPhysicalComponentTooHot expr: >- @@ -326,12 +322,12 @@ groups: labels: severity: warning annotations: - summary: Host physical component too hot (instance {{ $labels.instance }}) + summary: Host physical component too hot at {{ $labels.instance }}. description: |- Physical hardware component too hot. - VALUE = {{ $value }} - LABELS = {{ $labels }} + VALUE = {{ $value }} + LABELS = {{ $labels }} - alert: HostNodeOvertemperatureAlarm expr: >- @@ -340,12 +336,12 @@ groups: labels: severity: critical annotations: - summary: Host node overtemperature alarm (instance {{ $labels.instance }}) + summary: Host node overtemperature alarm at {{ $labels.instance }}. description: |- Physical node temperature alarm triggered. - VALUE = {{ $value }} - LABELS = {{ $labels }} + VALUE = {{ $value }} + LABELS = {{ $labels }} - alert: HostRaidArrayGotInactive expr: >- @@ -354,14 +350,12 @@ groups: labels: severity: critical annotations: - summary: Host RAID array got inactive (instance {{ $labels.instance }}) + summary: Host RAID array got inactive at {{ $labels.instance }}. description: |- - RAID array {{ $labels.device }} is in degraded state due to one or - more disks failures. Number of spare drives is insufficient to fix - issue automatically. + RAID array is in a degraded state due to one or more disks failures. Number of spare drives is insufficient to fix the issue automatically. - VALUE = {{ $value }} - LABELS = {{ $labels }} + VALUE = {{ $value }} + LABELS = {{ $labels }} - alert: HostRaidDiskFailure expr: >- @@ -370,14 +364,12 @@ groups: labels: severity: warning annotations: - summary: Host RAID disk failure (instance {{ $labels.instance }}) + summary: Host RAID disk failure at {{ $labels.instance }}. description: |- - At least one device in RAID array on {{ $labels.instance }} failed. - Array {{ $labels.md_device }} needs attention and possibly a disk - swap. + At least one device in RAID array is failed. Possibly, a disk swap is required. - VALUE = {{ $value }} - LABELS = {{ $labels }} + VALUE = {{ $value }} + LABELS = {{ $labels }} - alert: HostOomKillDetected expr: >- @@ -386,12 +378,12 @@ groups: labels: severity: warning annotations: - summary: Host OOM kill detected (instance {{ $labels.instance }}) + summary: Host OOM kill detected at {{ $labels.instance }}. description: |- OOM kill detected. - VALUE = {{ $value }} - LABELS = {{ $labels }} + VALUE = {{ $value }} + LABELS = {{ $labels }} - alert: HostEdacCorrectableErrorsDetected expr: >- @@ -400,13 +392,12 @@ groups: labels: severity: info annotations: - summary: Host EDAC Correctable Errors detected (instance {{ $labels.instance }}) + summary: Host EDAC correctable errors detected at {{ $labels.instance }}. description: |- - Host {{ $labels.instance }} has had {{ printf "%.0f" $value }} - correctable memory errors reported by EDAC in the last 5 minutes. + Host has had {{ printf "%.0f" $value }} correctable memory errors reported by EDAC in the last 5 minutes. - VALUE = {{ $value }} - LABELS = {{ $labels }} + VALUE = {{ $value }} + LABELS = {{ $labels }} - alert: HostEdacUncorrectableErrorsDetected expr: >- @@ -415,66 +406,67 @@ groups: labels: severity: warning annotations: - summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }}) + summary: Host EDAC uncorrectable errors detected at {{ $labels.instance }}. description: |- - Host {{ $labels.instance }} has had {{ printf "%.0f" $value }} - uncorrectable memory errors reported by EDAC in the last 5 - minutes. + Host has had {{ printf "%.0f" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes. - VALUE = {{ $value }} - LABELS = {{ $labels }} + VALUE = {{ $value }} + LABELS = {{ $labels }} - alert: HostNetworkReceiveErrors - expr: "rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01" + expr: >- + rate(node_network_receive_errs_total{device!~"^wg.*"}[2m]) + / + rate(node_network_receive_packets_total{device!~"^wg.*"}[2m]) + > 0.01 for: 2m labels: severity: warning annotations: - summary: Host Network Receive Errors (instance {{ $labels.instance }}) + summary: Host Network Receive Errors at {{ $labels.instance }}. description: |- - Host {{ $labels.instance }} interface {{ $labels.device }} has - encountered {{ printf "%.0f" $value }} receive errors in the last - two minutes. + Host interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive errors in the last two minutes. - VALUE = {{ $value }} - LABELS = {{ $labels }} + VALUE = {{ $value }} + LABELS = {{ $labels }} - alert: HostNetworkTransmitErrors - expr: "rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01" + expr: >- + rate(node_network_transmit_errs_total{device!~"^wg.*"}[2m]) + / + rate(node_network_transmit_packets_total{device!~"^wg.*"}[2m]) + > 0.1 for: 2m labels: severity: warning annotations: - summary: Host Network Transmit Errors (instance {{ $labels.instance }}) + summary: Host network transmit errors at {{ $labels.instance }}. description: |- - Host {{ $labels.instance }} interface {{ $labels.device }} has - encountered {{ printf "%.0f" $value }} transmit errors in the last - two minutes. + Host interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit errors in the last two minutes. - VALUE = {{ $value }} - LABELS = {{ $labels }} + VALUE = {{ $value }} + LABELS = {{ $labels }} - alert: HostNetworkInterfaceSaturated expr: >- ( - rate(node_network_receive_bytes_total{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}[1m]) + rate(node_network_receive_bytes_total{device!~"^wg.*"}[1m]) + - rate(node_network_transmit_bytes_total{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}[1m]) + rate(node_network_transmit_bytes_total{device!~"^wg.*"}[1m]) ) - / node_network_speed_bytes{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"} + / node_network_speed_bytes{device!~"^wg.*"} > 0.8 < 10000 for: 1m labels: severity: warning annotations: - summary: Host Network Interface Saturated (instance {{ $labels.instance }}) + summary: Host network interface saturated at {{ $labels.instance }}. description: |- - The network interface "{{ $labels.device }}" on "{{ $labels.instance }}" - is getting overloaded. + The network interface {{ $labels.device }} is getting overloaded. - VALUE = {{ $value }} - LABELS = {{ $labels }} + VALUE = {{ $value }} + LABELS = {{ $labels }} - alert: HostNetworkBondDegraded expr: >- @@ -483,43 +475,53 @@ groups: labels: severity: warning annotations: - summary: Host Network Bond Degraded (instance {{ $labels.instance }}) + summary: Host network bond degraded at {{ $labels.instance }}. description: |- - Bond "{{ $labels.device }}" degraded on "{{ $labels.instance }}". + Bond {{ $labels.device }} degraded. - VALUE = {{ $value }} - LABELS = {{ $labels }} + VALUE = {{ $value }} + LABELS = {{ $labels }} - alert: HostConntrackLimit expr: >- - node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8 + node_nf_conntrack_entries + / + node_nf_conntrack_entries_limit + > 0.8 for: 5m labels: severity: warning annotations: - summary: Host conntrack limit (instance {{ $labels.instance }}) + summary: Host conntrack limit at {{ $labels.instance }}. description: |- The number of conntrack is approaching limit. - VALUE = {{ $value }} - LABELS = {{ $labels }} + VALUE = {{ $value }} + LABELS = {{ $labels }} - alert: HostClockSkew expr: >- - (node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) + ( + node_timex_offset_seconds > 0.05 + and + deriv(node_timex_offset_seconds[5m]) >= 0 + ) or - (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0) + ( + node_timex_offset_seconds < -0.05 + and + deriv(node_timex_offset_seconds[5m]) <= 0 + ) for: 2m labels: severity: warning annotations: - summary: Host clock skew (instance {{ $labels.instance }}) + summary: Host clock skew at {{ $labels.instance }}. description: |- - Clock skew detected. Clock is out of sync. Ensure NTP is configured - correctly on this host. + Clock skew is detected and the clock is out of sync. Ensure that NTP is configured correctly on this host. - VALUE = {{ $value }} - LABELS = {{ $labels }} + VALUE = {{ $value }} + LABELS = {{ $labels }} - alert: HostClockNotSynchronising expr: >- @@ -530,12 +532,12 @@ groups: labels: severity: warning annotations: - summary: Host clock not synchronising (instance {{ $labels.instance }}) + summary: Host clock not synchronising at {{ $labels.instance }}. description: |- - Clock not synchronising. Ensure NTP is configured on this host. + Clock is not synchronising. Ensure that NTP is configured correctly on this host. - VALUE = {{ $value }} - LABELS = {{ $labels }} + VALUE = {{ $value }} + LABELS = {{ $labels }} - alert: HostRequiresReboot expr: >- @@ -544,9 +546,9 @@ groups: labels: severity: info annotations: - summary: Host requires reboot (instance {{ $labels.instance }}) + summary: Host requires reboot at {{ $labels.instance }}. description: |- - Instance {{ $labels.instance }} requires a reboot. + Instance requires a reboot. - VALUE = {{ $value }} - LABELS = {{ $labels }} + VALUE = {{ $value }} + LABELS = {{ $labels }} diff --git a/modules/nixos/monitoring/rules/postgres.yaml b/modules/nixos/monitoring/rules/postgres.yaml index 5d360fa..6aee560 100644 --- a/modules/nixos/monitoring/rules/postgres.yaml +++ b/modules/nixos/monitoring/rules/postgres.yaml @@ -10,12 +10,12 @@ groups: labels: severity: critical annotations: - summary: PostgreSQL is down (instance {{ $labels.instance }}) + summary: PostgreSQL is down at {{ $labels.instance }}. description: |- - Postgresql instance is down. + PostgreSQL instance is down. - VALUE = {{ $value }} - LABELS = {{ $labels }} + VALUE = {{ $value }} + LABELS = {{ $labels }} - alert: PostgresqlRestarted expr: >- @@ -24,12 +24,12 @@ groups: labels: severity: critical annotations: - summary: PostgreSQL restarted (instance {{ $labels.instance }}) + summary: PostgreSQL restarted at {{ $labels.instance }}. description: |- PostgreSQL restarted. - VALUE = {{ $value }} - LABELS = {{ $labels }} + VALUE = {{ $value }} + LABELS = {{ $labels }} - alert: PostgresqlExporterError expr: >- @@ -38,12 +38,12 @@ groups: labels: severity: critical annotations: - summary: PostgreSQL exporter error (instance {{ $labels.instance }}) + summary: PostgreSQL exporter errors at {{ $labels.instance }}. description: |- - PostgreSQL exporter is showing errors. A query may be buggy in query.yaml. + PostgreSQL exporter is showing errors. - VALUE = {{ $value }} - LABELS = {{ $labels }} + VALUE = {{ $value }} + LABELS = {{ $labels }} - alert: PostgresqlTableNotAutoVacuumed expr: >- @@ -55,12 +55,12 @@ groups: labels: severity: warning annotations: - summary: PostgreSQL table not auto vacuumed (instance {{ $labels.instance }}) + summary: PostgreSQL table not auto vacuumed at {{ $labels.instance }}. description: |- Table {{ $labels.relname }} has not been auto vacuumed for 10 days. - VALUE = {{ $value }} - LABELS = {{ $labels }} + VALUE = {{ $value }} + LABELS = {{ $labels }} - alert: PostgresqlTableNotAutoAnalyzed expr: >- @@ -72,57 +72,60 @@ groups: labels: severity: warning annotations: - summary: PostgreSQL table not auto analyzed (instance {{ $labels.instance }}) + summary: PostgreSQL table not auto analyzed at {{ $labels.instance }}. description: |- Table {{ $labels.relname }} has not been auto analyzed for 10 days. - VALUE = {{ $value }} - LABELS = {{ $labels }}" + VALUE = {{ $value }} + LABELS = {{ $labels }}" - alert: PostgresqlTooManyConnections expr: >- - sum by (datname) (pg_stat_activity_count{datname!~"template.*|postgres"}) - > pg_settings_max_connections * 0.8 + sum by (datname) ( + pg_stat_activity_count{datname!~"template.*|postgres"} + ) > pg_settings_max_connections * 0.8 for: 2m labels: severity: warning annotations: - summary: Postgresql too many connections (instance {{ $labels.instance }}) + summary: PostgreSQL with too many connections at {{ $labels.instance }}. description: |- - PostgreSQL instance has too many connections (> 80%). + PostgreSQL instance {{ $labels.instance }} has too many connections. - VALUE = {{ $value }} - LABELS = {{ $labels }} + VALUE = {{ $value }} + LABELS = {{ $labels }} - alert: PostgresqlNotEnoughConnections expr: >- - sum by (datname) (pg_stat_activity_count{datname!~"template.*|postgres"}) - < 1 + sum by (datname) ( + pg_stat_activity_count{datname!~"template.*|postgres"} + ) < 1 for: 2m labels: severity: warning annotations: - summary: Postgresql not enough connections (instance {{ $labels.instance }}) + summary: PostgreSQL with not enough connections at {{ $labels.instance }}. description: |- - PostgreSQL instance should have more connections (> 1). + PostgreSQL instance {{ $labels.instance }} should have more connections. - VALUE = {{ $value }} - LABELS = {{ $labels }} + VALUE = {{ $value }} + LABELS = {{ $labels }} - alert: PostgresqlDeadLocks expr: >- - increase(pg_stat_database_deadlocks{datname!~"template.*|postgres"}[1m]) - > 5 + increase( + pg_stat_database_deadlocks{datname!~"template.*|postgres"}[1m] + ) > 5 for: 0m labels: severity: warning annotations: - summary: Postgresql dead locks (instance {{ $labels.instance }}) + summary: PostgreSQL dead-locks at instance {{ $labels.instance }}. description: |- - PostgreSQL has dead-locks. + PostgreSQL shows dead-locks. - VALUE = {{ $value }} - LABELS = {{ $labels }} + VALUE = {{ $value }} + LABELS = {{ $labels }} - alert: PostgresqlHighRollbackRate expr: >- @@ -136,17 +139,17 @@ groups: (rate(pg_stat_database_xact_commit{datname!~"template.*|postgres",datid!="0"}[3m])) ) ) - > 0.02 + > 0.10 for: 0m labels: severity: warning annotations: - summary: PostgreSQL is at a high rollback rate (instance {{ $labels.instance }}) + summary: PostgreSQL at a high rollback rate at {{ $labels.instance }}. description: |- - Ratio of transactions being aborted compared to committed is > 2%. + Ratio of transactions being aborted compared to committed is too big. - VALUE = {{ $value }} - LABELS = {{ $labels }} + VALUE = {{ $value }} + LABELS = {{ $labels }} - alert: PostgresqlCommitRateLow expr: >- @@ -156,12 +159,12 @@ groups: labels: severity: critical annotations: - summary: PostgreSQL commit rate low (instance {{ $labels.instance }}) + summary: PostgreSQL commit rate low at instance {{ $labels.instance }}. description: |- PostgreSQL seems to be processing very few transactions. - VALUE = {{ $value }} - LABELS = {{ $labels }} + VALUE = {{ $value }} + LABELS = {{ $labels }} - alert: PostgresqlLowXidConsumption expr: >- @@ -171,12 +174,12 @@ groups: labels: severity: warning annotations: - summary: PostgreSQL low XID consumption (instance {{ $labels.instance }}) + summary: PostgreSQL low XID consumption at instance {{ $labels.instance }}. description: |- PostgreSQL seems to be consuming transaction IDs very slowly. - VALUE = {{ $value }} - LABELS = {{ $labels }} + VALUE = {{ $value }} + LABELS = {{ $labels }} - alert: PostgresqlHighRateStatementTimeout expr: >- @@ -190,8 +193,8 @@ groups: description: |- PostgreSQL transactions showing high rate of statement timeouts. - VALUE = {{ $value }} - LABELS = {{ $labels }} + VALUE = {{ $value }} + LABELS = {{ $labels }} - alert: PostgresqlHighRateDeadlock expr: >- @@ -201,12 +204,12 @@ groups: labels: severity: critical annotations: - summary: PostgreSQL high rate deadlock (instance {{ $labels.instance }}) + summary: PostgreSQL high rate dead-lock at {{ $labels.instance }}. description: |- - PostgreSQL detected deadlocks. + PostgreSQL has detected dead-locks. - VALUE = {{ $value }} - LABELS = {{ $labels }} + VALUE = {{ $value }} + LABELS = {{ $labels }} - alert: PostgresqlUnusedReplicationSlot expr: >- @@ -215,12 +218,12 @@ groups: labels: severity: warning annotations: - summary: PostgreSQL unused replication slot (instance {{ $labels.instance }}) + summary: PostgreSQL unused replication slot at {{ $labels.instance }}. description: |- - Unused Replication Slots. + Unused replication slots. - VALUE = {{ $value }} - LABELS = {{ $labels }} + VALUE = {{ $value }} + LABELS = {{ $labels }} - alert: PostgresqlTooManyDeadTuples expr: >- @@ -234,12 +237,12 @@ groups: labels: severity: warning annotations: - summary: PostgreSQL too many dead tuples (instance {{ $labels.instance }}) + summary: PostgreSQL too many dead tuples at {{ $labels.instance }}. description: |- PostgreSQL number of dead tuples is too large. - VALUE = {{ $value }} - LABELS = {{ $labels }} + VALUE = {{ $value }} + LABELS = {{ $labels }} - alert: PostgresqlSslCompressionActive expr: >- @@ -248,13 +251,12 @@ groups: labels: severity: critical annotations: - summary: Postgresql SSL compression active (instance {{ $labels.instance }}) + summary: PostgreSQL SSL compression active at {{ $labels.instance }}. description: |- - Database connections with SSL compression is enabled. This may add a - significant jitter in the replication delay. + Database connections with an SSL compression is enabled. This may add a significant jitter in the replication delay. - VALUE = {{ $value }} - LABELS = {{ $labels }} + VALUE = {{ $value }} + LABELS = {{ $labels }} - alert: PostgresqlTooManyLocksAcquired expr: >- @@ -268,12 +270,12 @@ groups: labels: severity: critical annotations: - summary: PostgreSQL too many locks acquired (instance {{ $labels.instance }}) + summary: PostgreSQL too many locks acquired at {{ $labels.instance }}. description: |- Too many locks acquired on the database. - VALUE = {{ $value }} - LABELS = {{ $labels }} + VALUE = {{ $value }} + LABELS = {{ $labels }} - alert: PostgresqlBloatIndexHigh expr: >- @@ -284,13 +286,12 @@ groups: labels: severity: warning annotations: - summary: PostgreSQL bloat index high (> 80%) (instance {{ $labels.instance }}) + summary: PostgreSQL index bloat high at {{ $labels.instance }}. description: |- - The index {{ $labels.idxname }} is bloated. You should execute - `REINDEX INDEX CONCURRENTLY {{ $labels.idxname }};` + The index {{ $labels.idxname }} is bloated. You should execute `REINDEX INDEX CONCURRENTLY {{ $labels.idxname }};`. - VALUE = {{ $value }} - LABELS = {{ $labels }} + VALUE = {{ $value }} + LABELS = {{ $labels }} - alert: PostgresqlBloatTableHigh expr: >- @@ -301,10 +302,9 @@ groups: labels: severity: warning annotations: - summary: PostgreSQL bloat table high (> 80%) (instance {{ $labels.instance }}) + summary: PostgreSQL table bloat high at instance {{ $labels.instance }}. description: |- - The table {{ $labels.relname }} is bloated. You should execute - `VACUUM {{ $labels.relname }};` + The table {{ $labels.relname }} is bloated. You should execute `VACUUM {{ $labels.relname }};`. - VALUE = {{ $value }} - LABELS = {{ $labels }} + VALUE = {{ $value }} + LABELS = {{ $labels }} diff --git a/modules/nixos/monitoring/rules/redis.yaml b/modules/nixos/monitoring/rules/redis.yaml index f6d1fe1..c07c819 100644 --- a/modules/nixos/monitoring/rules/redis.yaml +++ b/modules/nixos/monitoring/rules/redis.yaml @@ -10,12 +10,12 @@ groups: labels: severity: critical annotations: - summary: Redis down (instance {{ $labels.instance }}) + summary: Redis down at {{ $labels.instance }}. description: |- Redis instance is down. - VALUE = {{ $value }} - LABELS = {{ $labels }} + VALUE = {{ $value }} + LABELS = {{ $labels }} - alert: RedisMissingMaster expr: >- @@ -25,12 +25,12 @@ groups: labels: severity: critical annotations: - summary: Redis missing master (instance {{ $labels.instance }}) + summary: Redis missing master at {{ $labels.instance }}). description: |- - Redis cluster has no node marked as master. + Redis cluster has no node marked as a master. - VALUE = {{ $value }} - LABELS = {{ $labels }} + VALUE = {{ $value }} + LABELS = {{ $labels }} - alert: RedisTooManyMasters expr: >- @@ -39,12 +39,12 @@ groups: labels: severity: critical annotations: - summary: Redis too many masters (instance {{ $labels.instance }}) + summary: Redis too many masters at {{ $labels.instance }}. description: |- - Redis cluster has too many nodes marked as master. + Redis cluster has too many nodes marked as a master. - VALUE = {{ $value }} - LABELS = {{ $labels }} + VALUE = {{ $value }} + LABELS = {{ $labels }} - alert: RedisDisconnectedSlaves expr: >- @@ -56,12 +56,12 @@ groups: labels: severity: critical annotations: - summary: Redis disconnected slaves (instance {{ $labels.instance }}) + summary: Redis disconnected slaves at {{ $labels.instance }}. description: |- Redis is not replicating for all slaves. - VALUE = {{ $value }} - LABELS = {{ $labels }} + VALUE = {{ $value }} + LABELS = {{ $labels }} - alert: RedisReplicationBroken expr: >- @@ -70,12 +70,12 @@ groups: labels: severity: critical annotations: - summary: Redis replication broken (instance {{ $labels.instance }}) + summary: Redis replication broken at {{ $labels.instance }}. description: |- Redis instance lost a slave. - VALUE = {{ $value }} - LABELS = {{ $labels }} + VALUE = {{ $value }} + LABELS = {{ $labels }} - alert: RedisClusterFlapping expr: >- @@ -84,14 +84,12 @@ groups: labels: severity: critical annotations: - summary: Redis cluster flapping (instance {{ $labels.instance }}) + summary: Redis cluster flapping at {{ $labels.instance }}. description: |- - Changes have been detected in the Redis replica connection. This can - occur when replica nodes lose connection to the master and reconnect - (a.k.a flapping). + Changes have been detected in the Redis replica connection. This can occur when replica nodes lose connection to the master and reconnect (a.k.a flapping). - VALUE = {{ $value }} - LABELS = {{ $labels }} + VALUE = {{ $value }} + LABELS = {{ $labels }} - alert: RedisMissingBackup expr: >- @@ -101,12 +99,12 @@ groups: labels: severity: critical annotations: - summary: Redis missing backup (instance {{ $labels.instance }}) + summary: Redis missing backup at {{ $labels.instance }}. description: |- Redis has not been backed up for 24 hours. - VALUE = {{ $value }} - LABELS = {{ $labels }} + VALUE = {{ $value }} + LABELS = {{ $labels }} - alert: RedisOutOfSystemMemory expr: >- @@ -118,12 +116,12 @@ groups: labels: severity: warning annotations: - summary: Redis out of system memory (instance {{ $labels.instance }}) + summary: Redis out of system memory at {{ $labels.instance }}. description: |- - Redis is running out of system memory (> 90%). + Redis is running out of system memory. - VALUE = {{ $value }} - LABELS = {{ $labels }} + VALUE = {{ $value }} + LABELS = {{ $labels }} - alert: RedisOutOfConfiguredMaxmemory expr: >- @@ -139,12 +137,12 @@ groups: labels: severity: warning annotations: - summary: Redis out of configured maxmemory (instance {{ $labels.instance }}) + summary: Redis out of configured maxmemory at {{ $labels.instance }}. description: |- - Redis is running out of configured maxmemory (> 90%). + Redis is running out of configured maxmemory. - VALUE = {{ $value }} - LABELS = {{ $labels }} + VALUE = {{ $value }} + LABELS = {{ $labels }} - alert: RedisTooManyConnections expr: >- @@ -153,12 +151,12 @@ groups: labels: severity: warning annotations: - summary: Redis too many connections (instance {{ $labels.instance }}) + summary: Redis too many connections at {{ $labels.instance }}. description: |- Redis instance has too many connections. - VALUE = {{ $value }} - LABELS = {{ $labels }} + VALUE = {{ $value }} + LABELS = {{ $labels }} - alert: RedisNotEnoughConnections expr: >- @@ -167,12 +165,12 @@ groups: labels: severity: warning annotations: - summary: Redis not enough connections (instance {{ $labels.instance }}) + summary: Redis not enough connections at {{ $labels.instance }}. description: |- - Redis instance should have more connections (> 1). + Redis instance should have more connections. - VALUE = {{ $value }} - LABELS = {{ $labels }} + VALUE = {{ $value }} + LABELS = {{ $labels }} - alert: RedisRejectedConnections expr: >- @@ -181,9 +179,9 @@ groups: labels: severity: critical annotations: - summary: Redis rejected connections (instance {{ $labels.instance }}) + summary: Redis rejected connections at {{ $labels.instance }}. description: |- Some connections to Redis have been rejected. - VALUE = {{ $value }} - LABELS = {{ $labels }} + VALUE = {{ $value }} + LABELS = {{ $labels }} |