summaryrefslogtreecommitdiff
path: root/modules/nixos/monitoring/rules
diff options
context:
space:
mode:
Diffstat (limited to 'modules/nixos/monitoring/rules')
-rw-r--r--modules/nixos/monitoring/rules/nginx.yaml40
-rw-r--r--modules/nixos/monitoring/rules/node.yaml304
-rw-r--r--modules/nixos/monitoring/rules/postgres.yaml154
-rw-r--r--modules/nixos/monitoring/rules/redis.yaml86
4 files changed, 296 insertions, 288 deletions
diff --git a/modules/nixos/monitoring/rules/nginx.yaml b/modules/nixos/monitoring/rules/nginx.yaml
index 59229a8..f00d372 100644
--- a/modules/nixos/monitoring/rules/nginx.yaml
+++ b/modules/nixos/monitoring/rules/nginx.yaml
@@ -6,47 +6,55 @@ groups:
- alert: NginxHighHttp4xxErrorRate
expr: >-
sum(rate(nginx_http_requests_total{status=~"^4.."}[1m]))
- / sum(rate(nginx_http_requests_total[1m])) * 100
+ /
+ sum(rate(nginx_http_requests_total[1m])) * 100
> 5
for: 1m
labels:
severity: critical
annotations:
- summary: NGINX high HTTP 4xx error rate (instance {{ $labels.instance }})
+ summary: NGINX high HTTP 4xx error rate at {{ $labels.instance }}.
description: |-
- Too many HTTP requests with status 4xx (> 5%).
+ Too many HTTP requests with a 4xx status code.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: NginxHighHttp5xxErrorRate
expr: >-
sum(rate(nginx_http_requests_total{status=~"^5.."}[1m]))
- / sum(rate(nginx_http_requests_total[1m])) * 100
+ /
+ sum(rate(nginx_http_requests_total[1m])) * 100
> 5
for: 1m
labels:
severity: critical
annotations:
- summary: NGINX high HTTP 5xx error rate (instance {{ $labels.instance }})
+ summary: NGINX high HTTP 5xx error rate at {{ $labels.instance }}.
description: |-
- Too many HTTP requests with status 5xx (> 5%).
+ Too many HTTP requests with a 5xx status code.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: NginxLatencyHigh
expr: >-
- histogram_quantile(0.99,
- sum(rate(nginx_http_request_duration_seconds_bucket[2m])) by (host, node))
+ histogram_quantile(
+ 0.99,
+ sum(
+ rate(
+ nginx_http_request_duration_seconds_bucket[2m]
+ )
+ ) by (host, node)
+ )
> 3
for: 2m
labels:
severity: warning
annotations:
- summary: NGINX high latency (instance {{ $labels.instance }})
+ summary: NGINX high latency at {{ $labels.instance }}.
description: |-
- NGINX 99% latency is higher than 3 seconds.
+ NGINX 99% of latency spikes is higher than 3 seconds.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
diff --git a/modules/nixos/monitoring/rules/node.yaml b/modules/nixos/monitoring/rules/node.yaml
index 81d7810..98217b3 100644
--- a/modules/nixos/monitoring/rules/node.yaml
+++ b/modules/nixos/monitoring/rules/node.yaml
@@ -13,12 +13,12 @@ groups:
labels:
severity: warning
annotations:
- summary: Host out of memory (instance {{ $labels.instance }})
+ summary: Host out of memory at {{ $labels.instance }}.
description: |-
- Node memory is filling up (< 10% left).
+ Node memory is filling up.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: HostMemoryUnderMemoryPressure
expr: >-
@@ -27,13 +27,12 @@ groups:
labels:
severity: warning
annotations:
- summary: Host memory under memory pressure (instance {{ $labels.instance }})
+ summary: Host memory under memory pressure at {{ $labels.instance }}.
description: |-
- The node is under heavy memory pressure. High rate of major page
- faults.
+ The node is under heavy memory pressure. High rate of major page faults.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: HostUnusualNetworkThroughputIn
expr: >-
@@ -47,10 +46,10 @@ groups:
annotations:
summary: Host unusual network throughput in (instance {{ $labels.instance }})
description: |-
- Host network interfaces are probably receiving too much data (> 100 MB/s).
+ Host network interfaces are probably receiving too much data.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: HostUnusualNetworkThroughputOut
expr: >-
@@ -62,12 +61,12 @@ groups:
labels:
severity: warning
annotations:
- summary: Host unusual network throughput out (instance {{ $labels.instance }})
+ summary: Host unusual network throughput out at {{ $labels.instance }}.
description: |-
- Host network interfaces are probably sending too much data (> 100 MB/s).
+ Host network interfaces are probably sending too much data.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: HostUnusualDiskReadRate
expr: >-
@@ -79,12 +78,12 @@ groups:
labels:
severity: warning
annotations:
- summary: Host unusual disk read rate (instance {{ $labels.instance }})
+ summary: Host unusual disk read rate at {{ $labels.instance }}.
description: |-
- Disk is probably reading too much data (> 50 MB/s).
+ Disk is probably reading too much data.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: HostUnusualDiskWriteRate
expr: >-
@@ -96,12 +95,12 @@ groups:
labels:
severity: warning
annotations:
- summary: Host unusual disk write rate (instance {{ $labels.instance }})
+ summary: Host unusual disk write rate at {{ $labels.instance }}.
description: |-
- Disk is probably writing too much data (> 50 MB/s).
+ Disk is probably writing too much data.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: HostOutOfDiskSpace
expr: >-
@@ -109,18 +108,18 @@ groups:
/ node_filesystem_size_bytes
< 10
and
- ON (instance, device, mountpoint) node_filesystem_readonly
+ on (instance, device, mountpoint) node_filesystem_readonly
== 0
for: 2m
labels:
severity: warning
annotations:
- summary: Host out of disk space (instance {{ $labels.instance }})
+ summary: Host out of disk space at {{ $labels.instance }}.
description: |-
- Disk is almost full (< 10% left).
+ Disk is almost full.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: HostDiskWillFillIn24Hours
expr: >-
@@ -136,13 +135,12 @@ groups:
labels:
severity: warning
annotations:
- summary: Host disk will fill in 24 hours (instance {{ $labels.instance }})
+ summary: Host disk will fill in 24 hours at {{ $labels.instance }}.
description: |-
- Filesystem is predicted to run out of space within the next 24 hours
- at current write rate.
+ Filesystem is predicted to run out of space within the next 24 hours at current write rate.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: HostOutOfInodes
expr: >-
@@ -156,12 +154,12 @@ groups:
labels:
severity: warning
annotations:
- summary: Host out of inodes (instance {{ $labels.instance }})
+ summary: Host out of inodes at {{ $labels.instance }}.
description: |-
- Disk is almost running out of available inodes (< 10% left).
+ Disk is almost running out of available inodes.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: HostInodesWillFillIn24Hours
expr: >-
@@ -178,13 +176,12 @@ groups:
labels:
severity: warning
annotations:
- summary: Host inodes will fill in 24 hours (instance {{ $labels.instance }})
+ summary: Host inodes will fill in 24 hours at {{ $labels.instance }}.
description: |-
- Filesystem is predicted to run out of inodes within the next 24
- hours at current write rate.
+ Filesystem is predicted to run out of inodes within the next 24 hours at current write rate.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: HostUnusualDiskReadLatency
expr: >-
@@ -198,12 +195,12 @@ groups:
labels:
severity: warning
annotations:
- summary: Host unusual disk read latency (instance {{ $labels.instance }})
+ summary: Host unusual disk read latency at {{ $labels.instance }}.
description: |-
- Disk latency is growing (read operations > 100ms).
+ Disk latency is growing.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: HostUnusualDiskWriteLatency
expr: >-
@@ -217,12 +214,12 @@ groups:
labels:
severity: warning
annotations:
- summary: Host unusual disk write latency (instance {{ $labels.instance }})
+ summary: Host unusual disk write latency at {{ $labels.instance }}.
description: |-
- Disk latency is growing (write operations > 100ms).
+ Disk latency is growing.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: HostHighCpuLoad
expr: >-
@@ -234,23 +231,23 @@ groups:
labels:
severity: warning
annotations:
- summary: Host high CPU load (instance {{ $labels.instance }})
+ summary: Host high CPU load at {{ $labels.instance }}.
description: |-
- CPU load is > 80%.
+ CPU load is high.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: HostCpuStealNoisyNeighbor
expr: >-
avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m]))
* 100
- > 10
+ > 15
for: 0m
labels:
severity: warning
annotations:
- summary: Host CPU steal noisy neighbor (instance {{ $labels.instance }})
+ summary: Host CPU steal noisy neighbor at {{ $labels.instance }}.
description: |-
CPU steal is > 10%. A noisy neighbor is killing VM performances or a
spot instance may be out of credit.
@@ -262,18 +259,17 @@ groups:
expr: |-
avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m]))
* 100
- > 10
+ > 15
for: 0m
labels:
severity: warning
annotations:
- summary: Host CPU high iowait (instance {{ $labels.instance }})
+ summary: Host CPU high I/O wait at {{ $labels.instance }}.
description: |-
- CPU iowait > 10%. A high iowait means that you are disk or network
- bound.
+ CPU I/O wait is high. A high I/O wait means that you are disk or network bound.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: HostUnusualDiskIo
expr: >-
@@ -282,12 +278,12 @@ groups:
labels:
severity: warning
annotations:
- summary: Host unusual disk IO (instance {{ $labels.instance }})
+ summary: Host unusual disk I/O at {{ $labels.instance }}.
description: |-
- Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.
+ Time spent over I/O is too high. Check storage for issues.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: HostSwapIsFillingUp
expr: >-
@@ -298,12 +294,12 @@ groups:
labels:
severity: warning
annotations:
- summary: Host swap is filling up (instance {{ $labels.instance }})
+ summary: Host swap is filling up at {{ $labels.instance }}.
description: |-
- Swap is filling up (> 80%).
+ Swap is filling up.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: HostSystemdServiceCrashed
expr: >-
@@ -312,12 +308,12 @@ groups:
labels:
severity: warning
annotations:
- summary: Host systemd service crashed (instance {{ $labels.instance }})
+ summary: Host systemd service crashed at {{ $labels.instance }}.
description: |-
Systemd service crashed.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: HostPhysicalComponentTooHot
expr: >-
@@ -326,12 +322,12 @@ groups:
labels:
severity: warning
annotations:
- summary: Host physical component too hot (instance {{ $labels.instance }})
+ summary: Host physical component too hot at {{ $labels.instance }}.
description: |-
Physical hardware component too hot.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: HostNodeOvertemperatureAlarm
expr: >-
@@ -340,12 +336,12 @@ groups:
labels:
severity: critical
annotations:
- summary: Host node overtemperature alarm (instance {{ $labels.instance }})
+ summary: Host node overtemperature alarm at {{ $labels.instance }}.
description: |-
Physical node temperature alarm triggered.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: HostRaidArrayGotInactive
expr: >-
@@ -354,14 +350,12 @@ groups:
labels:
severity: critical
annotations:
- summary: Host RAID array got inactive (instance {{ $labels.instance }})
+ summary: Host RAID array got inactive at {{ $labels.instance }}.
description: |-
- RAID array {{ $labels.device }} is in degraded state due to one or
- more disks failures. Number of spare drives is insufficient to fix
- issue automatically.
+ RAID array is in a degraded state due to one or more disks failures. Number of spare drives is insufficient to fix the issue automatically.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: HostRaidDiskFailure
expr: >-
@@ -370,14 +364,12 @@ groups:
labels:
severity: warning
annotations:
- summary: Host RAID disk failure (instance {{ $labels.instance }})
+ summary: Host RAID disk failure at {{ $labels.instance }}.
description: |-
- At least one device in RAID array on {{ $labels.instance }} failed.
- Array {{ $labels.md_device }} needs attention and possibly a disk
- swap.
+ At least one device in RAID array is failed. Possibly, a disk swap is required.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: HostOomKillDetected
expr: >-
@@ -386,12 +378,12 @@ groups:
labels:
severity: warning
annotations:
- summary: Host OOM kill detected (instance {{ $labels.instance }})
+ summary: Host OOM kill detected at {{ $labels.instance }}.
description: |-
OOM kill detected.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: HostEdacCorrectableErrorsDetected
expr: >-
@@ -400,13 +392,12 @@ groups:
labels:
severity: info
annotations:
- summary: Host EDAC Correctable Errors detected (instance {{ $labels.instance }})
+ summary: Host EDAC correctable errors detected at {{ $labels.instance }}.
description: |-
- Host {{ $labels.instance }} has had {{ printf "%.0f" $value }}
- correctable memory errors reported by EDAC in the last 5 minutes.
+ Host has had {{ printf "%.0f" $value }} correctable memory errors reported by EDAC in the last 5 minutes.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: HostEdacUncorrectableErrorsDetected
expr: >-
@@ -415,66 +406,67 @@ groups:
labels:
severity: warning
annotations:
- summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }})
+ summary: Host EDAC uncorrectable errors detected at {{ $labels.instance }}.
description: |-
- Host {{ $labels.instance }} has had {{ printf "%.0f" $value }}
- uncorrectable memory errors reported by EDAC in the last 5
- minutes.
+ Host has had {{ printf "%.0f" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: HostNetworkReceiveErrors
- expr: "rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01"
+ expr: >-
+ rate(node_network_receive_errs_total{device!~"^wg.*"}[2m])
+ /
+ rate(node_network_receive_packets_total{device!~"^wg.*"}[2m])
+ > 0.01
for: 2m
labels:
severity: warning
annotations:
- summary: Host Network Receive Errors (instance {{ $labels.instance }})
+ summary: Host Network Receive Errors at {{ $labels.instance }}.
description: |-
- Host {{ $labels.instance }} interface {{ $labels.device }} has
- encountered {{ printf "%.0f" $value }} receive errors in the last
- two minutes.
+ Host interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive errors in the last two minutes.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: HostNetworkTransmitErrors
- expr: "rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01"
+ expr: >-
+ rate(node_network_transmit_errs_total{device!~"^wg.*"}[2m])
+ /
+ rate(node_network_transmit_packets_total{device!~"^wg.*"}[2m])
+ > 0.1
for: 2m
labels:
severity: warning
annotations:
- summary: Host Network Transmit Errors (instance {{ $labels.instance }})
+ summary: Host network transmit errors at {{ $labels.instance }}.
description: |-
- Host {{ $labels.instance }} interface {{ $labels.device }} has
- encountered {{ printf "%.0f" $value }} transmit errors in the last
- two minutes.
+ Host interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit errors in the last two minutes.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: HostNetworkInterfaceSaturated
expr: >-
(
- rate(node_network_receive_bytes_total{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}[1m])
+ rate(node_network_receive_bytes_total{device!~"^wg.*"}[1m])
+
- rate(node_network_transmit_bytes_total{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}[1m])
+ rate(node_network_transmit_bytes_total{device!~"^wg.*"}[1m])
)
- / node_network_speed_bytes{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}
+ / node_network_speed_bytes{device!~"^wg.*"}
> 0.8
< 10000
for: 1m
labels:
severity: warning
annotations:
- summary: Host Network Interface Saturated (instance {{ $labels.instance }})
+ summary: Host network interface saturated at {{ $labels.instance }}.
description: |-
- The network interface "{{ $labels.device }}" on "{{ $labels.instance }}"
- is getting overloaded.
+ The network interface {{ $labels.device }} is getting overloaded.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: HostNetworkBondDegraded
expr: >-
@@ -483,43 +475,53 @@ groups:
labels:
severity: warning
annotations:
- summary: Host Network Bond Degraded (instance {{ $labels.instance }})
+ summary: Host network bond degraded at {{ $labels.instance }}.
description: |-
- Bond "{{ $labels.device }}" degraded on "{{ $labels.instance }}".
+ Bond {{ $labels.device }} degraded.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: HostConntrackLimit
expr: >-
- node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8
+ node_nf_conntrack_entries
+ /
+ node_nf_conntrack_entries_limit
+ > 0.8
for: 5m
labels:
severity: warning
annotations:
- summary: Host conntrack limit (instance {{ $labels.instance }})
+ summary: Host conntrack limit at {{ $labels.instance }}.
description: |-
The number of conntrack is approaching limit.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: HostClockSkew
expr: >-
- (node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0)
+ (
+ node_timex_offset_seconds > 0.05
+ and
+ deriv(node_timex_offset_seconds[5m]) >= 0
+ )
or
- (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)
+ (
+ node_timex_offset_seconds < -0.05
+ and
+ deriv(node_timex_offset_seconds[5m]) <= 0
+ )
for: 2m
labels:
severity: warning
annotations:
- summary: Host clock skew (instance {{ $labels.instance }})
+ summary: Host clock skew at {{ $labels.instance }}.
description: |-
- Clock skew detected. Clock is out of sync. Ensure NTP is configured
- correctly on this host.
+ Clock skew is detected and the clock is out of sync. Ensure that NTP is configured correctly on this host.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: HostClockNotSynchronising
expr: >-
@@ -530,12 +532,12 @@ groups:
labels:
severity: warning
annotations:
- summary: Host clock not synchronising (instance {{ $labels.instance }})
+ summary: Host clock not synchronising at {{ $labels.instance }}.
description: |-
- Clock not synchronising. Ensure NTP is configured on this host.
+ Clock is not synchronising. Ensure that NTP is configured correctly on this host.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: HostRequiresReboot
expr: >-
@@ -544,9 +546,9 @@ groups:
labels:
severity: info
annotations:
- summary: Host requires reboot (instance {{ $labels.instance }})
+ summary: Host requires reboot at {{ $labels.instance }}.
description: |-
- Instance {{ $labels.instance }} requires a reboot.
+ Instance requires a reboot.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
diff --git a/modules/nixos/monitoring/rules/postgres.yaml b/modules/nixos/monitoring/rules/postgres.yaml
index 5d360fa..6aee560 100644
--- a/modules/nixos/monitoring/rules/postgres.yaml
+++ b/modules/nixos/monitoring/rules/postgres.yaml
@@ -10,12 +10,12 @@ groups:
labels:
severity: critical
annotations:
- summary: PostgreSQL is down (instance {{ $labels.instance }})
+ summary: PostgreSQL is down at {{ $labels.instance }}.
description: |-
- Postgresql instance is down.
+ PostgreSQL instance is down.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: PostgresqlRestarted
expr: >-
@@ -24,12 +24,12 @@ groups:
labels:
severity: critical
annotations:
- summary: PostgreSQL restarted (instance {{ $labels.instance }})
+ summary: PostgreSQL restarted at {{ $labels.instance }}.
description: |-
PostgreSQL restarted.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: PostgresqlExporterError
expr: >-
@@ -38,12 +38,12 @@ groups:
labels:
severity: critical
annotations:
- summary: PostgreSQL exporter error (instance {{ $labels.instance }})
+ summary: PostgreSQL exporter errors at {{ $labels.instance }}.
description: |-
- PostgreSQL exporter is showing errors. A query may be buggy in query.yaml.
+ PostgreSQL exporter is showing errors.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: PostgresqlTableNotAutoVacuumed
expr: >-
@@ -55,12 +55,12 @@ groups:
labels:
severity: warning
annotations:
- summary: PostgreSQL table not auto vacuumed (instance {{ $labels.instance }})
+ summary: PostgreSQL table not auto vacuumed at {{ $labels.instance }}.
description: |-
Table {{ $labels.relname }} has not been auto vacuumed for 10 days.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: PostgresqlTableNotAutoAnalyzed
expr: >-
@@ -72,57 +72,60 @@ groups:
labels:
severity: warning
annotations:
- summary: PostgreSQL table not auto analyzed (instance {{ $labels.instance }})
+ summary: PostgreSQL table not auto analyzed at {{ $labels.instance }}.
description: |-
Table {{ $labels.relname }} has not been auto analyzed for 10 days.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}"
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}"
- alert: PostgresqlTooManyConnections
expr: >-
- sum by (datname) (pg_stat_activity_count{datname!~"template.*|postgres"})
- > pg_settings_max_connections * 0.8
+ sum by (datname) (
+ pg_stat_activity_count{datname!~"template.*|postgres"}
+ ) > pg_settings_max_connections * 0.8
for: 2m
labels:
severity: warning
annotations:
- summary: Postgresql too many connections (instance {{ $labels.instance }})
+ summary: PostgreSQL with too many connections at {{ $labels.instance }}.
description: |-
- PostgreSQL instance has too many connections (> 80%).
+ PostgreSQL instance {{ $labels.instance }} has too many connections.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: PostgresqlNotEnoughConnections
expr: >-
- sum by (datname) (pg_stat_activity_count{datname!~"template.*|postgres"})
- < 1
+ sum by (datname) (
+ pg_stat_activity_count{datname!~"template.*|postgres"}
+ ) < 1
for: 2m
labels:
severity: warning
annotations:
- summary: Postgresql not enough connections (instance {{ $labels.instance }})
+ summary: PostgreSQL with not enough connections at {{ $labels.instance }}.
description: |-
- PostgreSQL instance should have more connections (> 1).
+ PostgreSQL instance {{ $labels.instance }} should have more connections.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: PostgresqlDeadLocks
expr: >-
- increase(pg_stat_database_deadlocks{datname!~"template.*|postgres"}[1m])
- > 5
+ increase(
+ pg_stat_database_deadlocks{datname!~"template.*|postgres"}[1m]
+ ) > 5
for: 0m
labels:
severity: warning
annotations:
- summary: Postgresql dead locks (instance {{ $labels.instance }})
+ summary: PostgreSQL dead-locks at instance {{ $labels.instance }}.
description: |-
- PostgreSQL has dead-locks.
+ PostgreSQL shows dead-locks.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: PostgresqlHighRollbackRate
expr: >-
@@ -136,17 +139,17 @@ groups:
(rate(pg_stat_database_xact_commit{datname!~"template.*|postgres",datid!="0"}[3m]))
)
)
- > 0.02
+ > 0.10
for: 0m
labels:
severity: warning
annotations:
- summary: PostgreSQL is at a high rollback rate (instance {{ $labels.instance }})
+ summary: PostgreSQL at a high rollback rate at {{ $labels.instance }}.
description: |-
- Ratio of transactions being aborted compared to committed is > 2%.
+ Ratio of transactions being aborted compared to committed is too big.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: PostgresqlCommitRateLow
expr: >-
@@ -156,12 +159,12 @@ groups:
labels:
severity: critical
annotations:
- summary: PostgreSQL commit rate low (instance {{ $labels.instance }})
+ summary: PostgreSQL commit rate low at instance {{ $labels.instance }}.
description: |-
PostgreSQL seems to be processing very few transactions.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: PostgresqlLowXidConsumption
expr: >-
@@ -171,12 +174,12 @@ groups:
labels:
severity: warning
annotations:
- summary: PostgreSQL low XID consumption (instance {{ $labels.instance }})
+ summary: PostgreSQL low XID consumption at instance {{ $labels.instance }}.
description: |-
PostgreSQL seems to be consuming transaction IDs very slowly.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: PostgresqlHighRateStatementTimeout
expr: >-
@@ -190,8 +193,8 @@ groups:
description: |-
PostgreSQL transactions showing high rate of statement timeouts.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: PostgresqlHighRateDeadlock
expr: >-
@@ -201,12 +204,12 @@ groups:
labels:
severity: critical
annotations:
- summary: PostgreSQL high rate deadlock (instance {{ $labels.instance }})
+ summary: PostgreSQL high rate dead-lock at {{ $labels.instance }}.
description: |-
- PostgreSQL detected deadlocks.
+ PostgreSQL has detected dead-locks.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: PostgresqlUnusedReplicationSlot
expr: >-
@@ -215,12 +218,12 @@ groups:
labels:
severity: warning
annotations:
- summary: PostgreSQL unused replication slot (instance {{ $labels.instance }})
+ summary: PostgreSQL unused replication slot at {{ $labels.instance }}.
description: |-
- Unused Replication Slots.
+ Unused replication slots.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: PostgresqlTooManyDeadTuples
expr: >-
@@ -234,12 +237,12 @@ groups:
labels:
severity: warning
annotations:
- summary: PostgreSQL too many dead tuples (instance {{ $labels.instance }})
+ summary: PostgreSQL too many dead tuples at {{ $labels.instance }}.
description: |-
PostgreSQL number of dead tuples is too large.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: PostgresqlSslCompressionActive
expr: >-
@@ -248,13 +251,12 @@ groups:
labels:
severity: critical
annotations:
- summary: Postgresql SSL compression active (instance {{ $labels.instance }})
+ summary: PostgreSQL SSL compression active at {{ $labels.instance }}.
description: |-
- Database connections with SSL compression is enabled. This may add a
- significant jitter in the replication delay.
+ Database connections with an SSL compression is enabled. This may add a significant jitter in the replication delay.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: PostgresqlTooManyLocksAcquired
expr: >-
@@ -268,12 +270,12 @@ groups:
labels:
severity: critical
annotations:
- summary: PostgreSQL too many locks acquired (instance {{ $labels.instance }})
+ summary: PostgreSQL too many locks acquired at {{ $labels.instance }}.
description: |-
Too many locks acquired on the database.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: PostgresqlBloatIndexHigh
expr: >-
@@ -284,13 +286,12 @@ groups:
labels:
severity: warning
annotations:
- summary: PostgreSQL bloat index high (> 80%) (instance {{ $labels.instance }})
+ summary: PostgreSQL index bloat high at {{ $labels.instance }}.
description: |-
- The index {{ $labels.idxname }} is bloated. You should execute
- `REINDEX INDEX CONCURRENTLY {{ $labels.idxname }};`
+ The index {{ $labels.idxname }} is bloated. You should execute `REINDEX INDEX CONCURRENTLY {{ $labels.idxname }};`.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: PostgresqlBloatTableHigh
expr: >-
@@ -301,10 +302,9 @@ groups:
labels:
severity: warning
annotations:
- summary: PostgreSQL bloat table high (> 80%) (instance {{ $labels.instance }})
+ summary: PostgreSQL table bloat high at instance {{ $labels.instance }}.
description: |-
- The table {{ $labels.relname }} is bloated. You should execute
- `VACUUM {{ $labels.relname }};`
+ The table {{ $labels.relname }} is bloated. You should execute `VACUUM {{ $labels.relname }};`.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
diff --git a/modules/nixos/monitoring/rules/redis.yaml b/modules/nixos/monitoring/rules/redis.yaml
index f6d1fe1..c07c819 100644
--- a/modules/nixos/monitoring/rules/redis.yaml
+++ b/modules/nixos/monitoring/rules/redis.yaml
@@ -10,12 +10,12 @@ groups:
labels:
severity: critical
annotations:
- summary: Redis down (instance {{ $labels.instance }})
+ summary: Redis down at {{ $labels.instance }}.
description: |-
Redis instance is down.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: RedisMissingMaster
expr: >-
@@ -25,12 +25,12 @@ groups:
labels:
severity: critical
annotations:
- summary: Redis missing master (instance {{ $labels.instance }})
+ summary: Redis missing master at {{ $labels.instance }}).
description: |-
- Redis cluster has no node marked as master.
+ Redis cluster has no node marked as a master.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: RedisTooManyMasters
expr: >-
@@ -39,12 +39,12 @@ groups:
labels:
severity: critical
annotations:
- summary: Redis too many masters (instance {{ $labels.instance }})
+ summary: Redis too many masters at {{ $labels.instance }}.
description: |-
- Redis cluster has too many nodes marked as master.
+ Redis cluster has too many nodes marked as a master.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: RedisDisconnectedSlaves
expr: >-
@@ -56,12 +56,12 @@ groups:
labels:
severity: critical
annotations:
- summary: Redis disconnected slaves (instance {{ $labels.instance }})
+ summary: Redis disconnected slaves at {{ $labels.instance }}.
description: |-
Redis is not replicating for all slaves.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: RedisReplicationBroken
expr: >-
@@ -70,12 +70,12 @@ groups:
labels:
severity: critical
annotations:
- summary: Redis replication broken (instance {{ $labels.instance }})
+ summary: Redis replication broken at {{ $labels.instance }}.
description: |-
Redis instance lost a slave.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: RedisClusterFlapping
expr: >-
@@ -84,14 +84,12 @@ groups:
labels:
severity: critical
annotations:
- summary: Redis cluster flapping (instance {{ $labels.instance }})
+ summary: Redis cluster flapping at {{ $labels.instance }}.
description: |-
- Changes have been detected in the Redis replica connection. This can
- occur when replica nodes lose connection to the master and reconnect
- (a.k.a flapping).
+ Changes have been detected in the Redis replica connection. This can occur when replica nodes lose connection to the master and reconnect (a.k.a flapping).
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: RedisMissingBackup
expr: >-
@@ -101,12 +99,12 @@ groups:
labels:
severity: critical
annotations:
- summary: Redis missing backup (instance {{ $labels.instance }})
+ summary: Redis missing backup at {{ $labels.instance }}.
description: |-
Redis has not been backed up for 24 hours.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: RedisOutOfSystemMemory
expr: >-
@@ -118,12 +116,12 @@ groups:
labels:
severity: warning
annotations:
- summary: Redis out of system memory (instance {{ $labels.instance }})
+ summary: Redis out of system memory at {{ $labels.instance }}.
description: |-
- Redis is running out of system memory (> 90%).
+ Redis is running out of system memory.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: RedisOutOfConfiguredMaxmemory
expr: >-
@@ -139,12 +137,12 @@ groups:
labels:
severity: warning
annotations:
- summary: Redis out of configured maxmemory (instance {{ $labels.instance }})
+ summary: Redis out of configured maxmemory at {{ $labels.instance }}.
description: |-
- Redis is running out of configured maxmemory (> 90%).
+ Redis is running out of configured maxmemory.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: RedisTooManyConnections
expr: >-
@@ -153,12 +151,12 @@ groups:
labels:
severity: warning
annotations:
- summary: Redis too many connections (instance {{ $labels.instance }})
+ summary: Redis too many connections at {{ $labels.instance }}.
description: |-
Redis instance has too many connections.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: RedisNotEnoughConnections
expr: >-
@@ -167,12 +165,12 @@ groups:
labels:
severity: warning
annotations:
- summary: Redis not enough connections (instance {{ $labels.instance }})
+ summary: Redis not enough connections at {{ $labels.instance }}.
description: |-
- Redis instance should have more connections (> 1).
+ Redis instance should have more connections.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: RedisRejectedConnections
expr: >-
@@ -181,9 +179,9 @@ groups:
labels:
severity: critical
annotations:
- summary: Redis rejected connections (instance {{ $labels.instance }})
+ summary: Redis rejected connections at {{ $labels.instance }}.
description: |-
Some connections to Redis have been rejected.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}