summaryrefslogtreecommitdiff
path: root/modules/nixos/monitoring/rules/node.yaml
diff options
context:
space:
mode:
authorAzat Bahawi <azat@bahawi.net>2023-04-12 04:01:46 +0300
committerAzat Bahawi <azat@bahawi.net>2023-04-12 04:01:46 +0300
commitd6368c86bc949371e904eed3d0a6583ebd53b055 (patch)
tree042db513412ba7f1577b1ac690d4e0e0fac22cbf /modules/nixos/monitoring/rules/node.yaml
parentdae3149a93cab4d1140526e15eb928d275f56128 (diff)
2023-04-12
Diffstat (limited to 'modules/nixos/monitoring/rules/node.yaml')
-rw-r--r--modules/nixos/monitoring/rules/node.yaml304
1 files changed, 153 insertions, 151 deletions
diff --git a/modules/nixos/monitoring/rules/node.yaml b/modules/nixos/monitoring/rules/node.yaml
index 81d7810..98217b3 100644
--- a/modules/nixos/monitoring/rules/node.yaml
+++ b/modules/nixos/monitoring/rules/node.yaml
@@ -13,12 +13,12 @@ groups:
labels:
severity: warning
annotations:
- summary: Host out of memory (instance {{ $labels.instance }})
+ summary: Host out of memory at {{ $labels.instance }}.
description: |-
- Node memory is filling up (< 10% left).
+ Node memory is filling up.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: HostMemoryUnderMemoryPressure
expr: >-
@@ -27,13 +27,12 @@ groups:
labels:
severity: warning
annotations:
- summary: Host memory under memory pressure (instance {{ $labels.instance }})
+ summary: Host memory under memory pressure at {{ $labels.instance }}.
description: |-
- The node is under heavy memory pressure. High rate of major page
- faults.
+ The node is under heavy memory pressure. High rate of major page faults.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: HostUnusualNetworkThroughputIn
expr: >-
@@ -47,10 +46,10 @@ groups:
annotations:
summary: Host unusual network throughput in (instance {{ $labels.instance }})
description: |-
- Host network interfaces are probably receiving too much data (> 100 MB/s).
+ Host network interfaces are probably receiving too much data.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: HostUnusualNetworkThroughputOut
expr: >-
@@ -62,12 +61,12 @@ groups:
labels:
severity: warning
annotations:
- summary: Host unusual network throughput out (instance {{ $labels.instance }})
+ summary: Host unusual network throughput out at {{ $labels.instance }}.
description: |-
- Host network interfaces are probably sending too much data (> 100 MB/s).
+ Host network interfaces are probably sending too much data.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: HostUnusualDiskReadRate
expr: >-
@@ -79,12 +78,12 @@ groups:
labels:
severity: warning
annotations:
- summary: Host unusual disk read rate (instance {{ $labels.instance }})
+ summary: Host unusual disk read rate at {{ $labels.instance }}.
description: |-
- Disk is probably reading too much data (> 50 MB/s).
+ Disk is probably reading too much data.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: HostUnusualDiskWriteRate
expr: >-
@@ -96,12 +95,12 @@ groups:
labels:
severity: warning
annotations:
- summary: Host unusual disk write rate (instance {{ $labels.instance }})
+ summary: Host unusual disk write rate at {{ $labels.instance }}.
description: |-
- Disk is probably writing too much data (> 50 MB/s).
+ Disk is probably writing too much data.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: HostOutOfDiskSpace
expr: >-
@@ -109,18 +108,18 @@ groups:
/ node_filesystem_size_bytes
< 10
and
- ON (instance, device, mountpoint) node_filesystem_readonly
+ on (instance, device, mountpoint) node_filesystem_readonly
== 0
for: 2m
labels:
severity: warning
annotations:
- summary: Host out of disk space (instance {{ $labels.instance }})
+ summary: Host out of disk space at {{ $labels.instance }}.
description: |-
- Disk is almost full (< 10% left).
+ Disk is almost full.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: HostDiskWillFillIn24Hours
expr: >-
@@ -136,13 +135,12 @@ groups:
labels:
severity: warning
annotations:
- summary: Host disk will fill in 24 hours (instance {{ $labels.instance }})
+ summary: Host disk will fill in 24 hours at {{ $labels.instance }}.
description: |-
- Filesystem is predicted to run out of space within the next 24 hours
- at current write rate.
+ Filesystem is predicted to run out of space within the next 24 hours at current write rate.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: HostOutOfInodes
expr: >-
@@ -156,12 +154,12 @@ groups:
labels:
severity: warning
annotations:
- summary: Host out of inodes (instance {{ $labels.instance }})
+ summary: Host out of inodes at {{ $labels.instance }}.
description: |-
- Disk is almost running out of available inodes (< 10% left).
+ Disk is almost running out of available inodes.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: HostInodesWillFillIn24Hours
expr: >-
@@ -178,13 +176,12 @@ groups:
labels:
severity: warning
annotations:
- summary: Host inodes will fill in 24 hours (instance {{ $labels.instance }})
+ summary: Host inodes will fill in 24 hours at {{ $labels.instance }}.
description: |-
- Filesystem is predicted to run out of inodes within the next 24
- hours at current write rate.
+ Filesystem is predicted to run out of inodes within the next 24 hours at current write rate.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: HostUnusualDiskReadLatency
expr: >-
@@ -198,12 +195,12 @@ groups:
labels:
severity: warning
annotations:
- summary: Host unusual disk read latency (instance {{ $labels.instance }})
+ summary: Host unusual disk read latency at {{ $labels.instance }}.
description: |-
- Disk latency is growing (read operations > 100ms).
+ Disk latency is growing.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: HostUnusualDiskWriteLatency
expr: >-
@@ -217,12 +214,12 @@ groups:
labels:
severity: warning
annotations:
- summary: Host unusual disk write latency (instance {{ $labels.instance }})
+ summary: Host unusual disk write latency at {{ $labels.instance }}.
description: |-
- Disk latency is growing (write operations > 100ms).
+ Disk latency is growing.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: HostHighCpuLoad
expr: >-
@@ -234,23 +231,23 @@ groups:
labels:
severity: warning
annotations:
- summary: Host high CPU load (instance {{ $labels.instance }})
+ summary: Host high CPU load at {{ $labels.instance }}.
description: |-
- CPU load is > 80%.
+ CPU load is high.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: HostCpuStealNoisyNeighbor
expr: >-
avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m]))
* 100
- > 10
+ > 15
for: 0m
labels:
severity: warning
annotations:
- summary: Host CPU steal noisy neighbor (instance {{ $labels.instance }})
+ summary: Host CPU steal noisy neighbor at {{ $labels.instance }}.
description: |-
CPU steal is > 10%. A noisy neighbor is killing VM performances or a
spot instance may be out of credit.
@@ -262,18 +259,17 @@ groups:
expr: |-
avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m]))
* 100
- > 10
+ > 15
for: 0m
labels:
severity: warning
annotations:
- summary: Host CPU high iowait (instance {{ $labels.instance }})
+ summary: Host CPU high I/O wait at {{ $labels.instance }}.
description: |-
- CPU iowait > 10%. A high iowait means that you are disk or network
- bound.
+ CPU I/O wait is high. A high I/O wait means that you are disk or network bound.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: HostUnusualDiskIo
expr: >-
@@ -282,12 +278,12 @@ groups:
labels:
severity: warning
annotations:
- summary: Host unusual disk IO (instance {{ $labels.instance }})
+ summary: Host unusual disk I/O at {{ $labels.instance }}.
description: |-
- Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.
+ Time spent over I/O is too high. Check storage for issues.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: HostSwapIsFillingUp
expr: >-
@@ -298,12 +294,12 @@ groups:
labels:
severity: warning
annotations:
- summary: Host swap is filling up (instance {{ $labels.instance }})
+ summary: Host swap is filling up at {{ $labels.instance }}.
description: |-
- Swap is filling up (> 80%).
+ Swap is filling up.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: HostSystemdServiceCrashed
expr: >-
@@ -312,12 +308,12 @@ groups:
labels:
severity: warning
annotations:
- summary: Host systemd service crashed (instance {{ $labels.instance }})
+ summary: Host systemd service crashed at {{ $labels.instance }}.
description: |-
Systemd service crashed.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: HostPhysicalComponentTooHot
expr: >-
@@ -326,12 +322,12 @@ groups:
labels:
severity: warning
annotations:
- summary: Host physical component too hot (instance {{ $labels.instance }})
+ summary: Host physical component too hot at {{ $labels.instance }}.
description: |-
Physical hardware component too hot.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: HostNodeOvertemperatureAlarm
expr: >-
@@ -340,12 +336,12 @@ groups:
labels:
severity: critical
annotations:
- summary: Host node overtemperature alarm (instance {{ $labels.instance }})
+ summary: Host node overtemperature alarm at {{ $labels.instance }}.
description: |-
Physical node temperature alarm triggered.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: HostRaidArrayGotInactive
expr: >-
@@ -354,14 +350,12 @@ groups:
labels:
severity: critical
annotations:
- summary: Host RAID array got inactive (instance {{ $labels.instance }})
+ summary: Host RAID array got inactive at {{ $labels.instance }}.
description: |-
- RAID array {{ $labels.device }} is in degraded state due to one or
- more disks failures. Number of spare drives is insufficient to fix
- issue automatically.
+ RAID array is in a degraded state due to one or more disks failures. Number of spare drives is insufficient to fix the issue automatically.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: HostRaidDiskFailure
expr: >-
@@ -370,14 +364,12 @@ groups:
labels:
severity: warning
annotations:
- summary: Host RAID disk failure (instance {{ $labels.instance }})
+ summary: Host RAID disk failure at {{ $labels.instance }}.
description: |-
- At least one device in RAID array on {{ $labels.instance }} failed.
- Array {{ $labels.md_device }} needs attention and possibly a disk
- swap.
+ At least one device in RAID array is failed. Possibly, a disk swap is required.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: HostOomKillDetected
expr: >-
@@ -386,12 +378,12 @@ groups:
labels:
severity: warning
annotations:
- summary: Host OOM kill detected (instance {{ $labels.instance }})
+ summary: Host OOM kill detected at {{ $labels.instance }}.
description: |-
OOM kill detected.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: HostEdacCorrectableErrorsDetected
expr: >-
@@ -400,13 +392,12 @@ groups:
labels:
severity: info
annotations:
- summary: Host EDAC Correctable Errors detected (instance {{ $labels.instance }})
+ summary: Host EDAC correctable errors detected at {{ $labels.instance }}.
description: |-
- Host {{ $labels.instance }} has had {{ printf "%.0f" $value }}
- correctable memory errors reported by EDAC in the last 5 minutes.
+ Host has had {{ printf "%.0f" $value }} correctable memory errors reported by EDAC in the last 5 minutes.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: HostEdacUncorrectableErrorsDetected
expr: >-
@@ -415,66 +406,67 @@ groups:
labels:
severity: warning
annotations:
- summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }})
+ summary: Host EDAC uncorrectable errors detected at {{ $labels.instance }}.
description: |-
- Host {{ $labels.instance }} has had {{ printf "%.0f" $value }}
- uncorrectable memory errors reported by EDAC in the last 5
- minutes.
+ Host has had {{ printf "%.0f" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: HostNetworkReceiveErrors
- expr: "rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01"
+ expr: >-
+ rate(node_network_receive_errs_total{device!~"^wg.*"}[2m])
+ /
+ rate(node_network_receive_packets_total{device!~"^wg.*"}[2m])
+ > 0.01
for: 2m
labels:
severity: warning
annotations:
- summary: Host Network Receive Errors (instance {{ $labels.instance }})
+ summary: Host Network Receive Errors at {{ $labels.instance }}.
description: |-
- Host {{ $labels.instance }} interface {{ $labels.device }} has
- encountered {{ printf "%.0f" $value }} receive errors in the last
- two minutes.
+ Host interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive errors in the last two minutes.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: HostNetworkTransmitErrors
- expr: "rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01"
+ expr: >-
+ rate(node_network_transmit_errs_total{device!~"^wg.*"}[2m])
+ /
+ rate(node_network_transmit_packets_total{device!~"^wg.*"}[2m])
+ > 0.1
for: 2m
labels:
severity: warning
annotations:
- summary: Host Network Transmit Errors (instance {{ $labels.instance }})
+ summary: Host network transmit errors at {{ $labels.instance }}.
description: |-
- Host {{ $labels.instance }} interface {{ $labels.device }} has
- encountered {{ printf "%.0f" $value }} transmit errors in the last
- two minutes.
+ Host interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit errors in the last two minutes.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: HostNetworkInterfaceSaturated
expr: >-
(
- rate(node_network_receive_bytes_total{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}[1m])
+ rate(node_network_receive_bytes_total{device!~"^wg.*"}[1m])
+
- rate(node_network_transmit_bytes_total{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}[1m])
+ rate(node_network_transmit_bytes_total{device!~"^wg.*"}[1m])
)
- / node_network_speed_bytes{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}
+ / node_network_speed_bytes{device!~"^wg.*"}
> 0.8
< 10000
for: 1m
labels:
severity: warning
annotations:
- summary: Host Network Interface Saturated (instance {{ $labels.instance }})
+ summary: Host network interface saturated at {{ $labels.instance }}.
description: |-
- The network interface "{{ $labels.device }}" on "{{ $labels.instance }}"
- is getting overloaded.
+ The network interface {{ $labels.device }} is getting overloaded.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: HostNetworkBondDegraded
expr: >-
@@ -483,43 +475,53 @@ groups:
labels:
severity: warning
annotations:
- summary: Host Network Bond Degraded (instance {{ $labels.instance }})
+ summary: Host network bond degraded at {{ $labels.instance }}.
description: |-
- Bond "{{ $labels.device }}" degraded on "{{ $labels.instance }}".
+ Bond {{ $labels.device }} degraded.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: HostConntrackLimit
expr: >-
- node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8
+ node_nf_conntrack_entries
+ /
+ node_nf_conntrack_entries_limit
+ > 0.8
for: 5m
labels:
severity: warning
annotations:
- summary: Host conntrack limit (instance {{ $labels.instance }})
+ summary: Host conntrack limit at {{ $labels.instance }}.
description: |-
The number of conntrack is approaching limit.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: HostClockSkew
expr: >-
- (node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0)
+ (
+ node_timex_offset_seconds > 0.05
+ and
+ deriv(node_timex_offset_seconds[5m]) >= 0
+ )
or
- (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)
+ (
+ node_timex_offset_seconds < -0.05
+ and
+ deriv(node_timex_offset_seconds[5m]) <= 0
+ )
for: 2m
labels:
severity: warning
annotations:
- summary: Host clock skew (instance {{ $labels.instance }})
+ summary: Host clock skew at {{ $labels.instance }}.
description: |-
- Clock skew detected. Clock is out of sync. Ensure NTP is configured
- correctly on this host.
+ Clock skew is detected and the clock is out of sync. Ensure that NTP is configured correctly on this host.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: HostClockNotSynchronising
expr: >-
@@ -530,12 +532,12 @@ groups:
labels:
severity: warning
annotations:
- summary: Host clock not synchronising (instance {{ $labels.instance }})
+ summary: Host clock not synchronising at {{ $labels.instance }}.
description: |-
- Clock not synchronising. Ensure NTP is configured on this host.
+ Clock is not synchronising. Ensure that NTP is configured correctly on this host.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: HostRequiresReboot
expr: >-
@@ -544,9 +546,9 @@ groups:
labels:
severity: info
annotations:
- summary: Host requires reboot (instance {{ $labels.instance }})
+ summary: Host requires reboot at {{ $labels.instance }}.
description: |-
- Instance {{ $labels.instance }} requires a reboot.
+ Instance requires a reboot.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}