about summary refs log tree commit diff
path: root/modules/nixos/monitoring/rules/node.yaml
diff options
context:
space:
mode:
authorAzat Bahawi <azat@bahawi.net>2023-04-12 04:01:46 +0300
committerAzat Bahawi <azat@bahawi.net>2023-04-12 04:01:46 +0300
commitd6368c86bc949371e904eed3d0a6583ebd53b055 (patch)
tree042db513412ba7f1577b1ac690d4e0e0fac22cbf /modules/nixos/monitoring/rules/node.yaml
parent2023-04-07 (diff)
2023-04-12
Diffstat (limited to '')
-rw-r--r--modules/nixos/monitoring/rules/node.yaml304
1 files changed, 153 insertions, 151 deletions
diff --git a/modules/nixos/monitoring/rules/node.yaml b/modules/nixos/monitoring/rules/node.yaml
index 81d7810..98217b3 100644
--- a/modules/nixos/monitoring/rules/node.yaml
+++ b/modules/nixos/monitoring/rules/node.yaml
@@ -13,12 +13,12 @@ groups:
         labels:
           severity: warning
         annotations:
-          summary: Host out of memory (instance {{ $labels.instance }})
+          summary: Host out of memory at {{ $labels.instance }}.
           description: |-
-            Node memory is filling up (< 10% left).
+            Node memory is filling up.
 
-              VALUE = {{ $value }}
-              LABELS = {{ $labels }}
+            VALUE = {{ $value }}
+            LABELS = {{ $labels }}
 
       - alert: HostMemoryUnderMemoryPressure
         expr: >-
@@ -27,13 +27,12 @@ groups:
         labels:
           severity: warning
         annotations:
-          summary: Host memory under memory pressure (instance {{ $labels.instance }})
+          summary: Host memory under memory pressure at {{ $labels.instance }}.
           description: |-
-            The node is under heavy memory pressure. High rate of major page
-            faults.
+            The node is under heavy memory pressure. High rate of major page faults.
 
-              VALUE = {{ $value }}
-              LABELS = {{ $labels }}
+            VALUE = {{ $value }}
+            LABELS = {{ $labels }}
 
       - alert: HostUnusualNetworkThroughputIn
         expr: >-
@@ -47,10 +46,10 @@ groups:
         annotations:
           summary: Host unusual network throughput in (instance {{ $labels.instance }})
           description: |-
-            Host network interfaces are probably receiving too much data (> 100 MB/s).
+            Host network interfaces are probably receiving too much data.
 
-              VALUE = {{ $value }}
-              LABELS = {{ $labels }}
+            VALUE = {{ $value }}
+            LABELS = {{ $labels }}
 
       - alert: HostUnusualNetworkThroughputOut
         expr: >-
@@ -62,12 +61,12 @@ groups:
         labels:
           severity: warning
         annotations:
-          summary: Host unusual network throughput out (instance {{ $labels.instance }})
+          summary: Host unusual network throughput out at {{ $labels.instance }}.
           description: |-
-            Host network interfaces are probably sending too much data (> 100 MB/s).
+            Host network interfaces are probably sending too much data.
 
-              VALUE = {{ $value }}
-              LABELS = {{ $labels }}
+            VALUE = {{ $value }}
+            LABELS = {{ $labels }}
 
       - alert: HostUnusualDiskReadRate
         expr: >-
@@ -79,12 +78,12 @@ groups:
         labels:
           severity: warning
         annotations:
-          summary: Host unusual disk read rate (instance {{ $labels.instance }})
+          summary: Host unusual disk read rate at {{ $labels.instance }}.
           description: |-
-            Disk is probably reading too much data (> 50 MB/s).
+            Disk is probably reading too much data.
 
-              VALUE = {{ $value }}
-              LABELS = {{ $labels }}
+            VALUE = {{ $value }}
+            LABELS = {{ $labels }}
 
       - alert: HostUnusualDiskWriteRate
         expr: >-
@@ -96,12 +95,12 @@ groups:
         labels:
           severity: warning
         annotations:
-          summary: Host unusual disk write rate (instance {{ $labels.instance }})
+          summary: Host unusual disk write rate at {{ $labels.instance }}.
           description: |-
-            Disk is probably writing too much data (> 50 MB/s).
+            Disk is probably writing too much data.
 
-              VALUE = {{ $value }}
-              LABELS = {{ $labels }}
+            VALUE = {{ $value }}
+            LABELS = {{ $labels }}
 
       - alert: HostOutOfDiskSpace
         expr: >-
@@ -109,18 +108,18 @@ groups:
           / node_filesystem_size_bytes
           < 10
           and
-          ON (instance, device, mountpoint) node_filesystem_readonly
+          on (instance, device, mountpoint) node_filesystem_readonly
           == 0
         for: 2m
         labels:
           severity: warning
         annotations:
-          summary: Host out of disk space (instance {{ $labels.instance }})
+          summary: Host out of disk space at {{ $labels.instance }}.
           description: |-
-            Disk is almost full (< 10% left).
+            Disk is almost full.
 
-              VALUE = {{ $value }}
-              LABELS = {{ $labels }}
+            VALUE = {{ $value }}
+            LABELS = {{ $labels }}
 
       - alert: HostDiskWillFillIn24Hours
         expr: >-
@@ -136,13 +135,12 @@ groups:
         labels:
           severity: warning
         annotations:
-          summary: Host disk will fill in 24 hours (instance {{ $labels.instance }})
+          summary: Host disk will fill in 24 hours at {{ $labels.instance }}.
           description: |-
-            Filesystem is predicted to run out of space within the next 24 hours
-            at current write rate.
+            Filesystem is predicted to run out of space within the next 24 hours at current write rate.
 
-              VALUE = {{ $value }}
-              LABELS = {{ $labels }}
+            VALUE = {{ $value }}
+            LABELS = {{ $labels }}
 
       - alert: HostOutOfInodes
         expr: >-
@@ -156,12 +154,12 @@ groups:
         labels:
           severity: warning
         annotations:
-          summary: Host out of inodes (instance {{ $labels.instance }})
+          summary: Host out of inodes at {{ $labels.instance }}.
           description: |-
-            Disk is almost running out of available inodes (< 10% left).
+            Disk is almost running out of available inodes.
 
-              VALUE = {{ $value }}
-              LABELS = {{ $labels }}
+            VALUE = {{ $value }}
+            LABELS = {{ $labels }}
 
       - alert: HostInodesWillFillIn24Hours
         expr: >-
@@ -178,13 +176,12 @@ groups:
         labels:
           severity: warning
         annotations:
-          summary: Host inodes will fill in 24 hours (instance {{ $labels.instance }})
+          summary: Host inodes will fill in 24 hours at {{ $labels.instance }}.
           description: |-
-            Filesystem is predicted to run out of inodes within the next 24
-            hours at current write rate.
+            Filesystem is predicted to run out of inodes within the next 24 hours at current write rate.
 
-              VALUE = {{ $value }}
-              LABELS = {{ $labels }}
+            VALUE = {{ $value }}
+            LABELS = {{ $labels }}
 
       - alert: HostUnusualDiskReadLatency
         expr: >-
@@ -198,12 +195,12 @@ groups:
         labels:
           severity: warning
         annotations:
-          summary: Host unusual disk read latency (instance {{ $labels.instance }})
+          summary: Host unusual disk read latency at {{ $labels.instance }}.
           description: |-
-            Disk latency is growing (read operations > 100ms).
+            Disk latency is growing.
 
-              VALUE = {{ $value }}
-              LABELS = {{ $labels }}
+            VALUE = {{ $value }}
+            LABELS = {{ $labels }}
 
       - alert: HostUnusualDiskWriteLatency
         expr: >-
@@ -217,12 +214,12 @@ groups:
         labels:
           severity: warning
         annotations:
-          summary: Host unusual disk write latency (instance {{ $labels.instance }})
+          summary: Host unusual disk write latency at {{ $labels.instance }}.
           description: |-
-            Disk latency is growing (write operations > 100ms).
+            Disk latency is growing.
 
-              VALUE = {{ $value }}
-              LABELS = {{ $labels }}
+            VALUE = {{ $value }}
+            LABELS = {{ $labels }}
 
       - alert: HostHighCpuLoad
         expr: >-
@@ -234,23 +231,23 @@ groups:
         labels:
           severity: warning
         annotations:
-          summary: Host high CPU load (instance {{ $labels.instance }})
+          summary: Host high CPU load at {{ $labels.instance }}.
           description: |-
-            CPU load is > 80%.
+            CPU load is high.
 
-              VALUE = {{ $value }}
-              LABELS = {{ $labels }}
+            VALUE = {{ $value }}
+            LABELS = {{ $labels }}
 
       - alert: HostCpuStealNoisyNeighbor
         expr: >-
           avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m]))
           * 100
-          > 10
+          > 15
         for: 0m
         labels:
           severity: warning
         annotations:
-          summary: Host CPU steal noisy neighbor (instance {{ $labels.instance }})
+          summary: Host CPU steal noisy neighbor at {{ $labels.instance }}.
           description: |-
             CPU steal is > 10%. A noisy neighbor is killing VM performances or a
             spot instance may be out of credit.
@@ -262,18 +259,17 @@ groups:
         expr: |-
           avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m]))
           * 100
-          > 10
+          > 15
         for: 0m
         labels:
           severity: warning
         annotations:
-          summary: Host CPU high iowait (instance {{ $labels.instance }})
+          summary: Host CPU high I/O wait at {{ $labels.instance }}.
           description: |-
-            CPU iowait > 10%. A high iowait means that you are disk or network
-            bound.
+            CPU I/O wait is high. A high I/O wait means that you are disk or network bound.
 
-              VALUE = {{ $value }}
-              LABELS = {{ $labels }}
+            VALUE = {{ $value }}
+            LABELS = {{ $labels }}
 
       - alert: HostUnusualDiskIo
         expr: >-
@@ -282,12 +278,12 @@ groups:
         labels:
           severity: warning
         annotations:
-          summary: Host unusual disk IO (instance {{ $labels.instance }})
+          summary: Host unusual disk I/O at {{ $labels.instance }}.
           description: |-
-            Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.
+            Time spent over I/O is too high. Check storage for issues.
 
-              VALUE = {{ $value }}
-              LABELS = {{ $labels }}
+            VALUE = {{ $value }}
+            LABELS = {{ $labels }}
 
       - alert: HostSwapIsFillingUp
         expr: >-
@@ -298,12 +294,12 @@ groups:
         labels:
           severity: warning
         annotations:
-          summary: Host swap is filling up (instance {{ $labels.instance }})
+          summary: Host swap is filling up at {{ $labels.instance }}.
           description: |-
-            Swap is filling up (> 80%).
+            Swap is filling up.
 
-              VALUE = {{ $value }}
-              LABELS = {{ $labels }}
+            VALUE = {{ $value }}
+            LABELS = {{ $labels }}
 
       - alert: HostSystemdServiceCrashed
         expr: >-
@@ -312,12 +308,12 @@ groups:
         labels:
           severity: warning
         annotations:
-          summary: Host systemd service crashed (instance {{ $labels.instance }})
+          summary: Host systemd service crashed at {{ $labels.instance }}.
           description: |-
             Systemd service crashed.
 
-              VALUE = {{ $value }}
-              LABELS = {{ $labels }}
+            VALUE = {{ $value }}
+            LABELS = {{ $labels }}
 
       - alert: HostPhysicalComponentTooHot
         expr: >-
@@ -326,12 +322,12 @@ groups:
         labels:
           severity: warning
         annotations:
-          summary: Host physical component too hot (instance {{ $labels.instance }})
+          summary: Host physical component too hot at {{ $labels.instance }}.
           description: |-
             Physical hardware component too hot.
 
-              VALUE = {{ $value }}
-              LABELS = {{ $labels }}
+            VALUE = {{ $value }}
+            LABELS = {{ $labels }}
 
       - alert: HostNodeOvertemperatureAlarm
         expr: >-
@@ -340,12 +336,12 @@ groups:
         labels:
           severity: critical
         annotations:
-          summary: Host node overtemperature alarm (instance {{ $labels.instance }})
+          summary: Host node overtemperature alarm at {{ $labels.instance }}.
           description: |-
             Physical node temperature alarm triggered.
 
-              VALUE = {{ $value }}
-              LABELS = {{ $labels }}
+            VALUE = {{ $value }}
+            LABELS = {{ $labels }}
 
       - alert: HostRaidArrayGotInactive
         expr: >-
@@ -354,14 +350,12 @@ groups:
         labels:
           severity: critical
         annotations:
-          summary: Host RAID array got inactive (instance {{ $labels.instance }})
+          summary: Host RAID array got inactive at {{ $labels.instance }}.
           description: |-
-            RAID array {{ $labels.device }} is in degraded state due to one or
-            more disks failures. Number of spare drives is insufficient to fix
-            issue automatically.
+            RAID array is in a degraded state due to one or more disks failures. Number of spare drives is insufficient to fix the issue automatically.
 
-              VALUE = {{ $value }}
-              LABELS = {{ $labels }}
+            VALUE = {{ $value }}
+            LABELS = {{ $labels }}
 
       - alert: HostRaidDiskFailure
         expr: >-
@@ -370,14 +364,12 @@ groups:
         labels:
           severity: warning
         annotations:
-          summary: Host RAID disk failure (instance {{ $labels.instance }})
+          summary: Host RAID disk failure at {{ $labels.instance }}.
           description: |-
-            At least one device in RAID array on {{ $labels.instance }} failed.
-            Array {{ $labels.md_device }} needs attention and possibly a disk
-            swap.
+            At least one device in RAID array is failed. Possibly, a disk swap is required.
 
-              VALUE = {{ $value }}
-              LABELS = {{ $labels }}
+            VALUE = {{ $value }}
+            LABELS = {{ $labels }}
 
       - alert: HostOomKillDetected
         expr: >-
@@ -386,12 +378,12 @@ groups:
         labels:
           severity: warning
         annotations:
-          summary: Host OOM kill detected (instance {{ $labels.instance }})
+          summary: Host OOM kill detected at {{ $labels.instance }}.
           description: |-
             OOM kill detected.
 
-              VALUE = {{ $value }}
-              LABELS = {{ $labels }}
+            VALUE = {{ $value }}
+            LABELS = {{ $labels }}
 
       - alert: HostEdacCorrectableErrorsDetected
         expr: >-
@@ -400,13 +392,12 @@ groups:
         labels:
           severity: info
         annotations:
-          summary: Host EDAC Correctable Errors detected (instance {{ $labels.instance }})
+          summary: Host EDAC correctable errors detected at {{ $labels.instance }}.
           description: |-
-            Host {{ $labels.instance }} has had {{ printf "%.0f" $value }}
-            correctable memory errors reported by EDAC in the last 5 minutes.
+            Host has had {{ printf "%.0f" $value }} correctable memory errors reported by EDAC in the last 5 minutes.
 
-              VALUE = {{ $value }}
-              LABELS = {{ $labels }}
+            VALUE = {{ $value }}
+            LABELS = {{ $labels }}
 
       - alert: HostEdacUncorrectableErrorsDetected
         expr: >-
@@ -415,66 +406,67 @@ groups:
         labels:
           severity: warning
         annotations:
-          summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }})
+          summary: Host EDAC uncorrectable errors detected at {{ $labels.instance }}.
           description: |-
-            Host {{ $labels.instance }} has had {{ printf "%.0f" $value }}
-            uncorrectable memory errors reported by EDAC in the last 5
-            minutes.
+            Host has had {{ printf "%.0f" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.
 
-              VALUE = {{ $value }}
-              LABELS = {{ $labels }}
+            VALUE = {{ $value }}
+            LABELS = {{ $labels }}
 
       - alert: HostNetworkReceiveErrors
-        expr: "rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01"
+        expr: >-
+          rate(node_network_receive_errs_total{device!~"^wg.*"}[2m])
+          /
+          rate(node_network_receive_packets_total{device!~"^wg.*"}[2m])
+          > 0.01
         for: 2m
         labels:
           severity: warning
         annotations:
-          summary: Host Network Receive Errors (instance {{ $labels.instance }})
+          summary: Host Network Receive Errors at {{ $labels.instance }}.
           description: |-
-            Host {{ $labels.instance }} interface {{ $labels.device }} has
-            encountered {{ printf "%.0f" $value }} receive errors in the last
-            two minutes.
+            Host interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive errors in the last two minutes.
 
-              VALUE = {{ $value }}
-              LABELS = {{ $labels }}
+            VALUE = {{ $value }}
+            LABELS = {{ $labels }}
 
       - alert: HostNetworkTransmitErrors
-        expr: "rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01"
+        expr: >-
+          rate(node_network_transmit_errs_total{device!~"^wg.*"}[2m])
+          /
+          rate(node_network_transmit_packets_total{device!~"^wg.*"}[2m])
+          > 0.1
         for: 2m
         labels:
           severity: warning
         annotations:
-          summary: Host Network Transmit Errors (instance {{ $labels.instance }})
+          summary: Host network transmit errors at {{ $labels.instance }}.
           description: |-
-            Host {{ $labels.instance }} interface {{ $labels.device }} has
-            encountered {{ printf "%.0f" $value }} transmit errors in the last
-            two minutes.
+            Host interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit errors in the last two minutes.
 
-              VALUE = {{ $value }}
-              LABELS = {{ $labels }}
+            VALUE = {{ $value }}
+            LABELS = {{ $labels }}
 
       - alert: HostNetworkInterfaceSaturated
         expr: >-
           (
-            rate(node_network_receive_bytes_total{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}[1m])
+            rate(node_network_receive_bytes_total{device!~"^wg.*"}[1m])
             +
-            rate(node_network_transmit_bytes_total{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}[1m])
+            rate(node_network_transmit_bytes_total{device!~"^wg.*"}[1m])
           )
-          / node_network_speed_bytes{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}
+          / node_network_speed_bytes{device!~"^wg.*"}
           > 0.8
           < 10000
         for: 1m
         labels:
           severity: warning
         annotations:
-          summary: Host Network Interface Saturated (instance {{ $labels.instance }})
+          summary: Host network interface saturated at {{ $labels.instance }}.
           description: |-
-            The network interface "{{ $labels.device }}" on "{{ $labels.instance }}"
-            is getting overloaded.
+            The network interface {{ $labels.device }} is getting overloaded.
 
-              VALUE = {{ $value }}
-              LABELS = {{ $labels }}
+            VALUE = {{ $value }}
+            LABELS = {{ $labels }}
 
       - alert: HostNetworkBondDegraded
         expr: >-
@@ -483,43 +475,53 @@ groups:
         labels:
           severity: warning
         annotations:
-          summary: Host Network Bond Degraded (instance {{ $labels.instance }})
+          summary: Host network bond degraded at {{ $labels.instance }}.
           description: |-
-            Bond "{{ $labels.device }}" degraded on "{{ $labels.instance }}".
+            Bond {{ $labels.device }} degraded.
 
-              VALUE = {{ $value }}
-              LABELS = {{ $labels }}
+            VALUE = {{ $value }}
+            LABELS = {{ $labels }}
 
       - alert: HostConntrackLimit
         expr: >-
-          node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8
+          node_nf_conntrack_entries
+          /
+          node_nf_conntrack_entries_limit
+          > 0.8
         for: 5m
         labels:
           severity: warning
         annotations:
-          summary: Host conntrack limit (instance {{ $labels.instance }})
+          summary: Host conntrack limit at {{ $labels.instance }}.
           description: |-
             The number of conntrack is approaching limit.
 
-              VALUE = {{ $value }}
-              LABELS = {{ $labels }}
+            VALUE = {{ $value }}
+            LABELS = {{ $labels }}
 
       - alert: HostClockSkew
         expr: >-
-          (node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0)
+          (
+            node_timex_offset_seconds > 0.05
+            and
+            deriv(node_timex_offset_seconds[5m]) >= 0
+          )
           or
-          (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)
+          (
+            node_timex_offset_seconds < -0.05
+            and
+            deriv(node_timex_offset_seconds[5m]) <= 0
+          )
         for: 2m
         labels:
           severity: warning
         annotations:
-          summary: Host clock skew (instance {{ $labels.instance }})
+          summary: Host clock skew at {{ $labels.instance }}.
           description: |-
-            Clock skew detected. Clock is out of sync. Ensure NTP is configured
-            correctly on this host.
+            Clock skew is detected and the clock is out of sync. Ensure that NTP is configured correctly on this host.
 
-              VALUE = {{ $value }}
-              LABELS = {{ $labels }}
+            VALUE = {{ $value }}
+            LABELS = {{ $labels }}
 
       - alert: HostClockNotSynchronising
         expr: >-
@@ -530,12 +532,12 @@ groups:
         labels:
           severity: warning
         annotations:
-          summary: Host clock not synchronising (instance {{ $labels.instance }})
+          summary: Host clock not synchronising at {{ $labels.instance }}.
           description: |-
-            Clock not synchronising. Ensure NTP is configured on this host.
+            Clock is not synchronising. Ensure that NTP is configured correctly on this host.
 
-              VALUE = {{ $value }}
-              LABELS = {{ $labels }}
+            VALUE = {{ $value }}
+            LABELS = {{ $labels }}
 
       - alert: HostRequiresReboot
         expr: >-
@@ -544,9 +546,9 @@ groups:
         labels:
           severity: info
         annotations:
-          summary: Host requires reboot (instance {{ $labels.instance }})
+          summary: Host requires reboot at {{ $labels.instance }}.
           description: |-
-            Instance {{ $labels.instance }} requires a reboot.
+            Instance requires a reboot.
 
-              VALUE = {{ $value }}
-              LABELS = {{ $labels }}
+            VALUE = {{ $value }}
+            LABELS = {{ $labels }}

Consider giving Nix/NixOS a try! <3