2023-04-12

author: Azat Bahawi <azat@bahawi.net> 2023-04-12 04:01:46 +0300
committer: Azat Bahawi <azat@bahawi.net> 2023-04-12 04:01:46 +0300
commit: d6368c86bc949371e904eed3d0a6583ebd53b055 (patch)
tree: 042db513412ba7f1577b1ac690d4e0e0fac22cbf /modules/nixos/monitoring/rules
parent: 2023-04-07 (diff)
4 files changed, 296 insertions, 288 deletions
diff --git a/modules/nixos/monitoring/rules/nginx.yaml b/modules/nixos/monitoring/rules/nginx.yaml
index 59229a8..f00d372 100644
--- a/modules/nixos/monitoring/rules/nginx.yaml
+++ b/modules/nixos/monitoring/rules/nginx.yaml
@@ -6,47 +6,55 @@ groups:
       - alert: NginxHighHttp4xxErrorRate
         expr: >-
           sum(rate(nginx_http_requests_total{status=~"^4.."}[1m]))
-          / sum(rate(nginx_http_requests_total[1m])) * 100
+          /
+          sum(rate(nginx_http_requests_total[1m])) * 100
           > 5
         for: 1m
         labels:
           severity: critical
         annotations:
-          summary: NGINX high HTTP 4xx error rate (instance {{ $labels.instance }})
+          summary: NGINX high HTTP 4xx error rate at {{ $labels.instance }}.
           description: |-
-            Too many HTTP requests with status 4xx (> 5%).
+            Too many HTTP requests with a 4xx status code.
 
-              VALUE = {{ $value }}
-              LABELS = {{ $labels }}
+            VALUE = {{ $value }}
+            LABELS = {{ $labels }}
 
       - alert: NginxHighHttp5xxErrorRate
         expr: >-
           sum(rate(nginx_http_requests_total{status=~"^5.."}[1m]))
-          / sum(rate(nginx_http_requests_total[1m])) * 100
+          /
+          sum(rate(nginx_http_requests_total[1m])) * 100
           > 5
         for: 1m
         labels:
           severity: critical
         annotations:
-          summary: NGINX high HTTP 5xx error rate (instance {{ $labels.instance }})
+          summary: NGINX high HTTP 5xx error rate at {{ $labels.instance }}.
           description: |-
-            Too many HTTP requests with status 5xx (> 5%).
+            Too many HTTP requests with a 5xx status code.
 
-              VALUE = {{ $value }}
-              LABELS = {{ $labels }}
+            VALUE = {{ $value }}
+            LABELS = {{ $labels }}
 
       - alert: NginxLatencyHigh
         expr: >-
-          histogram_quantile(0.99,
-            sum(rate(nginx_http_request_duration_seconds_bucket[2m])) by (host, node))
+          histogram_quantile(
+            0.99,
+            sum(
+              rate(
+                nginx_http_request_duration_seconds_bucket[2m]
+              )
+            ) by (host, node)
+          )
           > 3
         for: 2m
         labels:
           severity: warning
         annotations:
-          summary: NGINX high latency (instance {{ $labels.instance }})
+          summary: NGINX high latency at {{ $labels.instance }}.
           description: |-
-            NGINX 99% latency is higher than 3 seconds.
+            NGINX 99% of latency spikes is higher than 3 seconds.
 
-              VALUE = {{ $value }}
-              LABELS = {{ $labels }}
+            VALUE = {{ $value }}
+            LABELS = {{ $labels }}
diff --git a/modules/nixos/monitoring/rules/node.yaml b/modules/nixos/monitoring/rules/node.yaml
index 81d7810..98217b3 100644
--- a/modules/nixos/monitoring/rules/node.yaml
+++ b/modules/nixos/monitoring/rules/node.yaml
@@ -13,12 +13,12 @@ groups:
         labels:
           severity: warning
         annotations:
-          summary: Host out of memory (instance {{ $labels.instance }})
+          summary: Host out of memory at {{ $labels.instance }}.
           description: |-
-            Node memory is filling up (< 10% left).
+            Node memory is filling up.
 
-              VALUE = {{ $value }}
-              LABELS = {{ $labels }}
+            VALUE = {{ $value }}
+            LABELS = {{ $labels }}
 
       - alert: HostMemoryUnderMemoryPressure
         expr: >-
@@ -27,13 +27,12 @@ groups:
         labels:
           severity: warning
         annotations:
-          summary: Host memory under memory pressure (instance {{ $labels.instance }})
+          summary: Host memory under memory pressure at {{ $labels.instance }}.
           description: |-
-            The node is under heavy memory pressure. High rate of major page
-            faults.
+            The node is under heavy memory pressure. High rate of major page faults.
 
-              VALUE = {{ $value }}
-              LABELS = {{ $labels }}
+            VALUE = {{ $value }}
+            LABELS = {{ $labels }}
 
       - alert: HostUnusualNetworkThroughputIn
         expr: >-
@@ -47,10 +46,10 @@ groups:
         annotations:
           summary: Host unusual network throughput in (instance {{ $labels.instance }})
           description: |-
-            Host network interfaces are probably receiving too much data (> 100 MB/s).
+            Host network interfaces are probably receiving too much data.
 
-              VALUE = {{ $value }}
-              LABELS = {{ $labels }}
+            VALUE = {{ $value }}
+            LABELS = {{ $labels }}
 
       - alert: HostUnusualNetworkThroughputOut
         expr: >-
@@ -62,12 +61,12 @@ groups:
         labels:
           severity: warning
         annotations:
-          summary: Host unusual network throughput out (instance {{ $labels.instance }})
+          summary: Host unusual network throughput out at {{ $labels.instance }}.
           description: |-
-            Host network interfaces are probably sending too much data (> 100 MB/s).
+            Host network interfaces are probably sending too much data.
 
-              VALUE = {{ $value }}
-              LABELS = {{ $labels }}
+            VALUE = {{ $value }}
+            LABELS = {{ $labels }}
 
       - alert: HostUnusualDiskReadRate
         expr: >-
@@ -79,12 +78,12 @@ groups:
         labels:
           severity: warning
         annotations:
-          summary: Host unusual disk read rate (instance {{ $labels.instance }})
+          summary: Host unusual disk read rate at {{ $labels.instance }}.
           description: |-
-            Disk is probably reading too much data (> 50 MB/s).
+            Disk is probably reading too much data.
 
-              VALUE = {{ $value }}
-              LABELS = {{ $labels }}
+            VALUE = {{ $value }}
+            LABELS = {{ $labels }}
 
       - alert: HostUnusualDiskWriteRate
         expr: >-
@@ -96,12 +95,12 @@ groups:
         labels:
           severity: warning
         annotations:
-          summary: Host unusual disk write rate (instance {{ $labels.instance }})
+          summary: Host unusual disk write rate at {{ $labels.instance }}.
           description: |-
-            Disk is probably writing too much data (> 50 MB/s).
+            Disk is probably writing too much data.
 
-              VALUE = {{ $value }}
-              LABELS = {{ $labels }}
+            VALUE = {{ $value }}
+            LABELS = {{ $labels }}
 
       - alert: HostOutOfDiskSpace
         expr: >-
@@ -109,18 +108,18 @@ groups:
           / node_filesystem_size_bytes
           < 10
           and
-          ON (instance, device, mountpoint) node_filesystem_readonly
+          on (instance, device, mountpoint) node_filesystem_readonly
           == 0
         for: 2m
         labels:
           severity: warning
         annotations:
-          summary: Host out of disk space (instance {{ $labels.instance }})
+          summary: Host out of disk space at {{ $labels.instance }}.
           description: |-
-            Disk is almost full (< 10% left).
+            Disk is almost full.
 
-              VALUE = {{ $value }}
-              LABELS = {{ $labels }}
+            VALUE = {{ $value }}
+            LABELS = {{ $labels }}
 
       - alert: HostDiskWillFillIn24Hours
         expr: >-
@@ -136,13 +135,12 @@ groups:
         labels:
           severity: warning
         annotations:
-          summary: Host disk will fill in 24 hours (instance {{ $labels.instance }})
+          summary: Host disk will fill in 24 hours at {{ $labels.instance }}.
           description: |-
-            Filesystem is predicted to run out of space within the next 24 hours
-            at current write rate.
+            Filesystem is predicted to run out of space within the next 24 hours at current write rate.
 
-              VALUE = {{ $value }}
-              LABELS = {{ $labels }}
+            VALUE = {{ $value }}
+            LABELS = {{ $labels }}
 
       - alert: HostOutOfInodes
         expr: >-
@@ -156,12 +154,12 @@ groups:
         labels:
           severity: warning
         annotations:
-          summary: Host out of inodes (instance {{ $labels.instance }})
+          summary: Host out of inodes at {{ $labels.instance }}.
           description: |-
-            Disk is almost running out of available inodes (< 10% left).
+            Disk is almost running out of available inodes.
 
-              VALUE = {{ $value }}
-              LABELS = {{ $labels }}
+            VALUE = {{ $value }}
+            LABELS = {{ $labels }}
 
       - alert: HostInodesWillFillIn24Hours
         expr: >-
@@ -178,13 +176,12 @@ groups:
         labels:
           severity: warning
         annotations:
-          summary: Host inodes will fill in 24 hours (instance {{ $labels.instance }})
+          summary: Host inodes will fill in 24 hours at {{ $labels.instance }}.
           description: |-
-            Filesystem is predicted to run out of inodes within the next 24
-            hours at current write rate.
+            Filesystem is predicted to run out of inodes within the next 24 hours at current write rate.
 
-              VALUE = {{ $value }}
-              LABELS = {{ $labels }}
+            VALUE = {{ $value }}
+            LABELS = {{ $labels }}
 
       - alert: HostUnusualDiskReadLatency
         expr: >-
@@ -198,12 +195,12 @@ groups:
         labels:
           severity: warning
         annotations:
-          summary: Host unusual disk read latency (instance {{ $labels.instance }})
+          summary: Host unusual disk read latency at {{ $labels.instance }}.
           description: |-
-            Disk latency is growing (read operations > 100ms).
+            Disk latency is growing.
 
-              VALUE = {{ $value }}
-              LABELS = {{ $labels }}
+            VALUE = {{ $value }}
+            LABELS = {{ $labels }}
 
       - alert: HostUnusualDiskWriteLatency
         expr: >-
@@ -217,12 +214,12 @@ groups:
         labels:
           severity: warning
         annotations:
-          summary: Host unusual disk write latency (instance {{ $labels.instance }})
+          summary: Host unusual disk write latency at {{ $labels.instance }}.
           description: |-
-            Disk latency is growing (write operations > 100ms).
+            Disk latency is growing.
 
-              VALUE = {{ $value }}
-              LABELS = {{ $labels }}
+            VALUE = {{ $value }}
+            LABELS = {{ $labels }}
 
       - alert: HostHighCpuLoad
         expr: >-
@@ -234,23 +231,23 @@ groups:
         labels:
           severity: warning
         annotations:
-          summary: Host high CPU load (instance {{ $labels.instance }})
+          summary: Host high CPU load at {{ $labels.instance }}.
           description: |-
-            CPU load is > 80%.
+            CPU load is high.
 
-              VALUE = {{ $value }}
-              LABELS = {{ $labels }}
+            VALUE = {{ $value }}
+            LABELS = {{ $labels }}
 
       - alert: HostCpuStealNoisyNeighbor
         expr: >-
           avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m]))
           * 100
-          > 10
+          > 15
         for: 0m
         labels:
           severity: warning
         annotations:
-          summary: Host CPU steal noisy neighbor (instance {{ $labels.instance }})
+          summary: Host CPU steal noisy neighbor at {{ $labels.instance }}.
           description: |-
             CPU steal is > 10%. A noisy neighbor is killing VM performances or a
             spot instance may be out of credit.
@@ -262,18 +259,17 @@ groups:
         expr: |-
           avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m]))
           * 100
-          > 10
+          > 15
         for: 0m
         labels:
           severity: warning
         annotations:
-          summary: Host CPU high iowait (instance {{ $labels.instance }})
+          summary: Host CPU high I/O wait at {{ $labels.instance }}.
           description: |-
-            CPU iowait > 10%. A high iowait means that you are disk or network
-            bound.
+            CPU I/O wait is high. A high I/O wait means that you are disk or network bound.
 
-              VALUE = {{ $value }}
-              LABELS = {{ $labels }}
+            VALUE = {{ $value }}
+            LABELS = {{ $labels }}
 
       - alert: HostUnusualDiskIo
         expr: >-
@@ -282,12 +278,12 @@ groups:
         labels:
           severity: warning
         annotations:
-          summary: Host unusual disk IO (instance {{ $labels.instance }})
+          summary: Host unusual disk I/O at {{ $labels.instance }}.
           description: |-
-            Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.
+            Time spent over I/O is too high. Check storage for issues.
 
-              VALUE = {{ $value }}
-              LABELS = {{ $labels }}
+            VALUE = {{ $value }}
+            LABELS = {{ $labels }}
 
       - alert: HostSwapIsFillingUp
         expr: >-
@@ -298,12 +294,12 @@ groups:
         labels:
           severity: warning
         annotations:
-          summary: Host swap is filling up (instance {{ $labels.instance }})
+          summary: Host swap is filling up at {{ $labels.instance }}.
           description: |-
-            Swap is filling up (> 80%).
+            Swap is filling up.
 
-              VALUE = {{ $value }}
-              LABELS = {{ $labels }}
+            VALUE = {{ $value }}
+            LABELS = {{ $labels }}
 
       - alert: HostSystemdServiceCrashed
         expr: >-
@@ -312,12 +308,12 @@ groups:
         labels:
           severity: warning
         annotations:
-          summary: Host systemd service crashed (instance {{ $labels.instance }})
+          summary: Host systemd service crashed at {{ $labels.instance }}.
           description: |-
             Systemd service crashed.
 
-              VALUE = {{ $value }}
-              LABELS = {{ $labels }}
+            VALUE = {{ $value }}
+            LABELS = {{ $labels }}
 
       - alert: HostPhysicalComponentTooHot
         expr: >-
@@ -326,12 +322,12 @@ groups:
         labels:
           severity: warning
         annotations:
-          summary: Host physical component too hot (instance {{ $labels.instance }})
+          summary: Host physical component too hot at {{ $labels.instance }}.
           description: |-
             Physical hardware component too hot.
 
-              VALUE = {{ $value }}
-              LABELS = {{ $labels }}
+            VALUE = {{ $value }}
+            LABELS = {{ $labels }}
 
       - alert: HostNodeOvertemperatureAlarm
         expr: >-
@@ -340,12 +336,12 @@ groups:
         labels:
           severity: critical
         annotations:
-          summary: Host node overtemperature alarm (instance {{ $labels.instance }})
+          summary: Host node overtemperature alarm at {{ $labels.instance }}.
           description: |-
             Physical node temperature alarm triggered.
 
-              VALUE = {{ $value }}
-              LABELS = {{ $labels }}
+            VALUE = {{ $value }}
+            LABELS = {{ $labels }}
 
       - alert: HostRaidArrayGotInactive
         expr: >-
@@ -354,14 +350,12 @@ groups:
         labels:
           severity: critical
         annotations:
-          summary: Host RAID array got inactive (instance {{ $labels.instance }})
+          summary: Host RAID array got inactive at {{ $labels.instance }}.
           description: |-
-            RAID array {{ $labels.device }} is in degraded state due to one or
-            more disks failures. Number of spare drives is insufficient to fix
-            issue automatically.
+            RAID array is in a degraded state due to one or more disks failures. Number of spare drives is insufficient to fix the issue automatically.
 
-              VALUE = {{ $value }}
-              LABELS = {{ $labels }}
+            VALUE = {{ $value }}
+            LABELS = {{ $labels }}
 
       - alert: HostRaidDiskFailure
         expr: >-
@@ -370,14 +364,12 @@ groups:
         labels:
           severity: warning
         annotations:
-          summary: Host RAID disk failure (instance {{ $labels.instance }})
+          summary: Host RAID disk failure at {{ $labels.instance }}.
           description: |-
-            At least one device in RAID array on {{ $labels.instance }} failed.
-            Array {{ $labels.md_device }} needs attention and possibly a disk
-            swap.
+            At least one device in RAID array is failed. Possibly, a disk swap is required.
 
-              VALUE = {{ $value }}
-              LABELS = {{ $labels }}
+            VALUE = {{ $value }}
+            LABELS = {{ $labels }}
 
       - alert: HostOomKillDetected
         expr: >-
@@ -386,12 +378,12 @@ groups:
         labels:
           severity: warning
         annotations:
-          summary: Host OOM kill detected (instance {{ $labels.instance }})
+          summary: Host OOM kill detected at {{ $labels.instance }}.
           description: |-
             OOM kill detected.
 
-              VALUE = {{ $value }}
-              LABELS = {{ $labels }}
+            VALUE = {{ $value }}
+            LABELS = {{ $labels }}
 
       - alert: HostEdacCorrectableErrorsDetected
         expr: >-
@@ -400,13 +392,12 @@ groups:
         labels:
           severity: info
         annotations:
-          summary: Host EDAC Correctable Errors detected (instance {{ $labels.instance }})
+          summary: Host EDAC correctable errors detected at {{ $labels.instance }}.
           description: |-
-            Host {{ $labels.instance }} has had {{ printf "%.0f" $value }}
-            correctable memory errors reported by EDAC in the last 5 minutes.
+            Host has had {{ printf "%.0f" $value }} correctable memory errors reported by EDAC in the last 5 minutes.
 
-              VALUE = {{ $value }}
-              LABELS = {{ $labels }}
+            VALUE = {{ $value }}
+            LABELS = {{ $labels }}
 
       - alert: HostEdacUncorrectableErrorsDetected
         expr: >-
@@ -415,66 +406,67 @@ groups:
         labels:
           severity: warning
         annotations:
-          summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }})
+          summary: Host EDAC uncorrectable errors detected at {{ $labels.instance }}.
           description: |-
-            Host {{ $labels.instance }} has had {{ printf "%.0f" $value }}
-            uncorrectable memory errors reported by EDAC in the last 5
-            minutes.
+            Host has had {{ printf "%.0f" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.
 
-              VALUE = {{ $value }}
-              LABELS = {{ $labels }}
+            VALUE = {{ $value }}
+            LABELS = {{ $labels }}
 
       - alert: HostNetworkReceiveErrors
-        expr: "rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01"
+        expr: >-
+          rate(node_network_receive_errs_total{device!~"^wg.*"}[2m])
+          /
+          rate(node_network_receive_packets_total{device!~"^wg.*"}[2m])
+          > 0.01
         for: 2m
         labels:
           severity: warning
         annotations:
-          summary: Host Network Receive Errors (instance {{ $labels.instance }})
+          summary: Host Network Receive Errors at {{ $labels.instance }}.
           description: |-
-            Host {{ $labels.instance }} interface {{ $labels.device }} has
-            encountered {{ printf "%.0f" $value }} receive errors in the last
-            two minutes.
+            Host interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive errors in the last two minutes.
 
-              VALUE = {{ $value }}
-              LABELS = {{ $labels }}
+            VALUE = {{ $value }}
+            LABELS = {{ $labels }}
 
       - alert: HostNetworkTransmitErrors
-        expr: "rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01"
+        expr: >-
+          rate(node_network_transmit_errs_total{device!~"^wg.*"}[2m])
+          /
+          rate(node_network_transmit_packets_total{device!~"^wg.*"}[2m])
+          > 0.1
         for: 2m
         labels:
           severity: warning
         annotations:
-          summary: Host Network Transmit Errors (instance {{ $labels.instance }})
+          summary: Host network transmit errors at {{ $labels.instance }}.
           description: |-
-            Host {{ $labels.instance }} interface {{ $labels.device }} has
-            encountered {{ printf "%.0f" $value }} transmit errors in the last
-            two minutes.
+            Host interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit errors in the last two minutes.
 
-              VALUE = {{ $value }}
-              LABELS = {{ $labels }}
+            VALUE = {{ $value }}
+            LABELS = {{ $labels }}
 
       - alert: HostNetworkInterfaceSaturated
         expr: >-
           (
-            rate(node_network_receive_bytes_total{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}[1m])
+            rate(node_network_receive_bytes_total{device!~"^wg.*"}[1m])
             +
-            rate(node_network_transmit_bytes_total{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}[1m])
+            rate(node_network_transmit_bytes_total{device!~"^wg.*"}[1m])
           )
-          / node_network_speed_bytes{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}
+          / node_network_speed_bytes{device!~"^wg.*"}
           > 0.8
           < 10000
         for: 1m
         labels:
           severity: warning
         annotations:
-          summary: Host Network Interface Saturated (instance {{ $labels.instance }})
+          summary: Host network interface saturated at {{ $labels.instance }}.
           description: |-
-            The network interface "{{ $labels.device }}" on "{{ $labels.instance }}"
-            is getting overloaded.
+            The network interface {{ $labels.device }} is getting overloaded.
 
-              VALUE = {{ $value }}
-              LABELS = {{ $labels }}
+            VALUE = {{ $value }}
+            LABELS = {{ $labels }}
 
       - alert: HostNetworkBondDegraded
         expr: >-
@@ -483,43 +475,53 @@ groups:
         labels:
           severity: warning
         annotations:
-          summary: Host Network Bond Degraded (instance {{ $labels.instance }})
+          summary: Host network bond degraded at {{ $labels.instance }}.
           description: |-
-            Bond "{{ $labels.device }}" degraded on "{{ $labels.instance }}".
+            Bond {{ $labels.device }} degraded.
 
-              VALUE = {{ $value }}
-              LABELS = {{ $labels }}
+            VALUE = {{ $value }}
+            LABELS = {{ $labels }}
 
       - alert: HostConntrackLimit
         expr: >-
-          node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8
+          node_nf_conntrack_entries
+          /
+          node_nf_conntrack_entries_limit
+          > 0.8
         for: 5m
         labels:
           severity: warning
         annotations:
-          summary: Host conntrack limit (instance {{ $labels.instance }})
+          summary: Host conntrack limit at {{ $labels.instance }}.
           description: |-
             The number of conntrack is approaching limit.
 
-              VALUE = {{ $value }}
-              LABELS = {{ $labels }}
+            VALUE = {{ $value }}
+            LABELS = {{ $labels }}
 
       - alert: HostClockSkew
         expr: >-
-          (node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0)
+          (
+            node_timex_offset_seconds > 0.05
+            and
+            deriv(node_timex_offset_seconds[5m]) >= 0
+          )
           or
-          (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)
+          (
+            node_timex_offset_seconds < -0.05
+            and
+            deriv(node_timex_offset_seconds[5m]) <= 0
+          )
         for: 2m
         labels:
           severity: warning
         annotations:
-          summary: Host clock skew (instance {{ $labels.instance }})
+          summary: Host clock skew at {{ $labels.instance }}.
           description: |-
-            Clock skew detected. Clock is out of sync. Ensure NTP is configured
-            correctly on this host.
+            Clock skew is detected and the clock is out of sync. Ensure that NTP is configured correctly on this host.
 
-              VALUE = {{ $value }}
-              LABELS = {{ $labels }}
+            VALUE = {{ $value }}
+            LABELS = {{ $labels }}
 
       - alert: HostClockNotSynchronising
         expr: >-
@@ -530,12 +532,12 @@ groups:
         labels:
           severity: warning
         annotations:
-          summary: Host clock not synchronising (instance {{ $labels.instance }})
+          summary: Host clock not synchronising at {{ $labels.instance }}.
           description: |-
-            Clock not synchronising. Ensure NTP is configured on this host.
+            Clock is not synchronising. Ensure that NTP is configured correctly on this host.
 
-              VALUE = {{ $value }}
-              LABELS = {{ $labels }}
+            VALUE = {{ $value }}
+            LABELS = {{ $labels }}
 
       - alert: HostRequiresReboot
         expr: >-
@@ -544,9 +546,9 @@ groups:
         labels:
           severity: info
         annotations:
-          summary: Host requires reboot (instance {{ $labels.instance }})
+          summary: Host requires reboot at {{ $labels.instance }}.
           description: |-
-            Instance {{ $labels.instance }} requires a reboot.
+            Instance requires a reboot.
 
-              VALUE = {{ $value }}
-              LABELS = {{ $labels }}
+            VALUE = {{ $value }}
+            LABELS = {{ $labels }}
diff --git a/modules/nixos/monitoring/rules/postgres.yaml b/modules/nixos/monitoring/rules/postgres.yaml
index 5d360fa..6aee560 100644
--- a/modules/nixos/monitoring/rules/postgres.yaml
+++ b/modules/nixos/monitoring/rules/postgres.yaml
@@ -10,12 +10,12 @@ groups:
         labels:
           severity: critical
         annotations:
-          summary: PostgreSQL is down (instance {{ $labels.instance }})
+          summary: PostgreSQL is down at {{ $labels.instance }}.
           description: |-
-            Postgresql instance is down.
+            PostgreSQL instance is down.
 
-              VALUE = {{ $value }}
-              LABELS = {{ $labels }}
+            VALUE = {{ $value }}
+            LABELS = {{ $labels }}
 
       - alert: PostgresqlRestarted
         expr: >-
@@ -24,12 +24,12 @@ groups:
         labels:
           severity: critical
         annotations:
-          summary: PostgreSQL restarted (instance {{ $labels.instance }})
+          summary: PostgreSQL restarted at {{ $labels.instance }}.
           description: |-
             PostgreSQL restarted.
 
-              VALUE = {{ $value }}
-              LABELS = {{ $labels }}
+            VALUE = {{ $value }}
+            LABELS = {{ $labels }}
 
       - alert: PostgresqlExporterError
         expr: >-
@@ -38,12 +38,12 @@ groups:
         labels:
           severity: critical
         annotations:
-          summary: PostgreSQL exporter error (instance {{ $labels.instance }})
+          summary: PostgreSQL exporter errors at {{ $labels.instance }}.
           description: |-
-            PostgreSQL exporter is showing errors. A query may be buggy in query.yaml.
+            PostgreSQL exporter is showing errors.
 
-              VALUE = {{ $value }}
-              LABELS = {{ $labels }}
+            VALUE = {{ $value }}
+            LABELS = {{ $labels }}
 
       - alert: PostgresqlTableNotAutoVacuumed
         expr: >-
@@ -55,12 +55,12 @@ groups:
         labels:
           severity: warning
         annotations:
-          summary: PostgreSQL table not auto vacuumed (instance {{ $labels.instance }})
+          summary: PostgreSQL table not auto vacuumed at {{ $labels.instance }}.
           description: |-
             Table {{ $labels.relname }} has not been auto vacuumed for 10 days.
 
-              VALUE = {{ $value }}
-              LABELS = {{ $labels }}
+            VALUE = {{ $value }}
+            LABELS = {{ $labels }}
 
       - alert: PostgresqlTableNotAutoAnalyzed
         expr: >-
@@ -72,57 +72,60 @@ groups:
         labels:
           severity: warning
         annotations:
-          summary: PostgreSQL table not auto analyzed (instance {{ $labels.instance }})
+          summary: PostgreSQL table not auto analyzed at {{ $labels.instance }}.
           description: |-
             Table {{ $labels.relname }} has not been auto analyzed for 10 days.
 
-              VALUE = {{ $value }}
-              LABELS = {{ $labels }}"
+            VALUE = {{ $value }}
+            LABELS = {{ $labels }}"
 
       - alert: PostgresqlTooManyConnections
         expr: >-
-          sum by (datname) (pg_stat_activity_count{datname!~"template.*|postgres"})
-          > pg_settings_max_connections * 0.8
+          sum by (datname) (
+            pg_stat_activity_count{datname!~"template.*|postgres"}
+          ) > pg_settings_max_connections * 0.8
         for: 2m
         labels:
           severity: warning
         annotations:
-          summary: Postgresql too many connections (instance {{ $labels.instance }})
+          summary: PostgreSQL with too many connections at {{ $labels.instance }}.
           description: |-
-            PostgreSQL instance has too many connections (> 80%).
+            PostgreSQL instance {{ $labels.instance }} has too many connections.
 
-              VALUE = {{ $value }}
-              LABELS = {{ $labels }}
+            VALUE = {{ $value }}
+            LABELS = {{ $labels }}
 
       - alert: PostgresqlNotEnoughConnections
         expr: >-
-          sum by (datname) (pg_stat_activity_count{datname!~"template.*|postgres"})
-          < 1
+          sum by (datname) (
+            pg_stat_activity_count{datname!~"template.*|postgres"}
+          ) < 1
         for: 2m
         labels:
           severity: warning
         annotations:
-          summary: Postgresql not enough connections (instance {{ $labels.instance }})
+          summary: PostgreSQL with not enough connections at {{ $labels.instance }}.
           description: |-
-            PostgreSQL instance should have more connections (> 1).
+            PostgreSQL instance {{ $labels.instance }} should have more connections.
 
-              VALUE = {{ $value }}
-              LABELS = {{ $labels }}
+            VALUE = {{ $value }}
+            LABELS = {{ $labels }}
 
       - alert: PostgresqlDeadLocks
         expr: >-
-          increase(pg_stat_database_deadlocks{datname!~"template.*|postgres"}[1m])
-          > 5
+          increase(
+            pg_stat_database_deadlocks{datname!~"template.*|postgres"}[1m]
+          ) > 5
         for: 0m
         labels:
           severity: warning
         annotations:
-          summary: Postgresql dead locks (instance {{ $labels.instance }})
+          summary: PostgreSQL dead-locks at instance {{ $labels.instance }}.
           description: |-
-            PostgreSQL has dead-locks.
+            PostgreSQL shows dead-locks.
 
-              VALUE = {{ $value }}
-              LABELS = {{ $labels }}
+            VALUE = {{ $value }}
+            LABELS = {{ $labels }}
 
       - alert: PostgresqlHighRollbackRate
         expr: >-
@@ -136,17 +139,17 @@ groups:
                 (rate(pg_stat_database_xact_commit{datname!~"template.*|postgres",datid!="0"}[3m]))
               )
             )
-          > 0.02
+          > 0.10
         for: 0m
         labels:
           severity: warning
         annotations:
-          summary: PostgreSQL is at a high rollback rate (instance {{ $labels.instance }})
+          summary: PostgreSQL at a high rollback rate at {{ $labels.instance }}.
           description: |-
-            Ratio of transactions being aborted compared to committed is > 2%.
+            Ratio of transactions being aborted compared to committed is too big.
 
-              VALUE = {{ $value }}
-              LABELS = {{ $labels }}
+            VALUE = {{ $value }}
+            LABELS = {{ $labels }}
 
       - alert: PostgresqlCommitRateLow
         expr: >-
@@ -156,12 +159,12 @@ groups:
         labels:
           severity: critical
         annotations:
-          summary: PostgreSQL commit rate low (instance {{ $labels.instance }})
+          summary: PostgreSQL commit rate low at instance {{ $labels.instance }}.
           description: |-
             PostgreSQL seems to be processing very few transactions.
 
-              VALUE = {{ $value }}
-              LABELS = {{ $labels }}
+            VALUE = {{ $value }}
+            LABELS = {{ $labels }}
 
       - alert: PostgresqlLowXidConsumption
         expr: >-
@@ -171,12 +174,12 @@ groups:
         labels:
           severity: warning
         annotations:
-          summary: PostgreSQL low XID consumption (instance {{ $labels.instance }})
+          summary: PostgreSQL low XID consumption at instance {{ $labels.instance }}.
           description: |-
             PostgreSQL seems to be consuming transaction IDs very slowly.
 
-              VALUE = {{ $value }}
-              LABELS = {{ $labels }}
+            VALUE = {{ $value }}
+            LABELS = {{ $labels }}
 
       - alert: PostgresqlHighRateStatementTimeout
         expr: >-
@@ -190,8 +193,8 @@ groups:
           description: |-
             PostgreSQL transactions showing high rate of statement timeouts.
 
-              VALUE = {{ $value }}
-              LABELS = {{ $labels }}
+            VALUE = {{ $value }}
+            LABELS = {{ $labels }}
 
       - alert: PostgresqlHighRateDeadlock
         expr: >-
@@ -201,12 +204,12 @@ groups:
         labels:
           severity: critical
         annotations:
-          summary: PostgreSQL high rate deadlock (instance {{ $labels.instance }})
+          summary: PostgreSQL high rate dead-lock at {{ $labels.instance }}.
           description: |-
-            PostgreSQL detected deadlocks.
+            PostgreSQL has detected dead-locks.
 
-              VALUE = {{ $value }}
-              LABELS = {{ $labels }}
+            VALUE = {{ $value }}
+            LABELS = {{ $labels }}
 
       - alert: PostgresqlUnusedReplicationSlot
         expr: >-
@@ -215,12 +218,12 @@ groups:
         labels:
           severity: warning
         annotations:
-          summary: PostgreSQL unused replication slot (instance {{ $labels.instance }})
+          summary: PostgreSQL unused replication slot at {{ $labels.instance }}.
           description: |-
-            Unused Replication Slots.
+            Unused replication slots.
 
-              VALUE = {{ $value }}
-              LABELS = {{ $labels }}
+            VALUE = {{ $value }}
+            LABELS = {{ $labels }}
 
       - alert: PostgresqlTooManyDeadTuples
         expr: >-
@@ -234,12 +237,12 @@ groups:
         labels:
           severity: warning
         annotations:
-          summary: PostgreSQL too many dead tuples (instance {{ $labels.instance }})
+          summary: PostgreSQL too many dead tuples at {{ $labels.instance }}.
           description: |-
             PostgreSQL number of dead tuples is too large.
 
-              VALUE = {{ $value }}
-              LABELS = {{ $labels }}
+            VALUE = {{ $value }}
+            LABELS = {{ $labels }}
 
       - alert: PostgresqlSslCompressionActive
         expr: >-
@@ -248,13 +251,12 @@ groups:
         labels:
           severity: critical
         annotations:
-          summary: Postgresql SSL compression active (instance {{ $labels.instance }})
+          summary: PostgreSQL SSL compression active at {{ $labels.instance }}.
           description: |-
-            Database connections with SSL compression is enabled. This may add a
-            significant jitter in the replication delay.
+            Database connections with an SSL compression is enabled. This may add a significant jitter in the replication delay.
 
-              VALUE = {{ $value }}
-              LABELS = {{ $labels }}
+            VALUE = {{ $value }}
+            LABELS = {{ $labels }}
 
       - alert: PostgresqlTooManyLocksAcquired
         expr: >-
@@ -268,12 +270,12 @@ groups:
         labels:
           severity: critical
         annotations:
-          summary: PostgreSQL too many locks acquired (instance {{ $labels.instance }})
+          summary: PostgreSQL too many locks acquired at {{ $labels.instance }}.
           description: |-
             Too many locks acquired on the database.
 
-              VALUE = {{ $value }}
-              LABELS = {{ $labels }}
+            VALUE = {{ $value }}
+            LABELS = {{ $labels }}
 
       - alert: PostgresqlBloatIndexHigh
         expr: >-
@@ -284,13 +286,12 @@ groups:
         labels:
           severity: warning
         annotations:
-          summary: PostgreSQL bloat index high (> 80%) (instance {{ $labels.instance }})
+          summary: PostgreSQL index bloat high at {{ $labels.instance }}.
           description: |-
-            The index {{ $labels.idxname }} is bloated. You should execute
-            `REINDEX INDEX CONCURRENTLY {{ $labels.idxname }};`
+            The index {{ $labels.idxname }} is bloated. You should execute `REINDEX INDEX CONCURRENTLY {{ $labels.idxname }};`.
 
-              VALUE = {{ $value }}
-              LABELS = {{ $labels }}
+            VALUE = {{ $value }}
+            LABELS = {{ $labels }}
 
       - alert: PostgresqlBloatTableHigh
         expr: >-
@@ -301,10 +302,9 @@ groups:
         labels:
           severity: warning
         annotations:
-          summary: PostgreSQL bloat table high (> 80%) (instance {{ $labels.instance }})
+          summary: PostgreSQL table bloat high at instance {{ $labels.instance }}.
           description: |-
-            The table {{ $labels.relname }} is bloated. You should execute
-            `VACUUM {{ $labels.relname }};`
+            The table {{ $labels.relname }} is bloated. You should execute `VACUUM {{ $labels.relname }};`.
 
-              VALUE = {{ $value }}
-              LABELS = {{ $labels }}
+            VALUE = {{ $value }}
+            LABELS = {{ $labels }}
diff --git a/modules/nixos/monitoring/rules/redis.yaml b/modules/nixos/monitoring/rules/redis.yaml
index f6d1fe1..c07c819 100644
--- a/modules/nixos/monitoring/rules/redis.yaml
+++ b/modules/nixos/monitoring/rules/redis.yaml
@@ -10,12 +10,12 @@ groups:
         labels:
           severity: critical
         annotations:
-          summary: Redis down (instance {{ $labels.instance }})
+          summary: Redis down at {{ $labels.instance }}.
           description: |-
             Redis instance is down.
 
-              VALUE = {{ $value }}
-              LABELS = {{ $labels }}
+            VALUE = {{ $value }}
+            LABELS = {{ $labels }}
 
       - alert: RedisMissingMaster
         expr: >-
@@ -25,12 +25,12 @@ groups:
         labels:
           severity: critical
         annotations:
-          summary: Redis missing master (instance {{ $labels.instance }})
+          summary: Redis missing master at {{ $labels.instance }}).
           description: |-
-            Redis cluster has no node marked as master.
+            Redis cluster has no node marked as a master.
 
-              VALUE = {{ $value }}
-              LABELS = {{ $labels }}
+            VALUE = {{ $value }}
+            LABELS = {{ $labels }}
 
       - alert: RedisTooManyMasters
         expr: >-
@@ -39,12 +39,12 @@ groups:
         labels:
           severity: critical
         annotations:
-          summary: Redis too many masters (instance {{ $labels.instance }})
+          summary: Redis too many masters at {{ $labels.instance }}.
           description: |-
-            Redis cluster has too many nodes marked as master.
+            Redis cluster has too many nodes marked as a master.
 
-              VALUE = {{ $value }}
-              LABELS = {{ $labels }}
+            VALUE = {{ $value }}
+            LABELS = {{ $labels }}
 
       - alert: RedisDisconnectedSlaves
         expr: >-
@@ -56,12 +56,12 @@ groups:
         labels:
           severity: critical
         annotations:
-          summary: Redis disconnected slaves (instance {{ $labels.instance }})
+          summary: Redis disconnected slaves at {{ $labels.instance }}.
           description: |-
             Redis is not replicating for all slaves.
 
-              VALUE = {{ $value }}
-              LABELS = {{ $labels }}
+            VALUE = {{ $value }}
+            LABELS = {{ $labels }}
 
       - alert: RedisReplicationBroken
         expr: >-
@@ -70,12 +70,12 @@ groups:
         labels:
           severity: critical
         annotations:
-          summary: Redis replication broken (instance {{ $labels.instance }})
+          summary: Redis replication broken at {{ $labels.instance }}.
           description: |-
             Redis instance lost a slave.
 
-              VALUE = {{ $value }}
-              LABELS = {{ $labels }}
+            VALUE = {{ $value }}
+            LABELS = {{ $labels }}
 
       - alert: RedisClusterFlapping
         expr: >-
@@ -84,14 +84,12 @@ groups:
         labels:
           severity: critical
         annotations:
-          summary: Redis cluster flapping (instance {{ $labels.instance }})
+          summary: Redis cluster flapping at {{ $labels.instance }}.
           description: |-
-            Changes have been detected in the Redis replica connection. This can
-            occur when replica nodes lose connection to the master and reconnect
-            (a.k.a flapping).
+            Changes have been detected in the Redis replica connection. This can occur when replica nodes lose connection to the master and reconnect (a.k.a flapping).
 
-              VALUE = {{ $value }}
-              LABELS = {{ $labels }}
+            VALUE = {{ $value }}
+            LABELS = {{ $labels }}
 
       - alert: RedisMissingBackup
         expr: >-
@@ -101,12 +99,12 @@ groups:
         labels:
           severity: critical
         annotations:
-          summary: Redis missing backup (instance {{ $labels.instance }})
+          summary: Redis missing backup at {{ $labels.instance }}.
           description: |-
             Redis has not been backed up for 24 hours.
 
-              VALUE = {{ $value }}
-              LABELS = {{ $labels }}
+            VALUE = {{ $value }}
+            LABELS = {{ $labels }}
 
       - alert: RedisOutOfSystemMemory
         expr: >-
@@ -118,12 +116,12 @@ groups:
         labels:
           severity: warning
         annotations:
-          summary: Redis out of system memory (instance {{ $labels.instance }})
+          summary: Redis out of system memory at {{ $labels.instance }}.
           description: |-
-            Redis is running out of system memory (> 90%).
+            Redis is running out of system memory.
 
-              VALUE = {{ $value }}
-              LABELS = {{ $labels }}
+            VALUE = {{ $value }}
+            LABELS = {{ $labels }}
 
       - alert: RedisOutOfConfiguredMaxmemory
         expr: >-
@@ -139,12 +137,12 @@ groups:
         labels:
           severity: warning
         annotations:
-          summary: Redis out of configured maxmemory (instance {{ $labels.instance }})
+          summary: Redis out of configured maxmemory at {{ $labels.instance }}.
           description: |-
-            Redis is running out of configured maxmemory (> 90%).
+            Redis is running out of configured maxmemory.
 
-              VALUE = {{ $value }}
-              LABELS = {{ $labels }}
+            VALUE = {{ $value }}
+            LABELS = {{ $labels }}
 
       - alert: RedisTooManyConnections
         expr: >-
@@ -153,12 +151,12 @@ groups:
         labels:
           severity: warning
         annotations:
-          summary: Redis too many connections (instance {{ $labels.instance }})
+          summary: Redis too many connections at {{ $labels.instance }}.
           description: |-
             Redis instance has too many connections.
 
-              VALUE = {{ $value }}
-              LABELS = {{ $labels }}
+            VALUE = {{ $value }}
+            LABELS = {{ $labels }}
 
       - alert: RedisNotEnoughConnections
         expr: >-
@@ -167,12 +165,12 @@ groups:
         labels:
           severity: warning
         annotations:
-          summary: Redis not enough connections (instance {{ $labels.instance }})
+          summary: Redis not enough connections at {{ $labels.instance }}.
           description: |-
-            Redis instance should have more connections (> 1).
+            Redis instance should have more connections.
 
-              VALUE = {{ $value }}
-              LABELS = {{ $labels }}
+            VALUE = {{ $value }}
+            LABELS = {{ $labels }}
 
       - alert: RedisRejectedConnections
         expr: >-
@@ -181,9 +179,9 @@ groups:
         labels:
           severity: critical
         annotations:
-          summary: Redis rejected connections (instance {{ $labels.instance }})
+          summary: Redis rejected connections at {{ $labels.instance }}.
           description: |-
             Some connections to Redis have been rejected.
 
-              VALUE = {{ $value }}
-              LABELS = {{ $labels }}
+            VALUE = {{ $value }}
+            LABELS = {{ $labels }}
author	Azat Bahawi <azat@bahawi.net>	2023-04-12 04:01:46 +0300
committer	Azat Bahawi <azat@bahawi.net>	2023-04-12 04:01:46 +0300
commit	d6368c86bc949371e904eed3d0a6583ebd53b055 (patch)
tree	042db513412ba7f1577b1ac690d4e0e0fac22cbf /modules/nixos/monitoring/rules
parent	2023-04-07 (diff)