summaryrefslogtreecommitdiff
path: root/modules
diff options
context:
space:
mode:
authorAzat Bahawi <azat@bahawi.net>2023-04-12 04:01:46 +0300
committerAzat Bahawi <azat@bahawi.net>2023-04-12 04:01:46 +0300
commitd6368c86bc949371e904eed3d0a6583ebd53b055 (patch)
tree042db513412ba7f1577b1ac690d4e0e0fac22cbf /modules
parentdae3149a93cab4d1140526e15eb928d275f56128 (diff)
2023-04-12
Diffstat (limited to 'modules')
-rw-r--r--modules/nixos/firefox/default.nix24
-rw-r--r--modules/nixos/ipfs.nix161
-rw-r--r--modules/nixos/matrix/dendrite.nix2
-rw-r--r--modules/nixos/monitoring/default.nix195
-rw-r--r--modules/nixos/monitoring/rules/nginx.yaml40
-rw-r--r--modules/nixos/monitoring/rules/node.yaml304
-rw-r--r--modules/nixos/monitoring/rules/postgres.yaml154
-rw-r--r--modules/nixos/monitoring/rules/redis.yaml86
8 files changed, 448 insertions, 518 deletions
diff --git a/modules/nixos/firefox/default.nix b/modules/nixos/firefox/default.nix
index da78a1c..30927b3 100644
--- a/modules/nixos/firefox/default.nix
+++ b/modules/nixos/firefox/default.nix
@@ -67,16 +67,20 @@ in {
userContent = mkCssWithRoot ./userContent.css;
- extensions = with pkgs.nur.repos.rycee.firefox-addons; [
- bitwarden
- consent-o-matic
- darkreader
- localcdn
- noscript
- privacy-redirect
- ublock-origin
- violentmonkey
- ];
+ extensions = with pkgs.nur.repos.rycee.firefox-addons;
+ [
+ bitwarden
+ consent-o-matic
+ darkreader
+ furiganaize
+ localcdn
+ no-pdf-download
+ noscript
+ privacy-redirect
+ ublock-origin
+ violentmonkey
+ ]
+ ++ optional config.nixfiles.modules.ipfs.enable ipfs-companion;
# https://github.com/arkenfox/user.js/blob/master/user.js
arkenfox = {
diff --git a/modules/nixos/ipfs.nix b/modules/nixos/ipfs.nix
index 0ec64e5..68075ff 100644
--- a/modules/nixos/ipfs.nix
+++ b/modules/nixos/ipfs.nix
@@ -7,9 +7,9 @@
with lib; let
cfg = config.nixfiles.modules.ipfs;
- swarmDefaultPort = 4001;
- apiDefaultPort = 5001;
gatewayDefaultPort = 6001;
+ apiDefaultPort = 5001;
+ swarmDefaultPort = 4001;
in {
options.nixfiles.modules.ipfs = {
enable = mkEnableOption "IPFS daemon";
@@ -20,13 +20,13 @@ in {
default = "ipfs.${config.networking.fqdn}";
};
- swarmPort = mkOption {
- description = "Swarm port.";
+ gatewayPort = mkOption {
+ description = "Gateway port.";
type = with types; port;
default =
if this.isHeadless
- then swarmDefaultPort + 990
- else swarmDefaultPort;
+ then gatewayDefaultPort + 990
+ else gatewayDefaultPort;
};
apiPort = mkOption {
@@ -38,19 +38,16 @@ in {
else apiDefaultPort;
};
- gatewayPort = mkOption {
- description = "Gateway port.";
+ swarmPort = mkOption {
+ description = "Swarm port.";
type = with types; port;
- default =
- if this.isHeadless
- then gatewayDefaultPort + 990
- else gatewayDefaultPort;
+ default = swarmDefaultPort;
};
};
config = mkIf cfg.enable (mkMerge [
{
- services.ipfs = {
+ services.kubo = {
enable = true;
user = my.username;
@@ -58,70 +55,68 @@ in {
dataDir = "${config.dirs.data}/ipfs";
- swarmAddress = let
- port = toString cfg.swarmPort;
- in
- if this.isHeadless
- then [
- "/ip4/127.0.0.1/tcp/${port}"
- "/ip4/127.0.0.1/udp/${port}/quic"
- ]
- else [
- "/ip4/0.0.0.0/tcp/${port}"
- "/ip6/::/tcp/${port}"
- "/ip4/0.0.0.0/udp/${port}/quic"
- "/ip6/::/udp/${port}/quic"
- ];
- apiAddress = "/ip4/127.0.0.1/tcp/${toString cfg.apiPort}";
- gatewayAddress = "/ip4/127.0.0.1/tcp/${toString cfg.gatewayPort}";
-
autoMigrate = true;
autoMount = true;
emptyRepo = true;
enableGC = true;
- extraConfig = mkMerge [
- (let
- filterAddresses =
- [
- "/ip4/100.64.0.0/ipcidr/10"
- "/ip4/169.254.0.0/ipcidr/16"
- "/ip4/172.16.0.0/ipcidr/12"
- "/ip4/192.0.0.0/ipcidr/24"
- "/ip4/192.0.2.0/ipcidr/24"
- "/ip4/192.168.0.0/ipcidr/16"
- "/ip4/198.18.0.0/ipcidr/15"
- "/ip4/198.51.100.0/ipcidr/24"
- "/ip4/203.0.113.0/ipcidr/24"
- "/ip4/240.0.0.0/ipcidr/4"
- "/ip6/100::/ipcidr/64"
- "/ip6/2001:2::/ipcidr/48"
- "/ip6/2001:db8::/ipcidr/32"
- "/ip6/fe80::/ipcidr/10"
- ]
- ++ optionals (!hasAttr "wireguard" this) [
- "/ip4/10.0.0.0/ipcidr/8"
- "/ip6/fc00::/ipcidr/7"
- ];
- in {
- Addresses = with config.services.ipfs; {
- # https://github.com/NixOS/nixpkgs/pull/165259
- # I think this shit broke inheritance... Gotta test more and make
- # a PR I guess.
- API = apiAddress;
- Gateway = gatewayAddress;
- Swarm = swarmAddress;
+ settings = mkMerge [
+ (
+ let
+ filterAddresses =
+ [
+ "/ip4/100.64.0.0/ipcidr/10"
+ "/ip4/169.254.0.0/ipcidr/16"
+ "/ip4/172.16.0.0/ipcidr/12"
+ "/ip4/192.0.0.0/ipcidr/24"
+ "/ip4/192.0.2.0/ipcidr/24"
+ "/ip4/192.168.0.0/ipcidr/16"
+ "/ip4/198.18.0.0/ipcidr/15"
+ "/ip4/198.51.100.0/ipcidr/24"
+ "/ip4/203.0.113.0/ipcidr/24"
+ "/ip4/240.0.0.0/ipcidr/4"
+ "/ip6/100::/ipcidr/64"
+ "/ip6/2001:2::/ipcidr/48"
+ "/ip6/2001:db8::/ipcidr/32"
+ "/ip6/fe80::/ipcidr/10"
+ ]
+ ++ optionals (!hasAttr "wireguard" this) [
+ "/ip4/10.0.0.0/ipcidr/8"
+ "/ip6/fc00::/ipcidr/7"
+ ];
+ in {
+ Addresses = with config.services.ipfs; {
+ API = "/ip4/127.0.0.1/tcp/${toString cfg.apiPort}";
+ Gateway = "/ip4/127.0.0.1/tcp/${toString cfg.gatewayPort}";
+ Swarm = let
+ port = toString cfg.swarmPort;
+ in [
+ "/ip4/0.0.0.0/tcp/${port}"
+ "/ip6/::/tcp/${port}"
+ "/ip4/0.0.0.0/udp/${port}/quic"
+ "/ip6/::/udp/${port}/quic"
+ ];
- NoAnnounce = filterAddresses;
- };
- Swarm.AddrFilters = filterAddresses;
- API.HTTPHeaders.Access-Control-Allow-Methods = ["GET" "POST" "PUT"];
- })
+ NoAnnounce = filterAddresses;
+ };
+ Swarm.AddrFilters = filterAddresses;
+ API.HTTPHeaders.Access-Control-Allow-Methods = [
+ "GET"
+ "POST"
+ "PUT"
+ ];
+ }
+ )
(mkIf this.isHeadful {
API.HTTPHeaders.Access-Control-Allow-Origin = ["*"];
})
(mkIf this.isHeadless {
- API.HTTPHeaders.Access-Control-Allow-Origin = ["https://${cfg.domain}" "https://api.${cfg.domain}"];
+ API.HTTPHeaders.Access-Control-Allow-Origin = map (v: "http${
+ optionalString config.nixfiles.modules.acme.enable "s"
+ }://${v}") (with cfg; [
+ domain
+ "api.${domain}"
+ ]);
})
];
};
@@ -134,30 +129,22 @@ in {
(mkIf this.isHeadless {
nixfiles.modules.nginx = {
enable = true;
- upstreams = {
- ipfs_gateway.servers."127.0.0.1:${toString cfg.gatewayPort}" = {};
- ipfs_swarm.servers."127.0.0.1:${toString cfg.swarmPort}" = {};
- ipfs_api.servers."127.0.0.1:${toString cfg.apiPort}" = {};
+ upstreams = with cfg; {
+ kubo_gateway.servers."127.0.0.1:${toString gatewayPort}" = {};
+ kubo_api.servers."127.0.0.1:${toString apiPort}" = {};
};
virtualHosts = {
- ${cfg.domain}.locations."/".proxyPass = "http://ipfs_gateway";
- "swarm.${cfg.domain}" = {
- serverName = cfg.domain;
- listen = [
- {
- addr = "0.0.0.0";
- port = swarmDefaultPort;
- }
- {
- addr = "[::0]";
- port = swarmDefaultPort;
- }
- ];
- locations."/".proxyPass = "http://ipfs_swarm";
+ ${cfg.domain} = {
+ locations."/".proxyPass = "http://kubo_gateway";
+ extraConfig = nginxInternalOnly;
};
"api.${cfg.domain}" = {
- # TODO Redirect "/" to "/webui" but keep other endpoints.
- locations."/".proxyPass = "http://ipfs_api";
+ locations = {
+ "/".proxyPass = "http://kubo_api";
+ "~ ^/$".return = "301 http${
+ optionalString config.nixfiles.modules.acme.enable "s"
+ }://api.${cfg.domain}/webui";
+ };
extraConfig = nginxInternalOnly;
};
};
diff --git a/modules/nixos/matrix/dendrite.nix b/modules/nixos/matrix/dendrite.nix
index 6b662b2..35647cb 100644
--- a/modules/nixos/matrix/dendrite.nix
+++ b/modules/nixos/matrix/dendrite.nix
@@ -160,7 +160,7 @@ in {
"-o /run/dendrite/dendrite.yaml"
];
ExecStart = concatStringsSep " " [
- "${pkgs.dendrite}/bin/dendrite-monolith-server"
+ "${pkgs.dendrite}/bin/dendrite"
"--config /run/dendrite/dendrite.yaml"
"--http-bind-address 127.0.0.1:${toString cfg.port}"
];
diff --git a/modules/nixos/monitoring/default.nix b/modules/nixos/monitoring/default.nix
index a492a47..37e34d9 100644
--- a/modules/nixos/monitoring/default.nix
+++ b/modules/nixos/monitoring/default.nix
@@ -124,146 +124,77 @@ in {
prometheus = {
# It would be nice if these could be generated dynamically. That would
# require a complete rework of how configurations are defined, though.
- scrapeConfigs = let
- mkTargets = hosts: port: map (host: "${host.hostname}:${toString port}") hosts;
- in
- with my.configurations;
- with config.services.prometheus.exporters; [
- {
- job_name = "promtail";
- static_configs = [
- {
- targets =
- mkTargets
- [
- manwe
- varda
- yavanna
- ]
- config.nixfiles.modules.promtail.port;
- }
- ];
- }
- {
- job_name = "ntfy";
- static_configs = [
- {
- targets =
- mkTargets
- [
- manwe
- ]
- config.nixfiles.modules.ntfy.prometheus.port;
- }
- ];
- }
- {
- job_name = "soju";
- static_configs = [
- {
- targets = [
- "127.0.0.1:${toString config.nixfiles.modules.soju.prometheus.port}"
- ];
- }
- ];
- }
- {
- job_name = "endlessh-go";
- static_configs = [
- {
- targets =
- mkTargets
- [
- manwe
- varda
- yavanna
- ]
- config.services.endlessh-go.prometheus.port;
- }
- ];
- }
- {
- job_name = "nginx";
- static_configs = [
- {
- targets =
- mkTargets
- [
- manwe
- yavanna
- ]
- nginx.port;
- }
- ];
- }
- {
- job_name = "node";
- static_configs = [
- {
- targets =
- mkTargets
- [
- manwe
- varda
- yavanna
- ]
- node.port;
- }
- ];
- }
- {
- job_name = "postgres";
+ scrapeConfigs = with my.configurations;
+ mapAttrsToList
+ (
+ name: value: {
+ job_name = name;
static_configs = [
{
- targets =
- mkTargets
- [
- manwe
- ]
- postgres.port;
+ targets = with value;
+ map (host:
+ concatStringsSep ":" [
+ (
+ if isAttrs host
+ then host.hostname
+ else host
+ )
+ (toString port)
+ ])
+ hosts;
}
];
- }
- {
- job_name = "redis";
- static_configs = [
+ relabel_configs = [
{
- targets =
- mkTargets
- [
- manwe
- ]
- redis.port;
+ source_labels = ["__address__"];
+ regex = "([^:]+):\\d+";
+ target_label = "instance";
}
];
}
- {
- job_name = "unbound";
- static_configs = [
- {
- targets =
- mkTargets
- [
- manwe
- ]
- unbound.port;
- }
- ];
- }
- {
- job_name = "wireguard";
- static_configs = [
- {
- targets =
- mkTargets
- [
- manwe
- ]
- wireguard.port;
- }
- ];
- }
- ];
+ )
+ {
+ promtail = {
+ hosts = [manwe varda yavanna];
+ inherit (config.nixfiles.modules.promtail) port;
+ };
+ ntfy = {
+ hosts = [manwe];
+ inherit (config.nixfiles.modules.ntfy.prometheus) port;
+ };
+ soju = {
+ hosts = ["127.0.0.1"];
+ inherit (config.nixfiles.modules.soju.prometheus) port;
+ };
+ endlessh-go = {
+ hosts = [manwe varda yavanna];
+ inherit (config.services.endlessh-go.prometheus) port;
+ };
+ nginx = {
+ hosts = [manwe yavanna];
+ inherit (config.services.prometheus.exporters.nginx) port;
+ };
+ node = {
+ hosts = [manwe varda yavanna];
+ inherit (config.services.prometheus.exporters.node) port;
+ };
+ postgres = {
+ hosts = [manwe];
+ inherit (config.services.prometheus.exporters.postgres) port;
+ };
+ redis = {
+ hosts = [manwe];
+ inherit (config.services.prometheus.exporters.redis) port;
+ };
+ unbound = {
+ hosts = [manwe];
+ inherit (config.services.prometheus.exporters.unbound) port;
+ };
+ wireguard = {
+ hosts = [manwe];
+ inherit (config.services.prometheus.exporters.wireguard) port;
+ };
+ };
ruleFiles = [
./rules/nginx.yaml
diff --git a/modules/nixos/monitoring/rules/nginx.yaml b/modules/nixos/monitoring/rules/nginx.yaml
index 59229a8..f00d372 100644
--- a/modules/nixos/monitoring/rules/nginx.yaml
+++ b/modules/nixos/monitoring/rules/nginx.yaml
@@ -6,47 +6,55 @@ groups:
- alert: NginxHighHttp4xxErrorRate
expr: >-
sum(rate(nginx_http_requests_total{status=~"^4.."}[1m]))
- / sum(rate(nginx_http_requests_total[1m])) * 100
+ /
+ sum(rate(nginx_http_requests_total[1m])) * 100
> 5
for: 1m
labels:
severity: critical
annotations:
- summary: NGINX high HTTP 4xx error rate (instance {{ $labels.instance }})
+ summary: NGINX high HTTP 4xx error rate at {{ $labels.instance }}.
description: |-
- Too many HTTP requests with status 4xx (> 5%).
+ Too many HTTP requests with a 4xx status code.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: NginxHighHttp5xxErrorRate
expr: >-
sum(rate(nginx_http_requests_total{status=~"^5.."}[1m]))
- / sum(rate(nginx_http_requests_total[1m])) * 100
+ /
+ sum(rate(nginx_http_requests_total[1m])) * 100
> 5
for: 1m
labels:
severity: critical
annotations:
- summary: NGINX high HTTP 5xx error rate (instance {{ $labels.instance }})
+ summary: NGINX high HTTP 5xx error rate at {{ $labels.instance }}.
description: |-
- Too many HTTP requests with status 5xx (> 5%).
+ Too many HTTP requests with a 5xx status code.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: NginxLatencyHigh
expr: >-
- histogram_quantile(0.99,
- sum(rate(nginx_http_request_duration_seconds_bucket[2m])) by (host, node))
+ histogram_quantile(
+ 0.99,
+ sum(
+ rate(
+ nginx_http_request_duration_seconds_bucket[2m]
+ )
+ ) by (host, node)
+ )
> 3
for: 2m
labels:
severity: warning
annotations:
- summary: NGINX high latency (instance {{ $labels.instance }})
+ summary: NGINX high latency at {{ $labels.instance }}.
description: |-
- NGINX 99% latency is higher than 3 seconds.
+ NGINX 99% of latency spikes is higher than 3 seconds.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
diff --git a/modules/nixos/monitoring/rules/node.yaml b/modules/nixos/monitoring/rules/node.yaml
index 81d7810..98217b3 100644
--- a/modules/nixos/monitoring/rules/node.yaml
+++ b/modules/nixos/monitoring/rules/node.yaml
@@ -13,12 +13,12 @@ groups:
labels:
severity: warning
annotations:
- summary: Host out of memory (instance {{ $labels.instance }})
+ summary: Host out of memory at {{ $labels.instance }}.
description: |-
- Node memory is filling up (< 10% left).
+ Node memory is filling up.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: HostMemoryUnderMemoryPressure
expr: >-
@@ -27,13 +27,12 @@ groups:
labels:
severity: warning
annotations:
- summary: Host memory under memory pressure (instance {{ $labels.instance }})
+ summary: Host memory under memory pressure at {{ $labels.instance }}.
description: |-
- The node is under heavy memory pressure. High rate of major page
- faults.
+ The node is under heavy memory pressure. High rate of major page faults.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: HostUnusualNetworkThroughputIn
expr: >-
@@ -47,10 +46,10 @@ groups:
annotations:
summary: Host unusual network throughput in (instance {{ $labels.instance }})
description: |-
- Host network interfaces are probably receiving too much data (> 100 MB/s).
+ Host network interfaces are probably receiving too much data.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: HostUnusualNetworkThroughputOut
expr: >-
@@ -62,12 +61,12 @@ groups:
labels:
severity: warning
annotations:
- summary: Host unusual network throughput out (instance {{ $labels.instance }})
+ summary: Host unusual network throughput out at {{ $labels.instance }}.
description: |-
- Host network interfaces are probably sending too much data (> 100 MB/s).
+ Host network interfaces are probably sending too much data.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: HostUnusualDiskReadRate
expr: >-
@@ -79,12 +78,12 @@ groups:
labels:
severity: warning
annotations:
- summary: Host unusual disk read rate (instance {{ $labels.instance }})
+ summary: Host unusual disk read rate at {{ $labels.instance }}.
description: |-
- Disk is probably reading too much data (> 50 MB/s).
+ Disk is probably reading too much data.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: HostUnusualDiskWriteRate
expr: >-
@@ -96,12 +95,12 @@ groups:
labels:
severity: warning
annotations:
- summary: Host unusual disk write rate (instance {{ $labels.instance }})
+ summary: Host unusual disk write rate at {{ $labels.instance }}.
description: |-
- Disk is probably writing too much data (> 50 MB/s).
+ Disk is probably writing too much data.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: HostOutOfDiskSpace
expr: >-
@@ -109,18 +108,18 @@ groups:
/ node_filesystem_size_bytes
< 10
and
- ON (instance, device, mountpoint) node_filesystem_readonly
+ on (instance, device, mountpoint) node_filesystem_readonly
== 0
for: 2m
labels:
severity: warning
annotations:
- summary: Host out of disk space (instance {{ $labels.instance }})
+ summary: Host out of disk space at {{ $labels.instance }}.
description: |-
- Disk is almost full (< 10% left).
+ Disk is almost full.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: HostDiskWillFillIn24Hours
expr: >-
@@ -136,13 +135,12 @@ groups:
labels:
severity: warning
annotations:
- summary: Host disk will fill in 24 hours (instance {{ $labels.instance }})
+ summary: Host disk will fill in 24 hours at {{ $labels.instance }}.
description: |-
- Filesystem is predicted to run out of space within the next 24 hours
- at current write rate.
+ Filesystem is predicted to run out of space within the next 24 hours at current write rate.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: HostOutOfInodes
expr: >-
@@ -156,12 +154,12 @@ groups:
labels:
severity: warning
annotations:
- summary: Host out of inodes (instance {{ $labels.instance }})
+ summary: Host out of inodes at {{ $labels.instance }}.
description: |-
- Disk is almost running out of available inodes (< 10% left).
+ Disk is almost running out of available inodes.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: HostInodesWillFillIn24Hours
expr: >-
@@ -178,13 +176,12 @@ groups:
labels:
severity: warning
annotations:
- summary: Host inodes will fill in 24 hours (instance {{ $labels.instance }})
+ summary: Host inodes will fill in 24 hours at {{ $labels.instance }}.
description: |-
- Filesystem is predicted to run out of inodes within the next 24
- hours at current write rate.
+ Filesystem is predicted to run out of inodes within the next 24 hours at current write rate.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: HostUnusualDiskReadLatency
expr: >-
@@ -198,12 +195,12 @@ groups:
labels:
severity: warning
annotations:
- summary: Host unusual disk read latency (instance {{ $labels.instance }})
+ summary: Host unusual disk read latency at {{ $labels.instance }}.
description: |-
- Disk latency is growing (read operations > 100ms).
+ Disk latency is growing.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: HostUnusualDiskWriteLatency
expr: >-
@@ -217,12 +214,12 @@ groups:
labels:
severity: warning
annotations:
- summary: Host unusual disk write latency (instance {{ $labels.instance }})
+ summary: Host unusual disk write latency at {{ $labels.instance }}.
description: |-
- Disk latency is growing (write operations > 100ms).
+ Disk latency is growing.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: HostHighCpuLoad
expr: >-
@@ -234,23 +231,23 @@ groups:
labels:
severity: warning
annotations:
- summary: Host high CPU load (instance {{ $labels.instance }})
+ summary: Host high CPU load at {{ $labels.instance }}.
description: |-
- CPU load is > 80%.
+ CPU load is high.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: HostCpuStealNoisyNeighbor
expr: >-
avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m]))
* 100
- > 10
+ > 15
for: 0m
labels:
severity: warning
annotations:
- summary: Host CPU steal noisy neighbor (instance {{ $labels.instance }})
+ summary: Host CPU steal noisy neighbor at {{ $labels.instance }}.
description: |-
CPU steal is > 10%. A noisy neighbor is killing VM performances or a
spot instance may be out of credit.
@@ -262,18 +259,17 @@ groups:
expr: |-
avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m]))
* 100
- > 10
+ > 15
for: 0m
labels:
severity: warning
annotations:
- summary: Host CPU high iowait (instance {{ $labels.instance }})
+ summary: Host CPU high I/O wait at {{ $labels.instance }}.
description: |-
- CPU iowait > 10%. A high iowait means that you are disk or network
- bound.
+ CPU I/O wait is high. A high I/O wait means that you are disk or network bound.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: HostUnusualDiskIo
expr: >-
@@ -282,12 +278,12 @@ groups:
labels:
severity: warning
annotations:
- summary: Host unusual disk IO (instance {{ $labels.instance }})
+ summary: Host unusual disk I/O at {{ $labels.instance }}.
description: |-
- Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.
+ Time spent over I/O is too high. Check storage for issues.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: HostSwapIsFillingUp
expr: >-
@@ -298,12 +294,12 @@ groups:
labels:
severity: warning
annotations:
- summary: Host swap is filling up (instance {{ $labels.instance }})
+ summary: Host swap is filling up at {{ $labels.instance }}.
description: |-
- Swap is filling up (> 80%).
+ Swap is filling up.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: HostSystemdServiceCrashed
expr: >-
@@ -312,12 +308,12 @@ groups:
labels:
severity: warning
annotations:
- summary: Host systemd service crashed (instance {{ $labels.instance }})
+ summary: Host systemd service crashed at {{ $labels.instance }}.
description: |-
Systemd service crashed.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: HostPhysicalComponentTooHot
expr: >-
@@ -326,12 +322,12 @@ groups:
labels:
severity: warning
annotations:
- summary: Host physical component too hot (instance {{ $labels.instance }})
+ summary: Host physical component too hot at {{ $labels.instance }}.
description: |-
Physical hardware component too hot.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: HostNodeOvertemperatureAlarm
expr: >-
@@ -340,12 +336,12 @@ groups:
labels:
severity: critical
annotations:
- summary: Host node overtemperature alarm (instance {{ $labels.instance }})
+ summary: Host node overtemperature alarm at {{ $labels.instance }}.
description: |-
Physical node temperature alarm triggered.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: HostRaidArrayGotInactive
expr: >-
@@ -354,14 +350,12 @@ groups:
labels:
severity: critical
annotations:
- summary: Host RAID array got inactive (instance {{ $labels.instance }})
+ summary: Host RAID array got inactive at {{ $labels.instance }}.
description: |-
- RAID array {{ $labels.device }} is in degraded state due to one or
- more disks failures. Number of spare drives is insufficient to fix
- issue automatically.
+ RAID array is in a degraded state due to one or more disks failures. Number of spare drives is insufficient to fix the issue automatically.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: HostRaidDiskFailure
expr: >-
@@ -370,14 +364,12 @@ groups:
labels:
severity: warning
annotations:
- summary: Host RAID disk failure (instance {{ $labels.instance }})
+ summary: Host RAID disk failure at {{ $labels.instance }}.
description: |-
- At least one device in RAID array on {{ $labels.instance }} failed.
- Array {{ $labels.md_device }} needs attention and possibly a disk
- swap.
+ At least one device in RAID array is failed. Possibly, a disk swap is required.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: HostOomKillDetected
expr: >-
@@ -386,12 +378,12 @@ groups:
labels:
severity: warning
annotations:
- summary: Host OOM kill detected (instance {{ $labels.instance }})
+ summary: Host OOM kill detected at {{ $labels.instance }}.
description: |-
OOM kill detected.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: HostEdacCorrectableErrorsDetected
expr: >-
@@ -400,13 +392,12 @@ groups:
labels:
severity: info
annotations:
- summary: Host EDAC Correctable Errors detected (instance {{ $labels.instance }})
+ summary: Host EDAC correctable errors detected at {{ $labels.instance }}.
description: |-
- Host {{ $labels.instance }} has had {{ printf "%.0f" $value }}
- correctable memory errors reported by EDAC in the last 5 minutes.
+ Host has had {{ printf "%.0f" $value }} correctable memory errors reported by EDAC in the last 5 minutes.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: HostEdacUncorrectableErrorsDetected
expr: >-
@@ -415,66 +406,67 @@ groups:
labels:
severity: warning
annotations:
- summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }})
+ summary: Host EDAC uncorrectable errors detected at {{ $labels.instance }}.
description: |-
- Host {{ $labels.instance }} has had {{ printf "%.0f" $value }}
- uncorrectable memory errors reported by EDAC in the last 5
- minutes.
+ Host has had {{ printf "%.0f" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: HostNetworkReceiveErrors
- expr: "rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01"
+ expr: >-
+ rate(node_network_receive_errs_total{device!~"^wg.*"}[2m])
+ /
+ rate(node_network_receive_packets_total{device!~"^wg.*"}[2m])
+ > 0.01
for: 2m
labels:
severity: warning
annotations:
- summary: Host Network Receive Errors (instance {{ $labels.instance }})
+ summary: Host Network Receive Errors at {{ $labels.instance }}.
description: |-
- Host {{ $labels.instance }} interface {{ $labels.device }} has
- encountered {{ printf "%.0f" $value }} receive errors in the last
- two minutes.
+ Host interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive errors in the last two minutes.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: HostNetworkTransmitErrors
- expr: "rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01"
+ expr: >-
+ rate(node_network_transmit_errs_total{device!~"^wg.*"}[2m])
+ /
+ rate(node_network_transmit_packets_total{device!~"^wg.*"}[2m])
+ > 0.1
for: 2m
labels:
severity: warning
annotations:
- summary: Host Network Transmit Errors (instance {{ $labels.instance }})
+ summary: Host network transmit errors at {{ $labels.instance }}.
description: |-
- Host {{ $labels.instance }} interface {{ $labels.device }} has
- encountered {{ printf "%.0f" $value }} transmit errors in the last
- two minutes.
+ Host interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit errors in the last two minutes.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: HostNetworkInterfaceSaturated
expr: >-
(
- rate(node_network_receive_bytes_total{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}[1m])
+ rate(node_network_receive_bytes_total{device!~"^wg.*"}[1m])
+
- rate(node_network_transmit_bytes_total{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}[1m])
+ rate(node_network_transmit_bytes_total{device!~"^wg.*"}[1m])
)
- / node_network_speed_bytes{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}
+ / node_network_speed_bytes{device!~"^wg.*"}
> 0.8
< 10000
for: 1m
labels:
severity: warning
annotations:
- summary: Host Network Interface Saturated (instance {{ $labels.instance }})
+ summary: Host network interface saturated at {{ $labels.instance }}.
description: |-
- The network interface "{{ $labels.device }}" on "{{ $labels.instance }}"
- is getting overloaded.
+ The network interface {{ $labels.device }} is getting overloaded.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: HostNetworkBondDegraded
expr: >-
@@ -483,43 +475,53 @@ groups:
labels:
severity: warning
annotations:
- summary: Host Network Bond Degraded (instance {{ $labels.instance }})
+ summary: Host network bond degraded at {{ $labels.instance }}.
description: |-
- Bond "{{ $labels.device }}" degraded on "{{ $labels.instance }}".
+ Bond {{ $labels.device }} degraded.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: HostConntrackLimit
expr: >-
- node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8
+ node_nf_conntrack_entries
+ /
+ node_nf_conntrack_entries_limit
+ > 0.8
for: 5m
labels:
severity: warning
annotations:
- summary: Host conntrack limit (instance {{ $labels.instance }})
+ summary: Host conntrack limit at {{ $labels.instance }}.
description: |-
The number of conntrack is approaching limit.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: HostClockSkew
expr: >-
- (node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0)
+ (
+ node_timex_offset_seconds > 0.05
+ and
+ deriv(node_timex_offset_seconds[5m]) >= 0
+ )
or
- (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)
+ (
+ node_timex_offset_seconds < -0.05
+ and
+ deriv(node_timex_offset_seconds[5m]) <= 0
+ )
for: 2m
labels:
severity: warning
annotations:
- summary: Host clock skew (instance {{ $labels.instance }})
+ summary: Host clock skew at {{ $labels.instance }}.
description: |-
- Clock skew detected. Clock is out of sync. Ensure NTP is configured
- correctly on this host.
+ Clock skew is detected and the clock is out of sync. Ensure that NTP is configured correctly on this host.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: HostClockNotSynchronising
expr: >-
@@ -530,12 +532,12 @@ groups:
labels:
severity: warning
annotations:
- summary: Host clock not synchronising (instance {{ $labels.instance }})
+ summary: Host clock not synchronising at {{ $labels.instance }}.
description: |-
- Clock not synchronising. Ensure NTP is configured on this host.
+ Clock is not synchronising. Ensure that NTP is configured correctly on this host.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: HostRequiresReboot
expr: >-
@@ -544,9 +546,9 @@ groups:
labels:
severity: info
annotations:
- summary: Host requires reboot (instance {{ $labels.instance }})
+ summary: Host requires reboot at {{ $labels.instance }}.
description: |-
- Instance {{ $labels.instance }} requires a reboot.
+ Instance requires a reboot.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
diff --git a/modules/nixos/monitoring/rules/postgres.yaml b/modules/nixos/monitoring/rules/postgres.yaml
index 5d360fa..6aee560 100644
--- a/modules/nixos/monitoring/rules/postgres.yaml
+++ b/modules/nixos/monitoring/rules/postgres.yaml
@@ -10,12 +10,12 @@ groups:
labels:
severity: critical
annotations:
- summary: PostgreSQL is down (instance {{ $labels.instance }})
+ summary: PostgreSQL is down at {{ $labels.instance }}.
description: |-
- Postgresql instance is down.
+ PostgreSQL instance is down.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: PostgresqlRestarted
expr: >-
@@ -24,12 +24,12 @@ groups:
labels:
severity: critical
annotations:
- summary: PostgreSQL restarted (instance {{ $labels.instance }})
+ summary: PostgreSQL restarted at {{ $labels.instance }}.
description: |-
PostgreSQL restarted.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: PostgresqlExporterError
expr: >-
@@ -38,12 +38,12 @@ groups:
labels:
severity: critical
annotations:
- summary: PostgreSQL exporter error (instance {{ $labels.instance }})
+ summary: PostgreSQL exporter errors at {{ $labels.instance }}.
description: |-
- PostgreSQL exporter is showing errors. A query may be buggy in query.yaml.
+ PostgreSQL exporter is showing errors.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: PostgresqlTableNotAutoVacuumed
expr: >-
@@ -55,12 +55,12 @@ groups:
labels:
severity: warning
annotations:
- summary: PostgreSQL table not auto vacuumed (instance {{ $labels.instance }})
+ summary: PostgreSQL table not auto vacuumed at {{ $labels.instance }}.
description: |-
Table {{ $labels.relname }} has not been auto vacuumed for 10 days.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: PostgresqlTableNotAutoAnalyzed
expr: >-
@@ -72,57 +72,60 @@ groups:
labels:
severity: warning
annotations:
- summary: PostgreSQL table not auto analyzed (instance {{ $labels.instance }})
+ summary: PostgreSQL table not auto analyzed at {{ $labels.instance }}.
description: |-
Table {{ $labels.relname }} has not been auto analyzed for 10 days.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}"
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}"
- alert: PostgresqlTooManyConnections
expr: >-
- sum by (datname) (pg_stat_activity_count{datname!~"template.*|postgres"})
- > pg_settings_max_connections * 0.8
+ sum by (datname) (
+ pg_stat_activity_count{datname!~"template.*|postgres"}
+ ) > pg_settings_max_connections * 0.8
for: 2m
labels:
severity: warning
annotations:
- summary: Postgresql too many connections (instance {{ $labels.instance }})
+ summary: PostgreSQL with too many connections at {{ $labels.instance }}.
description: |-
- PostgreSQL instance has too many connections (> 80%).
+ PostgreSQL instance {{ $labels.instance }} has too many connections.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: PostgresqlNotEnoughConnections
expr: >-
- sum by (datname) (pg_stat_activity_count{datname!~"template.*|postgres"})
- < 1
+ sum by (datname) (
+ pg_stat_activity_count{datname!~"template.*|postgres"}
+ ) < 1
for: 2m
labels:
severity: warning
annotations:
- summary: Postgresql not enough connections (instance {{ $labels.instance }})
+ summary: PostgreSQL with not enough connections at {{ $labels.instance }}.
description: |-
- PostgreSQL instance should have more connections (> 1).
+ PostgreSQL instance {{ $labels.instance }} should have more connections.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: PostgresqlDeadLocks
expr: >-
- increase(pg_stat_database_deadlocks{datname!~"template.*|postgres"}[1m])
- > 5
+ increase(
+ pg_stat_database_deadlocks{datname!~"template.*|postgres"}[1m]
+ ) > 5
for: 0m
labels:
severity: warning
annotations:
- summary: Postgresql dead locks (instance {{ $labels.instance }})
+ summary: PostgreSQL dead-locks at instance {{ $labels.instance }}.
description: |-
- PostgreSQL has dead-locks.
+ PostgreSQL shows dead-locks.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: PostgresqlHighRollbackRate
expr: >-
@@ -136,17 +139,17 @@ groups:
(rate(pg_stat_database_xact_commit{datname!~"template.*|postgres",datid!="0"}[3m]))
)
)
- > 0.02
+ > 0.10
for: 0m
labels:
severity: warning
annotations:
- summary: PostgreSQL is at a high rollback rate (instance {{ $labels.instance }})
+ summary: PostgreSQL at a high rollback rate at {{ $labels.instance }}.
description: |-
- Ratio of transactions being aborted compared to committed is > 2%.
+ Ratio of transactions being aborted compared to committed is too big.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: PostgresqlCommitRateLow
expr: >-
@@ -156,12 +159,12 @@ groups:
labels:
severity: critical
annotations:
- summary: PostgreSQL commit rate low (instance {{ $labels.instance }})
+ summary: PostgreSQL commit rate low at instance {{ $labels.instance }}.
description: |-
PostgreSQL seems to be processing very few transactions.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: PostgresqlLowXidConsumption
expr: >-
@@ -171,12 +174,12 @@ groups:
labels:
severity: warning
annotations:
- summary: PostgreSQL low XID consumption (instance {{ $labels.instance }})
+ summary: PostgreSQL low XID consumption at instance {{ $labels.instance }}.
description: |-
PostgreSQL seems to be consuming transaction IDs very slowly.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: PostgresqlHighRateStatementTimeout
expr: >-
@@ -190,8 +193,8 @@ groups:
description: |-
PostgreSQL transactions showing high rate of statement timeouts.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: PostgresqlHighRateDeadlock
expr: >-
@@ -201,12 +204,12 @@ groups:
labels:
severity: critical
annotations:
- summary: PostgreSQL high rate deadlock (instance {{ $labels.instance }})
+ summary: PostgreSQL high rate dead-lock at {{ $labels.instance }}.
description: |-
- PostgreSQL detected deadlocks.
+ PostgreSQL has detected dead-locks.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: PostgresqlUnusedReplicationSlot
expr: >-
@@ -215,12 +218,12 @@ groups:
labels:
severity: warning
annotations:
- summary: PostgreSQL unused replication slot (instance {{ $labels.instance }})
+ summary: PostgreSQL unused replication slot at {{ $labels.instance }}.
description: |-
- Unused Replication Slots.
+ Unused replication slots.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: PostgresqlTooManyDeadTuples
expr: >-
@@ -234,12 +237,12 @@ groups:
labels:
severity: warning
annotations:
- summary: PostgreSQL too many dead tuples (instance {{ $labels.instance }})
+ summary: PostgreSQL too many dead tuples at {{ $labels.instance }}.
description: |-
PostgreSQL number of dead tuples is too large.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: PostgresqlSslCompressionActive
expr: >-
@@ -248,13 +251,12 @@ groups:
labels:
severity: critical
annotations:
- summary: Postgresql SSL compression active (instance {{ $labels.instance }})
+ summary: PostgreSQL SSL compression active at {{ $labels.instance }}.
description: |-
- Database connections with SSL compression is enabled. This may add a
- significant jitter in the replication delay.
+ Database connections with an SSL compression is enabled. This may add a significant jitter in the replication delay.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: PostgresqlTooManyLocksAcquired
expr: >-
@@ -268,12 +270,12 @@ groups:
labels:
severity: critical
annotations:
- summary: PostgreSQL too many locks acquired (instance {{ $labels.instance }})
+ summary: PostgreSQL too many locks acquired at {{ $labels.instance }}.
description: |-
Too many locks acquired on the database.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: PostgresqlBloatIndexHigh
expr: >-
@@ -284,13 +286,12 @@ groups:
labels:
severity: warning
annotations:
- summary: PostgreSQL bloat index high (> 80%) (instance {{ $labels.instance }})
+ summary: PostgreSQL index bloat high at {{ $labels.instance }}.
description: |-
- The index {{ $labels.idxname }} is bloated. You should execute
- `REINDEX INDEX CONCURRENTLY {{ $labels.idxname }};`
+ The index {{ $labels.idxname }} is bloated. You should execute `REINDEX INDEX CONCURRENTLY {{ $labels.idxname }};`.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: PostgresqlBloatTableHigh
expr: >-
@@ -301,10 +302,9 @@ groups:
labels:
severity: warning
annotations:
- summary: PostgreSQL bloat table high (> 80%) (instance {{ $labels.instance }})
+ summary: PostgreSQL table bloat high at instance {{ $labels.instance }}.
description: |-
- The table {{ $labels.relname }} is bloated. You should execute
- `VACUUM {{ $labels.relname }};`
+ The table {{ $labels.relname }} is bloated. You should execute `VACUUM {{ $labels.relname }};`.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
diff --git a/modules/nixos/monitoring/rules/redis.yaml b/modules/nixos/monitoring/rules/redis.yaml
index f6d1fe1..c07c819 100644
--- a/modules/nixos/monitoring/rules/redis.yaml
+++ b/modules/nixos/monitoring/rules/redis.yaml
@@ -10,12 +10,12 @@ groups:
labels:
severity: critical
annotations:
- summary: Redis down (instance {{ $labels.instance }})
+ summary: Redis down at {{ $labels.instance }}.
description: |-
Redis instance is down.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: RedisMissingMaster
expr: >-
@@ -25,12 +25,12 @@ groups:
labels:
severity: critical
annotations:
- summary: Redis missing master (instance {{ $labels.instance }})
+ summary: Redis missing master at {{ $labels.instance }}).
description: |-
- Redis cluster has no node marked as master.
+ Redis cluster has no node marked as a master.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: RedisTooManyMasters
expr: >-
@@ -39,12 +39,12 @@ groups:
labels:
severity: critical
annotations:
- summary: Redis too many masters (instance {{ $labels.instance }})
+ summary: Redis too many masters at {{ $labels.instance }}.
description: |-
- Redis cluster has too many nodes marked as master.
+ Redis cluster has too many nodes marked as a master.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: RedisDisconnectedSlaves
expr: >-
@@ -56,12 +56,12 @@ groups:
labels:
severity: critical
annotations:
- summary: Redis disconnected slaves (instance {{ $labels.instance }})
+ summary: Redis disconnected slaves at {{ $labels.instance }}.
description: |-
Redis is not replicating for all slaves.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: RedisReplicationBroken
expr: >-
@@ -70,12 +70,12 @@ groups:
labels:
severity: critical
annotations:
- summary: Redis replication broken (instance {{ $labels.instance }})
+ summary: Redis replication broken at {{ $labels.instance }}.
description: |-
Redis instance lost a slave.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: RedisClusterFlapping
expr: >-
@@ -84,14 +84,12 @@ groups:
labels:
severity: critical
annotations:
- summary: Redis cluster flapping (instance {{ $labels.instance }})
+ summary: Redis cluster flapping at {{ $labels.instance }}.
description: |-
- Changes have been detected in the Redis replica connection. This can
- occur when replica nodes lose connection to the master and reconnect
- (a.k.a flapping).
+ Changes have been detected in the Redis replica connection. This can occur when replica nodes lose connection to the master and reconnect (a.k.a flapping).
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: RedisMissingBackup
expr: >-
@@ -101,12 +99,12 @@ groups:
labels:
severity: critical
annotations:
- summary: Redis missing backup (instance {{ $labels.instance }})
+ summary: Redis missing backup at {{ $labels.instance }}.
description: |-
Redis has not been backed up for 24 hours.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: RedisOutOfSystemMemory
expr: >-
@@ -118,12 +116,12 @@ groups:
labels:
severity: warning
annotations:
- summary: Redis out of system memory (instance {{ $labels.instance }})
+ summary: Redis out of system memory at {{ $labels.instance }}.
description: |-
- Redis is running out of system memory (> 90%).
+ Redis is running out of system memory.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: RedisOutOfConfiguredMaxmemory
expr: >-
@@ -139,12 +137,12 @@ groups:
labels:
severity: warning
annotations:
- summary: Redis out of configured maxmemory (instance {{ $labels.instance }})
+ summary: Redis out of configured maxmemory at {{ $labels.instance }}.
description: |-
- Redis is running out of configured maxmemory (> 90%).
+ Redis is running out of configured maxmemory.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: RedisTooManyConnections
expr: >-
@@ -153,12 +151,12 @@ groups:
labels:
severity: warning
annotations:
- summary: Redis too many connections (instance {{ $labels.instance }})
+ summary: Redis too many connections at {{ $labels.instance }}.
description: |-
Redis instance has too many connections.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: RedisNotEnoughConnections
expr: >-
@@ -167,12 +165,12 @@ groups:
labels:
severity: warning
annotations:
- summary: Redis not enough connections (instance {{ $labels.instance }})
+ summary: Redis not enough connections at {{ $labels.instance }}.
description: |-
- Redis instance should have more connections (> 1).
+ Redis instance should have more connections.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}
- alert: RedisRejectedConnections
expr: >-
@@ -181,9 +179,9 @@ groups:
labels:
severity: critical
annotations:
- summary: Redis rejected connections (instance {{ $labels.instance }})
+ summary: Redis rejected connections at {{ $labels.instance }}.
description: |-
Some connections to Redis have been rejected.
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
+ VALUE = {{ $value }}
+ LABELS = {{ $labels }}