diff options
Diffstat (limited to 'modules/nixos/monitoring')
-rw-r--r-- | modules/nixos/monitoring/rules/node.yaml | 19 | ||||
-rw-r--r-- | modules/nixos/monitoring/rules/redis.yaml | 89 |
2 files changed, 1 insertions, 107 deletions
diff --git a/modules/nixos/monitoring/rules/node.yaml b/modules/nixos/monitoring/rules/node.yaml index 98217b3..eee5939 100644 --- a/modules/nixos/monitoring/rules/node.yaml +++ b/modules/nixos/monitoring/rules/node.yaml @@ -238,28 +238,11 @@ groups: VALUE = {{ $value }} LABELS = {{ $labels }} - - alert: HostCpuStealNoisyNeighbor - expr: >- - avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) - * 100 - > 15 - for: 0m - labels: - severity: warning - annotations: - summary: Host CPU steal noisy neighbor at {{ $labels.instance }}. - description: |- - CPU steal is > 10%. A noisy neighbor is killing VM performances or a - spot instance may be out of credit. - - VALUE = {{ $value }} - LABELS = {{ $labels }} - - alert: HostCpuHighIowait expr: |- avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) * 100 - > 15 + > 50 for: 0m labels: severity: warning diff --git a/modules/nixos/monitoring/rules/redis.yaml b/modules/nixos/monitoring/rules/redis.yaml index c07c819..b47c313 100644 --- a/modules/nixos/monitoring/rules/redis.yaml +++ b/modules/nixos/monitoring/rules/redis.yaml @@ -17,95 +17,6 @@ groups: VALUE = {{ $value }} LABELS = {{ $labels }} - - alert: RedisMissingMaster - expr: >- - (count(redis_instance_info{role="master"}) or vector(0)) - < 1 - for: 0m - labels: - severity: critical - annotations: - summary: Redis missing master at {{ $labels.instance }}). - description: |- - Redis cluster has no node marked as a master. - - VALUE = {{ $value }} - LABELS = {{ $labels }} - - - alert: RedisTooManyMasters - expr: >- - count(redis_instance_info{role="master"}) > 1 - for: 0m - labels: - severity: critical - annotations: - summary: Redis too many masters at {{ $labels.instance }}. - description: |- - Redis cluster has too many nodes marked as a master. - - VALUE = {{ $value }} - LABELS = {{ $labels }} - - - alert: RedisDisconnectedSlaves - expr: >- - count without (instance, job) (redis_connected_slaves) - - sum without (instance, job) (redis_connected_slaves) - - 1 - > 1 - for: 0m - labels: - severity: critical - annotations: - summary: Redis disconnected slaves at {{ $labels.instance }}. - description: |- - Redis is not replicating for all slaves. - - VALUE = {{ $value }} - LABELS = {{ $labels }} - - - alert: RedisReplicationBroken - expr: >- - delta(redis_connected_slaves[1m]) < 0 - for: 0m - labels: - severity: critical - annotations: - summary: Redis replication broken at {{ $labels.instance }}. - description: |- - Redis instance lost a slave. - - VALUE = {{ $value }} - LABELS = {{ $labels }} - - - alert: RedisClusterFlapping - expr: >- - changes(redis_connected_slaves[1m]) > 1 - for: 2m - labels: - severity: critical - annotations: - summary: Redis cluster flapping at {{ $labels.instance }}. - description: |- - Changes have been detected in the Redis replica connection. This can occur when replica nodes lose connection to the master and reconnect (a.k.a flapping). - - VALUE = {{ $value }} - LABELS = {{ $labels }} - - - alert: RedisMissingBackup - expr: >- - time() - redis_rdb_last_save_timestamp_seconds - > 60 * 60 * 24 - for: 0m - labels: - severity: critical - annotations: - summary: Redis missing backup at {{ $labels.instance }}. - description: |- - Redis has not been backed up for 24 hours. - - VALUE = {{ $value }} - LABELS = {{ $labels }} - - alert: RedisOutOfSystemMemory expr: >- redis_memory_used_bytes |