summaryrefslogtreecommitdiff
path: root/modules/nixos/monitoring
diff options
context:
space:
mode:
Diffstat (limited to 'modules/nixos/monitoring')
-rw-r--r--modules/nixos/monitoring/rules/node.yaml19
-rw-r--r--modules/nixos/monitoring/rules/redis.yaml89
2 files changed, 1 insertions, 107 deletions
diff --git a/modules/nixos/monitoring/rules/node.yaml b/modules/nixos/monitoring/rules/node.yaml
index 98217b3..eee5939 100644
--- a/modules/nixos/monitoring/rules/node.yaml
+++ b/modules/nixos/monitoring/rules/node.yaml
@@ -238,28 +238,11 @@ groups:
VALUE = {{ $value }}
LABELS = {{ $labels }}
- - alert: HostCpuStealNoisyNeighbor
- expr: >-
- avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m]))
- * 100
- > 15
- for: 0m
- labels:
- severity: warning
- annotations:
- summary: Host CPU steal noisy neighbor at {{ $labels.instance }}.
- description: |-
- CPU steal is > 10%. A noisy neighbor is killing VM performances or a
- spot instance may be out of credit.
-
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
-
- alert: HostCpuHighIowait
expr: |-
avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m]))
* 100
- > 15
+ > 50
for: 0m
labels:
severity: warning
diff --git a/modules/nixos/monitoring/rules/redis.yaml b/modules/nixos/monitoring/rules/redis.yaml
index c07c819..b47c313 100644
--- a/modules/nixos/monitoring/rules/redis.yaml
+++ b/modules/nixos/monitoring/rules/redis.yaml
@@ -17,95 +17,6 @@ groups:
VALUE = {{ $value }}
LABELS = {{ $labels }}
- - alert: RedisMissingMaster
- expr: >-
- (count(redis_instance_info{role="master"}) or vector(0))
- < 1
- for: 0m
- labels:
- severity: critical
- annotations:
- summary: Redis missing master at {{ $labels.instance }}).
- description: |-
- Redis cluster has no node marked as a master.
-
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
-
- - alert: RedisTooManyMasters
- expr: >-
- count(redis_instance_info{role="master"}) > 1
- for: 0m
- labels:
- severity: critical
- annotations:
- summary: Redis too many masters at {{ $labels.instance }}.
- description: |-
- Redis cluster has too many nodes marked as a master.
-
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
-
- - alert: RedisDisconnectedSlaves
- expr: >-
- count without (instance, job) (redis_connected_slaves)
- - sum without (instance, job) (redis_connected_slaves)
- - 1
- > 1
- for: 0m
- labels:
- severity: critical
- annotations:
- summary: Redis disconnected slaves at {{ $labels.instance }}.
- description: |-
- Redis is not replicating for all slaves.
-
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
-
- - alert: RedisReplicationBroken
- expr: >-
- delta(redis_connected_slaves[1m]) < 0
- for: 0m
- labels:
- severity: critical
- annotations:
- summary: Redis replication broken at {{ $labels.instance }}.
- description: |-
- Redis instance lost a slave.
-
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
-
- - alert: RedisClusterFlapping
- expr: >-
- changes(redis_connected_slaves[1m]) > 1
- for: 2m
- labels:
- severity: critical
- annotations:
- summary: Redis cluster flapping at {{ $labels.instance }}.
- description: |-
- Changes have been detected in the Redis replica connection. This can occur when replica nodes lose connection to the master and reconnect (a.k.a flapping).
-
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
-
- - alert: RedisMissingBackup
- expr: >-
- time() - redis_rdb_last_save_timestamp_seconds
- > 60 * 60 * 24
- for: 0m
- labels:
- severity: critical
- annotations:
- summary: Redis missing backup at {{ $labels.instance }}.
- description: |-
- Redis has not been backed up for 24 hours.
-
- VALUE = {{ $value }}
- LABELS = {{ $labels }}
-
- alert: RedisOutOfSystemMemory
expr: >-
redis_memory_used_bytes