diff options
author | Azat Bahawi <azat@bahawi.net> | 2024-04-21 02:15:42 +0300 |
---|---|---|
committer | Azat Bahawi <azat@bahawi.net> | 2024-04-21 02:15:42 +0300 |
commit | e6ed60548397627bf10f561f9438201dbba0a36e (patch) | |
tree | f9a84c5957d2cc4fcd148065ee9365a0c851ae1c /modules/nixos/monitoring/rules | |
parent | 2024-04-18 (diff) |
2024-04-21
Diffstat (limited to 'modules/nixos/monitoring/rules')
-rw-r--r-- | modules/nixos/monitoring/rules/nginx.yaml | 60 | ||||
-rw-r--r-- | modules/nixos/monitoring/rules/node.yaml | 537 | ||||
-rw-r--r-- | modules/nixos/monitoring/rules/postgres.yaml | 310 | ||||
-rw-r--r-- | modules/nixos/monitoring/rules/redis.yaml | 98 |
4 files changed, 0 insertions, 1005 deletions
diff --git a/modules/nixos/monitoring/rules/nginx.yaml b/modules/nixos/monitoring/rules/nginx.yaml deleted file mode 100644 index f00d372..0000000 --- a/modules/nixos/monitoring/rules/nginx.yaml +++ /dev/null @@ -1,60 +0,0 @@ ---- -groups: - - name: Nginx - - rules: - - alert: NginxHighHttp4xxErrorRate - expr: >- - sum(rate(nginx_http_requests_total{status=~"^4.."}[1m])) - / - sum(rate(nginx_http_requests_total[1m])) * 100 - > 5 - for: 1m - labels: - severity: critical - annotations: - summary: NGINX high HTTP 4xx error rate at {{ $labels.instance }}. - description: |- - Too many HTTP requests with a 4xx status code. - - VALUE = {{ $value }} - LABELS = {{ $labels }} - - - alert: NginxHighHttp5xxErrorRate - expr: >- - sum(rate(nginx_http_requests_total{status=~"^5.."}[1m])) - / - sum(rate(nginx_http_requests_total[1m])) * 100 - > 5 - for: 1m - labels: - severity: critical - annotations: - summary: NGINX high HTTP 5xx error rate at {{ $labels.instance }}. - description: |- - Too many HTTP requests with a 5xx status code. - - VALUE = {{ $value }} - LABELS = {{ $labels }} - - - alert: NginxLatencyHigh - expr: >- - histogram_quantile( - 0.99, - sum( - rate( - nginx_http_request_duration_seconds_bucket[2m] - ) - ) by (host, node) - ) - > 3 - for: 2m - labels: - severity: warning - annotations: - summary: NGINX high latency at {{ $labels.instance }}. - description: |- - NGINX 99% of latency spikes is higher than 3 seconds. - - VALUE = {{ $value }} - LABELS = {{ $labels }} diff --git a/modules/nixos/monitoring/rules/node.yaml b/modules/nixos/monitoring/rules/node.yaml deleted file mode 100644 index a9b6b79..0000000 --- a/modules/nixos/monitoring/rules/node.yaml +++ /dev/null @@ -1,537 +0,0 @@ ---- -groups: - - name: Node - - rules: - - alert: HostOutOfMemory - expr: >- - node_memory_MemAvailable_bytes - / - node_memory_MemTotal_bytes * 100 - < 10 - for: 2m - labels: - severity: warning - annotations: - summary: Host out of memory at {{ $labels.instance }}. - description: |- - Node memory is filling up. - - VALUE = {{ $value }} - LABELS = {{ $labels }} - - - alert: HostMemoryUnderMemoryPressure - expr: >- - rate(node_vmstat_pgmajfault[1m]) > 1000 - for: 2m - labels: - severity: warning - annotations: - summary: Host memory under memory pressure at {{ $labels.instance }}. - description: |- - The node is under heavy memory pressure. High rate of major page faults. - - VALUE = {{ $value }} - LABELS = {{ $labels }} - - - alert: HostUnusualNetworkThroughputIn - expr: >- - sum by (instance) (rate(node_network_receive_bytes_total[2m])) - / 1024 - / 1024 - > 100 - for: 5m - labels: - severity: warning - annotations: - summary: Host unusual network throughput in (instance {{ $labels.instance }}) - description: |- - Host network interfaces are probably receiving too much data. - - VALUE = {{ $value }} - LABELS = {{ $labels }} - - - alert: HostUnusualNetworkThroughputOut - expr: >- - sum by (instance) (rate(node_network_transmit_bytes_total[2m])) - / 1024 - / 1024 - > 100 - for: 5m - labels: - severity: warning - annotations: - summary: Host unusual network throughput out at {{ $labels.instance }}. - description: |- - Host network interfaces are probably sending too much data. - - VALUE = {{ $value }} - LABELS = {{ $labels }} - - - alert: HostUnusualDiskReadRate - expr: >- - sum by (instance) (rate(node_disk_read_bytes_total[2m])) - / 1024 - / 1024 - > 50 - for: 5m - labels: - severity: warning - annotations: - summary: Host unusual disk read rate at {{ $labels.instance }}. - description: |- - Disk is probably reading too much data. - - VALUE = {{ $value }} - LABELS = {{ $labels }} - - - alert: HostUnusualDiskWriteRate - expr: >- - sum by (instance) (rate(node_disk_written_bytes_total[2m])) - / 1024 - / 1024 - > 50 - for: 2m - labels: - severity: warning - annotations: - summary: Host unusual disk write rate at {{ $labels.instance }}. - description: |- - Disk is probably writing too much data. - - VALUE = {{ $value }} - LABELS = {{ $labels }} - - - alert: HostOutOfDiskSpace - expr: >- - (node_filesystem_avail_bytes * 100) - / node_filesystem_size_bytes - < 10 - and - on (instance, device, mountpoint) node_filesystem_readonly - == 0 - for: 2m - labels: - severity: warning - annotations: - summary: Host out of disk space at {{ $labels.instance }}. - description: |- - Disk is almost full. - - VALUE = {{ $value }} - LABELS = {{ $labels }} - - - alert: HostDiskWillFillIn24Hours - expr: >- - (node_filesystem_avail_bytes * 100) - / node_filesystem_size_bytes - < 10 - and ON (instance, device, mountpoint) - predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) - < 0 - and ON (instance, device, mountpoint) - node_filesystem_readonly == 0 - for: 2m - labels: - severity: warning - annotations: - summary: Host disk will fill in 24 hours at {{ $labels.instance }}. - description: |- - Filesystem is predicted to run out of space within the next 24 hours at current write rate. - - VALUE = {{ $value }} - LABELS = {{ $labels }} - - - alert: HostOutOfInodes - expr: >- - node_filesystem_files_free - / node_filesystem_files * 100 - < 10 - and - ON (instance, device, mountpoint) node_filesystem_readonly - == 0 - for: 2m - labels: - severity: warning - annotations: - summary: Host out of inodes at {{ $labels.instance }}. - description: |- - Disk is almost running out of available inodes. - - VALUE = {{ $value }} - LABELS = {{ $labels }} - - - alert: HostInodesWillFillIn24Hours - expr: >- - node_filesystem_files_free - / node_filesystem_files - * 100 - < 10 - and - predict_linear(node_filesystem_files_free[1h], 24 * 3600) - < 0 - and ON (instance, device, mountpoint) node_filesystem_readonly - == 0 - for: 2m - labels: - severity: warning - annotations: - summary: Host inodes will fill in 24 hours at {{ $labels.instance }}. - description: |- - Filesystem is predicted to run out of inodes within the next 24 hours at current write rate. - - VALUE = {{ $value }} - LABELS = {{ $labels }} - - - alert: HostUnusualDiskReadLatency - expr: >- - rate(node_disk_read_time_seconds_total[1m]) - / rate(node_disk_reads_completed_total[1m]) - > 0.1 - and - rate(node_disk_reads_completed_total[1m]) - > 0 - for: 2m - labels: - severity: warning - annotations: - summary: Host unusual disk read latency at {{ $labels.instance }}. - description: |- - Disk latency is growing. - - VALUE = {{ $value }} - LABELS = {{ $labels }} - - - alert: HostUnusualDiskWriteLatency - expr: >- - rate(node_disk_write_time_seconds_total[1m]) - / rate(node_disk_writes_completed_total[1m]) - > 0.1 - and - rate(node_disk_writes_completed_total[1m]) - > 0 - for: 2m - labels: - severity: warning - annotations: - summary: Host unusual disk write latency at {{ $labels.instance }}. - description: |- - Disk latency is growing. - - VALUE = {{ $value }} - LABELS = {{ $labels }} - - - alert: HostHighCpuLoad - expr: >- - sum by (instance) - (avg by (mode, instance) - (rate(node_cpu_seconds_total{mode!="idle"}[2m]))) - > 2.0 - for: 0m - labels: - severity: warning - annotations: - summary: Host high CPU load at {{ $labels.instance }}. - description: |- - CPU load is high. - - VALUE = {{ $value }} - LABELS = {{ $labels }} - - - alert: HostCpuHighIowait - expr: |- - avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) - * 100 - > 50 - for: 0m - labels: - severity: warning - annotations: - summary: Host CPU high I/O wait at {{ $labels.instance }}. - description: |- - CPU I/O wait is high. A high I/O wait means that you are disk or network bound. - - VALUE = {{ $value }} - LABELS = {{ $labels }} - - - alert: HostUnusualDiskIo - expr: >- - rate(node_disk_io_time_seconds_total[1m]) > 0.5 - for: 5m - labels: - severity: warning - annotations: - summary: Host unusual disk I/O at {{ $labels.instance }}. - description: |- - Time spent over I/O is too high. Check storage for issues. - - VALUE = {{ $value }} - LABELS = {{ $labels }} - - - alert: HostSwapIsFillingUp - expr: >- - (1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) - * 100 - > 80 - for: 2m - labels: - severity: warning - annotations: - summary: Host swap is filling up at {{ $labels.instance }}. - description: |- - Swap is filling up. - - VALUE = {{ $value }} - LABELS = {{ $labels }} - - - alert: HostSystemdServiceCrashed - expr: >- - node_systemd_unit_state{state="failed"} == 1 - for: 0m - labels: - severity: warning - annotations: - summary: Host systemd service crashed at {{ $labels.instance }}. - description: |- - Systemd service crashed. - - VALUE = {{ $value }} - LABELS = {{ $labels }} - - - alert: HostPhysicalComponentTooHot - expr: >- - node_hwmon_temp_celsius > 75 - for: 5m - labels: - severity: warning - annotations: - summary: Host physical component too hot at {{ $labels.instance }}. - description: |- - Physical hardware component too hot. - - VALUE = {{ $value }} - LABELS = {{ $labels }} - - - alert: HostNodeOvertemperatureAlarm - expr: >- - node_hwmon_temp_crit_alarm_celsius == 1 - for: 0m - labels: - severity: critical - annotations: - summary: Host node overtemperature alarm at {{ $labels.instance }}. - description: |- - Physical node temperature alarm triggered. - - VALUE = {{ $value }} - LABELS = {{ $labels }} - - - alert: HostRaidArrayGotInactive - expr: >- - node_md_state{state="inactive"} > 0 - for: 0m - labels: - severity: critical - annotations: - summary: Host RAID array got inactive at {{ $labels.instance }}. - description: |- - RAID array is in a degraded state due to one or more disks failures. Number of spare drives is insufficient to fix the issue automatically. - - VALUE = {{ $value }} - LABELS = {{ $labels }} - - - alert: HostRaidDiskFailure - expr: >- - node_md_disks{state="failed"} > 0 - for: 2m - labels: - severity: warning - annotations: - summary: Host RAID disk failure at {{ $labels.instance }}. - description: |- - At least one device in RAID array is failed. Possibly, a disk swap is required. - - VALUE = {{ $value }} - LABELS = {{ $labels }} - - - alert: HostOomKillDetected - expr: >- - increase(node_vmstat_oom_kill[1m]) > 0 - for: 0m - labels: - severity: warning - annotations: - summary: Host OOM kill detected at {{ $labels.instance }}. - description: |- - OOM kill detected. - - VALUE = {{ $value }} - LABELS = {{ $labels }} - - - alert: HostEdacCorrectableErrorsDetected - expr: >- - increase(node_edac_correctable_errors_total[1m]) > 0 - for: 0m - labels: - severity: info - annotations: - summary: Host EDAC correctable errors detected at {{ $labels.instance }}. - description: |- - Host has had {{ printf "%.0f" $value }} correctable memory errors reported by EDAC in the last 5 minutes. - - VALUE = {{ $value }} - LABELS = {{ $labels }} - - - alert: HostEdacUncorrectableErrorsDetected - expr: >- - node_edac_uncorrectable_errors_total > 0 - for: 0m - labels: - severity: warning - annotations: - summary: Host EDAC uncorrectable errors detected at {{ $labels.instance }}. - description: |- - Host has had {{ printf "%.0f" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes. - - VALUE = {{ $value }} - LABELS = {{ $labels }} - - - alert: HostNetworkReceiveErrors - expr: >- - rate(node_network_receive_errs_total{device!~"^wg.*"}[2m]) - / - rate(node_network_receive_packets_total{device!~"^wg.*"}[2m]) - > 0.01 - for: 2m - labels: - severity: warning - annotations: - summary: Host Network Receive Errors at {{ $labels.instance }}. - description: |- - Host interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive errors in the last two minutes. - - VALUE = {{ $value }} - LABELS = {{ $labels }} - - - alert: HostNetworkTransmitErrors - expr: >- - rate(node_network_transmit_errs_total{device!~"^wg.*"}[2m]) - / - rate(node_network_transmit_packets_total{device!~"^wg.*"}[2m]) - > 0.1 - for: 2m - labels: - severity: warning - annotations: - summary: Host network transmit errors at {{ $labels.instance }}. - description: |- - Host interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit errors in the last two minutes. - - VALUE = {{ $value }} - LABELS = {{ $labels }} - - - alert: HostNetworkInterfaceSaturated - expr: >- - ( - rate(node_network_receive_bytes_total{device!~"^wg.*"}[1m]) - + - rate(node_network_transmit_bytes_total{device!~"^wg.*"}[1m]) - ) - / node_network_speed_bytes{device!~"^wg.*"} - > 0.8 - < 10000 - for: 1m - labels: - severity: warning - annotations: - summary: Host network interface saturated at {{ $labels.instance }}. - description: |- - The network interface {{ $labels.device }} is getting overloaded. - - VALUE = {{ $value }} - LABELS = {{ $labels }} - - - alert: HostNetworkBondDegraded - expr: >- - (node_bonding_active - node_bonding_slaves) != 0 - for: 2m - labels: - severity: warning - annotations: - summary: Host network bond degraded at {{ $labels.instance }}. - description: |- - Bond {{ $labels.device }} degraded. - - VALUE = {{ $value }} - LABELS = {{ $labels }} - - - alert: HostConntrackLimit - expr: >- - node_nf_conntrack_entries - / - node_nf_conntrack_entries_limit - > 0.8 - for: 5m - labels: - severity: warning - annotations: - summary: Host conntrack limit at {{ $labels.instance }}. - description: |- - The number of conntrack is approaching limit. - - VALUE = {{ $value }} - LABELS = {{ $labels }} - - - alert: HostClockSkew - expr: >- - ( - node_timex_offset_seconds > 0.05 - and - deriv(node_timex_offset_seconds[5m]) >= 0 - ) - or - ( - node_timex_offset_seconds < -0.05 - and - deriv(node_timex_offset_seconds[5m]) <= 0 - ) - for: 2m - labels: - severity: warning - annotations: - summary: Host clock skew at {{ $labels.instance }}. - description: |- - Clock skew is detected and the clock is out of sync. Ensure that NTP is configured correctly on this host. - - VALUE = {{ $value }} - LABELS = {{ $labels }} - - - alert: HostClockNotSynchronising - expr: >- - min_over_time(node_timex_sync_status[1m]) == 0 - and - node_timex_maxerror_seconds >= 16 - for: 2m - labels: - severity: warning - annotations: - summary: Host clock not synchronising at {{ $labels.instance }}. - description: |- - Clock is not synchronising. Ensure that NTP is configured correctly on this host. - - VALUE = {{ $value }} - LABELS = {{ $labels }} - - - alert: HostRequiresReboot - expr: >- - node_reboot_required > 0 - for: 4h - labels: - severity: info - annotations: - summary: Host requires reboot at {{ $labels.instance }}. - description: |- - Instance requires a reboot. - - VALUE = {{ $value }} - LABELS = {{ $labels }} diff --git a/modules/nixos/monitoring/rules/postgres.yaml b/modules/nixos/monitoring/rules/postgres.yaml deleted file mode 100644 index 6a98c92..0000000 --- a/modules/nixos/monitoring/rules/postgres.yaml +++ /dev/null @@ -1,310 +0,0 @@ ---- -groups: - - name: Postgres - - rules: - - alert: PostgresqlDown - expr: >- - pg_up == 0 - for: 0m - labels: - severity: critical - annotations: - summary: PostgreSQL is down at {{ $labels.instance }}. - description: |- - PostgreSQL instance is down. - - VALUE = {{ $value }} - LABELS = {{ $labels }} - - - alert: PostgresqlRestarted - expr: >- - time() - pg_postmaster_start_time_seconds < 60 - for: 0m - labels: - severity: critical - annotations: - summary: PostgreSQL restarted at {{ $labels.instance }}. - description: |- - PostgreSQL restarted. - - VALUE = {{ $value }} - LABELS = {{ $labels }} - - - alert: PostgresqlExporterError - expr: >- - pg_exporter_last_scrape_error > 0 - for: 0m - labels: - severity: critical - annotations: - summary: PostgreSQL exporter errors at {{ $labels.instance }}. - description: |- - PostgreSQL exporter is showing errors. - - VALUE = {{ $value }} - LABELS = {{ $labels }} - - - alert: PostgresqlTableNotAutoVacuumed - expr: >- - (pg_stat_user_tables_last_autovacuum > 0) - and - (time() - pg_stat_user_tables_last_autovacuum) - > 60 * 60 * 24 * 10 - for: 0m - labels: - severity: warning - annotations: - summary: PostgreSQL table not auto vacuumed at {{ $labels.instance }}. - description: |- - Table {{ $labels.relname }} has not been auto vacuumed for 10 days. - - VALUE = {{ $value }} - LABELS = {{ $labels }} - - - alert: PostgresqlTableNotAutoAnalyzed - expr: >- - (pg_stat_user_tables_last_autoanalyze > 0) - and - (time() - pg_stat_user_tables_last_autoanalyze) - > 24 * 60 * 60 * 10 - for: 0m - labels: - severity: warning - annotations: - summary: PostgreSQL table not auto analyzed at {{ $labels.instance }}. - description: |- - Table {{ $labels.relname }} has not been auto analyzed for 10 days. - - VALUE = {{ $value }} - LABELS = {{ $labels }}" - - - alert: PostgresqlTooManyConnections - expr: >- - sum by (datname) ( - pg_stat_activity_count{datname!~"template.*|postgres"} - ) > pg_settings_max_connections * 0.8 - for: 2m - labels: - severity: warning - annotations: - summary: PostgreSQL with too many connections at {{ $labels.instance }}. - description: |- - PostgreSQL instance {{ $labels.instance }} has too many connections. - - VALUE = {{ $value }} - LABELS = {{ $labels }} - - - alert: PostgresqlNotEnoughConnections - expr: >- - sum by (datname) ( - pg_stat_activity_count{datname!~"template.*|postgres"} - ) < 1 - for: 2m - labels: - severity: warning - annotations: - summary: PostgreSQL with not enough connections at {{ $labels.instance }}. - description: |- - PostgreSQL instance {{ $labels.instance }} should have more connections. - - VALUE = {{ $value }} - LABELS = {{ $labels }} - - - alert: PostgresqlDeadLocks - expr: >- - increase( - pg_stat_database_deadlocks{datname!~"template.*|postgres"}[1m] - ) > 5 - for: 0m - labels: - severity: warning - annotations: - summary: PostgreSQL dead-locks at instance {{ $labels.instance }}. - description: |- - PostgreSQL shows dead-locks. - - VALUE = {{ $value }} - LABELS = {{ $labels }} - - - alert: PostgresqlHighRollbackRate - expr: >- - sum by (namespace,datname) - ( - (rate(pg_stat_database_xact_rollback{datname!~"template.*|postgres|dendrite",datid!="0"}[3m])) - / - ( - (rate(pg_stat_database_xact_rollback{datname!~"template.*|postgres|dendrite",datid!="0"}[3m])) - + - (rate(pg_stat_database_xact_commit{datname!~"template.*|postgres|dendrite",datid!="0"}[3m])) - ) - ) - > 0.10 - for: 0m - labels: - severity: warning - annotations: - summary: PostgreSQL at a high rollback rate at {{ $labels.instance }}. - description: |- - Ratio of transactions being aborted compared to committed is too big. - - VALUE = {{ $value }} - LABELS = {{ $labels }} - - - alert: PostgresqlCommitRateLow - expr: >- - rate(pg_stat_database_xact_commit[1m]) - < 10 - for: 2m - labels: - severity: critical - annotations: - summary: PostgreSQL commit rate low at instance {{ $labels.instance }}. - description: |- - PostgreSQL seems to be processing very few transactions. - - VALUE = {{ $value }} - LABELS = {{ $labels }} - - - alert: PostgresqlLowXidConsumption - expr: >- - rate(pg_txid_current[1m]) - < 5 - for: 2m - labels: - severity: warning - annotations: - summary: PostgreSQL low XID consumption at instance {{ $labels.instance }}. - description: |- - PostgreSQL seems to be consuming transaction IDs very slowly. - - VALUE = {{ $value }} - LABELS = {{ $labels }} - - - alert: PostgresqlHighRateStatementTimeout - expr: >- - rate(postgresql_errors_total{type="statement_timeout"}[1m]) - > 3 - for: 0m - labels: - severity: critical - annotations: - summary: PostgreSQL high rate statement timeout (instance {{ $labels.instance }}) - description: |- - PostgreSQL transactions showing high rate of statement timeouts. - - VALUE = {{ $value }} - LABELS = {{ $labels }} - - - alert: PostgresqlHighRateDeadlock - expr: >- - increase(postgresql_errors_total{type="deadlock_detected"}[1m]) - > 1 - for: 0m - labels: - severity: critical - annotations: - summary: PostgreSQL high rate dead-lock at {{ $labels.instance }}. - description: |- - PostgreSQL has detected dead-locks. - - VALUE = {{ $value }} - LABELS = {{ $labels }} - - - alert: PostgresqlUnusedReplicationSlot - expr: >- - pg_replication_slots_active == 0 - for: 1m - labels: - severity: warning - annotations: - summary: PostgreSQL unused replication slot at {{ $labels.instance }}. - description: |- - Unused replication slots. - - VALUE = {{ $value }} - LABELS = {{ $labels }} - - - alert: PostgresqlTooManyDeadTuples - expr: >- - ( - (pg_stat_user_tables_n_dead_tup > 10000) - / - (pg_stat_user_tables_n_live_tup + pg_stat_user_tables_n_dead_tup) - ) - >= 0.1 - for: 2m - labels: - severity: warning - annotations: - summary: PostgreSQL too many dead tuples at {{ $labels.instance }}. - description: |- - PostgreSQL number of dead tuples is too large. - - VALUE = {{ $value }} - LABELS = {{ $labels }} - - - alert: PostgresqlSslCompressionActive - expr: >- - sum(pg_stat_ssl_compression) > 0 - for: 0m - labels: - severity: critical - annotations: - summary: PostgreSQL SSL compression active at {{ $labels.instance }}. - description: |- - Database connections with an SSL compression is enabled. This may add a significant jitter in the replication delay. - - VALUE = {{ $value }} - LABELS = {{ $labels }} - - - alert: PostgresqlTooManyLocksAcquired - expr: >- - ( - (sum (pg_locks_count)) - / - (pg_settings_max_locks_per_transaction * pg_settings_max_connections) - ) - > 0.20 - for: 2m - labels: - severity: critical - annotations: - summary: PostgreSQL too many locks acquired at {{ $labels.instance }}. - description: |- - Too many locks acquired on the database. - - VALUE = {{ $value }} - LABELS = {{ $labels }} - - - alert: PostgresqlBloatIndexHigh - expr: >- - pg_bloat_btree_bloat_pct > 80 - and - on (idxname) (pg_bloat_btree_real_size > 100000000) - for: 1h - labels: - severity: warning - annotations: - summary: PostgreSQL index bloat high at {{ $labels.instance }}. - description: |- - The index {{ $labels.idxname }} is bloated. You should execute `REINDEX INDEX CONCURRENTLY {{ $labels.idxname }};`. - - VALUE = {{ $value }} - LABELS = {{ $labels }} - - - alert: PostgresqlBloatTableHigh - expr: >- - pg_bloat_table_bloat_pct > 80 - and - on (relname) (pg_bloat_table_real_size > 200000000) - for: 1h - labels: - severity: warning - annotations: - summary: PostgreSQL table bloat high at instance {{ $labels.instance }}. - description: |- - The table {{ $labels.relname }} is bloated. You should execute `VACUUM {{ $labels.relname }};`. - - VALUE = {{ $value }} - LABELS = {{ $labels }} diff --git a/modules/nixos/monitoring/rules/redis.yaml b/modules/nixos/monitoring/rules/redis.yaml deleted file mode 100644 index b47c313..0000000 --- a/modules/nixos/monitoring/rules/redis.yaml +++ /dev/null @@ -1,98 +0,0 @@ ---- -groups: - - name: Redis - - rules: - - alert: RedisDown - expr: >- - redis_up == 0 - for: 0m - labels: - severity: critical - annotations: - summary: Redis down at {{ $labels.instance }}. - description: |- - Redis instance is down. - - VALUE = {{ $value }} - LABELS = {{ $labels }} - - - alert: RedisOutOfSystemMemory - expr: >- - redis_memory_used_bytes - / - redis_total_system_memory_bytes * 100 - > 90 - for: 2m - labels: - severity: warning - annotations: - summary: Redis out of system memory at {{ $labels.instance }}. - description: |- - Redis is running out of system memory. - - VALUE = {{ $value }} - LABELS = {{ $labels }} - - - alert: RedisOutOfConfiguredMaxmemory - expr: >- - redis_memory_max_bytes != 0 - and - ( - redis_memory_used_bytes - / - redis_memory_max_bytes * 100 - > 90 - ) - for: 2m - labels: - severity: warning - annotations: - summary: Redis out of configured maxmemory at {{ $labels.instance }}. - description: |- - Redis is running out of configured maxmemory. - - VALUE = {{ $value }} - LABELS = {{ $labels }} - - - alert: RedisTooManyConnections - expr: >- - redis_connected_clients > 100 - for: 2m - labels: - severity: warning - annotations: - summary: Redis too many connections at {{ $labels.instance }}. - description: |- - Redis instance has too many connections. - - VALUE = {{ $value }} - LABELS = {{ $labels }} - - - alert: RedisNotEnoughConnections - expr: >- - redis_connected_clients < 1 - for: 2m - labels: - severity: warning - annotations: - summary: Redis not enough connections at {{ $labels.instance }}. - description: |- - Redis instance should have more connections. - - VALUE = {{ $value }} - LABELS = {{ $labels }} - - - alert: RedisRejectedConnections - expr: >- - increase(redis_rejected_connections_total[1m]) > 0 - for: 0m - labels: - severity: critical - annotations: - summary: Redis rejected connections at {{ $labels.instance }}. - description: |- - Some connections to Redis have been rejected. - - VALUE = {{ $value }} - LABELS = {{ $labels }} |