From e6ed60548397627bf10f561f9438201dbba0a36e Mon Sep 17 00:00:00 2001 From: Azat Bahawi Date: Sun, 21 Apr 2024 02:15:42 +0300 Subject: 2024-04-21 --- modules/nixos/monitoring/rules/node.yaml | 537 ------------------------------- 1 file changed, 537 deletions(-) delete mode 100644 modules/nixos/monitoring/rules/node.yaml (limited to 'modules/nixos/monitoring/rules/node.yaml') diff --git a/modules/nixos/monitoring/rules/node.yaml b/modules/nixos/monitoring/rules/node.yaml deleted file mode 100644 index a9b6b79..0000000 --- a/modules/nixos/monitoring/rules/node.yaml +++ /dev/null @@ -1,537 +0,0 @@ ---- -groups: - - name: Node - - rules: - - alert: HostOutOfMemory - expr: >- - node_memory_MemAvailable_bytes - / - node_memory_MemTotal_bytes * 100 - < 10 - for: 2m - labels: - severity: warning - annotations: - summary: Host out of memory at {{ $labels.instance }}. - description: |- - Node memory is filling up. - - VALUE = {{ $value }} - LABELS = {{ $labels }} - - - alert: HostMemoryUnderMemoryPressure - expr: >- - rate(node_vmstat_pgmajfault[1m]) > 1000 - for: 2m - labels: - severity: warning - annotations: - summary: Host memory under memory pressure at {{ $labels.instance }}. - description: |- - The node is under heavy memory pressure. High rate of major page faults. - - VALUE = {{ $value }} - LABELS = {{ $labels }} - - - alert: HostUnusualNetworkThroughputIn - expr: >- - sum by (instance) (rate(node_network_receive_bytes_total[2m])) - / 1024 - / 1024 - > 100 - for: 5m - labels: - severity: warning - annotations: - summary: Host unusual network throughput in (instance {{ $labels.instance }}) - description: |- - Host network interfaces are probably receiving too much data. - - VALUE = {{ $value }} - LABELS = {{ $labels }} - - - alert: HostUnusualNetworkThroughputOut - expr: >- - sum by (instance) (rate(node_network_transmit_bytes_total[2m])) - / 1024 - / 1024 - > 100 - for: 5m - labels: - severity: warning - annotations: - summary: Host unusual network throughput out at {{ $labels.instance }}. - description: |- - Host network interfaces are probably sending too much data. - - VALUE = {{ $value }} - LABELS = {{ $labels }} - - - alert: HostUnusualDiskReadRate - expr: >- - sum by (instance) (rate(node_disk_read_bytes_total[2m])) - / 1024 - / 1024 - > 50 - for: 5m - labels: - severity: warning - annotations: - summary: Host unusual disk read rate at {{ $labels.instance }}. - description: |- - Disk is probably reading too much data. - - VALUE = {{ $value }} - LABELS = {{ $labels }} - - - alert: HostUnusualDiskWriteRate - expr: >- - sum by (instance) (rate(node_disk_written_bytes_total[2m])) - / 1024 - / 1024 - > 50 - for: 2m - labels: - severity: warning - annotations: - summary: Host unusual disk write rate at {{ $labels.instance }}. - description: |- - Disk is probably writing too much data. - - VALUE = {{ $value }} - LABELS = {{ $labels }} - - - alert: HostOutOfDiskSpace - expr: >- - (node_filesystem_avail_bytes * 100) - / node_filesystem_size_bytes - < 10 - and - on (instance, device, mountpoint) node_filesystem_readonly - == 0 - for: 2m - labels: - severity: warning - annotations: - summary: Host out of disk space at {{ $labels.instance }}. - description: |- - Disk is almost full. - - VALUE = {{ $value }} - LABELS = {{ $labels }} - - - alert: HostDiskWillFillIn24Hours - expr: >- - (node_filesystem_avail_bytes * 100) - / node_filesystem_size_bytes - < 10 - and ON (instance, device, mountpoint) - predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) - < 0 - and ON (instance, device, mountpoint) - node_filesystem_readonly == 0 - for: 2m - labels: - severity: warning - annotations: - summary: Host disk will fill in 24 hours at {{ $labels.instance }}. - description: |- - Filesystem is predicted to run out of space within the next 24 hours at current write rate. - - VALUE = {{ $value }} - LABELS = {{ $labels }} - - - alert: HostOutOfInodes - expr: >- - node_filesystem_files_free - / node_filesystem_files * 100 - < 10 - and - ON (instance, device, mountpoint) node_filesystem_readonly - == 0 - for: 2m - labels: - severity: warning - annotations: - summary: Host out of inodes at {{ $labels.instance }}. - description: |- - Disk is almost running out of available inodes. - - VALUE = {{ $value }} - LABELS = {{ $labels }} - - - alert: HostInodesWillFillIn24Hours - expr: >- - node_filesystem_files_free - / node_filesystem_files - * 100 - < 10 - and - predict_linear(node_filesystem_files_free[1h], 24 * 3600) - < 0 - and ON (instance, device, mountpoint) node_filesystem_readonly - == 0 - for: 2m - labels: - severity: warning - annotations: - summary: Host inodes will fill in 24 hours at {{ $labels.instance }}. - description: |- - Filesystem is predicted to run out of inodes within the next 24 hours at current write rate. - - VALUE = {{ $value }} - LABELS = {{ $labels }} - - - alert: HostUnusualDiskReadLatency - expr: >- - rate(node_disk_read_time_seconds_total[1m]) - / rate(node_disk_reads_completed_total[1m]) - > 0.1 - and - rate(node_disk_reads_completed_total[1m]) - > 0 - for: 2m - labels: - severity: warning - annotations: - summary: Host unusual disk read latency at {{ $labels.instance }}. - description: |- - Disk latency is growing. - - VALUE = {{ $value }} - LABELS = {{ $labels }} - - - alert: HostUnusualDiskWriteLatency - expr: >- - rate(node_disk_write_time_seconds_total[1m]) - / rate(node_disk_writes_completed_total[1m]) - > 0.1 - and - rate(node_disk_writes_completed_total[1m]) - > 0 - for: 2m - labels: - severity: warning - annotations: - summary: Host unusual disk write latency at {{ $labels.instance }}. - description: |- - Disk latency is growing. - - VALUE = {{ $value }} - LABELS = {{ $labels }} - - - alert: HostHighCpuLoad - expr: >- - sum by (instance) - (avg by (mode, instance) - (rate(node_cpu_seconds_total{mode!="idle"}[2m]))) - > 2.0 - for: 0m - labels: - severity: warning - annotations: - summary: Host high CPU load at {{ $labels.instance }}. - description: |- - CPU load is high. - - VALUE = {{ $value }} - LABELS = {{ $labels }} - - - alert: HostCpuHighIowait - expr: |- - avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) - * 100 - > 50 - for: 0m - labels: - severity: warning - annotations: - summary: Host CPU high I/O wait at {{ $labels.instance }}. - description: |- - CPU I/O wait is high. A high I/O wait means that you are disk or network bound. - - VALUE = {{ $value }} - LABELS = {{ $labels }} - - - alert: HostUnusualDiskIo - expr: >- - rate(node_disk_io_time_seconds_total[1m]) > 0.5 - for: 5m - labels: - severity: warning - annotations: - summary: Host unusual disk I/O at {{ $labels.instance }}. - description: |- - Time spent over I/O is too high. Check storage for issues. - - VALUE = {{ $value }} - LABELS = {{ $labels }} - - - alert: HostSwapIsFillingUp - expr: >- - (1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) - * 100 - > 80 - for: 2m - labels: - severity: warning - annotations: - summary: Host swap is filling up at {{ $labels.instance }}. - description: |- - Swap is filling up. - - VALUE = {{ $value }} - LABELS = {{ $labels }} - - - alert: HostSystemdServiceCrashed - expr: >- - node_systemd_unit_state{state="failed"} == 1 - for: 0m - labels: - severity: warning - annotations: - summary: Host systemd service crashed at {{ $labels.instance }}. - description: |- - Systemd service crashed. - - VALUE = {{ $value }} - LABELS = {{ $labels }} - - - alert: HostPhysicalComponentTooHot - expr: >- - node_hwmon_temp_celsius > 75 - for: 5m - labels: - severity: warning - annotations: - summary: Host physical component too hot at {{ $labels.instance }}. - description: |- - Physical hardware component too hot. - - VALUE = {{ $value }} - LABELS = {{ $labels }} - - - alert: HostNodeOvertemperatureAlarm - expr: >- - node_hwmon_temp_crit_alarm_celsius == 1 - for: 0m - labels: - severity: critical - annotations: - summary: Host node overtemperature alarm at {{ $labels.instance }}. - description: |- - Physical node temperature alarm triggered. - - VALUE = {{ $value }} - LABELS = {{ $labels }} - - - alert: HostRaidArrayGotInactive - expr: >- - node_md_state{state="inactive"} > 0 - for: 0m - labels: - severity: critical - annotations: - summary: Host RAID array got inactive at {{ $labels.instance }}. - description: |- - RAID array is in a degraded state due to one or more disks failures. Number of spare drives is insufficient to fix the issue automatically. - - VALUE = {{ $value }} - LABELS = {{ $labels }} - - - alert: HostRaidDiskFailure - expr: >- - node_md_disks{state="failed"} > 0 - for: 2m - labels: - severity: warning - annotations: - summary: Host RAID disk failure at {{ $labels.instance }}. - description: |- - At least one device in RAID array is failed. Possibly, a disk swap is required. - - VALUE = {{ $value }} - LABELS = {{ $labels }} - - - alert: HostOomKillDetected - expr: >- - increase(node_vmstat_oom_kill[1m]) > 0 - for: 0m - labels: - severity: warning - annotations: - summary: Host OOM kill detected at {{ $labels.instance }}. - description: |- - OOM kill detected. - - VALUE = {{ $value }} - LABELS = {{ $labels }} - - - alert: HostEdacCorrectableErrorsDetected - expr: >- - increase(node_edac_correctable_errors_total[1m]) > 0 - for: 0m - labels: - severity: info - annotations: - summary: Host EDAC correctable errors detected at {{ $labels.instance }}. - description: |- - Host has had {{ printf "%.0f" $value }} correctable memory errors reported by EDAC in the last 5 minutes. - - VALUE = {{ $value }} - LABELS = {{ $labels }} - - - alert: HostEdacUncorrectableErrorsDetected - expr: >- - node_edac_uncorrectable_errors_total > 0 - for: 0m - labels: - severity: warning - annotations: - summary: Host EDAC uncorrectable errors detected at {{ $labels.instance }}. - description: |- - Host has had {{ printf "%.0f" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes. - - VALUE = {{ $value }} - LABELS = {{ $labels }} - - - alert: HostNetworkReceiveErrors - expr: >- - rate(node_network_receive_errs_total{device!~"^wg.*"}[2m]) - / - rate(node_network_receive_packets_total{device!~"^wg.*"}[2m]) - > 0.01 - for: 2m - labels: - severity: warning - annotations: - summary: Host Network Receive Errors at {{ $labels.instance }}. - description: |- - Host interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive errors in the last two minutes. - - VALUE = {{ $value }} - LABELS = {{ $labels }} - - - alert: HostNetworkTransmitErrors - expr: >- - rate(node_network_transmit_errs_total{device!~"^wg.*"}[2m]) - / - rate(node_network_transmit_packets_total{device!~"^wg.*"}[2m]) - > 0.1 - for: 2m - labels: - severity: warning - annotations: - summary: Host network transmit errors at {{ $labels.instance }}. - description: |- - Host interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit errors in the last two minutes. - - VALUE = {{ $value }} - LABELS = {{ $labels }} - - - alert: HostNetworkInterfaceSaturated - expr: >- - ( - rate(node_network_receive_bytes_total{device!~"^wg.*"}[1m]) - + - rate(node_network_transmit_bytes_total{device!~"^wg.*"}[1m]) - ) - / node_network_speed_bytes{device!~"^wg.*"} - > 0.8 - < 10000 - for: 1m - labels: - severity: warning - annotations: - summary: Host network interface saturated at {{ $labels.instance }}. - description: |- - The network interface {{ $labels.device }} is getting overloaded. - - VALUE = {{ $value }} - LABELS = {{ $labels }} - - - alert: HostNetworkBondDegraded - expr: >- - (node_bonding_active - node_bonding_slaves) != 0 - for: 2m - labels: - severity: warning - annotations: - summary: Host network bond degraded at {{ $labels.instance }}. - description: |- - Bond {{ $labels.device }} degraded. - - VALUE = {{ $value }} - LABELS = {{ $labels }} - - - alert: HostConntrackLimit - expr: >- - node_nf_conntrack_entries - / - node_nf_conntrack_entries_limit - > 0.8 - for: 5m - labels: - severity: warning - annotations: - summary: Host conntrack limit at {{ $labels.instance }}. - description: |- - The number of conntrack is approaching limit. - - VALUE = {{ $value }} - LABELS = {{ $labels }} - - - alert: HostClockSkew - expr: >- - ( - node_timex_offset_seconds > 0.05 - and - deriv(node_timex_offset_seconds[5m]) >= 0 - ) - or - ( - node_timex_offset_seconds < -0.05 - and - deriv(node_timex_offset_seconds[5m]) <= 0 - ) - for: 2m - labels: - severity: warning - annotations: - summary: Host clock skew at {{ $labels.instance }}. - description: |- - Clock skew is detected and the clock is out of sync. Ensure that NTP is configured correctly on this host. - - VALUE = {{ $value }} - LABELS = {{ $labels }} - - - alert: HostClockNotSynchronising - expr: >- - min_over_time(node_timex_sync_status[1m]) == 0 - and - node_timex_maxerror_seconds >= 16 - for: 2m - labels: - severity: warning - annotations: - summary: Host clock not synchronising at {{ $labels.instance }}. - description: |- - Clock is not synchronising. Ensure that NTP is configured correctly on this host. - - VALUE = {{ $value }} - LABELS = {{ $labels }} - - - alert: HostRequiresReboot - expr: >- - node_reboot_required > 0 - for: 4h - labels: - severity: info - annotations: - summary: Host requires reboot at {{ $labels.instance }}. - description: |- - Instance requires a reboot. - - VALUE = {{ $value }} - LABELS = {{ $labels }} -- cgit v1.2.3