From e6ed60548397627bf10f561f9438201dbba0a36e Mon Sep 17 00:00:00 2001 From: Azat Bahawi Date: Sun, 21 Apr 2024 02:15:42 +0300 Subject: 2024-04-21 --- modules/monitoring/rules/postgres.yaml | 310 +++++++++++++++++++++++++++++++++ 1 file changed, 310 insertions(+) create mode 100644 modules/monitoring/rules/postgres.yaml (limited to 'modules/monitoring/rules/postgres.yaml') diff --git a/modules/monitoring/rules/postgres.yaml b/modules/monitoring/rules/postgres.yaml new file mode 100644 index 0000000..6a98c92 --- /dev/null +++ b/modules/monitoring/rules/postgres.yaml @@ -0,0 +1,310 @@ +--- +groups: + - name: Postgres + + rules: + - alert: PostgresqlDown + expr: >- + pg_up == 0 + for: 0m + labels: + severity: critical + annotations: + summary: PostgreSQL is down at {{ $labels.instance }}. + description: |- + PostgreSQL instance is down. + + VALUE = {{ $value }} + LABELS = {{ $labels }} + + - alert: PostgresqlRestarted + expr: >- + time() - pg_postmaster_start_time_seconds < 60 + for: 0m + labels: + severity: critical + annotations: + summary: PostgreSQL restarted at {{ $labels.instance }}. + description: |- + PostgreSQL restarted. + + VALUE = {{ $value }} + LABELS = {{ $labels }} + + - alert: PostgresqlExporterError + expr: >- + pg_exporter_last_scrape_error > 0 + for: 0m + labels: + severity: critical + annotations: + summary: PostgreSQL exporter errors at {{ $labels.instance }}. + description: |- + PostgreSQL exporter is showing errors. + + VALUE = {{ $value }} + LABELS = {{ $labels }} + + - alert: PostgresqlTableNotAutoVacuumed + expr: >- + (pg_stat_user_tables_last_autovacuum > 0) + and + (time() - pg_stat_user_tables_last_autovacuum) + > 60 * 60 * 24 * 10 + for: 0m + labels: + severity: warning + annotations: + summary: PostgreSQL table not auto vacuumed at {{ $labels.instance }}. + description: |- + Table {{ $labels.relname }} has not been auto vacuumed for 10 days. + + VALUE = {{ $value }} + LABELS = {{ $labels }} + + - alert: PostgresqlTableNotAutoAnalyzed + expr: >- + (pg_stat_user_tables_last_autoanalyze > 0) + and + (time() - pg_stat_user_tables_last_autoanalyze) + > 24 * 60 * 60 * 10 + for: 0m + labels: + severity: warning + annotations: + summary: PostgreSQL table not auto analyzed at {{ $labels.instance }}. + description: |- + Table {{ $labels.relname }} has not been auto analyzed for 10 days. + + VALUE = {{ $value }} + LABELS = {{ $labels }}" + + - alert: PostgresqlTooManyConnections + expr: >- + sum by (datname) ( + pg_stat_activity_count{datname!~"template.*|postgres"} + ) > pg_settings_max_connections * 0.8 + for: 2m + labels: + severity: warning + annotations: + summary: PostgreSQL with too many connections at {{ $labels.instance }}. + description: |- + PostgreSQL instance {{ $labels.instance }} has too many connections. + + VALUE = {{ $value }} + LABELS = {{ $labels }} + + - alert: PostgresqlNotEnoughConnections + expr: >- + sum by (datname) ( + pg_stat_activity_count{datname!~"template.*|postgres"} + ) < 1 + for: 2m + labels: + severity: warning + annotations: + summary: PostgreSQL with not enough connections at {{ $labels.instance }}. + description: |- + PostgreSQL instance {{ $labels.instance }} should have more connections. + + VALUE = {{ $value }} + LABELS = {{ $labels }} + + - alert: PostgresqlDeadLocks + expr: >- + increase( + pg_stat_database_deadlocks{datname!~"template.*|postgres"}[1m] + ) > 5 + for: 0m + labels: + severity: warning + annotations: + summary: PostgreSQL dead-locks at instance {{ $labels.instance }}. + description: |- + PostgreSQL shows dead-locks. + + VALUE = {{ $value }} + LABELS = {{ $labels }} + + - alert: PostgresqlHighRollbackRate + expr: >- + sum by (namespace,datname) + ( + (rate(pg_stat_database_xact_rollback{datname!~"template.*|postgres|dendrite",datid!="0"}[3m])) + / + ( + (rate(pg_stat_database_xact_rollback{datname!~"template.*|postgres|dendrite",datid!="0"}[3m])) + + + (rate(pg_stat_database_xact_commit{datname!~"template.*|postgres|dendrite",datid!="0"}[3m])) + ) + ) + > 0.10 + for: 0m + labels: + severity: warning + annotations: + summary: PostgreSQL at a high rollback rate at {{ $labels.instance }}. + description: |- + Ratio of transactions being aborted compared to committed is too big. + + VALUE = {{ $value }} + LABELS = {{ $labels }} + + - alert: PostgresqlCommitRateLow + expr: >- + rate(pg_stat_database_xact_commit[1m]) + < 10 + for: 2m + labels: + severity: critical + annotations: + summary: PostgreSQL commit rate low at instance {{ $labels.instance }}. + description: |- + PostgreSQL seems to be processing very few transactions. + + VALUE = {{ $value }} + LABELS = {{ $labels }} + + - alert: PostgresqlLowXidConsumption + expr: >- + rate(pg_txid_current[1m]) + < 5 + for: 2m + labels: + severity: warning + annotations: + summary: PostgreSQL low XID consumption at instance {{ $labels.instance }}. + description: |- + PostgreSQL seems to be consuming transaction IDs very slowly. + + VALUE = {{ $value }} + LABELS = {{ $labels }} + + - alert: PostgresqlHighRateStatementTimeout + expr: >- + rate(postgresql_errors_total{type="statement_timeout"}[1m]) + > 3 + for: 0m + labels: + severity: critical + annotations: + summary: PostgreSQL high rate statement timeout (instance {{ $labels.instance }}) + description: |- + PostgreSQL transactions showing high rate of statement timeouts. + + VALUE = {{ $value }} + LABELS = {{ $labels }} + + - alert: PostgresqlHighRateDeadlock + expr: >- + increase(postgresql_errors_total{type="deadlock_detected"}[1m]) + > 1 + for: 0m + labels: + severity: critical + annotations: + summary: PostgreSQL high rate dead-lock at {{ $labels.instance }}. + description: |- + PostgreSQL has detected dead-locks. + + VALUE = {{ $value }} + LABELS = {{ $labels }} + + - alert: PostgresqlUnusedReplicationSlot + expr: >- + pg_replication_slots_active == 0 + for: 1m + labels: + severity: warning + annotations: + summary: PostgreSQL unused replication slot at {{ $labels.instance }}. + description: |- + Unused replication slots. + + VALUE = {{ $value }} + LABELS = {{ $labels }} + + - alert: PostgresqlTooManyDeadTuples + expr: >- + ( + (pg_stat_user_tables_n_dead_tup > 10000) + / + (pg_stat_user_tables_n_live_tup + pg_stat_user_tables_n_dead_tup) + ) + >= 0.1 + for: 2m + labels: + severity: warning + annotations: + summary: PostgreSQL too many dead tuples at {{ $labels.instance }}. + description: |- + PostgreSQL number of dead tuples is too large. + + VALUE = {{ $value }} + LABELS = {{ $labels }} + + - alert: PostgresqlSslCompressionActive + expr: >- + sum(pg_stat_ssl_compression) > 0 + for: 0m + labels: + severity: critical + annotations: + summary: PostgreSQL SSL compression active at {{ $labels.instance }}. + description: |- + Database connections with an SSL compression is enabled. This may add a significant jitter in the replication delay. + + VALUE = {{ $value }} + LABELS = {{ $labels }} + + - alert: PostgresqlTooManyLocksAcquired + expr: >- + ( + (sum (pg_locks_count)) + / + (pg_settings_max_locks_per_transaction * pg_settings_max_connections) + ) + > 0.20 + for: 2m + labels: + severity: critical + annotations: + summary: PostgreSQL too many locks acquired at {{ $labels.instance }}. + description: |- + Too many locks acquired on the database. + + VALUE = {{ $value }} + LABELS = {{ $labels }} + + - alert: PostgresqlBloatIndexHigh + expr: >- + pg_bloat_btree_bloat_pct > 80 + and + on (idxname) (pg_bloat_btree_real_size > 100000000) + for: 1h + labels: + severity: warning + annotations: + summary: PostgreSQL index bloat high at {{ $labels.instance }}. + description: |- + The index {{ $labels.idxname }} is bloated. You should execute `REINDEX INDEX CONCURRENTLY {{ $labels.idxname }};`. + + VALUE = {{ $value }} + LABELS = {{ $labels }} + + - alert: PostgresqlBloatTableHigh + expr: >- + pg_bloat_table_bloat_pct > 80 + and + on (relname) (pg_bloat_table_real_size > 200000000) + for: 1h + labels: + severity: warning + annotations: + summary: PostgreSQL table bloat high at instance {{ $labels.instance }}. + description: |- + The table {{ $labels.relname }} is bloated. You should execute `VACUUM {{ $labels.relname }};`. + + VALUE = {{ $value }} + LABELS = {{ $labels }} -- cgit v1.2.3