diff options
Diffstat (limited to 'modules/nixos/monitoring/rules/postgres.yaml')
-rw-r--r-- | modules/nixos/monitoring/rules/postgres.yaml | 310 |
1 files changed, 310 insertions, 0 deletions
diff --git a/modules/nixos/monitoring/rules/postgres.yaml b/modules/nixos/monitoring/rules/postgres.yaml new file mode 100644 index 0000000..5d360fa --- /dev/null +++ b/modules/nixos/monitoring/rules/postgres.yaml @@ -0,0 +1,310 @@ +--- +groups: + - name: Postgres + + rules: + - alert: PostgresqlDown + expr: >- + pg_up == 0 + for: 0m + labels: + severity: critical + annotations: + summary: PostgreSQL is down (instance {{ $labels.instance }}) + description: |- + Postgresql instance is down. + + VALUE = {{ $value }} + LABELS = {{ $labels }} + + - alert: PostgresqlRestarted + expr: >- + time() - pg_postmaster_start_time_seconds < 60 + for: 0m + labels: + severity: critical + annotations: + summary: PostgreSQL restarted (instance {{ $labels.instance }}) + description: |- + PostgreSQL restarted. + + VALUE = {{ $value }} + LABELS = {{ $labels }} + + - alert: PostgresqlExporterError + expr: >- + pg_exporter_last_scrape_error > 0 + for: 0m + labels: + severity: critical + annotations: + summary: PostgreSQL exporter error (instance {{ $labels.instance }}) + description: |- + PostgreSQL exporter is showing errors. A query may be buggy in query.yaml. + + VALUE = {{ $value }} + LABELS = {{ $labels }} + + - alert: PostgresqlTableNotAutoVacuumed + expr: >- + (pg_stat_user_tables_last_autovacuum > 0) + and + (time() - pg_stat_user_tables_last_autovacuum) + > 60 * 60 * 24 * 10 + for: 0m + labels: + severity: warning + annotations: + summary: PostgreSQL table not auto vacuumed (instance {{ $labels.instance }}) + description: |- + Table {{ $labels.relname }} has not been auto vacuumed for 10 days. + + VALUE = {{ $value }} + LABELS = {{ $labels }} + + - alert: PostgresqlTableNotAutoAnalyzed + expr: >- + (pg_stat_user_tables_last_autoanalyze > 0) + and + (time() - pg_stat_user_tables_last_autoanalyze) + > 24 * 60 * 60 * 10 + for: 0m + labels: + severity: warning + annotations: + summary: PostgreSQL table not auto analyzed (instance {{ $labels.instance }}) + description: |- + Table {{ $labels.relname }} has not been auto analyzed for 10 days. + + VALUE = {{ $value }} + LABELS = {{ $labels }}" + + - alert: PostgresqlTooManyConnections + expr: >- + sum by (datname) (pg_stat_activity_count{datname!~"template.*|postgres"}) + > pg_settings_max_connections * 0.8 + for: 2m + labels: + severity: warning + annotations: + summary: Postgresql too many connections (instance {{ $labels.instance }}) + description: |- + PostgreSQL instance has too many connections (> 80%). + + VALUE = {{ $value }} + LABELS = {{ $labels }} + + - alert: PostgresqlNotEnoughConnections + expr: >- + sum by (datname) (pg_stat_activity_count{datname!~"template.*|postgres"}) + < 1 + for: 2m + labels: + severity: warning + annotations: + summary: Postgresql not enough connections (instance {{ $labels.instance }}) + description: |- + PostgreSQL instance should have more connections (> 1). + + VALUE = {{ $value }} + LABELS = {{ $labels }} + + - alert: PostgresqlDeadLocks + expr: >- + increase(pg_stat_database_deadlocks{datname!~"template.*|postgres"}[1m]) + > 5 + for: 0m + labels: + severity: warning + annotations: + summary: Postgresql dead locks (instance {{ $labels.instance }}) + description: |- + PostgreSQL has dead-locks. + + VALUE = {{ $value }} + LABELS = {{ $labels }} + + - alert: PostgresqlHighRollbackRate + expr: >- + sum by (namespace,datname) + ( + (rate(pg_stat_database_xact_rollback{datname!~"template.*|postgres",datid!="0"}[3m])) + / + ( + (rate(pg_stat_database_xact_rollback{datname!~"template.*|postgres",datid!="0"}[3m])) + + + (rate(pg_stat_database_xact_commit{datname!~"template.*|postgres",datid!="0"}[3m])) + ) + ) + > 0.02 + for: 0m + labels: + severity: warning + annotations: + summary: PostgreSQL is at a high rollback rate (instance {{ $labels.instance }}) + description: |- + Ratio of transactions being aborted compared to committed is > 2%. + + VALUE = {{ $value }} + LABELS = {{ $labels }} + + - alert: PostgresqlCommitRateLow + expr: >- + rate(pg_stat_database_xact_commit[1m]) + < 10 + for: 2m + labels: + severity: critical + annotations: + summary: PostgreSQL commit rate low (instance {{ $labels.instance }}) + description: |- + PostgreSQL seems to be processing very few transactions. + + VALUE = {{ $value }} + LABELS = {{ $labels }} + + - alert: PostgresqlLowXidConsumption + expr: >- + rate(pg_txid_current[1m]) + < 5 + for: 2m + labels: + severity: warning + annotations: + summary: PostgreSQL low XID consumption (instance {{ $labels.instance }}) + description: |- + PostgreSQL seems to be consuming transaction IDs very slowly. + + VALUE = {{ $value }} + LABELS = {{ $labels }} + + - alert: PostgresqlHighRateStatementTimeout + expr: >- + rate(postgresql_errors_total{type="statement_timeout"}[1m]) + > 3 + for: 0m + labels: + severity: critical + annotations: + summary: PostgreSQL high rate statement timeout (instance {{ $labels.instance }}) + description: |- + PostgreSQL transactions showing high rate of statement timeouts. + + VALUE = {{ $value }} + LABELS = {{ $labels }} + + - alert: PostgresqlHighRateDeadlock + expr: >- + increase(postgresql_errors_total{type="deadlock_detected"}[1m]) + > 1 + for: 0m + labels: + severity: critical + annotations: + summary: PostgreSQL high rate deadlock (instance {{ $labels.instance }}) + description: |- + PostgreSQL detected deadlocks. + + VALUE = {{ $value }} + LABELS = {{ $labels }} + + - alert: PostgresqlUnusedReplicationSlot + expr: >- + pg_replication_slots_active == 0 + for: 1m + labels: + severity: warning + annotations: + summary: PostgreSQL unused replication slot (instance {{ $labels.instance }}) + description: |- + Unused Replication Slots. + + VALUE = {{ $value }} + LABELS = {{ $labels }} + + - alert: PostgresqlTooManyDeadTuples + expr: >- + ( + (pg_stat_user_tables_n_dead_tup > 10000) + / + (pg_stat_user_tables_n_live_tup + pg_stat_user_tables_n_dead_tup) + ) + >= 0.1 + for: 2m + labels: + severity: warning + annotations: + summary: PostgreSQL too many dead tuples (instance {{ $labels.instance }}) + description: |- + PostgreSQL number of dead tuples is too large. + + VALUE = {{ $value }} + LABELS = {{ $labels }} + + - alert: PostgresqlSslCompressionActive + expr: >- + sum(pg_stat_ssl_compression) > 0 + for: 0m + labels: + severity: critical + annotations: + summary: Postgresql SSL compression active (instance {{ $labels.instance }}) + description: |- + Database connections with SSL compression is enabled. This may add a + significant jitter in the replication delay. + + VALUE = {{ $value }} + LABELS = {{ $labels }} + + - alert: PostgresqlTooManyLocksAcquired + expr: >- + ( + (sum (pg_locks_count)) + / + (pg_settings_max_locks_per_transaction * pg_settings_max_connections) + ) + > 0.20 + for: 2m + labels: + severity: critical + annotations: + summary: PostgreSQL too many locks acquired (instance {{ $labels.instance }}) + description: |- + Too many locks acquired on the database. + + VALUE = {{ $value }} + LABELS = {{ $labels }} + + - alert: PostgresqlBloatIndexHigh + expr: >- + pg_bloat_btree_bloat_pct > 80 + and + on (idxname) (pg_bloat_btree_real_size > 100000000) + for: 1h + labels: + severity: warning + annotations: + summary: PostgreSQL bloat index high (> 80%) (instance {{ $labels.instance }}) + description: |- + The index {{ $labels.idxname }} is bloated. You should execute + `REINDEX INDEX CONCURRENTLY {{ $labels.idxname }};` + + VALUE = {{ $value }} + LABELS = {{ $labels }} + + - alert: PostgresqlBloatTableHigh + expr: >- + pg_bloat_table_bloat_pct > 80 + and + on (relname) (pg_bloat_table_real_size > 200000000) + for: 1h + labels: + severity: warning + annotations: + summary: PostgreSQL bloat table high (> 80%) (instance {{ $labels.instance }}) + description: |- + The table {{ $labels.relname }} is bloated. You should execute + `VACUUM {{ $labels.relname }};` + + VALUE = {{ $value }} + LABELS = {{ $labels }} |