about summary refs log tree commit diff
path: root/modules/nixos/monitoring/rules/postgres.yaml
diff options
context:
space:
mode:
Diffstat (limited to 'modules/nixos/monitoring/rules/postgres.yaml')
-rw-r--r--modules/nixos/monitoring/rules/postgres.yaml310
1 files changed, 310 insertions, 0 deletions
diff --git a/modules/nixos/monitoring/rules/postgres.yaml b/modules/nixos/monitoring/rules/postgres.yaml
new file mode 100644
index 0000000..5d360fa
--- /dev/null
+++ b/modules/nixos/monitoring/rules/postgres.yaml
@@ -0,0 +1,310 @@
+---
+groups:
+  - name: Postgres
+
+    rules:
+      - alert: PostgresqlDown
+        expr: >-
+          pg_up == 0
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: PostgreSQL is down (instance {{ $labels.instance }})
+          description: |-
+            Postgresql instance is down.
+
+              VALUE = {{ $value }}
+              LABELS = {{ $labels }}
+
+      - alert: PostgresqlRestarted
+        expr: >-
+          time() - pg_postmaster_start_time_seconds < 60
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: PostgreSQL restarted (instance {{ $labels.instance }})
+          description: |-
+            PostgreSQL restarted.
+
+              VALUE = {{ $value }}
+              LABELS = {{ $labels }}
+
+      - alert: PostgresqlExporterError
+        expr: >-
+          pg_exporter_last_scrape_error > 0
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: PostgreSQL exporter error (instance {{ $labels.instance }})
+          description: |-
+            PostgreSQL exporter is showing errors. A query may be buggy in query.yaml.
+
+              VALUE = {{ $value }}
+              LABELS = {{ $labels }}
+
+      - alert: PostgresqlTableNotAutoVacuumed
+        expr: >-
+          (pg_stat_user_tables_last_autovacuum > 0)
+          and
+          (time() - pg_stat_user_tables_last_autovacuum)
+          > 60 * 60 * 24 * 10
+        for: 0m
+        labels:
+          severity: warning
+        annotations:
+          summary: PostgreSQL table not auto vacuumed (instance {{ $labels.instance }})
+          description: |-
+            Table {{ $labels.relname }} has not been auto vacuumed for 10 days.
+
+              VALUE = {{ $value }}
+              LABELS = {{ $labels }}
+
+      - alert: PostgresqlTableNotAutoAnalyzed
+        expr: >-
+          (pg_stat_user_tables_last_autoanalyze > 0)
+          and
+          (time() - pg_stat_user_tables_last_autoanalyze)
+          > 24 * 60 * 60 * 10
+        for: 0m
+        labels:
+          severity: warning
+        annotations:
+          summary: PostgreSQL table not auto analyzed (instance {{ $labels.instance }})
+          description: |-
+            Table {{ $labels.relname }} has not been auto analyzed for 10 days.
+
+              VALUE = {{ $value }}
+              LABELS = {{ $labels }}"
+
+      - alert: PostgresqlTooManyConnections
+        expr: >-
+          sum by (datname) (pg_stat_activity_count{datname!~"template.*|postgres"})
+          > pg_settings_max_connections * 0.8
+        for: 2m
+        labels:
+          severity: warning
+        annotations:
+          summary: Postgresql too many connections (instance {{ $labels.instance }})
+          description: |-
+            PostgreSQL instance has too many connections (> 80%).
+
+              VALUE = {{ $value }}
+              LABELS = {{ $labels }}
+
+      - alert: PostgresqlNotEnoughConnections
+        expr: >-
+          sum by (datname) (pg_stat_activity_count{datname!~"template.*|postgres"})
+          < 1
+        for: 2m
+        labels:
+          severity: warning
+        annotations:
+          summary: Postgresql not enough connections (instance {{ $labels.instance }})
+          description: |-
+            PostgreSQL instance should have more connections (> 1).
+
+              VALUE = {{ $value }}
+              LABELS = {{ $labels }}
+
+      - alert: PostgresqlDeadLocks
+        expr: >-
+          increase(pg_stat_database_deadlocks{datname!~"template.*|postgres"}[1m])
+          > 5
+        for: 0m
+        labels:
+          severity: warning
+        annotations:
+          summary: Postgresql dead locks (instance {{ $labels.instance }})
+          description: |-
+            PostgreSQL has dead-locks.
+
+              VALUE = {{ $value }}
+              LABELS = {{ $labels }}
+
+      - alert: PostgresqlHighRollbackRate
+        expr: >-
+          sum by (namespace,datname)
+            (
+              (rate(pg_stat_database_xact_rollback{datname!~"template.*|postgres",datid!="0"}[3m]))
+              /
+              (
+                (rate(pg_stat_database_xact_rollback{datname!~"template.*|postgres",datid!="0"}[3m]))
+                +
+                (rate(pg_stat_database_xact_commit{datname!~"template.*|postgres",datid!="0"}[3m]))
+              )
+            )
+          > 0.02
+        for: 0m
+        labels:
+          severity: warning
+        annotations:
+          summary: PostgreSQL is at a high rollback rate (instance {{ $labels.instance }})
+          description: |-
+            Ratio of transactions being aborted compared to committed is > 2%.
+
+              VALUE = {{ $value }}
+              LABELS = {{ $labels }}
+
+      - alert: PostgresqlCommitRateLow
+        expr: >-
+          rate(pg_stat_database_xact_commit[1m])
+          < 10
+        for: 2m
+        labels:
+          severity: critical
+        annotations:
+          summary: PostgreSQL commit rate low (instance {{ $labels.instance }})
+          description: |-
+            PostgreSQL seems to be processing very few transactions.
+
+              VALUE = {{ $value }}
+              LABELS = {{ $labels }}
+
+      - alert: PostgresqlLowXidConsumption
+        expr: >-
+          rate(pg_txid_current[1m])
+          < 5
+        for: 2m
+        labels:
+          severity: warning
+        annotations:
+          summary: PostgreSQL low XID consumption (instance {{ $labels.instance }})
+          description: |-
+            PostgreSQL seems to be consuming transaction IDs very slowly.
+
+              VALUE = {{ $value }}
+              LABELS = {{ $labels }}
+
+      - alert: PostgresqlHighRateStatementTimeout
+        expr: >-
+          rate(postgresql_errors_total{type="statement_timeout"}[1m])
+          > 3
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: PostgreSQL high rate statement timeout (instance {{ $labels.instance }})
+          description: |-
+            PostgreSQL transactions showing high rate of statement timeouts.
+
+              VALUE = {{ $value }}
+              LABELS = {{ $labels }}
+
+      - alert: PostgresqlHighRateDeadlock
+        expr: >-
+          increase(postgresql_errors_total{type="deadlock_detected"}[1m])
+          > 1
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: PostgreSQL high rate deadlock (instance {{ $labels.instance }})
+          description: |-
+            PostgreSQL detected deadlocks.
+
+              VALUE = {{ $value }}
+              LABELS = {{ $labels }}
+
+      - alert: PostgresqlUnusedReplicationSlot
+        expr: >-
+          pg_replication_slots_active == 0
+        for: 1m
+        labels:
+          severity: warning
+        annotations:
+          summary: PostgreSQL unused replication slot (instance {{ $labels.instance }})
+          description: |-
+            Unused Replication Slots.
+
+              VALUE = {{ $value }}
+              LABELS = {{ $labels }}
+
+      - alert: PostgresqlTooManyDeadTuples
+        expr: >-
+          (
+            (pg_stat_user_tables_n_dead_tup > 10000)
+            /
+            (pg_stat_user_tables_n_live_tup + pg_stat_user_tables_n_dead_tup)
+          )
+          >= 0.1
+        for: 2m
+        labels:
+          severity: warning
+        annotations:
+          summary: PostgreSQL too many dead tuples (instance {{ $labels.instance }})
+          description: |-
+            PostgreSQL number of dead tuples is too large.
+
+              VALUE = {{ $value }}
+              LABELS = {{ $labels }}
+
+      - alert: PostgresqlSslCompressionActive
+        expr: >-
+          sum(pg_stat_ssl_compression) > 0
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: Postgresql SSL compression active (instance {{ $labels.instance }})
+          description: |-
+            Database connections with SSL compression is enabled. This may add a
+            significant jitter in the replication delay.
+
+              VALUE = {{ $value }}
+              LABELS = {{ $labels }}
+
+      - alert: PostgresqlTooManyLocksAcquired
+        expr: >-
+          (
+            (sum (pg_locks_count))
+            /
+            (pg_settings_max_locks_per_transaction * pg_settings_max_connections)
+          )
+          > 0.20
+        for: 2m
+        labels:
+          severity: critical
+        annotations:
+          summary: PostgreSQL too many locks acquired (instance {{ $labels.instance }})
+          description: |-
+            Too many locks acquired on the database.
+
+              VALUE = {{ $value }}
+              LABELS = {{ $labels }}
+
+      - alert: PostgresqlBloatIndexHigh
+        expr: >-
+          pg_bloat_btree_bloat_pct > 80
+          and
+          on (idxname) (pg_bloat_btree_real_size > 100000000)
+        for: 1h
+        labels:
+          severity: warning
+        annotations:
+          summary: PostgreSQL bloat index high (> 80%) (instance {{ $labels.instance }})
+          description: |-
+            The index {{ $labels.idxname }} is bloated. You should execute
+            `REINDEX INDEX CONCURRENTLY {{ $labels.idxname }};`
+
+              VALUE = {{ $value }}
+              LABELS = {{ $labels }}
+
+      - alert: PostgresqlBloatTableHigh
+        expr: >-
+          pg_bloat_table_bloat_pct > 80
+          and
+          on (relname) (pg_bloat_table_real_size > 200000000)
+        for: 1h
+        labels:
+          severity: warning
+        annotations:
+          summary: PostgreSQL bloat table high (> 80%) (instance {{ $labels.instance }})
+          description: |-
+            The table {{ $labels.relname }} is bloated. You should execute
+            `VACUUM {{ $labels.relname }};`
+
+              VALUE = {{ $value }}
+              LABELS = {{ $labels }}

Consider giving Nix/NixOS a try! <3