Skip to content
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
254 changes: 247 additions & 7 deletions postgres_mixin/alerts/postgres.libsonnet
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
prometheusAlerts+:: {
prometheusAlerts+: {
groups+: [
{
name: 'PostgreSQL',
Expand Down Expand Up @@ -63,7 +63,7 @@
expr: |||
avg by (datname) (
rate (
pg_stat_activity_max_tx_duration{datname!~"template.*",%(postgresExporterSelector)s}[2m]
pg_stat_activity_max_tx_duration{%(dbNameFilter)s,%(postgresExporterSelector)s}[2m]
)
) > 2 * 60
||| % $._config,
Expand All @@ -81,11 +81,11 @@
expr: |||
avg by (datname) (
irate(
pg_stat_database_xact_commit{datname!~"template.*",%(postgresExporterSelector)s}[5m]
pg_stat_database_xact_commit{%(dbNameFilter)s,%(postgresExporterSelector)s}[5m]
)
+
irate(
pg_stat_database_xact_rollback{datname!~"template.*",%(postgresExporterSelector)s}[5m]
pg_stat_database_xact_rollback{%(dbNameFilter)s,%(postgresExporterSelector)s}[5m]
)
) > 10000
||| % $._config,
Expand All @@ -102,15 +102,15 @@
},
expr: |||
avg by (datname) (
rate(pg_stat_database_blks_hit{datname!~"template.*",%(postgresExporterSelector)s}[5m])
rate(pg_stat_database_blks_hit{%(dbNameFilter)s,%(postgresExporterSelector)s}[5m])
/
(
rate(
pg_stat_database_blks_hit{datname!~"template.*",%(postgresExporterSelector)s}[5m]
pg_stat_database_blks_hit{%(dbNameFilter)s,%(postgresExporterSelector)s}[5m]
)
+
rate(
pg_stat_database_blks_read{datname!~"template.*",%(postgresExporterSelector)s}[5m]
pg_stat_database_blks_read{%(dbNameFilter)s,%(postgresExporterSelector)s}[5m]
)
)
) < 0.98
Expand All @@ -120,6 +120,246 @@
severity: 'warning',
},
},
{
alert: 'PostgresHasTooManyRollbacks',
annotations: {
description: 'PostgreSQL has too many rollbacks on {{ $labels.cluster }} for database {{ $labels.datname }} with a value of {{ $value }}',
summary: 'PostgreSQL has too many rollbacks.',
},
expr: |||
avg without(pod, instance)
(rate(pg_stat_database_xact_rollback{%(dbNameFilter)s}[5m]) /
(rate(pg_stat_database_xact_commit{%(dbNameFilter)s}[5m]) + rate(pg_stat_database_xact_rollback{%(dbNameFilter)s}[5m]))) > 0.10
||| % $._config,
'for': '5m',
labels: {
severity: 'warning',
},
},
{
alert: 'PostgresHasHighDeadLocks',
annotations: {
description: 'PostgreSQL has too high deadlocks on {{ $labels.cluster }} for database {{ $labels.datname }} with a value of {{ $value }}',
summary: 'PostgreSQL has high number of deadlocks.',
},
expr: |||
max without(pod, instance) (rate(pg_stat_database_deadlocks{%(dbNameFilter)s}[5m]) * 60) > 5
||| % $._config,
'for': '5m',
labels: {
severity: 'warning',
},
},
{
alert: 'PostgresAcquiredTooManyLocks',
annotations: {
description: 'PostgreSQL has acquired too many locks on {{ $labels.cluster }} for database {{ $labels.datname }} with a value of {{ $value }}',
summary: 'PostgreSQL has high number of acquired locks.',
},
expr: |||
max by( server, job, datname, namespace) ((pg_locks_count{%(dbNameFilter)s}) /
on(instance, namespace) group_left(server) (pg_settings_max_locks_per_transaction{} * pg_settings_max_connections{})) > 0.20
||| % $._config,
'for': '5m',
labels: {
severity: 'warning',
},
},
{
alert: 'PostgresXLOGConsumptionVeryLow',
annotations: {
description: 'PostgreSQL instance {{ $labels.instance }} has a very low XLOG consumption rate.',
summary: 'PostgreSQL XLOG consumption is very low.',
},
expr: 'rate(pg_xlog_position_bytes{}[5m]) < 200000',
'for': '5m',
labels: {
severity: 'critical',
},
},
{
alert: 'PostgresXLOGConsumptionVeryHigh',
annotations: {
description: '{{ $labels.instance }} is experiencing very high XLOG consumption rate, which might indicate excessive write operations.',
summary: 'PostgreSQL very high XLOG consumption rate.',
},
expr: 'rate(pg_xlog_position_bytes{}[2m]) > 36700160 and on (instance) (pg_replication_is_replica{} == 0)',
'for': '10m',
labels: {
severity: 'critical',
},
},
{
alert: 'PostgresReplicationStopped',
annotations: {
description: 'PostgreSQL instance {{ $labels.instance }} has stopped replication.',
summary: 'PostgreSQL replication has stopped.',
},
expr: 'pg_stat_replication_pg_xlog_location_diff{} != 0',
'for': '5m',
labels: {
severity: 'critical',
},
},
{
alert: 'PostgresReplicationLaggingMore1Hour',
annotations: {
description: '{{ $labels.instance }} replication lag exceeds 1 hour. Check for network issues or load imbalances.',
summary: 'PostgreSQL replication lagging more than 1 hour.',
},
expr: '(pg_replication_lag{} > 3600) and on (instance) (pg_replication_is_replica{} == 1)',
'for': '5m',
labels: {
severity: 'warning',
},
},
{
alert: 'PostgresReplicationLagBytesAreTooLarge',
annotations: {
description: '{{ $labels.instance }} replication lag in bytes is too large, which might indicate replication issues or network bottlenecks.',
summary: 'PostgreSQL replication lag in bytes too large.',
},
expr: |||
(pg_xlog_position_bytes{} and pg_replication_is_replica{} == 0)
- on (job, service) group_right(instance) (
pg_xlog_position_bytes{} and pg_replication_is_replica{} == 1
) > 1e+09
|||,
'for': '5m',
labels: {
severity: 'critical',
},
},
{
alert: 'PostgresHasReplicationSlotUsed',
annotations: {
description: '{{ $labels.instance }} has replication slots that are not used, which might lead to replication lag or data inconsistency.',
summary: 'PostgreSQL has unused replication slots.',
},
expr: 'pg_replication_slots_active{} == 0',
'for': '30m',
labels: {
severity: 'critical',
},
},
{
alert: 'PostgresReplicationIsStale',
annotations: {
description: '{{ $labels.instance }} replication slots have not been updated for a significant period, indicating potential issues with replication.',
summary: 'PostgreSQL replication slots are stale.',
},
expr: 'pg_replication_slots_xmin_age{slot_name =~ "^repmgr_slot_[0-9]+"} > 20000',
'for': '30m',
labels: {
severity: 'critical',
},
},
{
alert: 'PostgresReplicationRoleChanged',
annotations: {
description: '{{ $labels.instance }} replication role has changed. Verify if this is expected or if it indicates a failover.',
summary: 'PostgreSQL replication role change detected.',
},
expr: 'pg_replication_is_replica{} and changes(pg_replication_is_replica{}[1m]) > 0',
labels: {
severity: 'warning',
},
},
{
alert: 'PostgresHasExporterErrors',
annotations: {
description: '{{ $labels.instance }} exporter is experiencing errors. Verify exporter health and configuration.',
summary: 'PostgreSQL exporter errors detected.',
},
expr: 'pg_exporter_last_scrape_error{} > 0',
'for': '30m',
labels: {
severity: 'critical',
},
},
{
alert: 'PostgresHasTooManyDeadTuples',
annotations: {
description: '{{ $labels.instance }} has too many dead tuples, which may lead to inefficient query performance. Consider vacuuming the database.',
summary: 'PostgreSQL has too many dead tuples.',
},
expr: |||
(sum without(relname) (
pg_stat_user_tables_n_dead_tup{%(dbNameFilter)s}
) > 10000) /
((sum without(relname) (
pg_stat_user_tables_n_live_tup{%(dbNameFilter)s}
) + sum without(relname)(
pg_stat_user_tables_n_dead_tup{%(dbNameFilter)s}
)) > 0) >= 0.1 unless on(instance) (
pg_replication_is_replica{} == 1
)
||| % $._config,
'for': '5m',
labels: {
severity: 'warning',
},
},
{
alert: 'PostgresTablesNotVaccumed',
annotations: {
description: '{{ $labels.instance }} tables have not been vacuumed recently, which may lead to performance degradation.',
summary: 'PostgreSQL tables not vacuumed.',
},
expr: |||
group without(pod, instance)(
timestamp(
pg_stat_user_tables_n_dead_tup{} >
pg_stat_user_tables_n_live_tup{}
* on(namespace, job, service, instance, server) group_left pg_settings_autovacuum_vacuum_scale_factor{}
+ on(namespace, job, service, instance, server) group_left pg_settings_autovacuum_vacuum_threshold{}
)
< time() - 36000
)
|||,
'for': '30m',
labels: {
severity: 'critical',
},
},
{
alert: 'PostgresTablesNotVaccumed',
annotations: {
description: '{{ $labels.instance }} tables have not been vacuumed recently, which may lead to performance degradation.',
summary: 'PostgreSQL tables not vacuumed.',
},
expr: |||
group without(pod, instance)(
timestamp(
pg_stat_user_tables_n_dead_tup{} >
pg_stat_user_tables_n_live_tup{}
* on(namespace, job, service, instance, server) group_left pg_settings_autovacuum_vacuum_scale_factor{}
+ on(namespace, job, service, instance, server) group_left pg_settings_autovacuum_vacuum_threshold{}
)
< time() - 36000
)
|||,
'for': '30m',
labels: {
severity: 'critical',
},
},
{
alert: 'PostgresTooManyCheckpointsRequested',
annotations: {
description: '{{ $labels.instance }} is requesting too many checkpoints, which may lead to performance degradation.',
summary: 'PostgreSQL too many checkpoints requested.',
},
expr: |||
rate(pg_stat_bgwriter_checkpoints_timed_total{}[5m]) /
(rate(pg_stat_bgwriter_checkpoints_timed_total{}[5m]) + rate(pg_stat_bgwriter_checkpoints_req_total{}[5m]))
< 0.5
|||,
'for': '5m',
labels: {
severity: 'warning',
},
},
],
},
],
Expand Down
1 change: 1 addition & 0 deletions postgres_mixin/config.libsonnet
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
{
_config+:: {
dbNameFilter: 'datname!~"template.*"',
postgresExporterSelector: '',
},
}
2 changes: 1 addition & 1 deletion postgres_mixin/dashboards/dashboards.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@
grafanaDashboards+:: {
'postgres-overview.json': (import 'postgres-overview.json'),
},
}
}