Skip to content

Add variables for grouping - Postgres mixin #22

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Aug 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 35 additions & 28 deletions postgres_mixin/alerts/postgres.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -7,16 +7,16 @@
{
alert: 'PostgreSQLMaxConnectionsReached',
annotations: {
description: '{{ $labels.instance }} is exceeding the currently configured maximum Postgres connection limit (current value: {{ $value }}s). Services may be degraded - please take immediate action (you probably need to increase max_connections in the Docker image and re-deploy.',
description: '{{ $labels.instance }} is exceeding the currently configured maximum Postgres connection limit (current value: {{ $value }}s). Services may be degraded - please take immediate action (you probably need to increase max_connections in the Docker image and re-deploy).',
summary: 'Postgres connections count is over the maximum amount.',
},
expr: |||
sum by (instance) (pg_stat_activity_count{%(postgresExporterSelector)s})
sum by (%(agg)s) (pg_stat_activity_count{%(postgresExporterSelector)s})
>=
sum by (instance) (pg_settings_max_connections{%(postgresExporterSelector)s})
sum by (%(agg)s) (pg_settings_max_connections{%(postgresExporterSelector)s})
-
sum by (instance) (pg_settings_superuser_reserved_connections{%(postgresExporterSelector)s})
||| % $._config,
sum by (%(agg)s) (pg_settings_superuser_reserved_connections{%(postgresExporterSelector)s})
||| % $._config { agg: std.join(', ', $._config.groupLabels + $._config.instanceLabels) },
'for': '1m',
labels: {
severity: 'warning',
Expand All @@ -29,14 +29,14 @@
summary: 'Postgres connections count is over 80% of maximum amount.',
},
expr: |||
sum by (instance) (pg_stat_activity_count{%(postgresExporterSelector)s})
sum by (%(agg)s) (pg_stat_activity_count{%(postgresExporterSelector)s})
>
(
sum by (instance) (pg_settings_max_connections{%(postgresExporterSelector)s})
sum by (%(agg)s) (pg_settings_max_connections{%(postgresExporterSelector)s})
-
sum by (instance) (pg_settings_superuser_reserved_connections{%(postgresExporterSelector)s})
sum by (%(agg)s) (pg_settings_superuser_reserved_connections{%(postgresExporterSelector)s})
) * 0.8
||| % $._config,
||| % $._config { agg: std.join(', ', $._config.groupLabels + $._config.instanceLabels) },
'for': '10m',
labels: {
severity: 'warning',
Expand All @@ -61,12 +61,12 @@
summary: 'PostgreSQL high number of slow queries.',
},
expr: |||
avg by (datname) (
avg by (datname, %(agg)s) (
rate (
pg_stat_activity_max_tx_duration{%(dbNameFilter)s,%(postgresExporterSelector)s}[2m]
pg_stat_activity_max_tx_duration{%(dbNameFilter)s, %(postgresExporterSelector)s}[2m]
)
) > 2 * 60
||| % $._config,
||| % $._config { agg: std.join(', ', $._config.groupLabels + $._config.instanceLabels) },
'for': '2m',
labels: {
severity: 'warning',
Expand All @@ -79,16 +79,16 @@
summary: 'PostgreSQL high number of queries per second.',
},
expr: |||
avg by (datname) (
avg by (datname, %(agg)s) (
irate(
pg_stat_database_xact_commit{%(dbNameFilter)s,%(postgresExporterSelector)s}[5m]
pg_stat_database_xact_commit{%(dbNameFilter)s, %(postgresExporterSelector)s}[5m]
)
+
irate(
pg_stat_database_xact_rollback{%(dbNameFilter)s,%(postgresExporterSelector)s}[5m]
pg_stat_database_xact_rollback{%(dbNameFilter)s, %(postgresExporterSelector)s}[5m]
)
) > 10000
||| % $._config,
||| % $._config { agg: std.join(', ', $._config.groupLabels + $._config.instanceLabels) },
'for': '5m',
labels: {
severity: 'warning',
Expand All @@ -101,20 +101,20 @@
summary: 'PostgreSQL low cache hit rate.',
},
expr: |||
avg by (datname) (
rate(pg_stat_database_blks_hit{%(dbNameFilter)s,%(postgresExporterSelector)s}[5m])
avg by (datname, %(agg)s) (
rate(pg_stat_database_blks_hit{%(dbNameFilter)s, %(postgresExporterSelector)s}[5m])
/
(
rate(
pg_stat_database_blks_hit{%(dbNameFilter)s,%(postgresExporterSelector)s}[5m]
pg_stat_database_blks_hit{%(dbNameFilter)s, %(postgresExporterSelector)s}[5m]
)
+
rate(
pg_stat_database_blks_read{%(dbNameFilter)s,%(postgresExporterSelector)s}[5m]
pg_stat_database_blks_read{%(dbNameFilter)s, %(postgresExporterSelector)s}[5m]
)
)
) < 0.98
||| % $._config,
||| % $._config { agg: std.join(', ', $._config.groupLabels + $._config.instanceLabels) },
'for': '5m',
labels: {
severity: 'warning',
Expand Down Expand Up @@ -157,9 +157,14 @@
summary: 'PostgreSQL has high number of acquired locks.',
},
expr: |||
max by( server, job, datname, namespace) ((pg_locks_count{%(dbNameFilter)s}) /
on(instance, namespace) group_left(server) (pg_settings_max_locks_per_transaction{} * pg_settings_max_connections{})) > 0.20
||| % $._config,
max by(datname, %(agg)s) (
(pg_locks_count{%(dbNameFilter)s})
/
on(%(aggWithoutServer)s) group_left(server) (
pg_settings_max_locks_per_transaction{} * pg_settings_max_connections{}
)
) > 0.20
||| % $._config { agg: std.join(',', $._config.groupLabels + $._config.instanceLabels), aggWithoutServer: std.join(',', std.filter(function(x) x != "server", $._config.groupLabels + $._config.instanceLabels)) },
'for': '5m',
labels: {
severity: 'warning',
Expand All @@ -171,7 +176,9 @@
description: '{{ $labels.instance }} replication lag exceeds 1 hour. Check for network issues or load imbalances.',
summary: 'PostgreSQL replication lagging more than 1 hour.',
},
expr: '(pg_replication_lag{} > 3600) and on (instance) (pg_replication_is_replica{} == 1)',
expr: |||
(pg_replication_lag{} > 3600) and on (%(agg)s) (pg_replication_is_replica{} == 1)
||| % $._config { agg: std.join(', ', $._config.groupLabels + $._config.instanceLabels) },
'for': '5m',
labels: {
severity: 'warning',
Expand Down Expand Up @@ -223,12 +230,12 @@
timestamp(
pg_stat_user_tables_n_dead_tup{} >
pg_stat_user_tables_n_live_tup{}
* on(namespace, job, service, instance, server) group_left pg_settings_autovacuum_vacuum_scale_factor{}
+ on(namespace, job, service, instance, server) group_left pg_settings_autovacuum_vacuum_threshold{}
* on(%(agg)s) group_left pg_settings_autovacuum_vacuum_scale_factor{}
+ on(%(agg)s) group_left pg_settings_autovacuum_vacuum_threshold{}
)
< time() - 36000
)
|||,
||| % $._config { agg: std.join(', ', $._config.groupLabels + $._config.instanceLabels) },
'for': '30m',
labels: {
severity: 'critical',
Expand Down
5 changes: 4 additions & 1 deletion postgres_mixin/config.libsonnet
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
{
_config+:: {
dbNameFilter: 'datname!~"template.*"',
postgresExporterSelector: '',
postgresExporterSelector: 'job="integrations/postgres_exporter"',
groupLabels: if self.enableMultiCluster then ['job', 'cluster'] else ['job'],
instanceLabels: ['instance', 'server'],
enableMultiCluster: false,
},
}
2 changes: 1 addition & 1 deletion postgres_mixin/dashboards/dashboards.libsonnet
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
grafanaDashboards+:: {
'postgres-overview.json': (import 'postgres-overview.json'),
'postgresql-overview.json': (import 'postgresql-overview.json'),
},
}
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@
"repeatRowId": null,
"showTitle": true,
"span": 4,
"title": "Postgres Overview",
"title": "PostgreSQL overview",
"titleSize": "h6",
"type": "row"
},
Expand Down
Loading