Skip to content

Commit 9fe6b6d

Browse files
v-zhuravlevcristiangreco
authored andcommitted
Update mixin to latest changes from grafana/postgres_exporter
Porting a bunch of PRs accumulated over time in grafana/postgres_exporter: - grafana#11 (@v-zhuravlev) - grafana#12 (@gaantunes) - grafana#13 (@gaantunes) - grafana#14 (@gaantunes) - grafana#15 (@gaantunes) - grafana#16 (@gaantunes) - grafana#17 (@gaantunes) - grafana#20 (@gaantunes) - grafana#21 (@mshahzeb) - grafana#22 (@mshahzeb) Signed-off-by: Cristian Greco <cristian@regolo.cc>
1 parent 06a553c commit 9fe6b6d

File tree

4 files changed

+758
-552
lines changed

4 files changed

+758
-552
lines changed

postgres_mixin/.lint

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
---
2+
exclusions:
3+
panel-units-rule:
4+
reason: Ignoring so far, need to address this in future
5+
panel-title-description-rule:
6+
reason: Ignoring so far, need to address this in future
7+
panel-datasource-rule:
8+
reason: "Loki datasource variable is being named as loki_datasource now while linter expects 'datasource'"
9+
template-datasource-rule:
10+
reason: "Based on new convention we are using variable names prometheus_datasource and loki_datasource where as linter expects 'datasource'"
11+
alert-name-camelcase:
12+
reason: QPS is a common acronym (Queries Per Second) and should be allowed
13+
entries:
14+
- alert: PostgreSQLQPS
15+

postgres_mixin/alerts/postgres.libsonnet

Lines changed: 163 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -7,16 +7,16 @@
77
{
88
alert: 'PostgreSQLMaxConnectionsReached',
99
annotations: {
10-
description: '{{ $labels.instance }} is exceeding the currently configured maximum Postgres connection limit (current value: {{ $value }}s). Services may be degraded - please take immediate action (you probably need to increase max_connections in the Docker image and re-deploy.',
11-
summary: '{{ $labels.instance }} has maxed out Postgres connections.',
10+
description: '{{ $labels.instance }} is exceeding the currently configured maximum Postgres connection limit (current value: {{ $value }}s). Services may be degraded - please take immediate action (you probably need to increase max_connections in the Docker image and re-deploy).',
11+
summary: 'Postgres connections count is over the maximum amount.',
1212
},
1313
expr: |||
14-
sum by (instance) (pg_stat_activity_count{%(postgresExporterSelector)s})
14+
sum by (%(agg)s) (pg_stat_activity_count{%(postgresExporterSelector)s})
1515
>=
16-
sum by (instance) (pg_settings_max_connections{%(postgresExporterSelector)s})
16+
sum by (%(agg)s) (pg_settings_max_connections{%(postgresExporterSelector)s})
1717
-
18-
sum by (instance) (pg_settings_superuser_reserved_connections{%(postgresExporterSelector)s})
19-
||| % $._config,
18+
sum by (%(agg)s) (pg_settings_superuser_reserved_connections{%(postgresExporterSelector)s})
19+
||| % $._config { agg: std.join(', ', $._config.groupLabels + $._config.instanceLabels) },
2020
'for': '1m',
2121
labels: {
2222
severity: 'warning',
@@ -26,17 +26,17 @@
2626
alert: 'PostgreSQLHighConnections',
2727
annotations: {
2828
description: '{{ $labels.instance }} is exceeding 80% of the currently configured maximum Postgres connection limit (current value: {{ $value }}s). Please check utilization graphs and confirm if this is normal service growth, abuse or an otherwise temporary condition or if new resources need to be provisioned (or the limits increased, which is mostly likely).',
29-
summary: '{{ $labels.instance }} is over 80% of max Postgres connections.',
29+
summary: 'Postgres connections count is over 80% of maximum amount.',
3030
},
3131
expr: |||
32-
sum by (instance) (pg_stat_activity_count{%(postgresExporterSelector)s})
32+
sum by (%(agg)s) (pg_stat_activity_count{%(postgresExporterSelector)s})
3333
>
3434
(
35-
sum by (instance) (pg_settings_max_connections{%(postgresExporterSelector)s})
35+
sum by (%(agg)s) (pg_settings_max_connections{%(postgresExporterSelector)s})
3636
-
37-
sum by (instance) (pg_settings_superuser_reserved_connections{%(postgresExporterSelector)s})
37+
sum by (%(agg)s) (pg_settings_superuser_reserved_connections{%(postgresExporterSelector)s})
3838
) * 0.8
39-
||| % $._config,
39+
||| % $._config { agg: std.join(', ', $._config.groupLabels + $._config.instanceLabels) },
4040
'for': '10m',
4141
labels: {
4242
severity: 'warning',
@@ -46,7 +46,7 @@
4646
alert: 'PostgreSQLDown',
4747
annotations: {
4848
description: '{{ $labels.instance }} is rejecting query requests from the exporter, and thus probably not allowing DNS requests to work either. User services should not be effected provided at least 1 node is still alive.',
49-
summary: 'PostgreSQL is not processing queries: {{ $labels.instance }}',
49+
summary: 'PostgreSQL is not processing queries.',
5050
},
5151
expr: 'pg_up{%(postgresExporterSelector)s} != 1' % $._config,
5252
'for': '1m',
@@ -58,15 +58,15 @@
5858
alert: 'PostgreSQLSlowQueries',
5959
annotations: {
6060
description: 'PostgreSQL high number of slow queries {{ $labels.cluster }} for database {{ $labels.datname }} with a value of {{ $value }} ',
61-
summary: 'PostgreSQL high number of slow on {{ $labels.cluster }} for database {{ $labels.datname }} ',
61+
summary: 'PostgreSQL high number of slow queries.',
6262
},
6363
expr: |||
64-
avg by (datname) (
64+
avg by (datname, %(agg)s) (
6565
rate (
66-
pg_stat_activity_max_tx_duration{datname!~"template.*",%(postgresExporterSelector)s}[2m]
66+
pg_stat_activity_max_tx_duration{%(dbNameFilter)s, %(postgresExporterSelector)s}[2m]
6767
)
6868
) > 2 * 60
69-
||| % $._config,
69+
||| % $._config { agg: std.join(', ', $._config.groupLabels + $._config.instanceLabels) },
7070
'for': '2m',
7171
labels: {
7272
severity: 'warning',
@@ -76,19 +76,19 @@
7676
alert: 'PostgreSQLQPS',
7777
annotations: {
7878
description: 'PostgreSQL high number of queries per second on {{ $labels.cluster }} for database {{ $labels.datname }} with a value of {{ $value }}',
79-
summary: 'PostgreSQL high number of queries per second {{ $labels.cluster }} for database {{ $labels.datname }}',
79+
summary: 'PostgreSQL high number of queries per second.',
8080
},
8181
expr: |||
82-
avg by (datname) (
82+
avg by (datname, %(agg)s) (
8383
irate(
84-
pg_stat_database_xact_commit{datname!~"template.*",%(postgresExporterSelector)s}[5m]
84+
pg_stat_database_xact_commit{%(dbNameFilter)s, %(postgresExporterSelector)s}[5m]
8585
)
8686
+
8787
irate(
88-
pg_stat_database_xact_rollback{datname!~"template.*",%(postgresExporterSelector)s}[5m]
88+
pg_stat_database_xact_rollback{%(dbNameFilter)s, %(postgresExporterSelector)s}[5m]
8989
)
9090
) > 10000
91-
||| % $._config,
91+
||| % $._config { agg: std.join(', ', $._config.groupLabels + $._config.instanceLabels) },
9292
'for': '5m',
9393
labels: {
9494
severity: 'warning',
@@ -98,28 +98,165 @@
9898
alert: 'PostgreSQLCacheHitRatio',
9999
annotations: {
100100
description: 'PostgreSQL low on cache hit rate on {{ $labels.cluster }} for database {{ $labels.datname }} with a value of {{ $value }}',
101-
summary: 'PostgreSQL low cache hit rate on {{ $labels.cluster }} for database {{ $labels.datname }}',
101+
summary: 'PostgreSQL low cache hit rate.',
102102
},
103103
expr: |||
104-
avg by (datname) (
105-
rate(pg_stat_database_blks_hit{datname!~"template.*",%(postgresExporterSelector)s}[5m])
104+
avg by (datname, %(agg)s) (
105+
rate(pg_stat_database_blks_hit{%(dbNameFilter)s, %(postgresExporterSelector)s}[5m])
106106
/
107107
(
108108
rate(
109-
pg_stat_database_blks_hit{datname!~"template.*",%(postgresExporterSelector)s}[5m]
109+
pg_stat_database_blks_hit{%(dbNameFilter)s, %(postgresExporterSelector)s}[5m]
110110
)
111111
+
112112
rate(
113-
pg_stat_database_blks_read{datname!~"template.*",%(postgresExporterSelector)s}[5m]
113+
pg_stat_database_blks_read{%(dbNameFilter)s, %(postgresExporterSelector)s}[5m]
114114
)
115115
)
116116
) < 0.98
117+
||| % $._config { agg: std.join(', ', $._config.groupLabels + $._config.instanceLabels) },
118+
'for': '5m',
119+
labels: {
120+
severity: 'warning',
121+
},
122+
},
123+
{
124+
alert: 'PostgresHasTooManyRollbacks',
125+
annotations: {
126+
description: 'PostgreSQL has too many rollbacks on {{ $labels.cluster }} for database {{ $labels.datname }} with a value of {{ $value }}',
127+
summary: 'PostgreSQL has too many rollbacks.',
128+
},
129+
expr: |||
130+
avg without(pod, instance)
131+
(rate(pg_stat_database_xact_rollback{%(dbNameFilter)s}[5m]) /
132+
(rate(pg_stat_database_xact_commit{%(dbNameFilter)s}[5m]) + rate(pg_stat_database_xact_rollback{%(dbNameFilter)s}[5m]))) > 0.10
133+
||| % $._config,
134+
'for': '5m',
135+
labels: {
136+
severity: 'warning',
137+
},
138+
},
139+
{
140+
alert: 'PostgresHasHighDeadLocks',
141+
annotations: {
142+
description: 'PostgreSQL has too high deadlocks on {{ $labels.cluster }} for database {{ $labels.datname }} with a value of {{ $value }}',
143+
summary: 'PostgreSQL has high number of deadlocks.',
144+
},
145+
expr: |||
146+
max without(pod, instance) (rate(pg_stat_database_deadlocks{%(dbNameFilter)s}[5m]) * 60) > 5
117147
||| % $._config,
118148
'for': '5m',
119149
labels: {
120150
severity: 'warning',
121151
},
122152
},
153+
{
154+
alert: 'PostgresAcquiredTooManyLocks',
155+
annotations: {
156+
description: 'PostgreSQL has acquired too many locks on {{ $labels.cluster }} for database {{ $labels.datname }} with a value of {{ $value }}',
157+
summary: 'PostgreSQL has high number of acquired locks.',
158+
},
159+
expr: |||
160+
max by(datname, %(agg)s) (
161+
(pg_locks_count{%(dbNameFilter)s})
162+
/
163+
on(%(aggWithoutServer)s) group_left(server) (
164+
pg_settings_max_locks_per_transaction{} * pg_settings_max_connections{}
165+
)
166+
) > 0.20
167+
||| % $._config { agg: std.join(',', $._config.groupLabels + $._config.instanceLabels), aggWithoutServer: std.join(',', std.filter(function(x) x != "server", $._config.groupLabels + $._config.instanceLabels)) },
168+
'for': '5m',
169+
labels: {
170+
severity: 'warning',
171+
},
172+
},
173+
{
174+
alert: 'PostgresReplicationLaggingMore1Hour',
175+
annotations: {
176+
description: '{{ $labels.instance }} replication lag exceeds 1 hour. Check for network issues or load imbalances.',
177+
summary: 'PostgreSQL replication lagging more than 1 hour.',
178+
},
179+
expr: |||
180+
(pg_replication_lag{} > 3600) and on (%(agg)s) (pg_replication_is_replica{} == 1)
181+
||| % $._config { agg: std.join(', ', $._config.groupLabels + $._config.instanceLabels) },
182+
'for': '5m',
183+
labels: {
184+
severity: 'warning',
185+
},
186+
},
187+
{
188+
alert: 'PostgresHasReplicationSlotUsed',
189+
annotations: {
190+
description: '{{ $labels.instance }} has replication slots that are not used, which might lead to replication lag or data inconsistency.',
191+
summary: 'PostgreSQL has unused replication slots.',
192+
},
193+
expr: 'pg_replication_slots_active{} == 0',
194+
'for': '30m',
195+
labels: {
196+
severity: 'critical',
197+
},
198+
},
199+
{
200+
alert: 'PostgresReplicationRoleChanged',
201+
annotations: {
202+
description: '{{ $labels.instance }} replication role has changed. Verify if this is expected or if it indicates a failover.',
203+
summary: 'PostgreSQL replication role change detected.',
204+
},
205+
expr: 'pg_replication_is_replica{} and changes(pg_replication_is_replica{}[1m]) > 0',
206+
labels: {
207+
severity: 'warning',
208+
},
209+
},
210+
{
211+
alert: 'PostgresHasExporterErrors',
212+
annotations: {
213+
description: '{{ $labels.instance }} exporter is experiencing errors. Verify exporter health and configuration.',
214+
summary: 'PostgreSQL exporter errors detected.',
215+
},
216+
expr: 'pg_exporter_last_scrape_error{} > 0',
217+
'for': '30m',
218+
labels: {
219+
severity: 'critical',
220+
},
221+
},
222+
{
223+
alert: 'PostgresTablesNotVaccumed',
224+
annotations: {
225+
description: '{{ $labels.instance }} tables have not been vacuumed recently within the last hour, which may lead to performance degradation.',
226+
summary: 'PostgreSQL tables not vacuumed.',
227+
},
228+
expr: |||
229+
group without(pod, instance)(
230+
timestamp(
231+
pg_stat_user_tables_n_dead_tup{} >
232+
pg_stat_user_tables_n_live_tup{}
233+
* on(%(agg)s) group_left pg_settings_autovacuum_vacuum_scale_factor{}
234+
+ on(%(agg)s) group_left pg_settings_autovacuum_vacuum_threshold{}
235+
)
236+
< time() - 36000
237+
)
238+
||| % $._config { agg: std.join(', ', $._config.groupLabels + $._config.instanceLabels) },
239+
'for': '30m',
240+
labels: {
241+
severity: 'critical',
242+
},
243+
},
244+
{
245+
alert: 'PostgresTooManyCheckpointsRequested',
246+
annotations: {
247+
description: '{{ $labels.instance }} is requesting too many checkpoints, which may lead to performance degradation.',
248+
summary: 'PostgreSQL too many checkpoints requested.',
249+
},
250+
expr: |||
251+
rate(pg_stat_bgwriter_checkpoints_timed_total{}[5m]) /
252+
(rate(pg_stat_bgwriter_checkpoints_timed_total{}[5m]) + rate(pg_stat_bgwriter_checkpoints_req_total{}[5m]))
253+
< 0.5
254+
|||,
255+
'for': '5m',
256+
labels: {
257+
severity: 'warning',
258+
},
259+
},
123260
],
124261
},
125262
],

postgres_mixin/config.libsonnet

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,9 @@
11
{
22
_config+:: {
3-
postgresExporterSelector: '',
3+
dbNameFilter: 'datname!~"template.*"',
4+
postgresExporterSelector: 'job="integrations/postgres_exporter"',
5+
groupLabels: if self.enableMultiCluster then ['job', 'cluster'] else ['job'],
6+
instanceLabels: ['instance', 'server'],
7+
enableMultiCluster: false,
48
},
59
}

0 commit comments

Comments
 (0)