Skip to content

Commit 13710e2

Browse files
daroraw3b6x9
andcommitted
feat: add queries for additional metrics (#59)
* feat: add queries for additional metrics * chore: filter for realtime replication lag * chore: add realtime replication slot active status Co-authored-by: Wen Bo Xie <wenbo.xie3@gmail.com>
1 parent 6fdf095 commit 13710e2

File tree

1 file changed

+150
-1
lines changed

1 file changed

+150
-1
lines changed

ansible/files/queries.yml.j2

Lines changed: 150 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,13 +7,151 @@ pg_database:
77
usage: "GAUGE"
88
description: "Disk space used by the database"
99

10+
pg_stat_bgwriter:
11+
query: |
12+
select checkpoints_timed as checkpoints_timed_total,
13+
checkpoints_req as checkpoints_req_total,
14+
checkpoint_write_time as checkpoint_write_time_total,
15+
checkpoint_sync_time as checkpoint_sync_time_total,
16+
buffers_checkpoint as buffers_checkpoint_total,
17+
buffers_clean as buffers_clean_total,
18+
maxwritten_clean as maxwritten_clean_total,
19+
buffers_backend as buffers_backend_total,
20+
buffers_backend_fsync as buffers_backend_fsync_total,
21+
buffers_alloc as buffers_alloc_total,
22+
stats_reset
23+
from pg_stat_bgwriter
24+
cache_seconds: 30
25+
master: true
26+
metrics:
27+
- checkpoints_timed_total:
28+
usage: "COUNTER"
29+
description: "Scheduled checkpoints performed"
30+
- checkpoints_req_total:
31+
usage: "COUNTER"
32+
description: "Requested checkpoints performed"
33+
- checkpoint_write_time_total:
34+
usage: "COUNTER"
35+
description: "Time spent writing checkpoint files to disk"
36+
- checkpoint_sync_time_total:
37+
usage: "COUNTER"
38+
description: "Time spent synchronizing checkpoint files to disk"
39+
- buffers_checkpoint_total:
40+
usage: "COUNTER"
41+
description: "Buffers written during checkpoints"
42+
- buffers_clean_total:
43+
usage: "COUNTER"
44+
description: "Buffers written by bg writter"
45+
- maxwritten_clean_total:
46+
usage: "COUNTER"
47+
description: "Number of times bg writer stopped a cleaning scan because it had written too many buffers"
48+
- buffers_backend_total:
49+
usage: "COUNTER"
50+
description: "Buffers written directly by a backend"
51+
- buffers_backend_fsync_total:
52+
usage: "COUNTER"
53+
description: "fsync calls executed by a backend directly"
54+
- buffers_alloc_total:
55+
usage: "COUNTER"
56+
description: "Buffers allocated"
57+
- stats_reset:
58+
usage: "COUNTER"
59+
description: "Most recent stat reset time"
60+
61+
1062
pg_stat_database:
11-
query: "SELECT sum(numbackends) as num_backends FROM pg_stat_database"
63+
cache_seconds: 30
64+
query: |
65+
SELECT sum(numbackends) as num_backends,
66+
sum(xact_commit) as xact_commit_total,
67+
sum(xact_rollback) as xact_rollback_total,
68+
sum(blks_read) as blks_read_total,
69+
sum(blks_hit) as blks_hit_total,
70+
sum(tup_returned) as tup_returned_total,
71+
sum(tup_fetched) as tup_fetched_total,
72+
sum(tup_inserted) as tup_inserted_total,
73+
sum(tup_updated) as tup_updated_total,
74+
sum(tup_deleted) as tup_deleted_total,
75+
sum(conflicts) as conflicts_total,
76+
sum(temp_files) as temp_files_total,
77+
sum(temp_bytes) as temp_bytes_total,
78+
sum(deadlocks) as deadlocks_total,
79+
max(stats_reset) as most_recent_reset
80+
FROM pg_stat_database
1281
master: true
1382
metrics:
1483
- num_backends:
1584
usage: "GAUGE"
1685
description: "The number of active backends"
86+
- xact_commit_total:
87+
usage: "COUNTER"
88+
description: "Transactions committed"
89+
- xact_rollback_total:
90+
usage: "COUNTER"
91+
description: "Transactions rolled back"
92+
- blks_read_total:
93+
usage: "COUNTER"
94+
description: "Number of disk blocks read"
95+
- blks_hit_total:
96+
usage: "COUNTER"
97+
description: "Disk blocks found in buffer cache"
98+
- tup_returned_total:
99+
usage: "COUNTER"
100+
description: "Rows returned by queries"
101+
- tup_fetched_total:
102+
usage: "COUNTER"
103+
description: "Rows fetched by queries"
104+
- tup_inserted_total:
105+
usage: "COUNTER"
106+
description: "Rows inserted"
107+
- tup_updated_total:
108+
usage: "COUNTER"
109+
description: "Rows updated"
110+
- tup_deleted_total:
111+
usage: "COUNTER"
112+
description: "Rows deleted"
113+
- conflicts_total:
114+
usage: "COUNTER"
115+
description: "Queries canceled due to conflicts with recovery"
116+
- temp_files_total:
117+
usage: "COUNTER"
118+
description: "Temp files created by queries"
119+
- temp_bytes_total:
120+
usage: "COUNTER"
121+
description: "Temp data written by queries"
122+
- deadlocks_total:
123+
usage: "COUNTER"
124+
description: "Deadlocks detected"
125+
- most_recent_reset:
126+
usage: "COUNTER"
127+
description: "The most recent time one of the databases had its statistics reset"
128+
129+
pg_stat_database_conflicts:
130+
query: |
131+
SELECT sum(confl_tablespace) as confl_tablespace_total,
132+
sum(confl_lock) as confl_lock_total,
133+
sum(confl_snapshot) as confl_snapshot_total,
134+
sum(confl_bufferpin) as confl_bufferpin_total,
135+
sum(confl_deadlock) as confl_deadlock_total
136+
from pg_stat_database_conflicts
137+
cache_seconds: 30
138+
master: true
139+
metrics:
140+
- confl_tablespace_total:
141+
usage: "COUNTER"
142+
description: "Queries cancelled due to dropped tablespaces"
143+
- confl_lock_total:
144+
usage: "COUNTER"
145+
description: "Queries cancelled due to lock timeouts"
146+
- confl_snapshot_total:
147+
usage: "COUNTER"
148+
description: "Queries cancelled due to old snapshots"
149+
- confl_bufferpin_total:
150+
usage: "COUNTER"
151+
description: "Queries cancelled due to pinned buffers"
152+
- confl_deadlock_total:
153+
usage: "COUNTER"
154+
description: "Queries cancelled due to deadlocks"
17155

18156
pg_stat_statements:
19157
query: "SELECT sum(calls) as total_queries, sum(total_time / 1000) as total_time_seconds FROM extensions.pg_stat_statements t1 JOIN pg_database t3 ON (t1.dbid=t3.oid)"
@@ -35,6 +173,17 @@ auth_users:
35173
usage: "GAUGE"
36174
description: "Number of users in the project db"
37175

176+
replication:
177+
query: "SELECT pg_wal_lsn_diff(pg_current_wal_lsn(), restart_lsn) AS realtime_lag_bytes, active AS realtime_slot_status FROM pg_replication_slots where slot_name = 'realtime'"
178+
master: true
179+
metrics:
180+
- realtime_lag_bytes:
181+
usage: "GAUGE"
182+
description: "Replication Lag for Realtime"
183+
- realtime_slot_status:
184+
usage: "GAUGE"
185+
description: "Replication Slot active status"
186+
38187
storage:
39188
query: "select sum(size) / (1024 * 1024) as storage_size_mb from storage.get_size_by_bucket()"
40189
master: true

0 commit comments

Comments
 (0)