-
Notifications
You must be signed in to change notification settings - Fork 287
/
ticdc.rules.yml
137 lines (127 loc) · 5 KB
/
ticdc.rules.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
groups:
- name: alert.rules
rules:
# server related alter rules
- alert: cdc_multiple_owners
expr: sum(rate(ticdc_owner_ownership_counter[240s])) >= 2
for: 1m
labels:
env: ENV_LABELS_ENV
level: warning
expr: sum(rate(ticdc_owner_ownership_counter[240s])) >= 2
annotations:
description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}'
value: '{{ $value }}'
summary: cdc cluster has multiple owners
- alert: cdc_no_owner
expr: sum(rate(ticdc_owner_ownership_counter[240s])) < 0.5
for: 10m
labels:
env: ENV_LABELS_ENV
level: warning
expr: sum(rate(ticdc_owner_ownership_counter[240s])) < 0.5
annotations:
description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}'
value: '{{ $value }}'
summary: cdc cluster has no owner for more than 10 minutes
# changefeed related alter rules
- alert: ticdc_changefeed_failed
expr: (max_over_time(ticdc_owner_status[1m]) == 2) > 0
for: 1m
labels:
env: ENV_LABELS_ENV
level: critical
expr: (max_over_time(ticdc_owner_status[1m]) == 2) > 0
annotations:
description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}'
value: '{{ $value }}'
summary: cdc changefeed failed, it can not be automatically resumed
- alert: cdc_checkpoint_high_delay
expr: ticdc_owner_checkpoint_ts_lag > 600
for: 1m
labels:
env: ENV_LABELS_ENV
level: critical
expr: ticdc_owner_checkpoint_ts_lag > 600
annotations:
description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}'
value: '{{ $value }}'
summary: cdc owner checkpoint delay more than 10 minutes
- alert: cdc_resolvedts_high_delay
expr: ticdc_owner_resolved_ts_lag > 300
for: 1m
labels:
env: ENV_LABELS_ENV
level: critical
expr: ticdc_owner_resolved_ts_lag > 300
annotations:
description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}'
value: '{{ $value }}'
summary: cdc owner resolved ts delay more than 5 minutes
- alert: ticdc_sink_execution_error
expr: changes(ticdc_sink_execution_error[1m]) > 0
for: 1m
labels:
env: ENV_LABELS_ENV
level: warning
expr: changes(ticdc_sink_execution_error[1m]) > 0
annotations:
description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}'
value: '{{ $value }}'
summary: cdc sink execution meets errors
- alert: ticdc_processor_exit_with_error_count
expr: changes(ticdc_processor_exit_with_error_count[1m]) > 0
for: 1m
labels:
env: ENV_LABELS_ENV
level: warning
expr: changes(ticdc_processor_exit_with_error_count[1m]) > 0
annotations:
description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}'
value: '{{ $value }}'
summary: cdc processor exits with error
- alert: ticdc_changefeed_meet_error
expr: (max_over_time(ticdc_owner_status[1m]) == 1) > 0
for: 1m
labels:
env: ENV_LABELS_ENV
level: warning
expr: (max_over_time(ticdc_owner_status[1m]) == 1) > 0
annotations:
description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}'
value: '{{ $value }}'
summary: cdc changefeed meet error
# tikv related alter rules
- alert: tikv_cdc_min_resolved_ts_no_change_for_1m
expr: changes(tikv_cdc_min_resolved_ts[1m]) < 1 and ON (instance) tikv_cdc_region_resolve_status{status="resolved"} > 0 and ON (instance) tikv_cdc_captured_region_total > 0
for: 1m
labels:
env: ENV_LABELS_ENV
level: warning
expr: changes(tikv_cdc_min_resolved_ts[1m]) < 1 and ON (instance) tikv_cdc_region_resolve_status{status="resolved"} > 0
annotations:
description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}'
value: '{{ $labels.instance }}'
summary: tikv cdc min resolved ts no change for 1m
- alert: tikv_cdc_scan_duration_seconds_more_than_10min
expr: histogram_quantile(0.9, rate(tikv_cdc_scan_duration_seconds_bucket{}[1m])) > 600
for: 1m
labels:
env: ENV_LABELS_ENV
level: warning
expr: histogram_quantile(0.9, rate(tikv_cdc_scan_duration_seconds_bucket{}[1m])) > 600
annotations:
description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}'
value: '{{ $value }}'
summary: tikv cdc scan duration seconds more than 10 min
- alert: ticdc_memory_abnormal
expr: go_memstats_heap_alloc_bytes{job="ticdc"} > 1e+10
for: 1m
labels:
env: ENV_LABELS_ENV
level: warning
expr: go_memstats_heap_alloc_bytes{job="ticdc"} > 1e+10
annotations:
description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ $value }}'
value: '{{ $value }}'
summary: TiCDC heap memory usage is over 10 GB