Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

metrics (ticdc): add changefeed status alert rules #9265

Merged
merged 1 commit into from
Jun 20, 2023
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 40 additions & 25 deletions metrics/alertmanager/ticdc.rules.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
groups:
- name: alert.rules
rules:
# server related alter rules
- alert: cdc_multiple_owners
expr: sum(rate(ticdc_owner_ownership_counter[30s])) >= 2
for: 1m
Expand All @@ -25,6 +26,19 @@ groups:
value: '{{ $value }}'
summary: cdc cluster has no owner for more than 10 minutes

# changefeed related alter rules
- alert: ticdc_changefeed_failed
expr: (max_over_time(ticdc_owner_status[1m]) == 2) > 0
for: 1m
labels:
env: ENV_LABELS_ENV
level: critical
expr: (max_over_time(ticdc_owner_status[1m]) == 2) > 0
annotations:
description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}'
value: '{{ $value }}'
summary: cdc changefeed failed, it can not be automatically resumed

- alert: cdc_checkpoint_high_delay
expr: ticdc_owner_checkpoint_ts_lag > 600
for: 1m
Expand All @@ -37,62 +51,63 @@ groups:
value: '{{ $value }}'
summary: cdc owner checkpoint delay more than 10 minutes

- alert: tikv_cdc_min_resolved_ts_no_change_for_1m
expr: changes(tikv_cdc_min_resolved_ts[1m]) < 1 and ON (instance) tikv_cdc_region_resolve_status{status="resolved"} > 0
- alert: ticdc_sink_execution_error
expr: changes(ticdc_sink_execution_error[1m]) > 0
for: 1m
labels:
env: ENV_LABELS_ENV
level: warning
expr: changes(tikv_cdc_min_resolved_ts[1m]) < 1 and ON (instance) tikv_cdc_region_resolve_status{status="resolved"} > 0
expr: changes(ticdc_sink_execution_error[1m]) > 0
annotations:
description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}'
value: '{{ $labels.instance }}'
summary: tikv cdc min resolved ts no change for 1m
value: '{{ $value }}'
summary: cdc sink execution meets errors

- alert: tikv_cdc_scan_duration_seconds_more_than_10min
expr: histogram_quantile(0.9, rate(tikv_cdc_scan_duration_seconds_bucket{}[1m])) > 600
- alert: ticdc_processor_exit_with_error_count
expr: changes(ticdc_processor_exit_with_error_count[1m]) > 0
for: 1m
labels:
env: ENV_LABELS_ENV
level: warning
expr: histogram_quantile(0.9, rate(tikv_cdc_scan_duration_seconds_bucket{}[1m])) > 600
expr: changes(ticdc_processor_exit_with_error_count[1m]) > 0
annotations:
description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}'
value: '{{ $value }}'
summary: tikv cdc scan duration seconds more than 10 min
summary: cdc processor exits with error

- alert: ticdc_sink_mysql_execution_error
expr: changes(ticdc_sink_mysql_execution_error[1m]) > 0
- alert: ticdc_changefeed_meet_error
expr: (max_over_time(ticdc_owner_status[1m]) == 1) > 0
for: 1m
labels:
env: ENV_LABELS_ENV
level: warning
expr: changes(ticdc_sink_mysql_execution_error[1m]) > 0
expr: (max_over_time(ticdc_owner_status[1m]) == 1) > 0
annotations:
description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}'
value: '{{ $value }}'
summary: cdc sink mysql execution meets errors

- alert: ticdc_processor_exit_with_error_count
expr: changes(ticdc_processor_exit_with_error_count[1m]) > 0
summary: cdc changefeed meet error

# tikv related alter rules
- alert: tikv_cdc_min_resolved_ts_no_change_for_1m
expr: changes(tikv_cdc_min_resolved_ts[1m]) < 1 and ON (instance) tikv_cdc_region_resolve_status{status="resolved"} > 0
for: 1m
labels:
env: ENV_LABELS_ENV
level: critical
expr: changes(ticdc_processor_exit_with_error_count[1m]) > 0
level: warning
expr: changes(tikv_cdc_min_resolved_ts[1m]) < 1 and ON (instance) tikv_cdc_region_resolve_status{status="resolved"} > 0
annotations:
description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}'
value: '{{ $value }}'
summary: cdc processor exits with error
value: '{{ $labels.instance }}'
summary: tikv cdc min resolved ts no change for 1m

- alert: ticdc_memory_abnormal
expr: go_memstats_heap_alloc_bytes{job="ticdc"} > 1e+10
- alert: tikv_cdc_scan_duration_seconds_more_than_10min
expr: histogram_quantile(0.9, rate(tikv_cdc_scan_duration_seconds_bucket{}[1m])) > 600
for: 1m
labels:
env: ENV_LABELS_ENV
level: warning
expr: go_memstats_heap_alloc_bytes{job="ticdc"} > 1e+10
expr: histogram_quantile(0.9, rate(tikv_cdc_scan_duration_seconds_bucket{}[1m])) > 600
annotations:
description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ $value }}'
description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}'
value: '{{ $value }}'
summary: TiCDC heap memory usage is over 10 GB
summary: tikv cdc scan duration seconds more than 10 min