From 542c8f4525a5441df734d8ef6c80a5b83b72ebed Mon Sep 17 00:00:00 2001 From: dongmen <414110582@qq.com> Date: Mon, 19 Jun 2023 18:47:28 +0800 Subject: [PATCH] add changefeed status alert rules --- metrics/alertmanager/ticdc.rules.yml | 65 +++++++++++++++++----------- 1 file changed, 40 insertions(+), 25 deletions(-) diff --git a/metrics/alertmanager/ticdc.rules.yml b/metrics/alertmanager/ticdc.rules.yml index 1c5788882c1..db73b655938 100644 --- a/metrics/alertmanager/ticdc.rules.yml +++ b/metrics/alertmanager/ticdc.rules.yml @@ -1,6 +1,7 @@ groups: - name: alert.rules rules: + # server related alter rules - alert: cdc_multiple_owners expr: sum(rate(ticdc_owner_ownership_counter[30s])) >= 2 for: 1m @@ -25,6 +26,19 @@ groups: value: '{{ $value }}' summary: cdc cluster has no owner for more than 10 minutes + # changefeed related alter rules + - alert: ticdc_changefeed_failed + expr: (max_over_time(ticdc_owner_status[1m]) == 2) > 0 + for: 1m + labels: + env: ENV_LABELS_ENV + level: critical + expr: (max_over_time(ticdc_owner_status[1m]) == 2) > 0 + annotations: + description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}' + value: '{{ $value }}' + summary: cdc changefeed failed, it can not be automatically resumed + - alert: cdc_checkpoint_high_delay expr: ticdc_owner_checkpoint_ts_lag > 600 for: 1m @@ -37,62 +51,63 @@ groups: value: '{{ $value }}' summary: cdc owner checkpoint delay more than 10 minutes - - alert: tikv_cdc_min_resolved_ts_no_change_for_1m - expr: changes(tikv_cdc_min_resolved_ts[1m]) < 1 and ON (instance) tikv_cdc_region_resolve_status{status="resolved"} > 0 + - alert: ticdc_sink_execution_error + expr: changes(ticdc_sink_execution_error[1m]) > 0 for: 1m labels: env: ENV_LABELS_ENV level: warning - expr: changes(tikv_cdc_min_resolved_ts[1m]) < 1 and ON (instance) tikv_cdc_region_resolve_status{status="resolved"} > 0 + expr: changes(ticdc_sink_execution_error[1m]) > 0 annotations: description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}' - value: '{{ $labels.instance }}' - summary: tikv cdc min resolved ts no change for 1m + value: '{{ $value }}' + summary: cdc sink execution meets errors - - alert: tikv_cdc_scan_duration_seconds_more_than_10min - expr: histogram_quantile(0.9, rate(tikv_cdc_scan_duration_seconds_bucket{}[1m])) > 600 + - alert: ticdc_processor_exit_with_error_count + expr: changes(ticdc_processor_exit_with_error_count[1m]) > 0 for: 1m labels: env: ENV_LABELS_ENV level: warning - expr: histogram_quantile(0.9, rate(tikv_cdc_scan_duration_seconds_bucket{}[1m])) > 600 + expr: changes(ticdc_processor_exit_with_error_count[1m]) > 0 annotations: description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}' value: '{{ $value }}' - summary: tikv cdc scan duration seconds more than 10 min + summary: cdc processor exits with error - - alert: ticdc_sink_mysql_execution_error - expr: changes(ticdc_sink_mysql_execution_error[1m]) > 0 + - alert: ticdc_changefeed_meet_error + expr: (max_over_time(ticdc_owner_status[1m]) == 1) > 0 for: 1m labels: env: ENV_LABELS_ENV level: warning - expr: changes(ticdc_sink_mysql_execution_error[1m]) > 0 + expr: (max_over_time(ticdc_owner_status[1m]) == 1) > 0 annotations: description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}' value: '{{ $value }}' - summary: cdc sink mysql execution meets errors - - - alert: ticdc_processor_exit_with_error_count - expr: changes(ticdc_processor_exit_with_error_count[1m]) > 0 + summary: cdc changefeed meet error + + # tikv related alter rules + - alert: tikv_cdc_min_resolved_ts_no_change_for_1m + expr: changes(tikv_cdc_min_resolved_ts[1m]) < 1 and ON (instance) tikv_cdc_region_resolve_status{status="resolved"} > 0 for: 1m labels: env: ENV_LABELS_ENV - level: critical - expr: changes(ticdc_processor_exit_with_error_count[1m]) > 0 + level: warning + expr: changes(tikv_cdc_min_resolved_ts[1m]) < 1 and ON (instance) tikv_cdc_region_resolve_status{status="resolved"} > 0 annotations: description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}' - value: '{{ $value }}' - summary: cdc processor exits with error + value: '{{ $labels.instance }}' + summary: tikv cdc min resolved ts no change for 1m - - alert: ticdc_memory_abnormal - expr: go_memstats_heap_alloc_bytes{job="ticdc"} > 1e+10 + - alert: tikv_cdc_scan_duration_seconds_more_than_10min + expr: histogram_quantile(0.9, rate(tikv_cdc_scan_duration_seconds_bucket{}[1m])) > 600 for: 1m labels: env: ENV_LABELS_ENV level: warning - expr: go_memstats_heap_alloc_bytes{job="ticdc"} > 1e+10 + expr: histogram_quantile(0.9, rate(tikv_cdc_scan_duration_seconds_bucket{}[1m])) > 600 annotations: - description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ $value }}' + description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}' value: '{{ $value }}' - summary: TiCDC heap memory usage is over 10 GB + summary: tikv cdc scan duration seconds more than 10 min \ No newline at end of file