Skip to content
This repository has been archived by the owner on Jun 24, 2021. It is now read-only.

Update TiKV alert rules #898

Merged
merged 1 commit into from
Aug 15, 2019
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 4 additions & 16 deletions roles/prometheus/files/tikv.rules.yml
Original file line number Diff line number Diff line change
Expand Up @@ -110,12 +110,12 @@ groups:
summary: TiKV coprocessor request wait seconds more than 10s

- alert: TiKV_raftstore_thread_cpu_seconds_total
expr: sum(rate(tikv_thread_cpu_seconds_total{name=~"raftstore_.*"}[1m])) by (instance, name) > 0.8
expr: sum(rate(tikv_thread_cpu_seconds_total{name=~"raftstore_.*"}[1m])) by (instance, name) > 1.6
for: 1m
labels:
env: ENV_LABELS_ENV
level: critical
expr: sum(rate(tikv_thread_cpu_seconds_total{name=~"raftstore_.*"}[1m])) by (instance, name) > 0.8
expr: sum(rate(tikv_thread_cpu_seconds_total{name=~"raftstore_.*"}[1m])) by (instance, name) > 1.6
annotations:
description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ $value }}'
value: '{{ $value }}'
Expand Down Expand Up @@ -158,12 +158,12 @@ groups:
summary: TiKV scheduler latch wait duration seconds more than 1s

- alert: TiKV_thread_apply_worker_cpu_seconds
expr: sum(rate(tikv_thread_cpu_seconds_total{name="apply_worker"}[1m])) by (instance) > 0.9
expr: sum(rate(tikv_thread_cpu_seconds_total{name="apply_worker"}[1m])) by (instance) > 1.8
for: 1m
labels:
env: ENV_LABELS_ENV
level: critical
expr: sum(rate(tikv_thread_cpu_seconds_total{name="apply_worker"}[1m])) by (instance) > 0.9
expr: sum(rate(tikv_thread_cpu_seconds_total{name="apply_worker"}[1m])) by (instance) > 1.8
annotations:
description: 'cluster: ENV_LABELS_ENV, type: {{ $labels.type }}, instance: {{ $labels.instance }}, values: {{ $value }}'
value: '{{ $value }}'
Expand Down Expand Up @@ -241,18 +241,6 @@ groups:
value: '{{ $value }}'
summary: TiKV scheduler command duration seconds more than 1s

- alert: TiKV_thread_storage_scheduler_cpu_seconds
expr: sum(rate(tikv_thread_cpu_seconds_total{name=~"storage_schedul.*"}[1m])) by (instance) > 0.8
for: 1m
labels:
env: ENV_LABELS_ENV
level: warning
expr: sum(rate(tikv_thread_cpu_seconds_total{name=~"storage_schedul.*"}[1m])) by (instance) > 0.8
annotations:
description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ $value }}'
value: '{{ $value }}'
summary: TiKV storage scheduler cpu seconds more than 80%

- alert: TiKV_coprocessor_outdated_request_wait_seconds
expr: delta( tikv_coprocessor_outdated_request_wait_seconds_count[10m] ) > 0
for: 1m
Expand Down