metrics/alertmanager/pd.rules.yml

groups:
- name: alert.rules
  rules:
  - alert: PD_cluster_down_tikv_nums
    expr: (sum(pd_cluster_status{type="store_down_count"}) by (instance) > 0) and (sum(etcd_server_is_leader) by (instance) > 0)
    for: 1m
    labels:
      env: ENV_LABELS_ENV
      level: emergency
      expr:  (sum(pd_cluster_status{type="store_down_count"}) by (instance) > 0) and (sum(etcd_server_is_leader) by (instance) > 0)
    annotations:
      description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ $value }}'
      value: '{{ $value }}'
      summary: PD_cluster_down_tikv_nums

  - alert: PD_etcd_write_disk_latency
    expr: histogram_quantile(0.99, sum(rate(etcd_disk_wal_fsync_duration_seconds_bucket[1m])) by (instance,job,le) ) > 1
    for: 1m
    labels:
      env: ENV_LABELS_ENV
      level: critical
      expr:  histogram_quantile(0.99, sum(rate(etcd_disk_wal_fsync_duration_seconds_bucket[1m])) by (instance,job,le) ) > 1
    annotations:
      description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ $value }}'
      value: '{{ $value }}'
      summary: PD_etcd_write_disk_latency

  - alert: PD_miss_peer_region_count
    expr: (sum(pd_regions_status{type="miss_peer_region_count"}) by (instance)  > 100) and (sum(etcd_server_is_leader) by (instance) > 0)
    for: 1m
    labels:
      env: ENV_LABELS_ENV
      level: critical
      expr:  (sum(pd_regions_status{type="miss_peer_region_count"}) by (instance)  > 100) and (sum(etcd_server_is_leader) by (instance) > 0)
    annotations:
      description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ $value }}'
      value: '{{ $value }}'
      summary: PD_miss_peer_region_count

  - alert: PD_cluster_lost_connect_tikv_nums
    expr: (sum(pd_cluster_status{type="store_disconnected_count"}) by (instance) > 0) and (sum(etcd_server_is_leader) by (instance) > 0)
    for: 1m
    labels:
      env: ENV_LABELS_ENV
      level: warning
      expr:  (sum(pd_cluster_status{type="store_disconnected_count"}) by (instance) > 0) and (sum(etcd_server_is_leader) by (instance) > 0)
    annotations:
      description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ $value }}'
      value: '{{ $value }}'
      summary: PD_cluster_lost_connect_tikv_nums

  - alert: PD_cluster_unhealthy_tikv_nums
    expr: (sum ( pd_cluster_status{type="store_unhealth_count"} ) by (instance)
      > 0) and (sum(etcd_server_is_leader) by (instance) > 0)
    for: 1m
    labels:
      env: ENV_LABELS_ENV
      expr: (sum ( pd_cluster_status{type="store_unhealth_count"} ) by (instance)
        > 0) and (sum(etcd_server_is_leader) by (instance) > 0)
      level: warning
    annotations:
      description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{
        $value }}'
      summary: PD_cluster_unhealth_tikv_nums
      value: '{{ $value }}'

  - alert: PD_cluster_low_space
    expr: (sum(pd_cluster_status{type="store_low_space_count"}) by (instance) > 0) and (sum(etcd_server_is_leader) by (instance) > 0)
    for: 1m
    labels:
      env: ENV_LABELS_ENV
      level: warning
      expr:  (sum(pd_cluster_status{type="store_low_space_count"}) by (instance) > 0) and (sum(etcd_server_is_leader) by (instance) > 0)
    annotations:
      description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ $value }}'
      value: '{{ $value }}'
      summary: PD_cluster_low_space

  - alert: PD_etcd_network_peer_latency
    expr: histogram_quantile(0.99, sum(rate(etcd_network_peer_round_trip_time_seconds_bucket[1m])) by (To,instance,job,le) ) > 1
    for: 1m
    labels:
      env: ENV_LABELS_ENV
      level: warning
      expr:  histogram_quantile(0.99, sum(rate(etcd_network_peer_round_trip_time_seconds_bucket[1m])) by (To,instance,job,le) ) > 1
    annotations:
      description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ $value }}'
      value: '{{ $value }}'
      summary: PD_etcd_network_peer_latency

  - alert: PD_tidb_handle_requests_duration
    expr: histogram_quantile(0.99, sum(rate(pd_client_request_handle_requests_duration_seconds_bucket{type="tso"}[1m])) by (instance,job,le) ) > 0.1
    for: 1m
    labels:
      env: ENV_LABELS_ENV
      level: warning
      expr:  histogram_quantile(0.99, sum(rate(pd_client_request_handle_requests_duration_seconds_bucket{type="tso"}[1m])) by (instance,job,le) ) > 0.1
    annotations:
      description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ $value }}'
      value: '{{ $value }}'
      summary: PD_tidb_handle_requests_duration

  - alert: PD_down_peer_region_nums
    expr:  (sum(pd_regions_status{type="down-peer-region-count"}) by (instance)  > 0) and (sum(etcd_server_is_leader) by (instance) > 0)
    for: 1m
    labels:
      env: ENV_LABELS_ENV
      level: warning
      expr:  (sum(pd_regions_status{type="down-peer-region-count"}) by (instance)  > 0) and (sum(etcd_server_is_leader) by (instance) > 0)
    annotations:
      description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ $value }}'
      value: '{{ $value }}'
      summary: PD_down_peer_region_nums

  - alert: PD_pending_peer_region_count
    expr: (sum(pd_regions_status{type="pending-peer-region-count"}) by (instance)  > 100) and (sum(etcd_server_is_leader) by (instance) > 0)
    for: 1m
    labels:
      env: ENV_LABELS_ENV
      level: warning
      expr:  (sum(pd_regions_status{type="pending-peer-region-count"}) by (instance)  > 100) and (sum(etcd_server_is_leader) by (instance) > 0)
    annotations:
      description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ $value }}'
      value: '{{ $value }}'
      summary: PD_pending_peer_region_count

  - alert: PD_leader_change
    expr: count( changes(pd_tso_events{type="save"}[10m]) > 0 )   >= 2
    for: 1m
    labels:
      env: ENV_LABELS_ENV
      level: warning
      expr:  count( changes(pd_tso_events{type="save"}[10m]) > 0 )   >= 2
    annotations:
      description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ $value }}'
      value: '{{ $value }}'
      summary: PD_leader_change

  - alert: TiKV_space_used_more_than_80%
    expr: sum(pd_cluster_status{type="storage_size"}) / sum(pd_cluster_status{type="storage_capacity"}) * 100  > 80
    for: 1m
    labels:
      env: ENV_LABELS_ENV
      level: warning
      expr:  sum(pd_cluster_status{type="storage_size"}) / sum(pd_cluster_status{type="storage_capacity"}) * 100  > 80
    annotations:
      description: 'cluster: ENV_LABELS_ENV, type: {{ $labels.type }}, instance: {{ $labels.instance }}, values: {{ $value }}'
      value: '{{ $value }}'
      summary: TiKV_space_used_more_than_80%

  - alert: PD_system_time_slow
    expr: changes(pd_tso_events{type="system_time_slow"}[10m]) >= 1
    for: 1m
    labels:
      env: ENV_LABELS_ENV
      level: warning
      expr: changes(pd_tso_events{type="system_time_slow"}[10m]) >= 1
    annotations:
      description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}'
      value: '{{ $value }}'
      summary: PD_system_time_slow

  - alert: PD_no_store_for_making_replica
    expr: increase(pd_checker_event_count{type="replica_checker", name="no_target_store"}[1m]) > 0
    for: 1m
    labels:
      env: ENV_LABELS_ENV
      level: warning
      expr: increase(pd_checker_event_count{type="replica_checker", name="no_target_store"}[1m]) > 0
    annotations:
      description: 'cluster: ENV_LABELS_ENV, type: {{ $labels.type }}, instance: {{ $labels.instance }}, values: {{ $value }}'
      value: '{{ $value }}'
      summary: PD_no_store_for_making_replica

  - alert: PD_node_restart
    expr: changes(process_start_time_seconds{job="pd"}[5m]) > 0
    for: 1m
    labels:
      env: ENV_LABELS_ENV
      level: warning
      expr:  changes(process_start_time_seconds{job="pd"}[5m]) > 0
    annotations:
      description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ $value }}'
      value: '{{ $value }}'
      summary: PD server has been restarted

  - alert: PD_cluster_slow_tikv_nums
    expr: (sum(pd_cluster_status{type="store_slow_count"}) by (instance) > 0) and (sum(etcd_server_is_leader) by (instance) > 0)
    for: 1m
    labels:
      env: ENV_LABELS_ENV
      level: critical
      expr:  (sum(pd_cluster_status{type="store_slow_count"}) by (instance) > 0) and (sum(etcd_server_is_leader) by (instance) > 0)
    annotations:
      description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ $value }}'
      value: '{{ $value }}'
      summary: PD_cluster_slow_tikv_nums

  - alert: PD_cpu_quota
    expr:  irate(process_cpu_seconds_total{job="pd"}[30s]) / pd_service_maxprocs > 0.8
    for: 45s
    labels:
      env: ENV_LABELS_ENV
      level: warning
      expr: irate(process_cpu_seconds_total{job="pd"}[30s]) / pd_service_maxprocs > 0.8
    annotations:
      description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ $value }}'
      value: '{{ $value }}'
      summary: PD CPU usage is over 80% of CPU quota

  - alert: PD_memory_quota
    expr:  process_resident_memory_bytes{job="pd"} / pd_service_memory_quota_bytes > 0.8
    for: 15s
    labels:
      env: ENV_LABELS_ENV
      level: warning
      expr: process_resident_memory_bytes{job="pd"} / pd_service_memory_quota_bytes > 0.8
    annotations:
      description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ $value }}'
      value: '{{ $value }}'
      summary: PD memory usage is over 80% of memory quota