Skip to content

Commit

Permalink
Merge branch 'master' into labeler1
Browse files Browse the repository at this point in the history
  • Loading branch information
ti-chi-bot authored Aug 17, 2021
2 parents 4571e0f + 7a2ab50 commit f032c6c
Show file tree
Hide file tree
Showing 10 changed files with 346 additions and 149 deletions.
12 changes: 12 additions & 0 deletions metrics/alertmanager/pd.rules.yml
Original file line number Diff line number Diff line change
Expand Up @@ -168,3 +168,15 @@ groups:
description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ $value }}'
value: '{{ $value }}'
summary: PD server has been restarted

- alert: PD_cluster_slow_tikv_nums
expr: (sum(pd_cluster_status{type="store_slow_count"}) by (instance) > 0) and (sum(etcd_server_is_leader) by (instance) > 0)
for: 1m
labels:
env: ENV_LABELS_ENV
level: critical
expr: (sum(pd_cluster_status{type="store_slow_count"}) by (instance) > 0) and (sum(etcd_server_is_leader) by (instance) > 0)
annotations:
description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ $value }}'
value: '{{ $value }}'
summary: PD_cluster_slow_tikv_nums
146 changes: 74 additions & 72 deletions metrics/grafana/pd.json
Original file line number Diff line number Diff line change
Expand Up @@ -687,6 +687,14 @@
"legendFormat": "Tombstone Stores",
"refId": "G",
"step": 20
},
{
"expr": "sum(pd_cluster_status{tidb_cluster=\"$tidb_cluster\", instance=\"$instance\", type=\"store_slow_count\"})",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "Slow Stores",
"refId": "H",
"step": 20
}
],
"timeFrom": "1s",
Expand Down Expand Up @@ -1352,101 +1360,95 @@
"type": "table"
},
{
"cacheTimeout":null,
"colorBackground":false,
"colorValue":false,
"colors":[
"cacheTimeout": null,
"colorBackground": false,
"colorValue": false,
"colors": [
"#299c46",
"rgba(237, 129, 40, 0.89)",
"#d44a3a"
],
"datasource":"tidb-cluster",
"fieldConfig":{
"defaults":{
"custom":{

}
"datasource": "tidb-cluster",
"fieldConfig": {
"defaults": {
"custom": {}
},
"overrides":[

]
"overrides": []
},
"format":"none",
"gauge":{
"maxValue":100,
"minValue":0,
"show":false,
"thresholdLabels":false,
"thresholdMarkers":true
"format": "none",
"gauge": {
"maxValue": 100,
"minValue": 0,
"show": false,
"thresholdLabels": false,
"thresholdMarkers": true
},
"gridPos":{
"h":4,
"w":5,
"x":9,
"y":17
"gridPos": {
"h": 4,
"w": 5,
"x": 9,
"y": 17
},
"hideTimeOverride":true,
"id":115,
"interval":null,
"links":[

],
"mappingType":1,
"mappingTypes":[
"hideTimeOverride": true,
"id": 115,
"interval": null,
"links": [],
"mappingType": 1,
"mappingTypes": [
{
"name":"value to text",
"value":1
"name": "value to text",
"value": 1
},
{
"name":"range to text",
"value":2
"name": "range to text",
"value": 2
}
],
"maxDataPoints":100,
"nullPointMode":"connected",
"nullText":null,
"postfix":"",
"postfixFontSize":"50%",
"prefix":"",
"prefixFontSize":"50%",
"rangeMaps":[
"maxDataPoints": 100,
"nullPointMode": "connected",
"nullText": null,
"postfix": "",
"postfixFontSize": "50%",
"prefix": "",
"prefixFontSize": "50%",
"rangeMaps": [
{
"from":"null",
"text":"N/A",
"to":"null"
"from": "null",
"text": "N/A",
"to": "null"
}
],
"sparkline":{
"fillColor":"rgba(31, 118, 189, 0.18)",
"full":false,
"lineColor":"rgb(31, 120, 193)",
"show":false
"sparkline": {
"fillColor": "rgba(31, 118, 189, 0.18)",
"full": false,
"lineColor": "rgb(31, 120, 193)",
"show": false
},
"tableColumn":"idalloc",
"targets":[
"tableColumn": "idalloc",
"targets": [
{
"expr":"pd_cluster_id{tidb_cluster=\"$tidb_cluster\", instance=\"$instance\"}",
"format":"time_series",
"hide":false,
"instant":true,
"intervalFactor":2,
"legendFormat":"{{type}}",
"refId":"A"
"expr": "pd_cluster_id{tidb_cluster=\"$tidb_cluster\", instance=\"$instance\"}",
"format": "time_series",
"hide": false,
"instant": true,
"intervalFactor": 2,
"legendFormat": "{{type}}",
"refId": "A"
}
],
"thresholds":"",
"timeFrom":"1s",
"title":"Current ID allocation",
"type":"singlestat",
"valueFontSize":"80%",
"valueMaps":[
"thresholds": "",
"timeFrom": "1s",
"title": "Current ID allocation",
"type": "singlestat",
"valueFontSize": "80%",
"valueMaps": [
{
"op":"=",
"text":"N/A",
"value":"null"
"op": "=",
"text": "N/A",
"value": "null"
}
],
"valueName":"avg"
"valueName": "avg"
},
{
"aliasColors": {},
Expand Down
7 changes: 7 additions & 0 deletions pkg/mock/mockcluster/mockcluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -733,6 +733,13 @@ func (mc *Cluster) DisableFeature(fs ...versioninfo.Feature) {
}
}

// EnableFeature marks that these features are supported in the cluster.
func (mc *Cluster) EnableFeature(fs ...versioninfo.Feature) {
for _, f := range fs {
delete(mc.disabledFeatures, f)
}
}

// IsFeatureSupported checks if the feature is supported by current cluster.
func (mc *Cluster) IsFeatureSupported(f versioninfo.Feature) bool {
_, ok := mc.disabledFeatures[f]
Expand Down
39 changes: 24 additions & 15 deletions server/replication/replication_mode.go
Original file line number Diff line number Diff line change
Expand Up @@ -369,21 +369,22 @@ func (m *ModeManager) tickDR() {

drTickCounter.Inc()

totalPrimary, totalDr := m.config.DRAutoSync.PrimaryReplicas, m.config.DRAutoSync.DRReplicas
downPrimary, downDr := m.checkStoreStatus()
totalPrimaryPeers, totalDrPeers := m.config.DRAutoSync.PrimaryReplicas, m.config.DRAutoSync.DRReplicas
downPrimaryStores, downDrStores, upPrimayStores, upDrStores := m.checkStoreStatus()

// canSync is true when every region has at least 1 replica in each DC.
canSync := downPrimary < totalPrimary && downDr < totalDr
canSync := downPrimaryStores < totalPrimaryPeers && downDrStores < totalDrPeers &&
upPrimayStores > 0 && upDrStores > 0

// hasMajority is true when every region has majority peer online.
var upPeers int
if downPrimary < totalPrimary {
upPeers += totalPrimary - downPrimary
if downPrimaryStores < totalPrimaryPeers {
upPeers += totalPrimaryPeers - downPrimaryStores
}
if downDr < totalDr {
upPeers += totalDr - downDr
if downDrStores < totalDrPeers {
upPeers += totalDrPeers - downDrStores
}
hasMajority := upPeers*2 > totalPrimary+totalDr
hasMajority := upPeers*2 > totalPrimaryPeers+totalDrPeers

// If hasMajority is false, the cluster is always unavailable. Switch to async won't help.
if !canSync && hasMajority && m.drGetState() != drStateAsync && m.drCheckAsyncTimeout() {
Expand All @@ -407,17 +408,25 @@ func (m *ModeManager) tickDR() {
}
}

func (m *ModeManager) checkStoreStatus() (primaryFailCount, drFailCount int) {
func (m *ModeManager) checkStoreStatus() (primaryDownCount, drDownCount, primaryUpCount, drUpCount int) {
m.RLock()
defer m.RUnlock()
for _, s := range m.cluster.GetStores() {
if !s.IsTombstone() && s.DownTime() >= m.config.DRAutoSync.WaitStoreTimeout.Duration {
labelValue := s.GetLabelValue(m.config.DRAutoSync.LabelKey)
if labelValue == m.config.DRAutoSync.Primary {
primaryFailCount++
down := !s.IsTombstone() && s.DownTime() >= m.config.DRAutoSync.WaitStoreTimeout.Duration
labelValue := s.GetLabelValue(m.config.DRAutoSync.LabelKey)
if labelValue == m.config.DRAutoSync.Primary {
if down {
primaryDownCount++
} else {
primaryUpCount++
}
if labelValue == m.config.DRAutoSync.DR {
drFailCount++

}
if labelValue == m.config.DRAutoSync.DR {
if down {
drDownCount++
} else {
drUpCount++
}
}
}
Expand Down
17 changes: 15 additions & 2 deletions server/replication/replication_mode_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -166,8 +166,6 @@ func (s *testReplicationMode) TestStateSwitch(c *C) {
cluster.AddLabelsStore(1, 1, map[string]string{"zone": "zone1"})
cluster.AddLabelsStore(2, 1, map[string]string{"zone": "zone1"})
cluster.AddLabelsStore(3, 1, map[string]string{"zone": "zone1"})
cluster.AddLabelsStore(4, 1, map[string]string{"zone": "zone2"})
cluster.AddLabelsStore(5, 1, map[string]string{"zone": "zone2"})

// initial state is sync
c.Assert(rep.drGetState(), Equals, drStateSync)
Expand All @@ -178,6 +176,21 @@ func (s *testReplicationMode) TestStateSwitch(c *C) {
stateID = rep.drAutoSync.StateID
}

// only one zone, sync -> async
rep.tickDR()
c.Assert(rep.drGetState(), Equals, drStateAsync)
assertStateIDUpdate()

// add new store in dr zone.
cluster.AddLabelsStore(4, 1, map[string]string{"zone": "zone2"})
cluster.AddLabelsStore(5, 1, map[string]string{"zone": "zone2"})
// async -> sync
rep.tickDR()
c.Assert(rep.drGetState(), Equals, drStateSyncRecover)
rep.drSwitchToSync()
c.Assert(rep.drGetState(), Equals, drStateSync)
assertStateIDUpdate()

// sync -> async
rep.tickDR()
c.Assert(rep.drGetState(), Equals, drStateSync)
Expand Down
Loading

0 comments on commit f032c6c

Please sign in to comment.