From efb23b85853a139145c3da13f89b56203601d63b Mon Sep 17 00:00:00 2001 From: "Eduardo J. Ortega U" Date: Sun, 18 Oct 2020 20:24:52 +0200 Subject: [PATCH] Add support for recovery of async/semisync replicas of failed replication group members. --- docs/configuration-recovery.md | 3 +- docs/faq.md | 17 +-- go/inst/analysis.go | 7 + go/inst/analysis_dao.go | 266 ++++++++++++++++++--------------- go/inst/instance_dao.go | 4 + go/logic/topology_recovery.go | 93 +++++++++++- 6 files changed, 253 insertions(+), 137 deletions(-) diff --git a/docs/configuration-recovery.md b/docs/configuration-recovery.md index af1e7645c..dfba9b536 100644 --- a/docs/configuration-recovery.md +++ b/docs/configuration-recovery.md @@ -71,7 +71,8 @@ These hooks are available for recoveries: - `PreFailoverProcesses`: executed immediately before `orchestrator` takes recovery action. Failure (nonzero exit code) of any of these processes aborts the recovery. Hint: this gives you the opportunity to abort recovery based on some internal state of your system. - `PostMasterFailoverProcesses`: executed at the end of a successful master recovery. -- `PostIntermediateMasterFailoverProcesses`: executed at the end of a successful intermediate master recovery. +- `PostIntermediateMasterFailoverProcesses`: executed at the end of a successful intermediate master or replication + group member with replicas recovery. - `PostFailoverProcesses`: executed at the end of any successful recovery (including and adding to the above two). - `PostUnsuccessfulFailoverProcesses`: executed at the end of any unsuccessful recovery. - `PostGracefulTakeoverProcesses`: executed on planned, graceful master takeover, after the old master is positioned under the newly promoted master. diff --git a/docs/faq.md b/docs/faq.md index bfdedebd2..d76d83d41 100644 --- a/docs/faq.md +++ b/docs/faq.md @@ -62,21 +62,20 @@ No. ### Does orchestrator support MySQL Group Replication? -Partially. Replication groups in single primary mode are somewhat supported under MySQL 8.0. The extent of the support so far is: +Partially. Replication groups in single primary mode are supported under MySQL 8.0. The extent of the support is: * Orchestrator understands that all group members are part of the same cluster, retrieves replication group information as part of instance discovery, stores it in its database, and exposes it via the API. * The orchestrator web UI displays single primary group members. They are shown like this: - * All group secondary members as replicating from the primary. - * All group members have an icon that shows they are group members (as opposed to traditional async/semi-sync replicas). + * All secondary group members as replicating from the primary. + * All group members have an icon that shows they are group members (as opposed to traditional async/semi-sync + replicas). * Hovering over the icon mentioned above provides information about the state and role of the DB instance in the group. -* Some relocation operations are forbidden for group members. In particular, orchestrator will refuse to relocate a secondary group member, as it, by definition, replicates always from the group primary. It will also reject an attempt to relocate a group primary under a secondary of the same group. - -No support has been added (yet) to handling group member failure. If all you have is a single replication group, this is fine, because you don't need it; the group will handle all failures as long as it can secure a majority. - -If, however, you have the primary of a group as a replica to another instance; or you have replicas under your group -members, know that this has not been tested and results are, therefore, unpredictable at the moment. It *might* work, but it might also create a singularity and suck your database under the event horizon. +* Some relocation operations are forbidden for group members. In particular, orchestrator will refuse to relocate a + secondary group member, as it, by definition, replicates always from the group primary. It will also reject an attempt + to relocate a group primary under a secondary of the same group. +* Traditional async/semisync replicas from failed group members are relocated to a different group member. ### Does orchestrator support Yet Another Type of Replication? diff --git a/go/inst/analysis.go b/go/inst/analysis.go index cdcdb02e6..70943ea95 100644 --- a/go/inst/analysis.go +++ b/go/inst/analysis.go @@ -57,6 +57,8 @@ const ( AllIntermediateMasterReplicasNotReplicating = "AllIntermediateMasterReplicasNotReplicating" FirstTierReplicaFailingToConnectToMaster = "FirstTierReplicaFailingToConnectToMaster" BinlogServerFailingToConnectToMaster = "BinlogServerFailingToConnectToMaster" + // Group replication problems + DeadReplicationGroupMemberWithReplicas = "DeadReplicationGroupMemberWithReplicas" ) const ( @@ -110,6 +112,7 @@ const ( AnalysisInstanceTypeMaster AnalysisInstanceType = "master" AnalysisInstanceTypeCoMaster AnalysisInstanceType = "co-master" AnalysisInstanceTypeIntermediateMaster AnalysisInstanceType = "intermediate-master" + AnalysisInstanceTypeGroupMember AnalysisInstanceType = "group-member" ) // ReplicationAnalysis notes analysis on replication chain status, per instance @@ -122,6 +125,7 @@ type ReplicationAnalysis struct { AnalyzedInstancePhysicalEnvironment string AnalyzedInstanceBinlogCoordinates BinlogCoordinates IsMaster bool + IsReplicationGroupMember bool IsCoMaster bool LastCheckValid bool LastCheckPartialSuccess bool @@ -213,6 +217,9 @@ func (this *ReplicationAnalysis) GetAnalysisInstanceType() AnalysisInstanceType if this.IsCoMaster { return AnalysisInstanceTypeCoMaster } + if this.IsReplicationGroupMember { + return AnalysisInstanceTypeGroupMember + } if this.IsMaster { return AnalysisInstanceTypeMaster } diff --git a/go/inst/analysis_dao.go b/go/inst/analysis_dao.go index 62cb0167a..35795021c 100644 --- a/go/inst/analysis_dao.go +++ b/go/inst/analysis_dao.go @@ -174,6 +174,14 @@ func GetReplicationAnalysis(clusterName string, hints *ReplicationAnalysisHints) OR master_instance.replication_group_member_role = 'PRIMARY' ) ) AS is_master, + -- A host is not a group member if it has no replication group name OR if it does, but its state in the group is + -- OFFLINE (e.g. some GR configuration is in place but the host has not actually joined a group yet. Notice that + -- we DO consider it a group member if its state is ERROR (which is what happens when it gets expelled from the + -- group) + MIN( + master_instance.replication_group_name != '' + AND master_instance.replication_group_member_state != 'OFFLINE' + ) AS is_replication_group_member, MIN(master_instance.is_co_master) AS is_co_master, MIN( CONCAT( @@ -410,6 +418,7 @@ func GetReplicationAnalysis(clusterName string, hints *ReplicationAnalysisHints) } a.IsMaster = m.GetBool("is_master") + a.IsReplicationGroupMember = m.GetBool("is_replication_group_member") countCoMasterReplicas := m.GetUint("count_co_master_replicas") a.IsCoMaster = m.GetBool("is_co_master") || (countCoMasterReplicas > 0) a.AnalyzedInstanceKey = InstanceKey{Hostname: m.GetString("hostname"), Port: m.GetInt("port")} @@ -482,132 +491,141 @@ func GetReplicationAnalysis(clusterName string, hints *ReplicationAnalysisHints) log.Debugf(analysisMessage) } } - if a.IsMaster && !a.LastCheckValid && a.CountReplicas == 0 { - a.Analysis = DeadMasterWithoutReplicas - a.Description = "Master cannot be reached by orchestrator and has no replica" - // - } else if a.IsMaster && !a.LastCheckValid && a.CountValidReplicas == a.CountReplicas && a.CountValidReplicatingReplicas == 0 { - a.Analysis = DeadMaster - a.Description = "Master cannot be reached by orchestrator and none of its replicas is replicating" - // - } else if a.IsMaster && !a.LastCheckValid && a.CountReplicas > 0 && a.CountValidReplicas == 0 && a.CountValidReplicatingReplicas == 0 { - a.Analysis = DeadMasterAndReplicas - a.Description = "Master cannot be reached by orchestrator and none of its replicas is replicating" - // - } else if a.IsMaster && !a.LastCheckValid && a.CountValidReplicas < a.CountReplicas && a.CountValidReplicas > 0 && a.CountValidReplicatingReplicas == 0 { - a.Analysis = DeadMasterAndSomeReplicas - a.Description = "Master cannot be reached by orchestrator; some of its replicas are unreachable and none of its reachable replicas is replicating" - // - } else if a.IsMaster && !a.LastCheckValid && a.CountLaggingReplicas == a.CountReplicas && a.CountDelayedReplicas < a.CountReplicas && a.CountValidReplicatingReplicas > 0 { - a.Analysis = UnreachableMasterWithLaggingReplicas - a.Description = "Master cannot be reached by orchestrator and all of its replicas are lagging" - // - } else if a.IsMaster && !a.LastCheckValid && !a.LastCheckPartialSuccess && a.CountValidReplicas > 0 && a.CountValidReplicatingReplicas > 0 { - // partial success is here to redice noise - a.Analysis = UnreachableMaster - a.Description = "Master cannot be reached by orchestrator but it has replicating replicas; possibly a network/host issue" - // - } else if a.IsMaster && !a.LastCheckValid && a.LastCheckPartialSuccess && a.CountReplicasFailingToConnectToMaster > 0 && a.CountValidReplicas > 0 && a.CountValidReplicatingReplicas > 0 { - // there's partial success, but also at least one replica is failing to connect to master - a.Analysis = UnreachableMaster - a.Description = "Master cannot be reached by orchestrator but it has replicating replicas; possibly a network/host issue" - // - } else if a.IsMaster && a.SemiSyncMasterEnabled && a.SemiSyncMasterStatus && a.SemiSyncMasterWaitForReplicaCount > 0 && a.SemiSyncMasterClients < a.SemiSyncMasterWaitForReplicaCount { - if isStaleBinlogCoordinates { - a.Analysis = LockedSemiSyncMaster - a.Description = "Semi sync master is locked since it doesn't get enough replica acknowledgements" - } else { - a.Analysis = LockedSemiSyncMasterHypothesis - a.Description = "Semi sync master seems to be locked, more samplings needed to validate" + if !a.IsReplicationGroupMember /* Traditional Async/Semi-sync replication issue detection */ { + if a.IsMaster && !a.LastCheckValid && a.CountReplicas == 0 { + a.Analysis = DeadMasterWithoutReplicas + a.Description = "Master cannot be reached by orchestrator and has no replica" + // + } else if a.IsMaster && !a.LastCheckValid && a.CountValidReplicas == a.CountReplicas && a.CountValidReplicatingReplicas == 0 { + a.Analysis = DeadMaster + a.Description = "Master cannot be reached by orchestrator and none of its replicas is replicating" + // + } else if a.IsMaster && !a.LastCheckValid && a.CountReplicas > 0 && a.CountValidReplicas == 0 && a.CountValidReplicatingReplicas == 0 { + a.Analysis = DeadMasterAndReplicas + a.Description = "Master cannot be reached by orchestrator and none of its replicas is replicating" + // + } else if a.IsMaster && !a.LastCheckValid && a.CountValidReplicas < a.CountReplicas && a.CountValidReplicas > 0 && a.CountValidReplicatingReplicas == 0 { + a.Analysis = DeadMasterAndSomeReplicas + a.Description = "Master cannot be reached by orchestrator; some of its replicas are unreachable and none of its reachable replicas is replicating" + // + } else if a.IsMaster && !a.LastCheckValid && a.CountLaggingReplicas == a.CountReplicas && a.CountDelayedReplicas < a.CountReplicas && a.CountValidReplicatingReplicas > 0 { + a.Analysis = UnreachableMasterWithLaggingReplicas + a.Description = "Master cannot be reached by orchestrator and all of its replicas are lagging" + // + } else if a.IsMaster && !a.LastCheckValid && !a.LastCheckPartialSuccess && a.CountValidReplicas > 0 && a.CountValidReplicatingReplicas > 0 { + // partial success is here to redice noise + a.Analysis = UnreachableMaster + a.Description = "Master cannot be reached by orchestrator but it has replicating replicas; possibly a network/host issue" + // + } else if a.IsMaster && !a.LastCheckValid && a.LastCheckPartialSuccess && a.CountReplicasFailingToConnectToMaster > 0 && a.CountValidReplicas > 0 && a.CountValidReplicatingReplicas > 0 { + // there's partial success, but also at least one replica is failing to connect to master + a.Analysis = UnreachableMaster + a.Description = "Master cannot be reached by orchestrator but it has replicating replicas; possibly a network/host issue" + // + } else if a.IsMaster && a.SemiSyncMasterEnabled && a.SemiSyncMasterStatus && a.SemiSyncMasterWaitForReplicaCount > 0 && a.SemiSyncMasterClients < a.SemiSyncMasterWaitForReplicaCount { + if isStaleBinlogCoordinates { + a.Analysis = LockedSemiSyncMaster + a.Description = "Semi sync master is locked since it doesn't get enough replica acknowledgements" + } else { + a.Analysis = LockedSemiSyncMasterHypothesis + a.Description = "Semi sync master seems to be locked, more samplings needed to validate" + } + // + } else if a.IsMaster && a.LastCheckValid && a.CountReplicas == 1 && a.CountValidReplicas == a.CountReplicas && a.CountValidReplicatingReplicas == 0 { + a.Analysis = MasterSingleReplicaNotReplicating + a.Description = "Master is reachable but its single replica is not replicating" + // + } else if a.IsMaster && a.LastCheckValid && a.CountReplicas == 1 && a.CountValidReplicas == 0 { + a.Analysis = MasterSingleReplicaDead + a.Description = "Master is reachable but its single replica is dead" + // + } else if a.IsMaster && a.LastCheckValid && a.CountReplicas > 1 && a.CountValidReplicas == a.CountReplicas && a.CountValidReplicatingReplicas == 0 { + a.Analysis = AllMasterReplicasNotReplicating + a.Description = "Master is reachable but none of its replicas is replicating" + // + } else if a.IsMaster && a.LastCheckValid && a.CountReplicas > 1 && a.CountValidReplicas < a.CountReplicas && a.CountValidReplicas > 0 && a.CountValidReplicatingReplicas == 0 { + a.Analysis = AllMasterReplicasNotReplicatingOrDead + a.Description = "Master is reachable but none of its replicas is replicating" + // + } else /* co-master */ if a.IsCoMaster && !a.LastCheckValid && a.CountReplicas > 0 && a.CountValidReplicas == a.CountReplicas && a.CountValidReplicatingReplicas == 0 { + a.Analysis = DeadCoMaster + a.Description = "Co-master cannot be reached by orchestrator and none of its replicas is replicating" + // + } else if a.IsCoMaster && !a.LastCheckValid && a.CountReplicas > 0 && a.CountValidReplicas < a.CountReplicas && a.CountValidReplicas > 0 && a.CountValidReplicatingReplicas == 0 { + a.Analysis = DeadCoMasterAndSomeReplicas + a.Description = "Co-master cannot be reached by orchestrator; some of its replicas are unreachable and none of its reachable replicas is replicating" + // + } else if a.IsCoMaster && !a.LastCheckValid && !a.LastCheckPartialSuccess && a.CountValidReplicas > 0 && a.CountValidReplicatingReplicas > 0 { + a.Analysis = UnreachableCoMaster + a.Description = "Co-master cannot be reached by orchestrator but it has replicating replicas; possibly a network/host issue" + // + } else if a.IsCoMaster && a.LastCheckValid && a.CountReplicas > 0 && a.CountValidReplicatingReplicas == 0 { + a.Analysis = AllCoMasterReplicasNotReplicating + a.Description = "Co-master is reachable but none of its replicas is replicating" + // + } else /* intermediate-master */ if !a.IsMaster && !a.LastCheckValid && a.CountReplicas == 1 && a.CountValidReplicas == a.CountReplicas && a.CountReplicasFailingToConnectToMaster == a.CountReplicas && a.CountValidReplicatingReplicas == 0 { + a.Analysis = DeadIntermediateMasterWithSingleReplicaFailingToConnect + a.Description = "Intermediate master cannot be reached by orchestrator and its (single) replica is failing to connect" + // + } else if !a.IsMaster && !a.LastCheckValid && a.CountReplicas == 1 && a.CountValidReplicas == a.CountReplicas && a.CountValidReplicatingReplicas == 0 { + a.Analysis = DeadIntermediateMasterWithSingleReplica + a.Description = "Intermediate master cannot be reached by orchestrator and its (single) replica is not replicating" + // + } else if !a.IsMaster && !a.LastCheckValid && a.CountReplicas > 1 && a.CountValidReplicas == a.CountReplicas && a.CountValidReplicatingReplicas == 0 { + a.Analysis = DeadIntermediateMaster + a.Description = "Intermediate master cannot be reached by orchestrator and none of its replicas is replicating" + // + } else if !a.IsMaster && !a.LastCheckValid && a.CountValidReplicas < a.CountReplicas && a.CountValidReplicas > 0 && a.CountValidReplicatingReplicas == 0 { + a.Analysis = DeadIntermediateMasterAndSomeReplicas + a.Description = "Intermediate master cannot be reached by orchestrator; some of its replicas are unreachable and none of its reachable replicas is replicating" + // + } else if !a.IsMaster && !a.LastCheckValid && a.CountReplicas > 0 && a.CountValidReplicas == 0 { + a.Analysis = DeadIntermediateMasterAndReplicas + a.Description = "Intermediate master cannot be reached by orchestrator and all of its replicas are unreachable" + // + } else if !a.IsMaster && !a.LastCheckValid && a.CountLaggingReplicas == a.CountReplicas && a.CountDelayedReplicas < a.CountReplicas && a.CountValidReplicatingReplicas > 0 { + a.Analysis = UnreachableIntermediateMasterWithLaggingReplicas + a.Description = "Intermediate master cannot be reached by orchestrator and all of its replicas are lagging" + // + } else if !a.IsMaster && !a.LastCheckValid && !a.LastCheckPartialSuccess && a.CountValidReplicas > 0 && a.CountValidReplicatingReplicas > 0 { + a.Analysis = UnreachableIntermediateMaster + a.Description = "Intermediate master cannot be reached by orchestrator but it has replicating replicas; possibly a network/host issue" + // + } else if !a.IsMaster && a.LastCheckValid && a.CountReplicas > 1 && a.CountValidReplicatingReplicas == 0 && + a.CountReplicasFailingToConnectToMaster > 0 && a.CountReplicasFailingToConnectToMaster == a.CountValidReplicas { + // All replicas are either failing to connect to master (and at least one of these have to exist) + // or completely dead. + // Must have at least two replicas to reach such conclusion -- do note that the intermediate master is still + // reachable to orchestrator, so we base our conclusion on replicas only at this point. + a.Analysis = AllIntermediateMasterReplicasFailingToConnectOrDead + a.Description = "Intermediate master is reachable but all of its replicas are failing to connect" + // + } else if !a.IsMaster && a.LastCheckValid && a.CountReplicas > 0 && a.CountValidReplicatingReplicas == 0 { + a.Analysis = AllIntermediateMasterReplicasNotReplicating + a.Description = "Intermediate master is reachable but none of its replicas is replicating" + // + } else if a.IsBinlogServer && a.IsFailingToConnectToMaster { + a.Analysis = BinlogServerFailingToConnectToMaster + a.Description = "Binlog server is unable to connect to its master" + // + } else if a.ReplicationDepth == 1 && a.IsFailingToConnectToMaster { + a.Analysis = FirstTierReplicaFailingToConnectToMaster + a.Description = "1st tier replica (directly replicating from topology master) is unable to connect to the master" + // + } + // else if a.IsMaster && a.CountReplicas == 0 { + // a.Analysis = MasterWithoutReplicas + // a.Description = "Master has no replicas" + // } + + } else /* Group replication issue detection */ { + // Group member is not reachable, has replicas, and none of its reachable replicas can replicate from it + if !a.LastCheckValid && a.CountReplicas > 0 && a.CountValidReplicatingReplicas == 0 { + a.Analysis = DeadReplicationGroupMemberWithReplicas + a.Description = "Group member is unreachable and all its reachable replicas are not replicating" } - // - } else if a.IsMaster && a.LastCheckValid && a.CountReplicas == 1 && a.CountValidReplicas == a.CountReplicas && a.CountValidReplicatingReplicas == 0 { - a.Analysis = MasterSingleReplicaNotReplicating - a.Description = "Master is reachable but its single replica is not replicating" - // - } else if a.IsMaster && a.LastCheckValid && a.CountReplicas == 1 && a.CountValidReplicas == 0 { - a.Analysis = MasterSingleReplicaDead - a.Description = "Master is reachable but its single replica is dead" - // - } else if a.IsMaster && a.LastCheckValid && a.CountReplicas > 1 && a.CountValidReplicas == a.CountReplicas && a.CountValidReplicatingReplicas == 0 { - a.Analysis = AllMasterReplicasNotReplicating - a.Description = "Master is reachable but none of its replicas is replicating" - // - } else if a.IsMaster && a.LastCheckValid && a.CountReplicas > 1 && a.CountValidReplicas < a.CountReplicas && a.CountValidReplicas > 0 && a.CountValidReplicatingReplicas == 0 { - a.Analysis = AllMasterReplicasNotReplicatingOrDead - a.Description = "Master is reachable but none of its replicas is replicating" - // - } else /* co-master */ if a.IsCoMaster && !a.LastCheckValid && a.CountReplicas > 0 && a.CountValidReplicas == a.CountReplicas && a.CountValidReplicatingReplicas == 0 { - a.Analysis = DeadCoMaster - a.Description = "Co-master cannot be reached by orchestrator and none of its replicas is replicating" - // - } else if a.IsCoMaster && !a.LastCheckValid && a.CountReplicas > 0 && a.CountValidReplicas < a.CountReplicas && a.CountValidReplicas > 0 && a.CountValidReplicatingReplicas == 0 { - a.Analysis = DeadCoMasterAndSomeReplicas - a.Description = "Co-master cannot be reached by orchestrator; some of its replicas are unreachable and none of its reachable replicas is replicating" - // - } else if a.IsCoMaster && !a.LastCheckValid && !a.LastCheckPartialSuccess && a.CountValidReplicas > 0 && a.CountValidReplicatingReplicas > 0 { - a.Analysis = UnreachableCoMaster - a.Description = "Co-master cannot be reached by orchestrator but it has replicating replicas; possibly a network/host issue" - // - } else if a.IsCoMaster && a.LastCheckValid && a.CountReplicas > 0 && a.CountValidReplicatingReplicas == 0 { - a.Analysis = AllCoMasterReplicasNotReplicating - a.Description = "Co-master is reachable but none of its replicas is replicating" - // - } else /* intermediate-master */ if !a.IsMaster && !a.LastCheckValid && a.CountReplicas == 1 && a.CountValidReplicas == a.CountReplicas && a.CountReplicasFailingToConnectToMaster == a.CountReplicas && a.CountValidReplicatingReplicas == 0 { - a.Analysis = DeadIntermediateMasterWithSingleReplicaFailingToConnect - a.Description = "Intermediate master cannot be reached by orchestrator and its (single) replica is failing to connect" - // - } else if !a.IsMaster && !a.LastCheckValid && a.CountReplicas == 1 && a.CountValidReplicas == a.CountReplicas && a.CountValidReplicatingReplicas == 0 { - a.Analysis = DeadIntermediateMasterWithSingleReplica - a.Description = "Intermediate master cannot be reached by orchestrator and its (single) replica is not replicating" - // - } else if !a.IsMaster && !a.LastCheckValid && a.CountReplicas > 1 && a.CountValidReplicas == a.CountReplicas && a.CountValidReplicatingReplicas == 0 { - a.Analysis = DeadIntermediateMaster - a.Description = "Intermediate master cannot be reached by orchestrator and none of its replicas is replicating" - // - } else if !a.IsMaster && !a.LastCheckValid && a.CountValidReplicas < a.CountReplicas && a.CountValidReplicas > 0 && a.CountValidReplicatingReplicas == 0 { - a.Analysis = DeadIntermediateMasterAndSomeReplicas - a.Description = "Intermediate master cannot be reached by orchestrator; some of its replicas are unreachable and none of its reachable replicas is replicating" - // - } else if !a.IsMaster && !a.LastCheckValid && a.CountReplicas > 0 && a.CountValidReplicas == 0 { - a.Analysis = DeadIntermediateMasterAndReplicas - a.Description = "Intermediate master cannot be reached by orchestrator and all of its replicas are unreachable" - // - } else if !a.IsMaster && !a.LastCheckValid && a.CountLaggingReplicas == a.CountReplicas && a.CountDelayedReplicas < a.CountReplicas && a.CountValidReplicatingReplicas > 0 { - a.Analysis = UnreachableIntermediateMasterWithLaggingReplicas - a.Description = "Intermediate master cannot be reached by orchestrator and all of its replicas are lagging" - // - } else if !a.IsMaster && !a.LastCheckValid && !a.LastCheckPartialSuccess && a.CountValidReplicas > 0 && a.CountValidReplicatingReplicas > 0 { - a.Analysis = UnreachableIntermediateMaster - a.Description = "Intermediate master cannot be reached by orchestrator but it has replicating replicas; possibly a network/host issue" - // - } else if !a.IsMaster && a.LastCheckValid && a.CountReplicas > 1 && a.CountValidReplicatingReplicas == 0 && - a.CountReplicasFailingToConnectToMaster > 0 && a.CountReplicasFailingToConnectToMaster == a.CountValidReplicas { - // All replicas are either failing to connect to master (and at least one of these have to exist) - // or completely dead. - // Must have at least two replicas to reach such conclusion -- do note that the intermediate master is still - // reachable to orchestrator, so we base our conclusion on replicas only at this point. - a.Analysis = AllIntermediateMasterReplicasFailingToConnectOrDead - a.Description = "Intermediate master is reachable but all of its replicas are failing to connect" - // - } else if !a.IsMaster && a.LastCheckValid && a.CountReplicas > 0 && a.CountValidReplicatingReplicas == 0 { - a.Analysis = AllIntermediateMasterReplicasNotReplicating - a.Description = "Intermediate master is reachable but none of its replicas is replicating" - // - } else if a.IsBinlogServer && a.IsFailingToConnectToMaster { - a.Analysis = BinlogServerFailingToConnectToMaster - a.Description = "Binlog server is unable to connect to its master" - // - } else if a.ReplicationDepth == 1 && a.IsFailingToConnectToMaster { - a.Analysis = FirstTierReplicaFailingToConnectToMaster - a.Description = "1st tier replica (directly replicating from topology master) is unable to connect to the master" - // - } - // else if a.IsMaster && a.CountReplicas == 0 { - // a.Analysis = MasterWithoutReplicas - // a.Description = "Master has no replicas" - // } + } appendAnalysis := func(analysis *ReplicationAnalysis) { if a.Analysis == NoProblem && len(a.StructureAnalysis) == 0 && !hints.IncludeNoProblem { return diff --git a/go/inst/instance_dao.go b/go/inst/instance_dao.go index 2f3187b74..27ad73d76 100644 --- a/go/inst/instance_dao.go +++ b/go/inst/instance_dao.go @@ -3173,6 +3173,8 @@ func FigureInstanceKey(instanceKey *InstanceKey, thisInstanceKey *InstanceKey) ( // PopulateGroupReplicationInformation obtains information about Group Replication for this host as well as other hosts // who are members of the same group (if any). func PopulateGroupReplicationInformation(instance *Instance, db *sql.DB) error { + // We exclude below hosts with state OFFLINE because they have joined no group yet, so there is no point in getting + // any group replication information from them q := ` SELECT MEMBER_ID, @@ -3184,6 +3186,8 @@ func PopulateGroupReplicationInformation(instance *Instance, db *sql.DB) error { @@global.group_replication_single_primary_mode FROM performance_schema.replication_group_members + WHERE + MEMBER_STATE != 'OFFLINE' ` rows, err := db.Query(q) if err != nil { diff --git a/go/logic/topology_recovery.go b/go/logic/topology_recovery.go index 7cbb67097..ff53adf3e 100644 --- a/go/logic/topology_recovery.go +++ b/go/logic/topology_recovery.go @@ -18,6 +18,7 @@ package logic import ( "encoding/json" + "errors" "fmt" "math/rand" goos "os" @@ -45,9 +46,10 @@ var countPendingRecoveries int64 type RecoveryType string const ( - MasterRecovery RecoveryType = "MasterRecovery" - CoMasterRecovery = "CoMasterRecovery" - IntermediateMasterRecovery = "IntermediateMasterRecovery" + MasterRecovery RecoveryType = "MasterRecovery" + CoMasterRecovery = "CoMasterRecovery" + IntermediateMasterRecovery = "IntermediateMasterRecovery" + ReplicationGroupMemberRecovery = "ReplicationGroupMemberRecovery" ) type RecoveryAcknowledgement struct { @@ -189,6 +191,9 @@ var recoverDeadIntermediateMasterFailureCounter = metrics.NewCounter() var recoverDeadCoMasterCounter = metrics.NewCounter() var recoverDeadCoMasterSuccessCounter = metrics.NewCounter() var recoverDeadCoMasterFailureCounter = metrics.NewCounter() +var recoverDeadReplicationGroupMemberCounter = metrics.NewCounter() +var recoverDeadReplicationGroupMemberSuccessCounter = metrics.NewCounter() +var recoverDeadReplicationGroupMemberFailureCounter = metrics.NewCounter() var countPendingRecoveriesGauge = metrics.NewGauge() func init() { @@ -201,6 +206,9 @@ func init() { metrics.Register("recover.dead_co_master.start", recoverDeadCoMasterCounter) metrics.Register("recover.dead_co_master.success", recoverDeadCoMasterSuccessCounter) metrics.Register("recover.dead_co_master.fail", recoverDeadCoMasterFailureCounter) + metrics.Register("recover.dead_replication_group_member.start", recoverDeadReplicationGroupMemberCounter) + metrics.Register("recover.dead_replication_group_member.success", recoverDeadReplicationGroupMemberSuccessCounter) + metrics.Register("recover.dead_replication_group_member.fail", recoverDeadReplicationGroupMemberFailureCounter) metrics.Register("recover.pending", countPendingRecoveriesGauge) go initializeTopologyRecoveryPostConfiguration() @@ -1198,6 +1206,42 @@ func RecoverDeadIntermediateMaster(topologyRecovery *TopologyRecovery, skipProce return successorInstance, err } +// RecoverDeadReplicationGroupMemberWithReplicas performs dead group member recovery. It does so by finding members of +// the same replication group of the one of the failed instance, picking a random one and relocating replicas to it. +func RecoverDeadReplicationGroupMemberWithReplicas(topologyRecovery *TopologyRecovery, skipProcesses bool) (successorInstance *inst.Instance, err error) { + topologyRecovery.Type = ReplicationGroupMemberRecovery + analysisEntry := &topologyRecovery.AnalysisEntry + failedGroupMemberInstanceKey := &analysisEntry.AnalyzedInstanceKey + inst.AuditOperation("recover-dead-replication-group-member-with-replicas", failedGroupMemberInstanceKey, "problem found; will recover") + if !skipProcesses { + if err := executeProcesses(config.Config.PreFailoverProcesses, "PreFailoverProcesses", topologyRecovery, true); err != nil { + return nil, topologyRecovery.AddError(err) + } + } + failedGroupMember, _, err := inst.ReadInstance(failedGroupMemberInstanceKey) + if err != nil { + return nil, topologyRecovery.AddError(err) + } + // Find a group member under which we can relocate the replicas of the failed one. + groupMembers := failedGroupMember.ReplicationGroupMembers.GetInstanceKeys() + if len(groupMembers) == 0 { + return nil, topologyRecovery.AddError(errors.New("RecoverDeadReplicationGroupMemberWithReplicas: unable to find a candidate group member to relocate replicas to")) + } + // We have a group member to move replicas to, go ahead and do that + AuditTopologyRecovery(topologyRecovery, "Finding a candidate group member to relocate replicas to") + candidateGroupMemberInstanceKey := &groupMembers[rand.Intn(len(failedGroupMember.ReplicationGroupMembers.GetInstanceKeys()))] + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("Found group member %+v", candidateGroupMemberInstanceKey)) + relocatedReplicas, successorInstance, err, errs := inst.RelocateReplicas(failedGroupMemberInstanceKey, candidateGroupMemberInstanceKey, "") + topologyRecovery.AddErrors(errs) + if len(relocatedReplicas) != len(failedGroupMember.Replicas.GetInstanceKeys()) { + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("- RecoverDeadReplicationGroupMemberWithReplicas: failed to move all replicas to candidate group member (%+v)", candidateGroupMemberInstanceKey)) + return nil, topologyRecovery.AddError(errors.New(fmt.Sprintf("RecoverDeadReplicationGroupMemberWithReplicas: Unable to relocate replicas to +%v", candidateGroupMemberInstanceKey))) + } + AuditTopologyRecovery(topologyRecovery, "All replicas successfully relocated") + resolveRecovery(topologyRecovery, successorInstance) + return successorInstance, err +} + // checkAndRecoverDeadIntermediateMaster checks a given analysis, decides whether to take action, and possibly takes action // Returns true when action was taken. func checkAndRecoverDeadIntermediateMaster(analysisEntry inst.ReplicationAnalysis, candidateInstanceKey *inst.InstanceKey, forceInstanceRecovery bool, skipProcesses bool) (bool, *TopologyRecovery, error) { @@ -1411,6 +1455,46 @@ func checkAndRecoverGenericProblem(analysisEntry inst.ReplicationAnalysis, candi return false, nil, nil } +// checkAndRecoverDeadGroupMemberWithReplicas checks whether action needs to be taken for an analysis involving a dead +// replication group member, and takes the action if applicable. Notice that under our view of the world, a primary +// replication group member is akin to a master in traditional async/semisync replication; whereas secondary group +// members are akin to intermediate masters. Considering also that a failed group member can always be considered as a +// secondary (even if it was primary, the group should have detected its failure and elected a new primary), then +// failure of a group member with replicas is akin to failure of an intermediate master. +func checkAndRecoverDeadGroupMemberWithReplicas(analysisEntry inst.ReplicationAnalysis, candidateInstanceKey *inst.InstanceKey, forceInstanceRecovery bool, skipProcesses bool) (bool, *TopologyRecovery, error) { + // Don't proceed with recovery unless it was forced or automatic intermediate source recovery is enabled. + // We consider failed group members akin to failed intermediate masters, so we re-use the configuration for + // intermediates. + if !(forceInstanceRecovery || analysisEntry.ClusterDetails.HasAutomatedIntermediateMasterRecovery) { + return false, nil, nil + } + // Try to record the recovery. It it fails to be recorded, it because it is already being dealt with. + topologyRecovery, err := AttemptRecoveryRegistration(&analysisEntry, !forceInstanceRecovery, !forceInstanceRecovery) + if err != nil { + return false, nil, err + } + // Proceed with recovery + recoverDeadReplicationGroupMemberCounter.Inc(1) + + recoveredToGroupMember, err := RecoverDeadReplicationGroupMemberWithReplicas(topologyRecovery, skipProcesses) + + if recoveredToGroupMember != nil { + // success + recoverDeadReplicationGroupMemberSuccessCounter.Inc(1) + + if !skipProcesses { + // Execute post failover processes + topologyRecovery.SuccessorKey = &recoveredToGroupMember.Key + topologyRecovery.SuccessorAlias = recoveredToGroupMember.InstanceAlias + // For the same reasons that were mentioned above, we re-use the post intermediate master fail-over hooks + executeProcesses(config.Config.PostIntermediateMasterFailoverProcesses, "PostIntermediateMasterFailoverProcesses", topologyRecovery, false) + } + } else { + recoverDeadReplicationGroupMemberFailureCounter.Inc(1) + } + return true, topologyRecovery, err +} + // Force a re-read of a topology instance; this is done because we need to substantiate a suspicion // that we may have a failover scenario. we want to speed up reading the complete picture. func emergentlyReadTopologyInstance(instanceKey *inst.InstanceKey, analysisCode inst.AnalysisCode) (instance *inst.Instance, err error) { @@ -1549,6 +1633,9 @@ func getCheckAndRecoverFunction(analysisCode inst.AnalysisCode, analyzedInstance return checkAndRecoverGenericProblem, false case inst.UnreachableIntermediateMasterWithLaggingReplicas: return checkAndRecoverGenericProblem, false + // replication group members + case inst.DeadReplicationGroupMemberWithReplicas: + return checkAndRecoverDeadGroupMemberWithReplicas, true } // Right now this is mostly causing noise with no clear action. // Will revisit this in the future.