Skip to content

Commit

Permalink
Merge pull request #2845 from hashicorp/stale-raft-servers
Browse files Browse the repository at this point in the history
Clean up raft servers without a corresponding serf entry
  • Loading branch information
kyhavlov authored Mar 29, 2017
2 parents e34559a + e74e83f commit 73f0e6f
Show file tree
Hide file tree
Showing 2 changed files with 107 additions and 6 deletions.
50 changes: 44 additions & 6 deletions consul/autopilot.go
Original file line number Diff line number Diff line change
Expand Up @@ -78,17 +78,37 @@ func (s *Server) pruneDeadServers() error {

// Find any failed servers
var failed []string
staleRaftServers := make(map[string]raft.Server)
if autopilotConf.CleanupDeadServers {
future := s.raft.GetConfiguration()
if future.Error() != nil {
return err
}

for _, server := range future.Configuration().Servers {
staleRaftServers[string(server.Address)] = server
}

for _, member := range s.serfLAN.Members() {
valid, _ := agent.IsConsulServer(member)
if valid && member.Status == serf.StatusFailed {
failed = append(failed, member.Name)
valid, parts := agent.IsConsulServer(member)

if valid {
// Remove this server from the stale list; it has a serf entry
if _, ok := staleRaftServers[parts.Addr.String()]; ok {
delete(staleRaftServers, parts.Addr.String())
}

if member.Status == serf.StatusFailed {
failed = append(failed, member.Name)
}
}
}
}

removalCount := len(failed) + len(staleRaftServers)

// Nothing to remove, return early
if len(failed) == 0 {
if removalCount == 0 {
return nil
}

Expand All @@ -98,13 +118,31 @@ func (s *Server) pruneDeadServers() error {
}

// Only do removals if a minority of servers will be affected
if len(failed) < peers/2 {
if removalCount < peers/2 {
for _, server := range failed {
s.logger.Printf("[INFO] consul: Attempting removal of failed server: %v", server)
go s.serfLAN.RemoveFailedNode(server)
}

minRaftProtocol, err := ServerMinRaftProtocol(s.serfLAN.Members())
if err != nil {
return err
}
for _, raftServer := range staleRaftServers {
var future raft.Future
if minRaftProtocol >= 2 {
s.logger.Printf("[INFO] consul: Attempting removal of stale raft server : %v", raftServer.ID)
future = s.raft.RemoveServer(raftServer.ID, 0, 0)
} else {
s.logger.Printf("[INFO] consul: Attempting removal of stale raft server : %v", raftServer.ID)
future = s.raft.RemovePeer(raftServer.Address)
}
if err := future.Error(); err != nil {
return err
}
}
} else {
s.logger.Printf("[DEBUG] consul: Failed to remove dead servers: too many dead servers: %d/%d", len(failed), peers)
s.logger.Printf("[DEBUG] consul: Failed to remove dead servers: too many dead servers: %d/%d", removalCount, peers)
}

return nil
Expand Down
63 changes: 63 additions & 0 deletions consul/autopilot_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,69 @@ func TestAutopilot_CleanupDeadServerPeriodic(t *testing.T) {
}
}

func TestAutopilot_CleanupStaleRaftServer(t *testing.T) {
dir1, s1 := testServerDCBootstrap(t, "dc1", true)
defer os.RemoveAll(dir1)
defer s1.Shutdown()

dir2, s2 := testServerDCBootstrap(t, "dc1", false)
defer os.RemoveAll(dir2)
defer s2.Shutdown()

dir3, s3 := testServerDCBootstrap(t, "dc1", false)
defer os.RemoveAll(dir3)
defer s3.Shutdown()

dir4, s4 := testServerDCBootstrap(t, "dc1", false)
defer os.RemoveAll(dir4)
defer s4.Shutdown()

servers := []*Server{s1, s2, s3}

// Join the servers to s1
addr := fmt.Sprintf("127.0.0.1:%d",
s1.config.SerfLANConfig.MemberlistConfig.BindPort)

for _, s := range servers[1:] {
if _, err := s.JoinLAN([]string{addr}); err != nil {
t.Fatalf("err: %v", err)
}
}

for _, s := range servers {
if err := testutil.WaitForResult(func() (bool, error) {
peers, _ := s.numPeers()
return peers == 3, nil
}); err != nil {
t.Fatal(err)
}
}

// Add s4 to peers directly
s4addr := fmt.Sprintf("127.0.0.1:%d",
s4.config.SerfLANConfig.MemberlistConfig.BindPort)
s1.raft.AddVoter(raft.ServerID(s4.config.NodeID), raft.ServerAddress(s4addr),0, 0)

// Verify we have 4 peers
peers, err := s1.numPeers()
if err != nil {
t.Fatal(err)
}
if peers != 4 {
t.Fatalf("bad: %v", peers)
}

// Wait for s4 to be removed
for _, s := range []*Server{s1, s2, s3} {
if err := testutil.WaitForResult(func() (bool, error) {
peers, _ := s.numPeers()
return peers == 3, nil
}); err != nil {
t.Fatal(err)
}
}
}

func TestAutopilot_PromoteNonVoter(t *testing.T) {
dir1, s1 := testServerWithConfig(t, func(c *Config) {
c.Datacenter = "dc1"
Expand Down

0 comments on commit 73f0e6f

Please sign in to comment.