Skip to content

Commit

Permalink
Merge pull request #2319 from hashicorp/f-bootstrap-abort
Browse files Browse the repository at this point in the history
Adds check that aborts bootstrap mode if there's an existing cluster.
  • Loading branch information
slackpad authored Sep 1, 2016
2 parents 0c798e4 + 40e1553 commit 1488af4
Show file tree
Hide file tree
Showing 2 changed files with 75 additions and 19 deletions.
54 changes: 41 additions & 13 deletions consul/serf.go
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
package consul

import (
"net"
"strings"

"github.com/hashicorp/consul/consul/agent"
Expand Down Expand Up @@ -192,7 +191,7 @@ func (s *Server) wanNodeJoin(me serf.MemberEvent) {
}
}

// maybeBootsrap is used to handle bootstrapping when a new consul server joins
// maybeBootstrap is used to handle bootstrapping when a new consul server joins.
func (s *Server) maybeBootstrap() {
// Bootstrap can only be done if there are no committed logs, remove our
// expectations of bootstrapping. This is slightly cheaper than the full
Expand All @@ -203,13 +202,14 @@ func (s *Server) maybeBootstrap() {
return
}
if index != 0 {
s.logger.Printf("[INFO] consul: Raft data found, disabling bootstrap mode")
s.config.BootstrapExpect = 0
return
}

// Scan for all the known servers.
members := s.serfLAN.Members()
addrs := make([]string, 0)
var servers []agent.Server
for _, member := range members {
valid, p := agent.IsConsulServer(member)
if !valid {
Expand All @@ -227,34 +227,62 @@ func (s *Server) maybeBootstrap() {
s.logger.Printf("[ERR] consul: Member %v has bootstrap mode. Expect disabled.", member)
return
}
addr := &net.TCPAddr{IP: member.Addr, Port: p.Port}
addrs = append(addrs, addr.String())
servers = append(servers, *p)
}

// Skip if we haven't met the minimum expect count.
if len(addrs) < s.config.BootstrapExpect {
if len(servers) < s.config.BootstrapExpect {
return
}

// Query each of the servers and make sure they report no Raft peers.
for _, server := range servers {
var peers []string
if err := s.connPool.RPC(s.config.Datacenter, server.Addr, server.Version,
"Status.Peers", &struct{}{}, &peers); err != nil {
s.logger.Printf("[ERR] consul: Failed to confirm peer status for %s: %v", server.Name, err)
return
}

// Found a node with some Raft peers, stop bootstrap since there's
// evidence of an existing cluster. We should get folded in by the
// existing servers if that's the case, so it's cleaner to sit as a
// candidate with no peers so we don't cause spurious elections.
// It's OK this is racy, because even with an initial bootstrap
// as long as one peer runs bootstrap things will work, and if we
// have multiple peers bootstrap in the same way, that's OK. We
// just don't want a server added much later to do a live bootstrap
// and interfere with the cluster. This isn't required for Raft's
// correctness because no server in the existing cluster will vote
// for this server, but it makes things much more stable.
if len(peers) > 0 {
s.logger.Printf("[INFO] consul: Existing Raft peers reported by %s, disabling bootstrap mode", server.Name)
s.config.BootstrapExpect = 0
return
}
}

// Attempt a live bootstrap!
var configuration raft.Configuration
for _, addr := range addrs {
// TODO (slackpad) - This will need to be updated once we support
// node IDs.
server := raft.Server{
var addrs []string
for _, server := range servers {
addr := server.Addr.String()
addrs = append(addrs, addr)
peer := raft.Server{
ID: raft.ServerID(addr),
Address: raft.ServerAddress(addr),
}
configuration.Servers = append(configuration.Servers, server)
configuration.Servers = append(configuration.Servers, peer)
}
s.logger.Printf("[INFO] consul: Found expected number of peers (%s), attempting to bootstrap cluster...",
s.logger.Printf("[INFO] consul: Found expected number of peers, attempting bootstrap: %s",
strings.Join(addrs, ","))
future := s.raft.BootstrapCluster(configuration)
if err := future.Error(); err != nil {
s.logger.Printf("[ERR] consul: Failed to bootstrap cluster: %v", err)
}

// Bootstrapping complete, don't enter this again.
// Bootstrapping complete, or failed for some reason, don't enter this
// again.
s.config.BootstrapExpect = 0
}

Expand Down
40 changes: 34 additions & 6 deletions consul/server_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -500,7 +500,9 @@ func TestServer_JoinLAN_TLS(t *testing.T) {
}

func TestServer_Expect(t *testing.T) {
// all test servers should be in expect=3 mode
// All test servers should be in expect=3 mode, except for the 3rd one,
// but one with expect=0 can cause a bootstrap to occur from the other
// servers as currently implemented.
dir1, s1 := testServerDCExpect(t, "dc1", 3)
defer os.RemoveAll(dir1)
defer s1.Shutdown()
Expand All @@ -513,7 +515,11 @@ func TestServer_Expect(t *testing.T) {
defer os.RemoveAll(dir3)
defer s3.Shutdown()

// Try to join
dir4, s4 := testServerDCExpect(t, "dc1", 3)
defer os.RemoveAll(dir4)
defer s4.Shutdown()

// Join the first two servers.
addr := fmt.Sprintf("127.0.0.1:%d",
s1.config.SerfLANConfig.MemberlistConfig.BindPort)
if _, err := s2.JoinLAN([]string{addr}); err != nil {
Expand All @@ -523,7 +529,7 @@ func TestServer_Expect(t *testing.T) {
var p1 int
var p2 int

// should have no peers yet
// Should have no peers yet since the bootstrap didn't occur.
testutil.WaitForResult(func() (bool, error) {
p1, _ = s1.numPeers()
return p1 == 0, errors.New(fmt.Sprintf("%d", p1))
Expand All @@ -538,14 +544,14 @@ func TestServer_Expect(t *testing.T) {
t.Fatalf("should have 0 peers: %v", err)
})

// join the third node
// Join the third node.
if _, err := s3.JoinLAN([]string{addr}); err != nil {
t.Fatalf("err: %v", err)
}

var p3 int

// should now have all three peers
// Now we have three servers so we should bootstrap.
testutil.WaitForResult(func() (bool, error) {
p1, _ = s1.numPeers()
return p1 == 3, errors.New(fmt.Sprintf("%d", p1))
Expand All @@ -567,8 +573,30 @@ func TestServer_Expect(t *testing.T) {
t.Fatalf("should have 3 peers: %v", err)
})

// check if there is one leader now
// Make sure a leader is elected, grab the current term and then add in
// the fourth server.
testutil.WaitForLeader(t, s1.RPC, "dc1")
termBefore := s1.raft.Stats()["last_log_term"]
if _, err := s4.JoinLAN([]string{addr}); err != nil {
t.Fatalf("err: %v", err)
}

// Wait for the new server to see itself added to the cluster.
var p4 int
testutil.WaitForResult(func() (bool, error) {
p4, _ = s4.numPeers()
return p4 == 4, errors.New(fmt.Sprintf("%d", p4))
}, func(err error) {
t.Fatalf("should have 4 peers: %v", err)
})

// Make sure there's still a leader and that the term didn't change,
// so we know an election didn't occur.
testutil.WaitForLeader(t, s1.RPC, "dc1")
termAfter := s1.raft.Stats()["last_log_term"]
if termAfter != termBefore {
t.Fatalf("looks like an election took place")
}
}

func TestServer_BadExpect(t *testing.T) {
Expand Down

0 comments on commit 1488af4

Please sign in to comment.