diff --git a/agent/agent.go b/agent/agent.go index 289653cd1bf2..e28bf8b17bf6 100644 --- a/agent/agent.go +++ b/agent/agent.go @@ -992,6 +992,9 @@ func (a *Agent) consulConfig() (*consul.Config, error) { if a.config.NonVotingServer { base.NonVoter = a.config.NonVotingServer } + if a.config.NodeRenamingPolicy != "" { + base.NodeRenamingPolicy = a.config.NodeRenamingPolicy + } // These are fully specified in the agent defaults, so we can simply // copy them over. diff --git a/agent/config/builder.go b/agent/config/builder.go index 01040fb76aed..817c906e158e 100644 --- a/agent/config/builder.go +++ b/agent/config/builder.go @@ -790,6 +790,7 @@ func (b *Builder) Build() (rt RuntimeConfig, err error) { NodeID: types.NodeID(b.stringVal(c.NodeID)), NodeMeta: c.NodeMeta, NodeName: b.nodeName(c.NodeName), + NodeRenamingPolicy: b.stringValWithDefault(c.NodeRenamingPolicy, "legacy"), NonVotingServer: b.boolVal(c.NonVotingServer), PidFile: b.stringVal(c.PidFile), PrimaryDatacenter: primaryDatacenter, diff --git a/agent/config/config.go b/agent/config/config.go index e71aaa1d0bda..044f48723bad 100644 --- a/agent/config/config.go +++ b/agent/config/config.go @@ -221,6 +221,7 @@ type Config struct { NodeID *string `json:"node_id,omitempty" hcl:"node_id" mapstructure:"node_id"` NodeMeta map[string]string `json:"node_meta,omitempty" hcl:"node_meta" mapstructure:"node_meta"` NodeName *string `json:"node_name,omitempty" hcl:"node_name" mapstructure:"node_name"` + NodeRenamingPolicy *string `json:"node_renaming_policy,omitempty" hcl:"node_renaming_policy" mapstructure:"node_renaming_policy"` NonVotingServer *bool `json:"non_voting_server,omitempty" hcl:"non_voting_server" mapstructure:"non_voting_server"` Performance Performance `json:"performance,omitempty" hcl:"performance" mapstructure:"performance"` PidFile *string `json:"pid_file,omitempty" hcl:"pid_file" mapstructure:"pid_file"` diff --git a/agent/config/flags.go b/agent/config/flags.go index 53b255b859af..3aa0fb806971 100644 --- a/agent/config/flags.go +++ b/agent/config/flags.go @@ -84,6 +84,7 @@ func AddFlags(fs *flag.FlagSet, f *Flags) { add(&f.Config.NodeName, "node", "Name of this node. Must be unique in the cluster.") add(&f.Config.NodeID, "node-id", "A unique ID for this node across space and time. Defaults to a randomly-generated ID that persists in the data-dir.") add(&f.Config.NodeMeta, "node-meta", "An arbitrary metadata key/value pair for this node, of the format `key:value`. Can be specified multiple times.") + add(&f.Config.NodeRenamingPolicy, "node-renaming-policy", "Choose between behaviours when renaming nodes, can be any of legacy|dead|strict") add(&f.Config.NonVotingServer, "non-voting-server", "(Enterprise-only) This flag is used to make the server not participate in the Raft quorum, and have it only receive the data replication stream. This can be used to add read scalability to a cluster in cases where a high volume of reads to servers are needed.") add(&f.Config.PidFile, "pid-file", "Path to file to store agent PID.") add(&f.Config.RPCProtocol, "protocol", "Sets the protocol version. Defaults to latest.") diff --git a/agent/config/runtime.go b/agent/config/runtime.go index 4d912cb1d25f..8637e24e423e 100644 --- a/agent/config/runtime.go +++ b/agent/config/runtime.go @@ -819,6 +819,12 @@ type RuntimeConfig struct { // flag: -node-meta "key:value" -node-meta "key:value" ... NodeMeta map[string]string + // NodeRenamingPolicy exposes how renaming is handled. + // + // hcl: node_renaming_policy = (legacy|dead|strict) + // flag: -node-renaming-policy + NodeRenamingPolicy string + // NonVotingServer is whether this server will act as a non-voting member // of the cluster to help provide read scalability. (Enterprise-only) // diff --git a/agent/config/runtime_test.go b/agent/config/runtime_test.go index f318448c7e07..df0dd3d9aecb 100644 --- a/agent/config/runtime_test.go +++ b/agent/config/runtime_test.go @@ -500,6 +500,17 @@ func TestConfigFlagsAndEdgecases(t *testing.T) { rt.DataDir = dataDir }, }, + { + desc: "-node-renaming-policy", + args: []string{ + `-node-renaming-policy=dead`, + `-data-dir=` + dataDir, + }, + patch: func(rt *RuntimeConfig) { + rt.NodeRenamingPolicy = "dead" + rt.DataDir = dataDir + }, + }, { desc: "-non-voting-server", args: []string{ @@ -4196,6 +4207,7 @@ func TestFullConfig(t *testing.T) { NodeID: types.NodeID("AsUIlw99"), NodeMeta: map[string]string{"5mgGQMBk": "mJLtVMSG", "A7ynFMJB": "0Nx6RGab"}, NodeName: "otlLxGaI", + NodeRenamingPolicy: "legacy", NonVotingServer: true, PidFile: "43xN80Km", PrimaryDatacenter: "ejtmd43d", @@ -4994,6 +5006,7 @@ func TestSanitize(t *testing.T) { "NodeID": "", "NodeMeta": {}, "NodeName": "", + "NodeRenamingPolicy": "", "NonVotingServer": false, "PidFile": "", "PrimaryDatacenter": "", diff --git a/agent/connect/ca/provider_consul_test.go b/agent/connect/ca/provider_consul_test.go index 9352e698d897..cccfe0421545 100644 --- a/agent/connect/ca/provider_consul_test.go +++ b/agent/connect/ca/provider_consul_test.go @@ -46,7 +46,7 @@ func (c *consulCAMockDelegate) ApplyCARequest(req *structs.CARequest) error { } func newMockDelegate(t *testing.T, conf *structs.CAConfiguration) *consulCAMockDelegate { - s, err := state.NewStateStore(nil) + s, err := state.NewStateStore(nil, state.NodeRenamingDefault) if err != nil { t.Fatalf("err: %s", err) } diff --git a/agent/consul/config.go b/agent/consul/config.go index e3ebd8862783..eebfc6cc6633 100644 --- a/agent/consul/config.go +++ b/agent/consul/config.go @@ -336,6 +336,9 @@ type Config struct { // warning and discard the remaining updates. CoordinateUpdateMaxBatches int + // NodeRenamingPolicy allow to handle node renaming behavior + NodeRenamingPolicy string + // RPCHoldTimeout is how long an RPC can be "held" before it is errored. // This is used to paper over a loss of leadership by instead holding RPCs, // so that the caller experiences a slow response rather than an error. diff --git a/agent/consul/fsm/commands_oss_test.go b/agent/consul/fsm/commands_oss_test.go index 6778a3f7e8f8..33dd8d3142ff 100644 --- a/agent/consul/fsm/commands_oss_test.go +++ b/agent/consul/fsm/commands_oss_test.go @@ -41,7 +41,7 @@ func generateRandomCoordinate() *coordinate.Coordinate { func TestFSM_RegisterNode(t *testing.T) { t.Parallel() - fsm, err := New(nil, os.Stderr) + fsm, err := New(nil, os.Stderr, "") if err != nil { t.Fatalf("err: %v", err) } @@ -85,7 +85,7 @@ func TestFSM_RegisterNode(t *testing.T) { func TestFSM_RegisterNode_Service(t *testing.T) { t.Parallel() - fsm, err := New(nil, os.Stderr) + fsm, err := New(nil, os.Stderr, "") if err != nil { t.Fatalf("err: %v", err) } @@ -148,7 +148,7 @@ func TestFSM_RegisterNode_Service(t *testing.T) { func TestFSM_DeregisterService(t *testing.T) { t.Parallel() - fsm, err := New(nil, os.Stderr) + fsm, err := New(nil, os.Stderr, "") if err != nil { t.Fatalf("err: %v", err) } @@ -210,7 +210,7 @@ func TestFSM_DeregisterService(t *testing.T) { func TestFSM_DeregisterCheck(t *testing.T) { t.Parallel() - fsm, err := New(nil, os.Stderr) + fsm, err := New(nil, os.Stderr, "") if err != nil { t.Fatalf("err: %v", err) } @@ -272,7 +272,7 @@ func TestFSM_DeregisterCheck(t *testing.T) { func TestFSM_DeregisterNode(t *testing.T) { t.Parallel() - fsm, err := New(nil, os.Stderr) + fsm, err := New(nil, os.Stderr, "") if err != nil { t.Fatalf("err: %v", err) } @@ -349,7 +349,7 @@ func TestFSM_DeregisterNode(t *testing.T) { func TestFSM_KVSDelete(t *testing.T) { t.Parallel() - fsm, err := New(nil, os.Stderr) + fsm, err := New(nil, os.Stderr, "") if err != nil { t.Fatalf("err: %v", err) } @@ -395,7 +395,7 @@ func TestFSM_KVSDelete(t *testing.T) { func TestFSM_KVSDeleteTree(t *testing.T) { t.Parallel() - fsm, err := New(nil, os.Stderr) + fsm, err := New(nil, os.Stderr, "") if err != nil { t.Fatalf("err: %v", err) } @@ -442,7 +442,7 @@ func TestFSM_KVSDeleteTree(t *testing.T) { func TestFSM_KVSDeleteCheckAndSet(t *testing.T) { t.Parallel() - fsm, err := New(nil, os.Stderr) + fsm, err := New(nil, os.Stderr, "") if err != nil { t.Fatalf("err: %v", err) } @@ -498,7 +498,7 @@ func TestFSM_KVSDeleteCheckAndSet(t *testing.T) { func TestFSM_KVSCheckAndSet(t *testing.T) { t.Parallel() - fsm, err := New(nil, os.Stderr) + fsm, err := New(nil, os.Stderr, "") if err != nil { t.Fatalf("err: %v", err) } @@ -555,7 +555,7 @@ func TestFSM_KVSCheckAndSet(t *testing.T) { func TestFSM_KVSLock(t *testing.T) { t.Parallel() - fsm, err := New(nil, os.Stderr) + fsm, err := New(nil, os.Stderr, "") if err != nil { t.Fatalf("err: %v", err) } @@ -600,7 +600,7 @@ func TestFSM_KVSLock(t *testing.T) { func TestFSM_KVSUnlock(t *testing.T) { t.Parallel() - fsm, err := New(nil, os.Stderr) + fsm, err := New(nil, os.Stderr, "") if err != nil { t.Fatalf("err: %v", err) } @@ -663,7 +663,7 @@ func TestFSM_KVSUnlock(t *testing.T) { func TestFSM_CoordinateUpdate(t *testing.T) { t.Parallel() - fsm, err := New(nil, os.Stderr) + fsm, err := New(nil, os.Stderr, "") if err != nil { t.Fatalf("err: %v", err) } @@ -704,7 +704,7 @@ func TestFSM_CoordinateUpdate(t *testing.T) { func TestFSM_SessionCreate_Destroy(t *testing.T) { t.Parallel() - fsm, err := New(nil, os.Stderr) + fsm, err := New(nil, os.Stderr, "") if err != nil { t.Fatalf("err: %v", err) } @@ -784,7 +784,7 @@ func TestFSM_SessionCreate_Destroy(t *testing.T) { func TestFSM_ACL_CRUD(t *testing.T) { t.Parallel() - fsm, err := New(nil, os.Stderr) + fsm, err := New(nil, os.Stderr, "") if err != nil { t.Fatalf("err: %v", err) } @@ -902,7 +902,7 @@ func TestFSM_ACL_CRUD(t *testing.T) { func TestFSM_PreparedQuery_CRUD(t *testing.T) { t.Parallel() - fsm, err := New(nil, os.Stderr) + fsm, err := New(nil, os.Stderr, "") if err != nil { t.Fatalf("err: %v", err) } @@ -1000,7 +1000,7 @@ func TestFSM_PreparedQuery_CRUD(t *testing.T) { func TestFSM_TombstoneReap(t *testing.T) { t.Parallel() - fsm, err := New(nil, os.Stderr) + fsm, err := New(nil, os.Stderr, "") if err != nil { t.Fatalf("err: %v", err) } @@ -1048,7 +1048,7 @@ func TestFSM_TombstoneReap(t *testing.T) { func TestFSM_Txn(t *testing.T) { t.Parallel() - fsm, err := New(nil, os.Stderr) + fsm, err := New(nil, os.Stderr, "") if err != nil { t.Fatalf("err: %v", err) } @@ -1090,7 +1090,7 @@ func TestFSM_Txn(t *testing.T) { func TestFSM_Autopilot(t *testing.T) { t.Parallel() - fsm, err := New(nil, os.Stderr) + fsm, err := New(nil, os.Stderr, "") if err != nil { t.Fatalf("err: %v", err) } @@ -1154,7 +1154,7 @@ func TestFSM_Intention_CRUD(t *testing.T) { t.Parallel() assert := assert.New(t) - fsm, err := New(nil, os.Stderr) + fsm, err := New(nil, os.Stderr, "") assert.Nil(err) // Create a new intention. @@ -1223,7 +1223,7 @@ func TestFSM_CAConfig(t *testing.T) { t.Parallel() assert := assert.New(t) - fsm, err := New(nil, os.Stderr) + fsm, err := New(nil, os.Stderr, "") assert.Nil(err) // Set the autopilot config using a request. @@ -1290,7 +1290,7 @@ func TestFSM_CARoots(t *testing.T) { t.Parallel() assert := assert.New(t) - fsm, err := New(nil, os.Stderr) + fsm, err := New(nil, os.Stderr, "") assert.Nil(err) // Roots @@ -1322,7 +1322,7 @@ func TestFSM_CABuiltinProvider(t *testing.T) { t.Parallel() assert := assert.New(t) - fsm, err := New(nil, os.Stderr) + fsm, err := New(nil, os.Stderr, "") assert.Nil(err) // Provider state. diff --git a/agent/consul/fsm/fsm.go b/agent/consul/fsm/fsm.go index 58c126b22f2d..605d6e042166 100644 --- a/agent/consul/fsm/fsm.go +++ b/agent/consul/fsm/fsm.go @@ -59,12 +59,13 @@ type FSM struct { stateLock sync.RWMutex state *state.Store - gc *state.TombstoneGC + gc *state.TombstoneGC + nodeRenamingPolicy string } // New is used to construct a new FSM with a blank state. -func New(gc *state.TombstoneGC, logOutput io.Writer) (*FSM, error) { - stateNew, err := state.NewStateStore(gc) +func New(gc *state.TombstoneGC, logOutput io.Writer, nodeRenamingPolicy string) (*FSM, error) { + stateNew, err := state.NewStateStore(gc, nodeRenamingPolicy) if err != nil { return nil, err } @@ -136,7 +137,7 @@ func (c *FSM) Restore(old io.ReadCloser) error { defer old.Close() // Create a new state store. - stateNew, err := state.NewStateStore(c.gc) + stateNew, err := state.NewStateStore(c.gc, c.nodeRenamingPolicy) if err != nil { return err } diff --git a/agent/consul/fsm/fsm_test.go b/agent/consul/fsm/fsm_test.go index 47ff7ac9283c..1413087b9aa9 100644 --- a/agent/consul/fsm/fsm_test.go +++ b/agent/consul/fsm/fsm_test.go @@ -39,7 +39,7 @@ func makeLog(buf []byte) *raft.Log { func TestFSM_IgnoreUnknown(t *testing.T) { t.Parallel() - fsm, err := New(nil, os.Stderr) + fsm, err := New(nil, os.Stderr, "") assert.Nil(t, err) // Create a new reap request diff --git a/agent/consul/fsm/snapshot_oss_test.go b/agent/consul/fsm/snapshot_oss_test.go index 7c00ba767881..b2f44c509e64 100644 --- a/agent/consul/fsm/snapshot_oss_test.go +++ b/agent/consul/fsm/snapshot_oss_test.go @@ -22,7 +22,7 @@ func TestFSM_SnapshotRestore_OSS(t *testing.T) { t.Parallel() assert := assert.New(t) - fsm, err := New(nil, os.Stderr) + fsm, err := New(nil, os.Stderr, "") if err != nil { t.Fatalf("err: %v", err) } @@ -217,7 +217,7 @@ func TestFSM_SnapshotRestore_OSS(t *testing.T) { } // Try to restore on a new FSM - fsm2, err := New(nil, os.Stderr) + fsm2, err := New(nil, os.Stderr, "") if err != nil { t.Fatalf("err: %v", err) } @@ -418,7 +418,7 @@ func TestFSM_SnapshotRestore_OSS(t *testing.T) { func TestFSM_BadRestore_OSS(t *testing.T) { t.Parallel() // Create an FSM with some state. - fsm, err := New(nil, os.Stderr) + fsm, err := New(nil, os.Stderr, "") if err != nil { t.Fatalf("err: %v", err) } diff --git a/agent/consul/issue_test.go b/agent/consul/issue_test.go index f514642ab9ca..c88dd09626a8 100644 --- a/agent/consul/issue_test.go +++ b/agent/consul/issue_test.go @@ -23,7 +23,7 @@ func makeLog(buf []byte) *raft.Log { // Testing for GH-300 and GH-279 func TestHealthCheckRace(t *testing.T) { t.Parallel() - fsm, err := consulfsm.New(nil, os.Stderr) + fsm, err := consulfsm.New(nil, os.Stderr, "") if err != nil { t.Fatalf("err: %v", err) } diff --git a/agent/consul/server.go b/agent/consul/server.go index c3893b207250..0e1526f3fef3 100644 --- a/agent/consul/server.go +++ b/agent/consul/server.go @@ -496,7 +496,7 @@ func (s *Server) setupRaft() error { // Create the FSM. var err error - s.fsm, err = fsm.New(s.tombstoneGC, s.config.LogOutput) + s.fsm, err = fsm.New(s.tombstoneGC, s.config.LogOutput, s.config.NodeRenamingPolicy) if err != nil { return err } @@ -603,7 +603,7 @@ func (s *Server) setupRaft() error { return fmt.Errorf("recovery failed to parse peers.json: %v", err) } - tmpFsm, err := fsm.New(s.tombstoneGC, s.config.LogOutput) + tmpFsm, err := fsm.New(s.tombstoneGC, s.config.LogOutput, s.config.NodeRenamingPolicy) if err != nil { return fmt.Errorf("recovery failed to make temp FSM: %v", err) } diff --git a/agent/consul/state/catalog.go b/agent/consul/state/catalog.go index 162eed79cf66..912c549d551b 100644 --- a/agent/consul/state/catalog.go +++ b/agent/consul/state/catalog.go @@ -354,36 +354,103 @@ func (s *Store) EnsureNode(idx uint64, node *structs.Node) error { return nil } -// ensureNoNodeWithSimilarNameTxn checks that no other node has conflict in its name -// If allowClashWithoutID then, getting a conflict on another node without ID will be allowed -func (s *Store) ensureNoNodeWithSimilarNameTxn(tx *memdb.Txn, node *structs.Node, allowClashWithoutID bool) error { - // Retrieve all of the nodes +func (s *Store) findSimilarNode(tx *memdb.Txn, node *structs.Node, allowClashWithoutID bool) (*structs.Node, error) { enodes, err := tx.Get("nodes", "id") if err != nil { - return fmt.Errorf("Cannot lookup all nodes: %s", err) + return nil, fmt.Errorf("Cannot lookup all nodes: %s", err) } for nodeIt := enodes.Next(); nodeIt != nil; nodeIt = enodes.Next() { enode := nodeIt.(*structs.Node) if strings.EqualFold(node.Node, enode.Node) && node.ID != enode.ID { if !(enode.ID == "" && allowClashWithoutID) { - return fmt.Errorf("Node name %s is reserved by node %s with name %s", node.Node, enode.ID, enode.Node) + return enode, nil } } } + return nil, nil +} + +// ensureNoNodeWithSimilarNameTxn checks that no other node has conflict in its name +// If allowClashWithoutID then, getting a conflict on another node without ID will be allowed +func (s *Store) ensureNoNodeWithSimilarNameTxn(tx *memdb.Txn, node *structs.Node, allowClashWithoutID bool) error { + enode, err := s.findSimilarNode(tx, node, allowClashWithoutID) + if err != nil { + return err + } + if enode != nil { + return fmt.Errorf("Node name %s is reserved by node %s with name %s", node.Node, enode.ID, enode.Node) + } return nil } -// ensureNodeTxn is the inner function called to actually create a node -// registration or modify an existing one in the state store. It allows -// passing in a memdb transaction so it may be part of a larger txn. -func (s *Store) ensureNodeTxn(tx *memdb.Txn, idx uint64, node *structs.Node) error { +func (s *Store) findExistingNodeRenameDeadNodes(tx *memdb.Txn, idx uint64, node *structs.Node) (*structs.Node, error) { + if node.ID == "" { + return nil, fmt.Errorf("Empty Node IDs is not supported for : %s", node.Node) + } + existing, err := getNodeIDTxn(tx, node.ID) + if err != nil { + return nil, fmt.Errorf("node lookup failed: %s", err) + } + if existing != nil { + if existing.Node == node.Node { + return existing, nil + } + } + // We are either adding a new node OR renaming a node + // Lets first get all nodes and check whether name do match, we do not allow clash on nodes without ID + dupNode, err := s.findSimilarNode(tx, node, false) + if err != nil { + return nil, err + } + if dupNode != nil { + // There is a dup node, lets check if the node is healthy + dupNodeCheck, err := tx.First("checks", "id", dupNode.Node, string("serfHealth")) + if err != nil { + return nil, fmt.Errorf("Cannot get status of node %s due to: %s", dupNode.Node, err) + } + if dupNodeCheck == nil { + return nil, fmt.Errorf("Cannot validate whether node %s is healthy, cannot rename node", dupNode.Node) + } + existingDupNodeSerf := dupNodeCheck.(*structs.HealthCheck) + if existingDupNodeSerf.Status != "critical" { + // This is ok, we allow to take the identity of that node + return dupNode, fmt.Errorf("Cannot rename since node %s serfHealth is '%s'", dupNode.Node, existingDupNodeSerf.Status) + } + existing = dupNode + } + return existing, nil +} + +func (s *Store) findExistingNodeStrict(tx *memdb.Txn, idx uint64, node *structs.Node) (*structs.Node, error) { + if node.ID == "" { + return nil, fmt.Errorf("Empty Node IDs is not supported for : %s", node.Node) + } + existing, err := getNodeIDTxn(tx, node.ID) + if err != nil { + return nil, fmt.Errorf("node lookup failed: %s", err) + } + if existing != nil { + // If name is not renamed, we do not need a full checks on all nodes + if node.Node == existing.Node { + return existing, nil + } + } + // We ensure no-one has a similar name at all (case insensitive comparison on all nodes) + dupNameError := s.ensureNoNodeWithSimilarNameTxn(tx, node, false) + if dupNameError != nil { + return existing, fmt.Errorf("Error while renaming Node ID: %q: %s", node.ID, dupNameError) + } + return existing, nil +} + +func (s *Store) findExistingNodeLegacy(tx *memdb.Txn, idx uint64, node *structs.Node) (*structs.Node, error) { // See if there's an existing node with this UUID, and make sure the // name is the same. var n *structs.Node if node.ID != "" { existing, err := getNodeIDTxn(tx, node.ID) if err != nil { - return fmt.Errorf("node lookup failed: %s", err) + return nil, fmt.Errorf("node lookup failed: %s", err) } if existing != nil { n = existing @@ -391,22 +458,20 @@ func (s *Store) ensureNodeTxn(tx *memdb.Txn, idx uint64, node *structs.Node) err // Lets first get all nodes and check whether name do match, we do not allow clash on nodes without ID dupNameError := s.ensureNoNodeWithSimilarNameTxn(tx, node, false) if dupNameError != nil { - return fmt.Errorf("Error while renaming Node ID: %q: %s", node.ID, dupNameError) - } - // We are actually renaming a node, remove its reference first - err := s.deleteNodeTxn(tx, idx, n.Node) - if err != nil { - return fmt.Errorf("Error while renaming Node ID: %q from %s to %s", - node.ID, n.Node, node.Node) + return existing, fmt.Errorf("Error while renaming Node ID: %q: %s", node.ID, dupNameError) } } } else { // We allow to "steal" another node name that would have no ID // It basically means that we allow upgrading a node without ID and add the ID - dupNameError := s.ensureNoNodeWithSimilarNameTxn(tx, node, true) - if dupNameError != nil { - return fmt.Errorf("Error while renaming Node ID: %q: %s", node.ID, dupNameError) + similarNode, err := s.findSimilarNode(tx, node, true) + if err != nil { + return nil, fmt.Errorf("Error while renaming Node ID: %q: %s", node.ID, err) + } + if similarNode != nil && similarNode.ID != "" { + return nil, fmt.Errorf("Cannot rename node[%s](%s) to [%s](%s) because existing node has an ID", similarNode.ID, similarNode.Node, node.ID, node.Node) } + n = similarNode } } // TODO: else Node.ID == "" should be forbidden in future Consul releases @@ -416,7 +481,7 @@ func (s *Store) ensureNodeTxn(tx *memdb.Txn, idx uint64, node *structs.Node) err if n == nil { existing, err := tx.First("nodes", "id", node.Node) if err != nil { - return fmt.Errorf("node name lookup failed: %s", err) + return nil, fmt.Errorf("node name lookup failed: %s", err) } if existing != nil { @@ -426,6 +491,32 @@ func (s *Store) ensureNodeTxn(tx *memdb.Txn, idx uint64, node *structs.Node) err // for case insensitive matches, which may lead to DB corruption // See https://github.com/hashicorp/consul/pull/3983 for context } + return n, nil +} + +// findExistingNode Find a Node with a similar name, returned value might be the node itself or another node +func (s *Store) findExistingNode(tx *memdb.Txn, idx uint64, node *structs.Node) (*structs.Node, error) { + if s.nodeRenameSetting == NodeRenamingLegacy || s.nodeRenameSetting == "" { + return s.findExistingNodeLegacy(tx, idx, node) + } else if s.nodeRenameSetting == NodeRenamingRenameDeadNodes { + return s.findExistingNodeRenameDeadNodes(tx, idx, node) + } else if s.nodeRenameSetting == NodeRenamingStrict { + return s.findExistingNodeStrict(tx, idx, node) + } else { + return nil, fmt.Errorf("Unknown Rename Policy %s", s.nodeRenameSetting) + } +} + +// ensureNodeTxn is the inner function called to actually create a node +// registration or modify an existing one in the state store. It allows +// passing in a memdb transaction so it may be part of a larger txn. +func (s *Store) ensureNodeTxn(tx *memdb.Txn, idx uint64, node *structs.Node) error { + // See if there's an existing node with this UUID, and make sure the + // name is the same. + n, err := s.findExistingNode(tx, idx, node) + if err != nil { + return err + } // Get the indexes. if n != nil { @@ -436,6 +527,12 @@ func (s *Store) ensureNodeTxn(tx *memdb.Txn, idx uint64, node *structs.Node) err return nil } node.ModifyIndex = idx + if n.Node != node.Node { + // We are renaming with the same ID + if err := s.deleteNodeTxn(tx, idx, n.Node); err != nil { + return err + } + } } else { node.CreateIndex = idx node.ModifyIndex = idx diff --git a/agent/consul/state/catalog_test.go b/agent/consul/state/catalog_test.go index 3b1314f960da..846a5749b9f2 100644 --- a/agent/consul/state/catalog_test.go +++ b/agent/consul/state/catalog_test.go @@ -690,6 +690,73 @@ func TestNodeRenamingNodes(t *testing.T) { } } +func TestStateStore_EnsureNodeRenamingWorks(t *testing.T) { + + testFullRegistration := func(policy string, allowStealWithoutID, allowStealDeadNode bool) { + s := testStateStoreWithNodeRenamePolicy(t, policy) + + registerNode := func(idx uint64, nodeName, nodeID, status, ip string) error { + + // Create a node registration request + in := &structs.Node{ + ID: types.NodeID(nodeID), + Node: nodeName, + Address: ip, + } + + // Ensure the node is registered in the db + if err := s.EnsureNode(idx, in); err != nil { + return err + } + + healthIn := &structs.HealthCheck{ + CheckID: "serfHealth", + Name: "serfHealth", + Node: nodeName, + Status: status, + ServiceID: "", + } + return s.EnsureCheck(idx, healthIn) + } + if err := registerNode(1, "mysupernode", "3fc7621b-614a-4cdf-b9bd-0c5df7811385", "passing", "192.168.0.1"); err != nil { + t.Fatalf("err: %s", err) + } + if err := registerNode(2, "nodeHealthy1", "54974264-fd85-4afe-8ff4-e83701b6c664", "passing", "192.168.0.1"); err != nil { + t.Fatalf("err: %s", err) + } + if err := registerNode(3, "nodeSerfDown", "54974264-fd85-4afe-8ff4-e83701b6c665", "critical", "192.168.0.2"); err != nil { + t.Fatalf("err: %s", err) + } + // Try stealing node with similar name and an ID, should all fail + if err := registerNode(4, "MySuperNode", "29676151-51fb-4e51-817f-904a092012ac", "passing", "192.168.0.66"); err == nil { + t.Fatalf("Stealing a node with similar name should always fail, policy=%s", policy) + } + // Should always work + if err := registerNode(4, "nodeHealthyRenamed", "54974264-fd85-4afe-8ff4-e83701b6c664", "passing", "192.168.0.1"); err != nil { + t.Fatalf("err: %s", err) + } + // Revert the change, should always work as well + if err := registerNode(5, "nodeHealthy1", "54974264-fd85-4afe-8ff4-e83701b6c664", "passing", "192.168.0.1"); err != nil { + t.Fatalf("err: %s", err) + } + // Steal without ID + err := registerNode(6, "nodeHealthy1", "", "passing", "192.168.0.1") + if (err != nil) == allowStealWithoutID { + t.Fatalf("allowStealWithoutID:=%v, policy=%s: %s", allowStealWithoutID, policy, err) + } + // Rename dead node + err = registerNode(7, "nodeSerfDown", "b56fdb78-da30-4e82-8068-a87879d328bd", "passing", "192.168.0.7") + if (err != nil) == allowStealDeadNode { + t.Fatalf("nodeSeftDown:=%v, policy=%s: %s", allowStealDeadNode, policy, err) + } + } + testFullRegistration("legacy", true, false) + // "legacy" == "" (default value) + testFullRegistration("", true, false) + testFullRegistration("dead", false, true) + testFullRegistration("strict", false, false) +} + func TestStateStore_EnsureNode(t *testing.T) { s := testStateStore(t) @@ -1015,7 +1082,7 @@ func TestStateStore_GetNodes(t *testing.T) { } func BenchmarkGetNodes(b *testing.B) { - s, err := NewStateStore(nil) + s, err := NewStateStore(nil, NodeRenamingDefault) if err != nil { b.Fatalf("err: %s", err) } @@ -3051,7 +3118,7 @@ func TestStateStore_CheckConnectServiceNodes(t *testing.T) { } func BenchmarkCheckServiceNodes(b *testing.B) { - s, err := NewStateStore(nil) + s, err := NewStateStore(nil, NodeRenamingDefault) if err != nil { b.Fatalf("err: %s", err) } diff --git a/agent/consul/state/kvs_test.go b/agent/consul/state/kvs_test.go index 1ff2d6394f97..caa16026813e 100644 --- a/agent/consul/state/kvs_test.go +++ b/agent/consul/state/kvs_test.go @@ -21,7 +21,7 @@ func TestStateStore_GC(t *testing.T) { // Enable it and attach it to the state store. gc.SetEnabled(true) - s, err := NewStateStore(gc) + s, err := NewStateStore(gc, NodeRenamingDefault) if err != nil { t.Fatalf("err: %s", err) } diff --git a/agent/consul/state/state_store.go b/agent/consul/state/state_store.go index 78a397e24882..e932c96c1700 100644 --- a/agent/consul/state/state_store.go +++ b/agent/consul/state/state_store.go @@ -60,6 +60,12 @@ const ( // Given the current size of aFew == 32 in memdb's watch_few.go, this // will allow for up to ~64 goroutines per blocking query. watchLimit = 2048 + + // LegacyMode for Node Renaming - allow empty IDs + NodeRenamingLegacy = "legacy" + NodeRenamingRenameDeadNodes = "dead" + NodeRenamingStrict = "strict" + NodeRenamingDefault = NodeRenamingLegacy ) // Store is where we store all of Consul's state, including @@ -79,6 +85,9 @@ type Store struct { // lockDelay holds expiration times for locks associated with keys. lockDelay *Delay + + // Max number of watches, per store, defaults to 2048 + nodeRenameSetting string } // Snapshot is used to provide a point-in-time snapshot. It @@ -113,7 +122,7 @@ type sessionCheck struct { } // NewStateStore creates a new in-memory state storage layer. -func NewStateStore(gc *TombstoneGC) (*Store, error) { +func NewStateStore(gc *TombstoneGC, nodeRenameSetting string) (*Store, error) { // Create the in-memory DB. schema := stateStoreSchema() db, err := memdb.NewMemDB(schema) @@ -121,13 +130,17 @@ func NewStateStore(gc *TombstoneGC) (*Store, error) { return nil, fmt.Errorf("Failed setting up state store: %s", err) } + if nodeRenameSetting == "" { + nodeRenameSetting = NodeRenamingDefault + } // Create and return the state store. s := &Store{ - schema: schema, - db: db, - abandonCh: make(chan struct{}), - kvsGraveyard: NewGraveyard(gc), - lockDelay: NewDelay(), + schema: schema, + db: db, + abandonCh: make(chan struct{}), + kvsGraveyard: NewGraveyard(gc), + lockDelay: NewDelay(), + nodeRenameSetting: nodeRenameSetting, } return s, nil } diff --git a/agent/consul/state/state_store_test.go b/agent/consul/state/state_store_test.go index 368f460bc03e..f53b7a2c16ca 100644 --- a/agent/consul/state/state_store_test.go +++ b/agent/consul/state/state_store_test.go @@ -26,7 +26,11 @@ func testUUID() string { } func testStateStore(t *testing.T) *Store { - s, err := NewStateStore(nil) + return testStateStoreWithNodeRenamePolicy(t, NodeRenamingDefault) +} + +func testStateStoreWithNodeRenamePolicy(t *testing.T, nodeRenamingPolicy string) *Store { + s, err := NewStateStore(nil, nodeRenamingPolicy) if err != nil { t.Fatalf("err: %s", err) }