Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Makes reap time configurable for LAN and WAN. #1935

Merged
merged 3 commits into from
Apr 20, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions command/agent/agent.go
Original file line number Diff line number Diff line change
Expand Up @@ -299,6 +299,12 @@ func (a *Agent) consulConfig() *consul.Config {
base.SerfWANConfig.MemberlistConfig.AdvertiseAddr = a.config.AdvertiseAddrs.SerfWan.IP.String()
base.SerfWANConfig.MemberlistConfig.AdvertisePort = a.config.AdvertiseAddrs.SerfWan.Port
}
if a.config.ReconnectTimeoutLan != 0 {
base.SerfLANConfig.ReconnectTimeout = a.config.ReconnectTimeoutLan
}
if a.config.ReconnectTimeoutWan != 0 {
base.SerfWANConfig.ReconnectTimeout = a.config.ReconnectTimeoutWan
}
if a.config.AdvertiseAddrs.RPC != nil {
base.RPCAdvertise = a.config.AdvertiseAddrs.RPC
}
Expand Down
37 changes: 37 additions & 0 deletions command/agent/agent_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,43 @@ func TestAgent_CheckAdvertiseAddrsSettings(t *testing.T) {
}
}

func TestAgent_ReconnectConfigSettings(t *testing.T) {
c := nextConfig()
func() {
dir, agent := makeAgent(t, c)
defer os.RemoveAll(dir)
defer agent.Shutdown()

lan := agent.consulConfig().SerfLANConfig.ReconnectTimeout
if lan != 3*24*time.Hour {
t.Fatalf("bad: %s", lan.String())
}

wan := agent.consulConfig().SerfWANConfig.ReconnectTimeout
if wan != 3*24*time.Hour {
t.Fatalf("bad: %s", wan.String())
}
}()

c.ReconnectTimeoutLan = 24 * time.Hour
c.ReconnectTimeoutWan = 36 * time.Hour
func() {
dir, agent := makeAgent(t, c)
defer os.RemoveAll(dir)
defer agent.Shutdown()

lan := agent.consulConfig().SerfLANConfig.ReconnectTimeout
if lan != 24*time.Hour {
t.Fatalf("bad: %s", lan.String())
}

wan := agent.consulConfig().SerfWANConfig.ReconnectTimeout
if wan != 36*time.Hour {
t.Fatalf("bad: %s", wan.String())
}
}()
}

func TestAgent_AddService(t *testing.T) {
dir, agent := makeAgent(t, nextConfig())
defer os.RemoveAll(dir)
Expand Down
38 changes: 38 additions & 0 deletions command/agent/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -312,6 +312,14 @@ type Config struct {
RetryIntervalWan time.Duration `mapstructure:"-" json:"-"`
RetryIntervalWanRaw string `mapstructure:"retry_interval_wan"`

// ReconnectTimeout* specify the amount of time to wait to reconnect with
// another agent before deciding it's permanently gone. This can be used to
// control the time it takes to reap failed nodes from the cluster.
ReconnectTimeoutLan time.Duration `mapstructure:"-"`
ReconnectTimeoutLanRaw string `mapstructure:"reconnect_timeout"`
ReconnectTimeoutWan time.Duration `mapstructure:"-"`
ReconnectTimeoutWanRaw string `mapstructure:"reconnect_timeout_wan"`

// EnableUi enables the statically-compiled assets for the Consul web UI and
// serves them at the default /ui/ endpoint automatically.
EnableUi bool `mapstructure:"ui"`
Expand Down Expand Up @@ -778,6 +786,28 @@ func DecodeConfig(r io.Reader) (*Config, error) {
result.RetryIntervalWan = dur
}

const reconnectTimeoutMin = 8 * time.Hour
if raw := result.ReconnectTimeoutLanRaw; raw != "" {
dur, err := time.ParseDuration(raw)
if err != nil {
return nil, fmt.Errorf("ReconnectTimeoutLan invalid: %v", err)
}
if dur < reconnectTimeoutMin {
return nil, fmt.Errorf("ReconnectTimeoutLan must be >= %s", reconnectTimeoutMin.String())
}
result.ReconnectTimeoutLan = dur
}
if raw := result.ReconnectTimeoutWanRaw; raw != "" {
dur, err := time.ParseDuration(raw)
if err != nil {
return nil, fmt.Errorf("ReconnectTimeoutWan invalid: %v", err)
}
if dur < reconnectTimeoutMin {
return nil, fmt.Errorf("ReconnectTimeoutWan must be >= %s", reconnectTimeoutMin.String())
}
result.ReconnectTimeoutWan = dur
}

// Merge the single recursor
if result.DNSRecursor != "" {
result.DNSRecursors = append(result.DNSRecursors, result.DNSRecursor)
Expand Down Expand Up @@ -1131,6 +1161,14 @@ func MergeConfig(a, b *Config) *Config {
if b.RetryIntervalWan != 0 {
result.RetryIntervalWan = b.RetryIntervalWan
}
if b.ReconnectTimeoutLan != 0 {
result.ReconnectTimeoutLan = b.ReconnectTimeoutLan
result.ReconnectTimeoutLanRaw = b.ReconnectTimeoutLanRaw
}
if b.ReconnectTimeoutWan != 0 {
result.ReconnectTimeoutWan = b.ReconnectTimeoutWan
result.ReconnectTimeoutWanRaw = b.ReconnectTimeoutWanRaw
}
if b.DNSConfig.NodeTTL != 0 {
result.DNSConfig.NodeTTL = b.DNSConfig.NodeTTL
}
Expand Down
27 changes: 27 additions & 0 deletions command/agent/config_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -462,6 +462,29 @@ func TestDecodeConfig(t *testing.T) {
t.Fatalf("bad: %#v", config)
}

// Reconnect timeout LAN and WAN
input = `{"reconnect_timeout": "8h", "reconnect_timeout_wan": "10h"}`
config, err = DecodeConfig(bytes.NewReader([]byte(input)))
if err != nil {
t.Fatalf("err: %s", err)
}
if config.ReconnectTimeoutLanRaw != "8h" ||
config.ReconnectTimeoutLan.String() != "8h0m0s" ||
config.ReconnectTimeoutWanRaw != "10h" ||
config.ReconnectTimeoutWan.String() != "10h0m0s" {
t.Fatalf("bad: %#v", config)
}
input = `{"reconnect_timeout": "7h"}`
config, err = DecodeConfig(bytes.NewReader([]byte(input)))
if err == nil {
t.Fatalf("decode should have failed")
}
input = `{"reconnect_timeout_wan": "7h"}`
config, err = DecodeConfig(bytes.NewReader([]byte(input)))
if err == nil {
t.Fatalf("decode should have failed")
}

// Static UI server
input = `{"ui": true}`
config, err = DecodeConfig(bytes.NewReader([]byte(input)))
Expand Down Expand Up @@ -1351,6 +1374,10 @@ func TestMergeConfig(t *testing.T) {
RetryJoinWan: []string{"1.1.1.1"},
RetryIntervalWanRaw: "10s",
RetryIntervalWan: 10 * time.Second,
ReconnectTimeoutLanRaw: "24h",
ReconnectTimeoutLan: 24 * time.Hour,
ReconnectTimeoutWanRaw: "36h",
ReconnectTimeoutWan: 36 * time.Hour,
CheckUpdateInterval: 8 * time.Minute,
CheckUpdateIntervalRaw: "8m",
ACLToken: "1234",
Expand Down
2 changes: 1 addition & 1 deletion website/source/docs/agent/basics.html.markdown
Original file line number Diff line number Diff line change
Expand Up @@ -137,5 +137,5 @@ a server, replication to it will stop.

To prevent an accumulation of dead nodes (nodes in either _failed_ or _left_ states),
Consul will automatically remove dead nodes out of the catalog. This process is
called _reaping_. This is currently done on a non-configurable interval of 72 hours.
called _reaping_. This is currently done on a configurable interval of 72 hours.
Reaping is similar to leaving, causing all associated services to be deregistered.
13 changes: 13 additions & 0 deletions website/source/docs/agent/options.html.markdown
Original file line number Diff line number Diff line change
Expand Up @@ -580,6 +580,19 @@ Consul will not enable TLS for the HTTP API unless the `https` port has been ass
automatically reap child processes if it detects it is running as PID 1. If this is set to true or false, then
it controls reaping regardless of Consul's PID (forces reaping on or off, respectively).

* <a name="reconnect_timeout"></a><a href="#reconnect_timeout">`reconnect_timeout`</a> This controls
how long it takes for a failed node to be completely removed from the cluster. This defaults to
72 hours and it is recommended that this is set to at least double the maximum expected recoverable
outage time for a node or network partition. WARNING: Setting this time too low could cause Consul
servers to be removed from quorum during an extended node failure or partition, which could complicate
recovery of the cluster. The value is a time with a unit suffix, which can be "s", "m", "h" for seconds,
minutes, or hours. The value must be >= 8 hours.

* <a name="reconnect_timeout_wan"></a><a href="#reconnect_timeout_wan">`reconnect_timeout_wan`</a> This
is the WAN equivalent of the <a href="#reconnect_timeout">`reconnect_timeout`</a> parameter, which
controls how long it takes for a failed server to be completely removed from the WAN pool. This also
defaults to 72 hours, and must be >= 8 hours.

* <a name="recursor"></a><a href="#recursor">`recursor`</a> Provides a single recursor address.
This has been deprecated, and the value is appended to the [`recursors`](#recursors) list for
backwards compatibility.
Expand Down
2 changes: 1 addition & 1 deletion website/source/docs/faq.html.markdown
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ the current state of the catalog can lag behind until the state is reconciled.

To prevent an accumulation of dead nodes (nodes in either _failed_ or _left_ states),
Consul will automatically remove dead nodes out of the catalog. This process is
called _reaping_. This is currently done on a non-configurable interval of 72 hours.
called _reaping_. This is currently done on a configurable interval of 72 hours.
Reaping is similar to leaving, causing all associated services to be deregistered.

## Q: Does Consul support delta updates for watchers or blocking queries?
Expand Down