diff --git a/agent/agent.go b/agent/agent.go
index 603ac23c7f0d..4bd8e215f4ef 100644
--- a/agent/agent.go
+++ b/agent/agent.go
@@ -782,10 +782,12 @@ func (a *Agent) consulConfig() (*consul.Config, error) {
base.SerfLANConfig.MemberlistConfig.AdvertisePort = a.config.SerfAdvertiseAddrLAN.Port
base.SerfLANConfig.MemberlistConfig.GossipVerifyIncoming = a.config.EncryptVerifyIncoming
base.SerfLANConfig.MemberlistConfig.GossipVerifyOutgoing = a.config.EncryptVerifyOutgoing
- base.SerfLANConfig.MemberlistConfig.GossipInterval = a.config.ConsulSerfLANGossipInterval
- base.SerfLANConfig.MemberlistConfig.ProbeInterval = a.config.ConsulSerfLANProbeInterval
- base.SerfLANConfig.MemberlistConfig.ProbeTimeout = a.config.ConsulSerfLANProbeTimeout
- base.SerfLANConfig.MemberlistConfig.SuspicionMult = a.config.ConsulSerfLANSuspicionMult
+ base.SerfLANConfig.MemberlistConfig.GossipInterval = a.config.GossipLANGossipInterval
+ base.SerfLANConfig.MemberlistConfig.GossipNodes = a.config.GossipLANGossipNodes
+ base.SerfLANConfig.MemberlistConfig.ProbeInterval = a.config.GossipLANProbeInterval
+ base.SerfLANConfig.MemberlistConfig.ProbeTimeout = a.config.GossipLANProbeTimeout
+ base.SerfLANConfig.MemberlistConfig.SuspicionMult = a.config.GossipLANSuspicionMult
+ base.SerfLANConfig.MemberlistConfig.RetransmitMult = a.config.GossipLANRetransmitMult
if a.config.SerfBindAddrWAN != nil {
base.SerfWANConfig.MemberlistConfig.BindAddr = a.config.SerfBindAddrWAN.IP.String()
@@ -794,10 +796,12 @@ func (a *Agent) consulConfig() (*consul.Config, error) {
base.SerfWANConfig.MemberlistConfig.AdvertisePort = a.config.SerfAdvertiseAddrWAN.Port
base.SerfWANConfig.MemberlistConfig.GossipVerifyIncoming = a.config.EncryptVerifyIncoming
base.SerfWANConfig.MemberlistConfig.GossipVerifyOutgoing = a.config.EncryptVerifyOutgoing
- base.SerfWANConfig.MemberlistConfig.GossipInterval = a.config.ConsulSerfWANGossipInterval
- base.SerfWANConfig.MemberlistConfig.ProbeInterval = a.config.ConsulSerfWANProbeInterval
- base.SerfWANConfig.MemberlistConfig.ProbeTimeout = a.config.ConsulSerfWANProbeTimeout
- base.SerfWANConfig.MemberlistConfig.SuspicionMult = a.config.ConsulSerfWANSuspicionMult
+ base.SerfWANConfig.MemberlistConfig.GossipInterval = a.config.GossipWANGossipInterval
+ base.SerfWANConfig.MemberlistConfig.GossipNodes = a.config.GossipWANGossipNodes
+ base.SerfWANConfig.MemberlistConfig.ProbeInterval = a.config.GossipWANProbeInterval
+ base.SerfWANConfig.MemberlistConfig.ProbeTimeout = a.config.GossipWANProbeTimeout
+ base.SerfWANConfig.MemberlistConfig.SuspicionMult = a.config.GossipWANSuspicionMult
+ base.SerfWANConfig.MemberlistConfig.RetransmitMult = a.config.GossipWANRetransmitMult
} else {
// Disable serf WAN federation
base.SerfWANConfig = nil
diff --git a/agent/config/builder.go b/agent/config/builder.go
index d76196795e7b..25cb9b9c596f 100644
--- a/agent/config/builder.go
+++ b/agent/config/builder.go
@@ -579,16 +579,22 @@ func (b *Builder) Build() (rt RuntimeConfig, err error) {
ConsulRaftElectionTimeout: consulRaftElectionTimeout,
ConsulRaftHeartbeatTimeout: consulRaftHeartbeatTimeout,
ConsulRaftLeaderLeaseTimeout: consulRaftLeaderLeaseTimeout,
- ConsulSerfLANGossipInterval: b.durationVal("consul.serf_lan.gossip_interval", c.Consul.SerfLAN.Memberlist.GossipInterval),
- ConsulSerfLANProbeInterval: b.durationVal("consul.serf_lan.probe_interval", c.Consul.SerfLAN.Memberlist.ProbeInterval),
- ConsulSerfLANProbeTimeout: b.durationVal("consul.serf_lan.probe_timeout", c.Consul.SerfLAN.Memberlist.ProbeTimeout),
- ConsulSerfLANSuspicionMult: b.intVal(c.Consul.SerfLAN.Memberlist.SuspicionMult),
- ConsulSerfWANGossipInterval: b.durationVal("consul.serf_wan.gossip_interval", c.Consul.SerfWAN.Memberlist.GossipInterval),
- ConsulSerfWANProbeInterval: b.durationVal("consul.serf_wan.probe_interval", c.Consul.SerfWAN.Memberlist.ProbeInterval),
- ConsulSerfWANProbeTimeout: b.durationVal("consul.serf_wan.probe_timeout", c.Consul.SerfWAN.Memberlist.ProbeTimeout),
- ConsulSerfWANSuspicionMult: b.intVal(c.Consul.SerfWAN.Memberlist.SuspicionMult),
ConsulServerHealthInterval: b.durationVal("consul.server.health_interval", c.Consul.Server.HealthInterval),
+ // gossip configuration
+ GossipLANGossipInterval: b.durationVal("gossip_lan..gossip_interval", c.GossipLAN.GossipInterval),
+ GossipLANGossipNodes: b.intVal(c.GossipLAN.GossipNodes),
+ GossipLANProbeInterval: b.durationVal("gossip_lan..probe_interval", c.GossipLAN.ProbeInterval),
+ GossipLANProbeTimeout: b.durationVal("gossip_lan..probe_timeout", c.GossipLAN.ProbeTimeout),
+ GossipLANSuspicionMult: b.intVal(c.GossipLAN.SuspicionMult),
+ GossipLANRetransmitMult: b.intVal(c.GossipLAN.RetransmitMult),
+ GossipWANGossipInterval: b.durationVal("gossip_wan..gossip_interval", c.GossipWAN.GossipInterval),
+ GossipWANGossipNodes: b.intVal(c.GossipWAN.GossipNodes),
+ GossipWANProbeInterval: b.durationVal("gossip_wan..probe_interval", c.GossipWAN.ProbeInterval),
+ GossipWANProbeTimeout: b.durationVal("gossip_wan..probe_timeout", c.GossipWAN.ProbeTimeout),
+ GossipWANSuspicionMult: b.intVal(c.GossipWAN.SuspicionMult),
+ GossipWANRetransmitMult: b.intVal(c.GossipWAN.RetransmitMult),
+
// ACL
ACLAgentMasterToken: b.stringVal(c.ACLAgentMasterToken),
ACLAgentToken: b.stringVal(c.ACLAgentToken),
diff --git a/agent/config/config.go b/agent/config/config.go
index e49b7f9d638b..e0468f2d1f10 100644
--- a/agent/config/config.go
+++ b/agent/config/config.go
@@ -184,6 +184,8 @@ type Config struct {
EncryptKey *string `json:"encrypt,omitempty" hcl:"encrypt" mapstructure:"encrypt"`
EncryptVerifyIncoming *bool `json:"encrypt_verify_incoming,omitempty" hcl:"encrypt_verify_incoming" mapstructure:"encrypt_verify_incoming"`
EncryptVerifyOutgoing *bool `json:"encrypt_verify_outgoing,omitempty" hcl:"encrypt_verify_outgoing" mapstructure:"encrypt_verify_outgoing"`
+ GossipLAN GossipLANConfig `json:"gossip_lan,omitempty" hcl:"gossip_lan" mapstructure:"gossip_lan"`
+ GossipWAN GossipWANConfig `json:"gossip_wan,omitempty" hcl:"gossip_wan" mapstructure:"gossip_wan"`
HTTPConfig HTTPConfig `json:"http_config,omitempty" hcl:"http_config" mapstructure:"http_config"`
KeyFile *string `json:"key_file,omitempty" hcl:"key_file" mapstructure:"key_file"`
LeaveOnTerm *bool `json:"leave_on_terminate,omitempty" hcl:"leave_on_terminate" mapstructure:"leave_on_terminate"`
@@ -259,6 +261,24 @@ type Config struct {
VersionPrerelease *string `json:"version_prerelease,omitempty" hcl:"version_prerelease" mapstructure:"version_prerelease"`
}
+type GossipLANConfig struct {
+ GossipNodes *int `json:"gossip_nodes,omitempty" hcl:"gossip_nodes" mapstructure:"gossip_nodes"`
+ GossipInterval *string `json:"gossip_interval,omitempty" hcl:"gossip_interval" mapstructure:"gossip_interval"`
+ ProbeInterval *string `json:"probe_interval,omitempty" hcl:"probe_interval" mapstructure:"probe_interval"`
+ ProbeTimeout *string `json:"probe_timeout,omitempty" hcl:"probe_timeout" mapstructure:"probe_timeout"`
+ SuspicionMult *int `json:"suspicion_mult,omitempty" hcl:"suspicion_mult" mapstructure:"suspicion_mult"`
+ RetransmitMult *int `json:"retransmit_mult,omitempty" hcl:"retransmit_mult" mapstructure:"retransmit_mult"`
+}
+
+type GossipWANConfig struct {
+ GossipNodes *int `json:"gossip_nodes,omitempty" hcl:"gossip_nodes" mapstructure:"gossip_nodes"`
+ GossipInterval *string `json:"gossip_interval,omitempty" hcl:"gossip_interval" mapstructure:"gossip_interval"`
+ ProbeInterval *string `json:"probe_interval,omitempty" hcl:"probe_interval" mapstructure:"probe_interval"`
+ ProbeTimeout *string `json:"probe_timeout,omitempty" hcl:"probe_timeout" mapstructure:"probe_timeout"`
+ SuspicionMult *int `json:"suspicion_mult,omitempty" hcl:"suspicion_mult" mapstructure:"suspicion_mult"`
+ RetransmitMult *int `json:"retransmit_mult,omitempty" hcl:"retransmit_mult" mapstructure:"retransmit_mult"`
+}
+
type Consul struct {
Coordinate struct {
UpdateBatchSize *int `json:"update_batch_size,omitempty" hcl:"update_batch_size" mapstructure:"update_batch_size"`
@@ -272,24 +292,6 @@ type Consul struct {
LeaderLeaseTimeout *string `json:"leader_lease_timeout,omitempty" hcl:"leader_lease_timeout" mapstructure:"leader_lease_timeout"`
} `json:"raft,omitempty" hcl:"raft" mapstructure:"raft"`
- SerfLAN struct {
- Memberlist struct {
- GossipInterval *string `json:"gossip_interval,omitempty" hcl:"gossip_interval" mapstructure:"gossip_interval"`
- ProbeInterval *string `json:"probe_interval,omitempty" hcl:"probe_interval" mapstructure:"probe_interval"`
- ProbeTimeout *string `json:"probe_timeout,omitempty" hcl:"probe_timeout" mapstructure:"probe_timeout"`
- SuspicionMult *int `json:"suspicion_mult,omitempty" hcl:"suspicion_mult" mapstructure:"suspicion_mult"`
- } `json:"memberlist,omitempty" hcl:"memberlist" mapstructure:"memberlist"`
- } `json:"serf_lan,omitempty" hcl:"serf_lan" mapstructure:"serf_lan"`
-
- SerfWAN struct {
- Memberlist struct {
- GossipInterval *string `json:"gossip_interval,omitempty" hcl:"gossip_interval" mapstructure:"gossip_interval"`
- ProbeInterval *string `json:"probe_interval,omitempty" hcl:"probe_interval" mapstructure:"probe_interval"`
- ProbeTimeout *string `json:"probe_timeout,omitempty" hcl:"probe_timeout" mapstructure:"probe_timeout"`
- SuspicionMult *int `json:"suspicion_mult,omitempty" hcl:"suspicion_mult" mapstructure:"suspicion_mult"`
- } `json:"memberlist,omitempty" hcl:"memberlist" mapstructure:"memberlist"`
- } `json:"serf_wan,omitempty" hcl:"serf_wan" mapstructure:"serf_wan"`
-
Server struct {
HealthInterval *string `json:"health_interval,omitempty" hcl:"health_interval" mapstructure:"health_interval"`
} `json:"server,omitempty" hcl:"server" mapstructure:"server"`
diff --git a/agent/config/default.go b/agent/config/default.go
index c95a0c6434f5..017120d8ee45 100644
--- a/agent/config/default.go
+++ b/agent/config/default.go
@@ -26,6 +26,10 @@ func DefaultRPCProtocol() (int, error) {
// todo(fs): IMO, this should be the definitive default for all configurable values
// todo(fs): and whatever is in here should clobber every default value. Hence, no sourcing.
func DefaultSource() Source {
+ cfg := consul.DefaultConfig()
+ serfLAN := cfg.SerfLANConfig.MemberlistConfig
+ serfWAN := cfg.SerfWANConfig.MemberlistConfig
+
return Source{
Name: "default",
Format: "hcl",
@@ -62,6 +66,22 @@ func DefaultSource() Source {
max_trailing_logs = 250
server_stabilization_time = "10s"
}
+ gossip_lan = {
+ gossip_interval = "` + serfLAN.GossipInterval.String() + `"
+ gossip_nodes = ` + strconv.Itoa(serfLAN.GossipNodes) + `
+ retransmit_mult = ` + strconv.Itoa(serfLAN.RetransmitMult) + `
+ probe_interval = "` + serfLAN.ProbeInterval.String() + `"
+ probe_timeout = "` + serfLAN.ProbeTimeout.String() + `"
+ suspicion_mult = ` + strconv.Itoa(serfLAN.SuspicionMult) + `
+ }
+ gossip_wan = {
+ gossip_interval = "` + serfWAN.GossipInterval.String() + `"
+ gossip_nodes = ` + strconv.Itoa(serfLAN.GossipNodes) + `
+ retransmit_mult = ` + strconv.Itoa(serfLAN.RetransmitMult) + `
+ probe_interval = "` + serfWAN.ProbeInterval.String() + `"
+ probe_timeout = "` + serfWAN.ProbeTimeout.String() + `"
+ suspicion_mult = ` + strconv.Itoa(serfWAN.SuspicionMult) + `
+ }
dns_config = {
allow_stale = true
a_record_limit = 0
@@ -92,6 +112,7 @@ func DefaultSource() Source {
metrics_prefix = "consul"
filter_default = true
}
+
`,
}
}
@@ -111,6 +132,18 @@ func DevSource() Source {
log_level = "DEBUG"
server = true
+ gossip_lan = {
+ gossip_interval = "100ms"
+ probe_interval = "100ms"
+ probe_timeout = "100ms"
+ suspicion_mult = 3
+ }
+ gossip_wan = {
+ gossip_interval = "100ms"
+ probe_interval = "100ms"
+ probe_timeout = "100ms"
+ suspicion_mult = 3
+ }
connect = {
enabled = true
}
@@ -166,8 +199,6 @@ func DefaultVersionSource() Source {
func DefaultConsulSource() Source {
cfg := consul.DefaultConfig()
raft := cfg.RaftConfig
- serfLAN := cfg.SerfLANConfig.MemberlistConfig
- serfWAN := cfg.SerfWANConfig.MemberlistConfig
return Source{
Name: "consul",
Format: "hcl",
@@ -183,22 +214,6 @@ func DefaultConsulSource() Source {
heartbeat_timeout = "` + raft.HeartbeatTimeout.String() + `"
leader_lease_timeout = "` + raft.LeaderLeaseTimeout.String() + `"
}
- serf_lan = {
- memberlist = {
- gossip_interval = "` + serfLAN.GossipInterval.String() + `"
- probe_interval = "` + serfLAN.ProbeInterval.String() + `"
- probe_timeout = "` + serfLAN.ProbeTimeout.String() + `"
- suspicion_mult = ` + strconv.Itoa(serfLAN.SuspicionMult) + `
- }
- }
- serf_wan = {
- memberlist = {
- gossip_interval = "` + serfWAN.GossipInterval.String() + `"
- probe_interval = "` + serfWAN.ProbeInterval.String() + `"
- probe_timeout = "` + serfWAN.ProbeTimeout.String() + `"
- suspicion_mult = ` + strconv.Itoa(serfWAN.SuspicionMult) + `
- }
- }
server = {
health_interval = "` + cfg.ServerHealthInterval.String() + `"
}
@@ -223,22 +238,6 @@ func DevConsulSource() Source {
heartbeat_timeout = "35ms"
leader_lease_timeout = "20ms"
}
- serf_lan = {
- memberlist = {
- gossip_interval = "100ms"
- probe_interval = "100ms"
- probe_timeout = "100ms"
- suspicion_mult = 3
- }
- }
- serf_wan = {
- memberlist = {
- gossip_interval = "100ms"
- probe_interval = "100ms"
- probe_timeout = "100ms"
- suspicion_mult = 3
- }
- }
server = {
health_interval = "10ms"
}
diff --git a/agent/config/runtime.go b/agent/config/runtime.go
index dfb7893e0729..4c26ecd4dcab 100644
--- a/agent/config/runtime.go
+++ b/agent/config/runtime.go
@@ -47,14 +47,6 @@ type RuntimeConfig struct {
ConsulRaftElectionTimeout time.Duration
ConsulRaftHeartbeatTimeout time.Duration
ConsulRaftLeaderLeaseTimeout time.Duration
- ConsulSerfLANGossipInterval time.Duration
- ConsulSerfLANProbeInterval time.Duration
- ConsulSerfLANProbeTimeout time.Duration
- ConsulSerfLANSuspicionMult int
- ConsulSerfWANGossipInterval time.Duration
- ConsulSerfWANProbeInterval time.Duration
- ConsulSerfWANProbeTimeout time.Duration
- ConsulSerfWANSuspicionMult int
ConsulServerHealthInterval time.Duration
// ACLAgentMasterToken is a special token that has full read and write
@@ -964,6 +956,160 @@ type RuntimeConfig struct {
// hcl: ports { serf_wan = int }
SerfPortWAN int
+ // GossipLANGossipInterval is the interval between sending messages that need
+ // to be gossiped that haven't been able to piggyback on probing messages.
+ // If this is set to zero, non-piggyback gossip is disabled. By lowering
+ // this value (more frequent) gossip messages are propagated across
+ // the cluster more quickly at the expense of increased bandwidth. This
+ // configuration only applies to LAN gossip communications
+ //
+ // The default is: 200ms
+ //
+ // hcl: gossip_lan { gossip_interval = duration}
+ GossipLANGossipInterval time.Duration
+
+ // GossipLANGossipNodes is the number of random nodes to send gossip messages to
+ // per GossipInterval. Increasing this number causes the gossip messages to
+ // propagate across the cluster more quickly at the expense of increased
+ // bandwidth. This configuration only applies to LAN gossip communications
+ //
+ // The default is: 3
+ //
+ // hcl: gossip_lan { gossip_nodes = int }
+ GossipLANGossipNodes int
+
+ // GossipLANProbeInterval is the interval between random node probes. Setting
+ // this lower (more frequent) will cause the memberlist cluster to detect
+ // failed nodes more quickly at the expense of increased bandwidth usage.
+ // This configuration only applies to LAN gossip communications
+ //
+ // The default is: 1s
+ //
+ // hcl: gossip_lan { probe_interval = duration }
+ GossipLANProbeInterval time.Duration
+
+ // GossipLANProbeTimeout is the timeout to wait for an ack from a probed node
+ // before assuming it is unhealthy. This should be set to 99-percentile
+ // of RTT (round-trip time) on your network. This configuration
+ // only applies to the LAN gossip communications
+ //
+ // The default is: 500ms
+ //
+ // hcl: gossip_lan { probe_timeout = duration }
+ GossipLANProbeTimeout time.Duration
+
+ // GossipLANSuspicionMult is the multiplier for determining the time an
+ // inaccessible node is considered suspect before declaring it dead. This
+ // configuration only applies to LAN gossip communications
+ //
+ // The actual timeout is calculated using the formula:
+ //
+ // SuspicionTimeout = SuspicionMult * log(N+1) * ProbeInterval
+ //
+ // This allows the timeout to scale properly with expected propagation
+ // delay with a larger cluster size. The higher the multiplier, the longer
+ // an inaccessible node is considered part of the cluster before declaring
+ // it dead, giving that suspect node more time to refute if it is indeed
+ // still alive.
+ //
+ // The default is: 4
+ //
+ // hcl: gossip_lan { suspicion_mult = int }
+ GossipLANSuspicionMult int
+
+ // GossipLANRetransmitMult is the multiplier for the number of retransmissions
+ // that are attempted for messages broadcasted over gossip. This
+ // configuration only applies to LAN gossip communications. The actual
+ // count of retransmissions is calculated using the formula:
+ //
+ // Retransmits = RetransmitMult * log(N+1)
+ //
+ // This allows the retransmits to scale properly with cluster size. The
+ // higher the multiplier, the more likely a failed broadcast is to converge
+ // at the expense of increased bandwidth.
+ //
+ // The default is: 4
+ //
+ // hcl: gossip_lan { retransmit_mult = int }
+ GossipLANRetransmitMult int
+
+ // GossipWANGossipInterval is the interval between sending messages that need
+ // to be gossiped that haven't been able to piggyback on probing messages.
+ // If this is set to zero, non-piggyback gossip is disabled. By lowering
+ // this value (more frequent) gossip messages are propagated across
+ // the cluster more quickly at the expense of increased bandwidth. This
+ // configuration only applies to WAN gossip communications
+ //
+ // The default is: 200ms
+ //
+ // hcl: gossip_wan { gossip_interval = duration}
+ GossipWANGossipInterval time.Duration
+
+ // GossipWANGossipNodes is the number of random nodes to send gossip messages to
+ // per GossipInterval. Increasing this number causes the gossip messages to
+ // propagate across the cluster more quickly at the expense of increased
+ // bandwidth. This configuration only applies to WAN gossip communications
+ //
+ // The default is: 3
+ //
+ // hcl: gossip_wan { gossip_nodes = int }
+ GossipWANGossipNodes int
+
+ // GossipWANProbeInterval is the interval between random node probes. Setting
+ // this lower (more frequent) will cause the memberlist cluster to detect
+ // failed nodes more quickly at the expense of increased bandwidth usage.
+ // This configuration only applies to WAN gossip communications
+ //
+ // The default is: 1s
+ //
+ // hcl: gossip_wan { probe_interval = duration }
+ GossipWANProbeInterval time.Duration
+
+ // GossipWANProbeTimeout is the timeout to wait for an ack from a probed node
+ // before assuming it is unhealthy. This should be set to 99-percentile
+ // of RTT (round-trip time) on your network. This configuration
+ // only applies to the WAN gossip communications
+ //
+ // The default is: 500ms
+ //
+ // hcl: gossip_wan { probe_timeout = duration }
+ GossipWANProbeTimeout time.Duration
+
+ // GossipWANSuspicionMult is the multiplier for determining the time an
+ // inaccessible node is considered suspect before declaring it dead. This
+ // configuration only applies to WAN gossip communications
+ //
+ // The actual timeout is calculated using the formula:
+ //
+ // SuspicionTimeout = SuspicionMult * log(N+1) * ProbeInterval
+ //
+ // This allows the timeout to scale properly with expected propagation
+ // delay with a larger cluster size. The higher the multiplier, the longer
+ // an inaccessible node is considered part of the cluster before declaring
+ // it dead, giving that suspect node more time to refute if it is indeed
+ // still alive.
+ //
+ // The default is: 4
+ //
+ // hcl: gossip_wan { suspicion_mult = int }
+ GossipWANSuspicionMult int
+
+ // GossipWANRetransmitMult is the multiplier for the number of retransmissions
+ // that are attempted for messages broadcasted over gossip. This
+ // configuration only applies to WAN gossip communications. The actual
+ // count of retransmissions is calculated using the formula:
+ //
+ // Retransmits = RetransmitMult * log(N+1)
+ //
+ // This allows the retransmits to scale properly with cluster size. The
+ // higher the multiplier, the more likely a failed broadcast is to converge
+ // at the expense of increased bandwidth.
+ //
+ // The default is: 4
+ //
+ // hcl: gossip_wan { retransmit_mult = int }
+ GossipWANRetransmitMult int
+
// ServerMode controls if this agent acts like a Consul server,
// or merely as a client. Servers have more state, take part
// in leader election, etc.
diff --git a/agent/config/runtime_test.go b/agent/config/runtime_test.go
index 04a528b61810..75658a2507b8 100644
--- a/agent/config/runtime_test.go
+++ b/agent/config/runtime_test.go
@@ -286,14 +286,14 @@ func TestConfigFlagsAndEdgecases(t *testing.T) {
rt.ConsulRaftElectionTimeout = 52 * time.Millisecond
rt.ConsulRaftHeartbeatTimeout = 35 * time.Millisecond
rt.ConsulRaftLeaderLeaseTimeout = 20 * time.Millisecond
- rt.ConsulSerfLANGossipInterval = 100 * time.Millisecond
- rt.ConsulSerfLANProbeInterval = 100 * time.Millisecond
- rt.ConsulSerfLANProbeTimeout = 100 * time.Millisecond
- rt.ConsulSerfLANSuspicionMult = 3
- rt.ConsulSerfWANGossipInterval = 100 * time.Millisecond
- rt.ConsulSerfWANProbeInterval = 100 * time.Millisecond
- rt.ConsulSerfWANProbeTimeout = 100 * time.Millisecond
- rt.ConsulSerfWANSuspicionMult = 3
+ rt.GossipLANGossipInterval = 100 * time.Millisecond
+ rt.GossipLANProbeInterval = 100 * time.Millisecond
+ rt.GossipLANProbeTimeout = 100 * time.Millisecond
+ rt.GossipLANSuspicionMult = 3
+ rt.GossipWANGossipInterval = 100 * time.Millisecond
+ rt.GossipWANProbeInterval = 100 * time.Millisecond
+ rt.GossipWANProbeTimeout = 100 * time.Millisecond
+ rt.GossipWANSuspicionMult = 3
rt.ConsulServerHealthInterval = 10 * time.Millisecond
},
},
@@ -2617,6 +2617,22 @@ func TestFullConfig(t *testing.T) {
}
}
},
+ "gossip_lan" : {
+ "gossip_nodes": 6,
+ "gossip_interval" : "25252s",
+ "retransmit_mult" : 1234,
+ "suspicion_mult" : 1235,
+ "probe_interval" : "101ms",
+ "probe_timeout" : "102ms"
+ },
+ "gossip_wan" : {
+ "gossip_nodes" : 2,
+ "gossip_interval" : "6966s",
+ "retransmit_mult" : 16384,
+ "suspicion_mult" : 16385,
+ "probe_interval" : "103ms",
+ "probe_timeout" : "104ms"
+ },
"data_dir": "` + dataDir + `",
"datacenter": "rzo029wg",
"disable_anonymous_signature": true,
@@ -3092,6 +3108,22 @@ func TestFullConfig(t *testing.T) {
}
}
}
+ gossip_lan {
+ gossip_nodes = 6
+ gossip_interval = "25252s"
+ retransmit_mult = 1234
+ suspicion_mult = 1235
+ probe_interval = "101ms"
+ probe_timeout = "102ms"
+ }
+ gossip_wan {
+ gossip_nodes = 2
+ gossip_interval = "6966s"
+ retransmit_mult = 16384
+ suspicion_mult = 16385
+ probe_interval = "103ms"
+ probe_timeout = "104ms"
+ }
data_dir = "` + dataDir + `"
datacenter = "rzo029wg"
disable_anonymous_signature = true
@@ -3473,22 +3505,6 @@ func TestFullConfig(t *testing.T) {
"heartbeat_timeout": "25699s",
"leader_lease_timeout": "15351s"
},
- "serf_lan": {
- "memberlist": {
- "gossip_interval": "25252s",
- "probe_interval": "5105s",
- "probe_timeout": "29179s",
- "suspicion_mult": 8263
- }
- },
- "serf_wan": {
- "memberlist": {
- "gossip_interval": "6966s",
- "probe_interval": "20148s",
- "probe_timeout": "3007s",
- "suspicion_mult": 32096
- }
- },
"server": {
"health_interval": "17455s"
}
@@ -3527,22 +3543,6 @@ func TestFullConfig(t *testing.T) {
heartbeat_timeout = "25699s"
leader_lease_timeout = "15351s"
}
- serf_lan = {
- memberlist = {
- gossip_interval = "25252s"
- probe_interval = "5105s"
- probe_timeout = "29179s"
- suspicion_mult = 8263
- }
- }
- serf_wan = {
- memberlist = {
- gossip_interval = "6966s"
- probe_interval = "20148s"
- probe_timeout = "3007s"
- suspicion_mult = 32096
- }
- }
server = {
health_interval = "17455s"
}
@@ -3574,14 +3574,18 @@ func TestFullConfig(t *testing.T) {
ConsulRaftElectionTimeout: 5 * 31947 * time.Second,
ConsulRaftHeartbeatTimeout: 5 * 25699 * time.Second,
ConsulRaftLeaderLeaseTimeout: 5 * 15351 * time.Second,
- ConsulSerfLANGossipInterval: 25252 * time.Second,
- ConsulSerfLANProbeInterval: 5105 * time.Second,
- ConsulSerfLANProbeTimeout: 29179 * time.Second,
- ConsulSerfLANSuspicionMult: 8263,
- ConsulSerfWANGossipInterval: 6966 * time.Second,
- ConsulSerfWANProbeInterval: 20148 * time.Second,
- ConsulSerfWANProbeTimeout: 3007 * time.Second,
- ConsulSerfWANSuspicionMult: 32096,
+ GossipLANGossipInterval: 25252 * time.Second,
+ GossipLANGossipNodes: 6,
+ GossipLANProbeInterval: 101 * time.Millisecond,
+ GossipLANProbeTimeout: 102 * time.Millisecond,
+ GossipLANSuspicionMult: 1235,
+ GossipLANRetransmitMult: 1234,
+ GossipWANGossipInterval: 6966 * time.Second,
+ GossipWANGossipNodes: 2,
+ GossipWANProbeInterval: 103 * time.Millisecond,
+ GossipWANProbeTimeout: 104 * time.Millisecond,
+ GossipWANSuspicionMult: 16385,
+ GossipWANRetransmitMult: 16384,
ConsulServerHealthInterval: 17455 * time.Second,
// user configurable values
@@ -4407,14 +4411,18 @@ func TestSanitize(t *testing.T) {
"ConsulRaftElectionTimeout": "0s",
"ConsulRaftHeartbeatTimeout": "0s",
"ConsulRaftLeaderLeaseTimeout": "0s",
- "ConsulSerfLANGossipInterval": "0s",
- "ConsulSerfLANProbeInterval": "0s",
- "ConsulSerfLANProbeTimeout": "0s",
- "ConsulSerfLANSuspicionMult": 0,
- "ConsulSerfWANGossipInterval": "0s",
- "ConsulSerfWANProbeInterval": "0s",
- "ConsulSerfWANProbeTimeout": "0s",
- "ConsulSerfWANSuspicionMult": 0,
+ "GossipLANGossipInterval": "0s",
+ "GossipLANGossipNodes": 0,
+ "GossipLANProbeInterval": "0s",
+ "GossipLANProbeTimeout": "0s",
+ "GossipLANRetransmitMult": 0,
+ "GossipLANSuspicionMult": 0,
+ "GossipWANGossipInterval": "0s",
+ "GossipWANGossipNodes": 0,
+ "GossipWANProbeInterval": "0s",
+ "GossipWANProbeTimeout": "0s",
+ "GossipWANRetransmitMult": 0,
+ "GossipWANSuspicionMult": 0,
"ConsulServerHealthInterval": "0s",
"DNSARecordLimit": 0,
"DNSAddrs": [
diff --git a/website/source/docs/agent/options.html.md b/website/source/docs/agent/options.html.md
index 75029f63da06..6cd75d605b2c 100644
--- a/website/source/docs/agent/options.html.md
+++ b/website/source/docs/agent/options.html.md
@@ -918,6 +918,76 @@ Consul will not enable TLS for the HTTP API unless the `https` port has been ass
* `disable_keyring_file` - Equivalent to the
[`-disable-keyring-file` command-line flag](#_disable_keyring_file).
+* `gossip_lan` - **(Advanced)** This object contains a number of sub-keys
+ which can be set to tune the LAN gossip communications. These are only provided for users running especially large
+ clusters that need fine tuning and are prepared to spend significant effort correctly tuning them for their
+ environment and workload. **Tuning these improperly can cause Consul to fail in unexpected ways**.
+ The default values are appropriate in almost all deployments.
+
+ * `gossip_nodes` - The number of random nodes to send
+ gossip messages to per gossip_interval. Increasing this number causes the gossip messages to propagate
+ across the cluster more quickly at the expense of increased bandwidth. The default is 3.
+
+ * `gossip_interval` - The interval between sending
+ messages that need to be gossiped that haven't been able to piggyback on probing messages. If this is set to
+ zero, non-piggyback gossip is disabled. By lowering this value (more frequent) gossip messages are propagated
+ across the cluster more quickly at the expense of increased bandwidth. The default is 200ms.
+
+ * `probe_interval` - The interval between random node
+ probes. Setting this lower (more frequent) will cause the cluster to detect failed nodes more quickly
+ at the expense of increased bandwidth usage. The default is 1s.
+
+ * `probe_timeout` - The timeout to wait for an ack from
+ a probed node before assuming it is unhealthy. This should be at least the 99-percentile of RTT (round-trip time) on
+ your network. The default is 500ms and is a conservative value suitable for almost all realistic deployments.
+
+ * `retransmit_mult` - The multiplier for the number
+ of retransmissions that are attempted for messages broadcasted over gossip. The number of retransmits is scaled
+ using this multiplier and the cluster size. The higher the multiplier, the more likely a failed broadcast is to
+ converge at the expense of increased bandwidth. The default is 4.
+
+ * `suspicion_mult` - The multiplier for determining the
+ time an inaccessible node is considered suspect before declaring it dead. The timeout is scaled with the cluster
+ size and the probe_interval. This allows the timeout to scale properly with expected propagation delay with a
+ larger cluster size. The higher the multiplier, the longer an inaccessible node is considered part of the
+ cluster before declaring it dead, giving that suspect node more time to refute if it is indeed still alive. The
+ default is 4.
+
+* `gossip_wan` - **(Advanced)** This object contains a number of sub-keys
+ which can be set to tune the WAN gossip communications. These are only provided for users running especially large
+ clusters that need fine tuning and are prepared to spend significant effort correctly tuning them for their
+ environment and workload. **Tuning these improperly can cause Consul to fail in unexpected ways**.
+ The default values are appropriate in almost all deployments.
+
+ * `gossip_nodes` - The number of random nodes to send
+ gossip messages to per gossip_interval. Increasing this number causes the gossip messages to propagate
+ across the cluster more quickly at the expense of increased bandwidth. The default is 3.
+
+ * `gossip_interval` - The interval between sending
+ messages that need to be gossiped that haven't been able to piggyback on probing messages. If this is set to
+ zero, non-piggyback gossip is disabled. By lowering this value (more frequent) gossip messages are propagated
+ across the cluster more quickly at the expense of increased bandwidth. The default is 200ms.
+
+ * `probe_interval` - The interval between random node
+ probes. Setting this lower (more frequent) will cause the cluster to detect failed nodes more quickly
+ at the expense of increased bandwidth usage. The default is 1s.
+
+ * `probe_timeout` - The timeout to wait for an ack from
+ a probed node before assuming it is unhealthy. This should be at least the 99-percentile of RTT (round-trip time) on
+ your network. The default is 500ms and is a conservative value suitable for almost all realistic deployments.
+
+ * `retransmit_mult` - The multiplier for the number
+ of retransmissions that are attempted for messages broadcasted over gossip. The number of retransmits is scaled
+ using this multiplier and the cluster size. The higher the multiplier, the more likely a failed broadcast is to
+ converge at the expense of increased bandwidth. The default is 4.
+
+ * `suspicion_mult` - The multiplier for determining the
+ time an inaccessible node is considered suspect before declaring it dead. The timeout is scaled with the cluster
+ size and the probe_interval. This allows the timeout to scale properly with expected propagation delay with a
+ larger cluster size. The higher the multiplier, the longer an inaccessible node is considered part of the
+ cluster before declaring it dead, giving that suspect node more time to refute if it is indeed still alive. The
+ default is 4.
+
* `key_file` This provides a the file path to a
PEM-encoded private key. The key is used with the certificate to verify the agent's authenticity.
This must be provided along with [`cert_file`](#cert_file).