Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

agent: prevent very old servers re-joining a cluster with stale data #17171

Merged
merged 24 commits into from
May 15, 2023
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .changelog/17171.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
```release-note:improvement
agent: prevent very old servers re-joining a cluster with stale data
loshz marked this conversation as resolved.
Show resolved Hide resolved
```
93 changes: 84 additions & 9 deletions agent/agent.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (
"context"
"crypto/tls"
"encoding/json"
"errors"
"fmt"
"io"
"net"
Expand All @@ -22,8 +23,6 @@ import (

"github.com/armon/go-metrics"
"github.com/armon/go-metrics/prometheus"
"github.com/hashicorp/consul/agent/rpcclient"
"github.com/hashicorp/consul/agent/rpcclient/configentry"
"github.com/hashicorp/go-connlimit"
"github.com/hashicorp/go-hclog"
"github.com/hashicorp/go-memdb"
Expand All @@ -50,12 +49,13 @@ import (
grpcDNS "github.com/hashicorp/consul/agent/grpc-external/services/dns"
middleware "github.com/hashicorp/consul/agent/grpc-middleware"
"github.com/hashicorp/consul/agent/hcp/scada"
libscada "github.com/hashicorp/consul/agent/hcp/scada"
"github.com/hashicorp/consul/agent/local"
"github.com/hashicorp/consul/agent/proxycfg"
proxycfgglue "github.com/hashicorp/consul/agent/proxycfg-glue"
catalogproxycfg "github.com/hashicorp/consul/agent/proxycfg-sources/catalog"
localproxycfg "github.com/hashicorp/consul/agent/proxycfg-sources/local"
"github.com/hashicorp/consul/agent/rpcclient"
"github.com/hashicorp/consul/agent/rpcclient/configentry"
"github.com/hashicorp/consul/agent/rpcclient/health"
"github.com/hashicorp/consul/agent/structs"
"github.com/hashicorp/consul/agent/systemd"
Expand Down Expand Up @@ -105,6 +105,9 @@ const (
// defaultQueryTime is the amount of time we block waiting for a change
// if no time is specified. Previously we would wait the maxQueryTime.
defaultQueryTime = 300 * time.Second

// Name of the file to store a server's last seen timestamp.
serverLastSeenFile = "server_last_seen"
)

var (
Expand Down Expand Up @@ -575,11 +578,11 @@ func (a *Agent) Start(ctx context.Context) error {
return err
}

// copy over the existing node id, this cannot be
// changed while running anyways but this prevents
// breaking some existing behavior. then overwrite
// the configuration
// Copy over the existing node id. This cannot be
// changed while running, but this prevents
// breaking some existing behavior.
c.NodeID = a.config.NodeID
// Overwrite the configuration.
a.config = c

if err := a.tlsConfigurator.Update(a.config.TLS); err != nil {
Expand Down Expand Up @@ -625,6 +628,12 @@ func (a *Agent) Start(ctx context.Context) error {
if c.ServerMode {
serverLogger := a.baseDeps.Logger.NamedIntercept(logging.ConsulServer)

// TODO: maybe this is called too early?
loshz marked this conversation as resolved.
Show resolved Hide resolved
if err := a.checkServerLastSeen(); err != nil {
// TODO: log a bunch of times first?
loshz marked this conversation as resolved.
Show resolved Hide resolved
return err
}

incomingRPCLimiter := consul.ConfiguredIncomingRPCLimiter(
&lib.StopChannelContext{StopCh: a.shutdownCh},
serverLogger,
Expand Down Expand Up @@ -661,7 +670,6 @@ func (a *Agent) Start(ctx context.Context) error {
return fmt.Errorf("failed to start server cert manager: %w", err)
}
}

} else {
a.externalGRPCServer = external.NewServer(
a.logger.Named("grpc.external"),
Expand Down Expand Up @@ -752,6 +760,10 @@ func (a *Agent) Start(ctx context.Context) error {
},
)

// Start writing last seen timestamps to file in order to age on next startup.
// TODO: maybe we should do this earlier above in the "if c.ServerMode" block?
loshz marked this conversation as resolved.
Show resolved Hide resolved
go a.persistServerLastSeen()

// Start watching for critical services to deregister, based on their
// checks.
go a.reapServices()
Expand Down Expand Up @@ -1094,7 +1106,7 @@ func (a *Agent) listenHTTP() ([]apiServer, error) {
MaxHeaderBytes: a.config.HTTPMaxHeaderBytes,
}

if libscada.IsCapability(l.Addr()) {
if scada.IsCapability(l.Addr()) {
// wrap in http2 server handler
httpServer.Handler = h2c.NewHandler(srv.handler(a.config.EnableDebug), &http2.Server{})
}
Expand Down Expand Up @@ -4527,7 +4539,70 @@ func (a *Agent) proxyDataSources() proxycfg.DataSources {

a.fillEnterpriseProxyDataSources(&sources)
return sources
}

// persistServerLastSeen writes a server's last seen Unix timestamp to a file
// in the configured data directory, and then periodically updates the timestamp
// every hour.
func (a *Agent) persistServerLastSeen() {
file := filepath.Join(a.config.DataDir, serverLastSeenFile)

// Create a timer with no initial tick to allow the timestamp to be written immediately.
t := time.NewTimer(0)
defer t.Stop()

for {
select {
case <-t.C:
// Reset the timer to the larger periodic interval.
t.Reset(1 * time.Hour)

// TODO: should we do this properly using binary encoding?
loshz marked this conversation as resolved.
Show resolved Hide resolved
now := strconv.FormatInt(time.Now().Unix(), 10)

if err := os.WriteFile(file, []byte(now), 0600); err != nil {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

While it wouldn't be the end of the world if we fail to read this on a crash, do you think it's worth attempting to write it more robustly? (e.g. write to a temp file and rename)

// TODO: should we exit if this has happened too many times?
loshz marked this conversation as resolved.
Show resolved Hide resolved
a.logger.Error("failed to write last seen timestamp: %w", err)
}
case <-a.shutdownCh:
return
}
}
}

// checkServerLastSeen is a safety check for preventing old servers from rejoining an existing cluster.
//
// It attempts to read a server's last seen file and check the Unix timestamp against a
// configurable max age. If the last seen file does not exist, we treat this as an initial startup
// and return no error.
//
// Example: if the server recorded a last seen timestamp of now-7d, and we configure a max age
// of 3d, then we should prevent the server from rejoining.
func (a *Agent) checkServerLastSeen() error {
file := filepath.Join(a.config.DataDir, serverLastSeenFile)

// Check if the last seen timestamp file exists, and return early if it doesn't
// as this indicates the server is starting for the first time.
if _, err := os.Stat(file); errors.Is(err, os.ErrNotExist) {
return nil
}
loshz marked this conversation as resolved.
Show resolved Hide resolved

// Read timestamp from last seen file.
b, err := os.ReadFile(file)
if err != nil {
return fmt.Errorf("error reading server last seen timestamp: %w", err)
}

// TODO: again, we probably want to do this more efficiently using binary encoding.
i, _ := strconv.Atoi(string(b))
lastSeen := time.Unix(int64(i), 0)
maxAge := time.Now().Add(-a.config.ServerRejoinAgeMax)

if lastSeen.Before(maxAge) {
return fmt.Errorf("server has not been seen since %s, will not rejoin", lastSeen.Format(time.DateTime))
}

return nil
}

func listenerPortKey(svcID structs.ServiceID, checkID structs.CheckID) string {
Expand Down
4 changes: 2 additions & 2 deletions agent/config/builder.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,15 +28,14 @@ import (
"github.com/hashicorp/memberlist"
"golang.org/x/time/rate"

hcpconfig "github.com/hashicorp/consul/agent/hcp/config"

"github.com/hashicorp/consul/agent/cache"
"github.com/hashicorp/consul/agent/checks"
"github.com/hashicorp/consul/agent/connect/ca"
"github.com/hashicorp/consul/agent/consul"
"github.com/hashicorp/consul/agent/consul/authmethod/ssoauth"
consulrate "github.com/hashicorp/consul/agent/consul/rate"
"github.com/hashicorp/consul/agent/dns"
hcpconfig "github.com/hashicorp/consul/agent/hcp/config"
"github.com/hashicorp/consul/agent/rpc/middleware"
"github.com/hashicorp/consul/agent/structs"
"github.com/hashicorp/consul/agent/token"
Expand Down Expand Up @@ -1090,6 +1089,7 @@ func (b *builder) build() (rt RuntimeConfig, err error) {
ServerMode: serverMode,
ServerName: stringVal(c.ServerName),
ServerPort: serverPort,
ServerRejoinAgeMax: b.durationValWithDefault("server_rejoin_age_max", c.ServerRejoinAgeMax, 24*7*time.Hour),
loshz marked this conversation as resolved.
Show resolved Hide resolved
Services: services,
SessionTTLMin: b.durationVal("session_ttl_min", c.SessionTTLMin),
SkipLeaveOnInt: skipLeaveOnInt,
Expand Down
1 change: 1 addition & 0 deletions agent/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,7 @@ type Config struct {
SerfBindAddrWAN *string `mapstructure:"serf_wan" json:"serf_wan,omitempty"`
ServerMode *bool `mapstructure:"server" json:"server,omitempty"`
ServerName *string `mapstructure:"server_name" json:"server_name,omitempty"`
ServerRejoinAgeMax *string `mapstructure:"server_rejoin_age_max" json:"server_rejoin_age_max,omitempty"`
Service *ServiceDefinition `mapstructure:"service" json:"-"`
Services []ServiceDefinition `mapstructure:"services" json:"-"`
SessionTTLMin *string `mapstructure:"session_ttl_min" json:"session_ttl_min,omitempty"`
Expand Down
12 changes: 12 additions & 0 deletions agent/config/runtime.go
Original file line number Diff line number Diff line change
Expand Up @@ -1358,6 +1358,18 @@ type RuntimeConfig struct {
// hcl: ports { server = int }
ServerPort int

// ServerRejoinAgeMax is used to specify the duration of time a server
// is allowed to be down/offline before a startup operation is refused.
//
// For example: if a server has been offline for 5 days, and this option
// is configured to 3 days, then any subsequent startup operation will fail
// and require an operator to manually intervene.
//
// The default is: 7 days
//
// hcl: server_rejoin_age_max = "duration"
ServerRejoinAgeMax time.Duration

loshz marked this conversation as resolved.
Show resolved Hide resolved
// Services contains the provided service definitions:
//
// hcl: services = [
Expand Down
3 changes: 1 addition & 2 deletions agent/config/runtime_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,13 +25,12 @@ import (
"github.com/stretchr/testify/require"
"golang.org/x/time/rate"

hcpconfig "github.com/hashicorp/consul/agent/hcp/config"

"github.com/hashicorp/consul/acl"
"github.com/hashicorp/consul/agent/cache"
"github.com/hashicorp/consul/agent/checks"
"github.com/hashicorp/consul/agent/consul"
consulrate "github.com/hashicorp/consul/agent/consul/rate"
hcpconfig "github.com/hashicorp/consul/agent/hcp/config"
"github.com/hashicorp/consul/agent/structs"
"github.com/hashicorp/consul/agent/token"
"github.com/hashicorp/consul/lib"
Expand Down
3 changes: 2 additions & 1 deletion agent/config/testdata/TestRuntimeConfig_Sanitize.golden
Original file line number Diff line number Diff line change
Expand Up @@ -330,6 +330,7 @@
"ServerMode": false,
"ServerName": "",
"ServerPort": 0,
"ServerRejoinAgeMax": "168h0m0s",
"Services": [
{
"Address": "",
Expand Down Expand Up @@ -507,4 +508,4 @@
"VersionPrerelease": "",
"Watches": [],
"XDSUpdateRateLimit": 0
}
}