-
Notifications
You must be signed in to change notification settings - Fork 2k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
client: de-duplicate alloc updates and gate during restore #17074
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
```release-note:improvement | ||
client: de-duplicate allocation client status updates and prevent allocation client status updates from being sent until clients have first synchronized with the server | ||
``` |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -160,6 +160,8 @@ type AllocRunner interface { | |
Signal(taskName, signal string) error | ||
GetTaskEventHandler(taskName string) drivermanager.EventHandler | ||
PersistState() error | ||
AcknowledgeState(*arstate.State) | ||
LastAcknowledgedStateIsCurrent(*structs.Allocation) bool | ||
|
||
RestartTask(taskName string, taskEvent *structs.TaskEvent) error | ||
RestartRunning(taskEvent *structs.TaskEvent) error | ||
|
@@ -512,7 +514,7 @@ func NewClient(cfg *config.Config, consulCatalog consul.CatalogAPI, consulProxie | |
c.serviceRegWrapper = wrapper.NewHandlerWrapper(c.logger, c.consulService, c.nomadService) | ||
|
||
// Batching of initial fingerprints is done to reduce the number of node | ||
// updates sent to the server on startup. This is the first RPC to the servers | ||
// updates sent to the server on startup. | ||
go c.batchFirstFingerprints() | ||
|
||
// create heartbeatStop. We go after the first attempt to connect to the server, so | ||
|
@@ -1270,6 +1272,14 @@ func (c *Client) restoreState() error { | |
continue | ||
} | ||
|
||
allocState, err := c.stateDB.GetAcknowledgedState(alloc.ID) | ||
if err != nil { | ||
c.logger.Error("error restoring last acknowledged alloc state, will update again", | ||
err, "alloc_id", alloc.ID) | ||
} else { | ||
ar.AcknowledgeState(allocState) | ||
} | ||
|
||
// Maybe mark the alloc for halt on missing server heartbeats | ||
if c.heartbeatStop.shouldStop(alloc) { | ||
err = c.heartbeatStop.stopAlloc(alloc.ID) | ||
|
@@ -2144,10 +2154,20 @@ func (c *Client) allocSync() { | |
if len(updates) == 0 { | ||
continue | ||
} | ||
// Ensure we never send an update before we've had at least one sync | ||
// from the server | ||
select { | ||
case <-c.serversContactedCh: | ||
default: | ||
continue | ||
} | ||
|
||
sync := make([]*structs.Allocation, 0, len(updates)) | ||
for _, alloc := range updates { | ||
sync = append(sync, alloc) | ||
sync := c.filterAcknowledgedUpdates(updates) | ||
if len(sync) == 0 { | ||
// No updates to send | ||
updates = make(map[string]*structs.Allocation, len(updates)) | ||
syncTicker.Reset(allocSyncIntv) | ||
continue | ||
} | ||
|
||
// Send to server. | ||
|
@@ -2162,21 +2182,51 @@ func (c *Client) allocSync() { | |
// Error updating allocations, do *not* clear | ||
// updates and retry after backoff | ||
c.logger.Error("error updating allocations", "error", err) | ||
syncTicker.Stop() | ||
syncTicker = time.NewTicker(c.retryIntv(allocSyncRetryIntv)) | ||
syncTicker.Reset(c.retryIntv(allocSyncRetryIntv)) | ||
continue | ||
} | ||
|
||
c.allocLock.RLock() | ||
for _, update := range sync { | ||
if ar, ok := c.allocs[update.ID]; ok { | ||
ar.AcknowledgeState(&arstate.State{ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If you wanted to avoid storing a bunch of duplicated client alloc state and implementing a deep equality, I think we could make this a That relies on ARs/TRs not calling There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I really wanted to do exactly that sort of thing by retaining an index on both the current client state and the last-acknowledged state. But that doesn't help across a restore because the AR/TR will make repeat updates as their hooks complete. If the index were a hash it'd require less storage and potentially be less complex, but we wouldn't be able to return early on the comparison checks (whereas with the check we've got if e.g. the
I tried pushing the |
||
ClientStatus: update.ClientStatus, | ||
ClientDescription: update.ClientDescription, | ||
DeploymentStatus: update.DeploymentStatus, | ||
TaskStates: update.TaskStates, | ||
NetworkStatus: update.NetworkStatus, | ||
}) | ||
} | ||
} | ||
c.allocLock.RUnlock() | ||
|
||
// Successfully updated allocs, reset map and ticker. | ||
// Always reset ticker to give loop time to receive | ||
// alloc updates. If the RPC took the ticker interval | ||
// we may call it in a tight loop before draining | ||
// buffered updates. | ||
updates = make(map[string]*structs.Allocation, len(updates)) | ||
syncTicker.Stop() | ||
syncTicker = time.NewTicker(allocSyncIntv) | ||
syncTicker.Reset(allocSyncIntv) | ||
} | ||
} | ||
} | ||
|
||
func (c *Client) filterAcknowledgedUpdates(updates map[string]*structs.Allocation) []*structs.Allocation { | ||
sync := make([]*structs.Allocation, 0, len(updates)) | ||
c.allocLock.RLock() | ||
defer c.allocLock.RUnlock() | ||
for allocID, update := range updates { | ||
if ar, ok := c.allocs[allocID]; ok { | ||
if !ar.LastAcknowledgedStateIsCurrent(update) { | ||
sync = append(sync, update) | ||
} | ||
} else { | ||
// no allocrunner (typically a failed placement), so we need | ||
// to send update | ||
sync = append(sync, update) | ||
} | ||
} | ||
return sync | ||
} | ||
|
||
// allocUpdates holds the results of receiving updated allocations from the | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
in Go 1.21 this can become
clear(updates)
which... doesn't really matter because a single malloc and single piece of garbage here isn't going to make any noticeable performance difference 😅