hashicorp · tgross · May 11, 2023 · May 3, 2023 · schmichael · May 8, 2023
@@ -0,0 +1,3 @@
+```release-note:improvement
+client: de-duplicate allocation client status updates and prevent allocation client status updates from being sent until clients have first synchronized with the server
+```
@@ -11,6 +11,7 @@ import (
 
 	log "github.com/hashicorp/go-hclog"
 	multierror "github.com/hashicorp/go-multierror"
+	"golang.org/x/exp/maps"
 
 	"github.com/hashicorp/nomad/client/allocdir"
 	"github.com/hashicorp/nomad/client/allocrunner/interfaces"
@@ -123,6 +124,10 @@ type allocRunner struct {
 	state     *state.State
 	stateLock sync.RWMutex
 
+	// lastAcknowledgedState is the alloc runner state that was last
+	// acknowledged by the server (may lag behind ar.state)
+	lastAcknowledgedState *state.State
+
 	stateDB cstate.StateDB
 
 	// allocDir is used to build the allocations directory structure.
@@ -738,8 +743,9 @@ func (ar *allocRunner) killTasks() map[string]*structs.TaskState {
 	return states
 }
 
-// clientAlloc takes in the task states and returns an Allocation populated
-// with Client specific fields
+// clientAlloc takes in the task states and returns an Allocation populated with
+// Client specific fields. Note: this mutates the allocRunner's state to store
+// the taskStates!
 func (ar *allocRunner) clientAlloc(taskStates map[string]*structs.TaskState) *structs.Allocation {
 	ar.stateLock.Lock()
 	defer ar.stateLock.Unlock()
@@ -1394,3 +1400,50 @@ func (ar *allocRunner) GetTaskDriverCapabilities(taskName string) (*drivers.Capa
 
 	return tr.DriverCapabilities()
 }
+
+// AcknowledgeState is called by the client's alloc sync when a given client
+// state has been acknowledged by the server
+func (ar *allocRunner) AcknowledgeState(a *state.State) {
+	ar.stateLock.Lock()
+	defer ar.stateLock.Unlock()
+	ar.lastAcknowledgedState = a
+	ar.persistLastAcknowledgedState(a)
+}
+
+// persistLastAcknowledgedState stores the last client state acknowledged by the server
+func (ar *allocRunner) persistLastAcknowledgedState(a *state.State) {
+	if err := ar.stateDB.PutAcknowledgedState(ar.id, a); err != nil {
+		// While any persistence errors are very bad, the worst case scenario
+		// for failing to persist last acknowledged state is that if the agent
+		// is restarted it will send the update again.
+		ar.logger.Error("error storing acknowledged allocation status", "error", err)
+	}
+}
+
+// LastAcknowledgedStateIsCurrent returns true if the current state matches the
+// state that was last acknowledged from a server update. This is called from
+// the client in the same goroutine that called AcknowledgeState so that we
+// can't get a TOCTOU error.
+func (ar *allocRunner) LastAcknowledgedStateIsCurrent(a *structs.Allocation) bool {
+	ar.stateLock.RLock()
+	defer ar.stateLock.RUnlock()
+
+	last := ar.lastAcknowledgedState
+	if last == nil {
+		return false
+	}
+
+	switch {
+	case last.ClientStatus != a.ClientStatus:
+		return false
+	case last.ClientDescription != a.ClientDescription:
+		return false
+	case !last.DeploymentStatus.Equal(a.DeploymentStatus):
+		return false
+	case !last.NetworkStatus.Equal(a.NetworkStatus):
+		return false
+	}
+	return maps.EqualFunc(last.TaskStates, a.TaskStates, func(st, o *structs.TaskState) bool {
+		return st.Equal(o)
+	})
+}
@@ -14,8 +14,13 @@ import (
 
 	"github.com/hashicorp/consul/api"
 	multierror "github.com/hashicorp/go-multierror"
+	"github.com/shoenig/test/must"
+	"github.com/shoenig/test/wait"
+	"github.com/stretchr/testify/require"
+
 	"github.com/hashicorp/nomad/ci"
 	"github.com/hashicorp/nomad/client/allochealth"
+	arstate "github.com/hashicorp/nomad/client/allocrunner/state"
 	"github.com/hashicorp/nomad/client/allocrunner/tasklifecycle"
 	"github.com/hashicorp/nomad/client/allocrunner/taskrunner"
 	"github.com/hashicorp/nomad/client/allocwatcher"
@@ -26,9 +31,6 @@ import (
 	"github.com/hashicorp/nomad/nomad/mock"
 	"github.com/hashicorp/nomad/nomad/structs"
 	"github.com/hashicorp/nomad/testutil"
-	"github.com/shoenig/test/must"
-	"github.com/shoenig/test/wait"
-	"github.com/stretchr/testify/require"
 )
 
 // destroy does a blocking destroy on an alloc runner
@@ -2443,3 +2445,59 @@ func TestAllocRunner_PreKill_RunOnDone(t *testing.T) {
 		wait.Gap(500*time.Millisecond),
 	))
 }
+
+func TestAllocRunner_LastAcknowledgedStateIsCurrent(t *testing.T) {
+	ci.Parallel(t)
+
+	alloc := mock.Alloc()
+	task := alloc.Job.TaskGroups[0].Tasks[0]
+	task.Driver = "mock_driver"
+	task.Config = map[string]interface{}{"run_for": "2ms"}
+	alloc.DesiredStatus = "stop"
+
+	conf, cleanup := testAllocRunnerConfig(t, alloc.Copy())
+	t.Cleanup(cleanup)
+
+	ar, err := NewAllocRunner(conf)
+	must.NoError(t, err)
+
+	ar.SetNetworkStatus(&structs.AllocNetworkStatus{
+		InterfaceName: "eth0",
+		Address:       "192.168.1.1",
+		DNS:           &structs.DNSConfig{},
+	})
+
+	calloc := ar.clientAlloc(map[string]*structs.TaskState{})
+	ar.AcknowledgeState(&arstate.State{
+		ClientStatus:      calloc.ClientStatus,
+		ClientDescription: calloc.ClientDescription,
+		DeploymentStatus:  calloc.DeploymentStatus,
+		TaskStates:        calloc.TaskStates,
+		NetworkStatus:     calloc.NetworkStatus,
+	})
+
+	must.True(t, ar.LastAcknowledgedStateIsCurrent(calloc))
+
+	// clientAlloc mutates the state, so verify this doesn't break the check
+	// without state having been updated
+	calloc = ar.clientAlloc(map[string]*structs.TaskState{})
+	must.True(t, ar.LastAcknowledgedStateIsCurrent(calloc))
+
+	// make a no-op state update
+	ar.SetNetworkStatus(&structs.AllocNetworkStatus{
+		InterfaceName: "eth0",
+		Address:       "192.168.1.1",
+		DNS:           &structs.DNSConfig{},
+	})
+	calloc = ar.clientAlloc(map[string]*structs.TaskState{})
+	must.True(t, ar.LastAcknowledgedStateIsCurrent(calloc))
+
+	// make a state update that should be detected as a change
+	ar.SetNetworkStatus(&structs.AllocNetworkStatus{
+		InterfaceName: "eth0",
+		Address:       "192.168.2.1",
+		DNS:           &structs.DNSConfig{},
+	})
+	calloc = ar.clientAlloc(map[string]*structs.TaskState{})
+	must.False(t, ar.LastAcknowledgedStateIsCurrent(calloc))
+}
@@ -160,6 +160,8 @@ type AllocRunner interface {
 	Signal(taskName, signal string) error
 	GetTaskEventHandler(taskName string) drivermanager.EventHandler
 	PersistState() error
+	AcknowledgeState(*arstate.State)
+	LastAcknowledgedStateIsCurrent(*structs.Allocation) bool
 
 	RestartTask(taskName string, taskEvent *structs.TaskEvent) error
 	RestartRunning(taskEvent *structs.TaskEvent) error
@@ -512,7 +514,7 @@ func NewClient(cfg *config.Config, consulCatalog consul.CatalogAPI, consulProxie
 	c.serviceRegWrapper = wrapper.NewHandlerWrapper(c.logger, c.consulService, c.nomadService)
 
 	// Batching of initial fingerprints is done to reduce the number of node
-	// updates sent to the server on startup. This is the first RPC to the servers
+	// updates sent to the server on startup.
 	go c.batchFirstFingerprints()
 
 	// create heartbeatStop. We go after the first attempt to connect to the server, so
@@ -1270,6 +1272,14 @@ func (c *Client) restoreState() error {
 			continue
 		}
 
+		allocState, err := c.stateDB.GetAcknowledgedState(alloc.ID)
+		if err != nil {
+			c.logger.Error("error restoring last acknowledged alloc state, will update again",
+				err, "alloc_id", alloc.ID)
+		} else {
+			ar.AcknowledgeState(allocState)
+		}
+
 		// Maybe mark the alloc for halt on missing server heartbeats
 		if c.heartbeatStop.shouldStop(alloc) {
 			err = c.heartbeatStop.stopAlloc(alloc.ID)
@@ -2144,10 +2154,20 @@ func (c *Client) allocSync() {
 			if len(updates) == 0 {
 				continue
 			}
+			// Ensure we never send an update before we've had at least one sync
+			// from the server
+			select {
+			case <-c.serversContactedCh:
+			default:
+				continue
+			}
 
-			sync := make([]*structs.Allocation, 0, len(updates))
-			for _, alloc := range updates {
-				sync = append(sync, alloc)
+			sync := c.filterAcknowledgedUpdates(updates)
+			if len(sync) == 0 {
+				// No updates to send
+				updates = make(map[string]*structs.Allocation, len(updates))
+				syncTicker.Reset(allocSyncIntv)
+				continue
 			}
 
 			// Send to server.
@@ -2162,21 +2182,51 @@ func (c *Client) allocSync() {
 				// Error updating allocations, do *not* clear
 				// updates and retry after backoff
 				c.logger.Error("error updating allocations", "error", err)
-				syncTicker.Stop()
-				syncTicker = time.NewTicker(c.retryIntv(allocSyncRetryIntv))
+				syncTicker.Reset(c.retryIntv(allocSyncRetryIntv))
 				continue
 			}
 
+			c.allocLock.RLock()
+			for _, update := range sync {
+				if ar, ok := c.allocs[update.ID]; ok {
+					ar.AcknowledgeState(&arstate.State{
+						ClientStatus:      update.ClientStatus,
+						ClientDescription: update.ClientDescription,
+						DeploymentStatus:  update.DeploymentStatus,
+						TaskStates:        update.TaskStates,
+						NetworkStatus:     update.NetworkStatus,
+					})
+				}
+			}
+			c.allocLock.RUnlock()
+
 			// Successfully updated allocs, reset map and ticker.
 			// Always reset ticker to give loop time to receive
 			// alloc updates. If the RPC took the ticker interval
 			// we may call it in a tight loop before draining
 			// buffered updates.
 			updates = make(map[string]*structs.Allocation, len(updates))
-			syncTicker.Stop()
-			syncTicker = time.NewTicker(allocSyncIntv)
+			syncTicker.Reset(allocSyncIntv)
+		}
+	}
+}
+
+func (c *Client) filterAcknowledgedUpdates(updates map[string]*structs.Allocation) []*structs.Allocation {
+	sync := make([]*structs.Allocation, 0, len(updates))
+	c.allocLock.RLock()
+	defer c.allocLock.RUnlock()
+	for allocID, update := range updates {
+		if ar, ok := c.allocs[allocID]; ok {
+			if !ar.LastAcknowledgedStateIsCurrent(update) {
+				sync = append(sync, update)
+			}
+		} else {
+			// no allocrunner (typically a failed placement), so we need
+			// to send update
+			sync = append(sync, update)
 		}
 	}
+	return sync
 }
 
 // allocUpdates holds the results of receiving updated allocations from the