Skip to content
This repository was archived by the owner on Aug 23, 2023. It is now read-only.

Commit 8d392cb

Browse files
authored
Merge pull request #979 from grafana/speculative-fixes
Speculative execution fixes
2 parents adbae33 + 3428869 commit 8d392cb

File tree

2 files changed

+28
-15
lines changed

2 files changed

+28
-15
lines changed

api/cluster.go

+18-12
Original file line numberDiff line numberDiff line change
@@ -306,7 +306,7 @@ func (s *Server) peerQuery(ctx context.Context, data cluster.Traceable, name, pa
306306
// across the cluster, except to the local peer. If any peer fails requests to
307307
// other peers are aborted. If enough peers have been heard from (based on
308308
// speculation-threshold configuration), and we are missing the others, try to
309-
// speculatively query other members of the shard group.
309+
// speculatively query each other member of each shard group.
310310
// ctx: request context
311311
// data: request to be submitted
312312
// name: name to be used in logging & tracing
@@ -323,7 +323,6 @@ func (s *Server) peerQuerySpeculative(ctx context.Context, data cluster.Traceabl
323323
defer cancel()
324324

325325
originalPeers := make(map[string]struct{}, len(peerGroups))
326-
pendingResponses := make(map[int32]struct{}, len(peerGroups))
327326
receivedResponses := make(map[int32]struct{}, len(peerGroups))
328327

329328
responses := make(chan struct {
@@ -357,15 +356,20 @@ func (s *Server) peerQuerySpeculative(ctx context.Context, data cluster.Traceabl
357356
for group, peers := range peerGroups {
358357
peer := peers[0]
359358
originalPeers[peer.GetName()] = struct{}{}
360-
pendingResponses[group] = struct{}{}
361359
go askPeer(group, peer)
362360
}
363361

364362
result := make(map[string]PeerResponse)
365363

366-
specCheckTicker := time.NewTicker(5 * time.Millisecond)
364+
var ticker *time.Ticker
365+
var tickChan <-chan time.Time
366+
if speculationThreshold != 1 {
367+
ticker = time.NewTicker(5 * time.Millisecond)
368+
tickChan = ticker.C
369+
defer ticker.Stop()
370+
}
367371

368-
for len(pendingResponses) > 0 {
372+
for len(receivedResponses) < len(peerGroups) {
369373
select {
370374
case resp := <-responses:
371375
if _, ok := receivedResponses[resp.shardGroup]; ok {
@@ -379,18 +383,20 @@ func (s *Server) peerQuerySpeculative(ctx context.Context, data cluster.Traceabl
379383

380384
result[resp.data.peer.GetName()] = resp.data
381385
receivedResponses[resp.shardGroup] = struct{}{}
382-
delete(pendingResponses, resp.shardGroup)
383386
delete(originalPeers, resp.data.peer.GetName())
384387

385-
case <-specCheckTicker.C:
388+
case <-tickChan:
386389
// Check if it's time to speculate!
387-
percentReceived := 1 - (float64(len(pendingResponses)) / float64(len(peerGroups)))
388-
if percentReceived > speculationThreshold {
390+
percentReceived := float64(len(receivedResponses)) / float64(len(peerGroups))
391+
if percentReceived >= speculationThreshold {
389392
// kick off speculative queries to other members now
390-
specCheckTicker.Stop()
393+
ticker.Stop()
391394
speculativeAttempts.Inc()
392-
for shardGroup := range pendingResponses {
393-
eligiblePeers := peerGroups[shardGroup][1:]
395+
for shardGroup, peers := range peerGroups {
396+
if _, ok := receivedResponses[shardGroup]; ok {
397+
continue
398+
}
399+
eligiblePeers := peers[1:]
394400
for _, peer := range eligiblePeers {
395401
speculativeRequests.Inc()
396402
go askPeer(shardGroup, peer)

cluster/cluster.go

+10-3
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ type partitionCandidates struct {
7171
nodes []Node
7272
}
7373

74-
// return the list of nodes to broadcast requests to
74+
// MembersForQuery returns the list of nodes to broadcast requests to
7575
// If partitions are assinged to nodes in groups
7676
// (a[0,1], b[0,1], c[2,3], d[2,3] as opposed to a[0,1], b[0,2], c[1,3], d[2,3]),
7777
// only 1 member per partition is returned.
@@ -133,7 +133,10 @@ func MembersForQuery() ([]Node, error) {
133133
count := int(atomic.AddUint32(&counter, 1))
134134

135135
LOOP:
136+
// for every partition...
136137
for _, candidates := range membersMap {
138+
139+
// prefer the local node if it serves this partition
137140
if candidates.nodes[0].GetName() == thisNode.GetName() {
138141
if _, ok := selectedMembers[thisNode.GetName()]; !ok {
139142
selectedMembers[thisNode.GetName()] = struct{}{}
@@ -142,14 +145,18 @@ LOOP:
142145
continue LOOP
143146
}
144147

148+
// for remote nodes, try to pick one we've already included
149+
145150
for _, n := range candidates.nodes {
146151
if _, ok := selectedMembers[n.GetName()]; ok {
147152
continue LOOP
148153
}
149154
}
155+
150156
// if no nodes have been selected yet then grab a node from
151157
// the set of available nodes in such a way that nodes are
152158
// weighted fairly across MembersForQuery calls
159+
153160
selected := candidates.nodes[count%len(candidates.nodes)]
154161
selectedMembers[selected.GetName()] = struct{}{}
155162
answer = append(answer, selected)
@@ -159,7 +166,7 @@ LOOP:
159166
}
160167

161168
// MembersForSpeculativeQuery returns a prioritized list of nodes for each shard group
162-
// TODO - this assumes that the partition set for each node is perfectly aligned
169+
// keyed by the first (lowest) partition of their shard group
163170
func MembersForSpeculativeQuery() (map[int32][]Node, error) {
164171
thisNode := Manager.ThisNode()
165172
allNodes := Manager.MemberList()
@@ -192,7 +199,7 @@ func MembersForSpeculativeQuery() (map[int32][]Node, error) {
192199
}
193200

194201
for _, shard := range membersMap {
195-
// Shuffle to avoid always choosing the same peer firsts
202+
// Shuffle to avoid always choosing the same peer first
196203
for i := len(shard) - 1; i > 0; i-- {
197204
j := rand.Intn(i + 1)
198205
shard[i], shard[j] = shard[j], shard[i]

0 commit comments

Comments
 (0)