Skip to content

Commit 45aeb6e

Browse files
committed
feat(sync): split new journal requests from rescheduled
Rescheduled requests only traversed when at least one is ready.
1 parent da81647 commit 45aeb6e

File tree

3 files changed

+122
-31
lines changed

3 files changed

+122
-31
lines changed

rst/sync/internal/workmgr/manager.go

Lines changed: 71 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -200,12 +200,23 @@ func NewAndStart(log *zap.Logger, config Config, beeRemoteClient *beeremote.Clie
200200

201201
allEntriesFound := 0
202202
for priority := range priorityLevels {
203-
entriesFound, err := m.initScheduler(SubmissionIdPriorityRange(priority))
203+
lastSubmissionId, nextRescheduledTime, entriesFound, err := m.initScheduler(SubmissionIdPriorityRange(priority))
204204
if err != nil {
205205
m.log.Error("failed to initialize scheduler", zap.Error(err))
206206
break
207207
}
208+
208209
allEntriesFound += entriesFound
210+
if !nextRescheduledTime.IsZero() {
211+
m.scheduler.SetNextRescheduledTime(nextRescheduledTime, priority)
212+
}
213+
if lastSubmissionId != nil {
214+
if nextSubmissionId, _, err := IncrementSubmissionId(*lastSubmissionId); err != nil {
215+
m.log.Error("failed to initialize scheduler", zap.Error(err))
216+
} else {
217+
m.scheduler.SetNextSubmissionId(nextSubmissionId, priority)
218+
}
219+
}
209220
}
210221
if allEntriesFound > 0 {
211222
m.log.Info("discovered work requests from previous run", zap.Int("requests", allEntriesFound))
@@ -285,7 +296,7 @@ func (m *Manager) manage(deferredFuncs []func() error) {
285296
workJournal: m.workJournal,
286297
jobStore: m.jobStore,
287298
beeRemoteClient: m.beeRemoteClient,
288-
rescheduleWork: m.scheduler.AddWorkToken,
299+
rescheduleWork: m.scheduler.RescheduleWork,
289300
}
290301
m.workerWG.Add(1)
291302
go w.run(m.workerCtx, m.workerWG)
@@ -299,17 +310,33 @@ func (m *Manager) manage(deferredFuncs []func() error) {
299310
case <-m.mgrCtx.Done():
300311
return
301312
case allowedTokens := <-m.scheduler.tokensReleased:
313+
currentTime := time.Now()
302314
for priority, ok := nextPriority(); ok; priority, ok = nextPriority() {
303315
tokensDistributed := allowedTokens[priority]
304316
if tokensDistributed == 0 {
305317
continue
306318
}
307319

308320
start, stop := SubmissionIdPriorityRange(priority)
309-
err = m.pullInWork(start, stop, allowedTokens[priority])
310-
if err != nil {
321+
nextSubmissionId := m.scheduler.GetNextSubmissionId(priority)
322+
323+
// Add rescheduled work to the activeWork map
324+
nextRescheduledTime := m.scheduler.GetNextRescheduledTime(priority)
325+
if currentTime.After(nextRescheduledTime) {
326+
if _, nextRescheduledTime, err = m.pullInWork(start, nextSubmissionId, &allowedTokens[priority]); err != nil {
327+
m.log.Error("failed to pull in new work", zap.Error(err))
328+
break
329+
} else {
330+
m.scheduler.SetNextRescheduledTime(nextRescheduledTime, priority)
331+
}
332+
}
333+
334+
// Add new work to the activeWork map
335+
if next, _, err := m.pullInWork(nextSubmissionId, stop, &allowedTokens[priority]); err != nil {
311336
m.log.Error("failed to pull in new work", zap.Error(err))
312337
break
338+
} else if next != nil {
339+
m.scheduler.SetNextSubmissionId(*next, priority)
313340
}
314341
}
315342
case completion := <-completedWork:
@@ -335,9 +362,13 @@ func (m *Manager) manage(deferredFuncs []func() error) {
335362
}
336363

337364
// pullInWork moves ready work from the priority range to the activeWork map.
338-
func (m *Manager) pullInWork(start string, stop string, availableTokens int) error {
339-
if availableTokens <= 0 {
340-
return nil
365+
func (m *Manager) pullInWork(start string, stop string, availableTokens *int) (lastSubmissionId *string, nextExecuteAfter time.Time, err error) {
366+
if availableTokens == nil {
367+
err = fmt.Errorf("availableTokens was unexpectedly nil: this is a bug")
368+
return
369+
}
370+
if *availableTokens <= 0 {
371+
return
341372
}
342373

343374
m.activeWorkMu.Lock()
@@ -348,26 +379,29 @@ func (m *Manager) pullInWork(start string, stop string, availableTokens int) err
348379
kvstore.WithStopKey(stop),
349380
)
350381
if err != nil {
351-
return fmt.Errorf("unable to get work journal entries: %w", err)
382+
err = fmt.Errorf("unable to get work journal entries: %w", err)
383+
return
352384
}
353385
defer cleanupNext()
354386

355387
item, err := nextItem()
356388
if err != nil {
357-
return fmt.Errorf("unable to get work journal entry: %w", err)
389+
err = fmt.Errorf("unable to get work journal entry: %w", err)
390+
return
358391
}
359392
if item == nil {
360-
return nil
393+
return
361394
}
362395

396+
lastSubmissionId = new(string)
363397
currentTime := time.Now()
364-
for item != nil && availableTokens > 0 {
365-
submissionID := item.Key
398+
for item != nil && *availableTokens > 0 {
399+
*lastSubmissionId = item.Key
366400
entry := item.Entry.Value
367401

368402
if currentTime.After(entry.ExecuteAfter) {
369403
workId := workIdentifier{
370-
submissionID: submissionID,
404+
submissionID: *lastSubmissionId,
371405
jobID: entry.WorkRequest.JobId,
372406
workRequestID: entry.WorkRequest.RequestId,
373407
}
@@ -380,11 +414,15 @@ func (m *Manager) pullInWork(start string, stop string, availableTokens int) err
380414
activeWork := workAssignment{ctx: workCtx, workIdentifier: workId}
381415
m.activeWork[activeWork.workIdentifier] = workContext{ctx: workCtx, cancel: workCtxCancel}
382416
m.activeWorkQueue <- activeWork
383-
availableTokens -= 1
384-
m.scheduler.RemoveWorkToken(submissionID)
417+
*availableTokens -= 1
418+
m.scheduler.RemoveWorkToken(*lastSubmissionId)
385419
priority := priorityIdMap[entry.WorkRequest.GetPriority()]
386420
beeSyncActiveQueue.Add(priority, 1)
387421
}
422+
} else {
423+
if nextExecuteAfter.IsZero() || entry.ExecuteAfter.Before(nextExecuteAfter) {
424+
nextExecuteAfter = entry.ExecuteAfter
425+
}
388426
}
389427

390428
item, err = nextItem()
@@ -393,41 +431,49 @@ func (m *Manager) pullInWork(start string, stop string, availableTokens int) err
393431
}
394432
}
395433

396-
return err
434+
return
397435
}
398436

399-
// initScheduler adds tokens for unfinished work requests so they can be handled when the Sync
400-
// node starts.
401-
func (m *Manager) initScheduler(start string, stop string) (int, error) {
437+
// initScheduler adds tokens for unfinished work requests so they can be handled when the Sync node
438+
// starts. It returns the last entry's submissionId, entries found and an error if there was one.
439+
func (m *Manager) initScheduler(start string, stop string) (submissionId *string, nextExecuteAfter time.Time, entriesFound int, err error) {
402440
nextItem, cleanupNext, err := m.workJournal.GetEntries(
403441
kvstore.WithStartingKey(start),
404442
kvstore.WithStopKey(stop),
405443
)
406444
if err != nil {
407-
return 0, fmt.Errorf("unable to get work journal entries: %w", err)
445+
err = fmt.Errorf("unable to get work journal entries: %w", err)
446+
return
408447
}
409448
defer cleanupNext()
410449

411450
submission, err := nextItem()
412451
if err != nil {
413-
return 0, fmt.Errorf("unable to get work journal entry: %w", err)
452+
err = fmt.Errorf("unable to get work journal entry: %w", err)
453+
return
414454
}
415455
if submission == nil {
416-
return 0, nil
456+
return
417457
}
418458

419-
entriesFound := 0
459+
submissionId = new(string)
420460
for submission != nil {
421-
m.scheduler.AddWorkToken(submission.Key)
461+
*submissionId = submission.Key
462+
entry := submission.Entry.Value
463+
m.scheduler.AddWorkToken(*submissionId)
422464
entriesFound++
423465

466+
if nextExecuteAfter.IsZero() || entry.ExecuteAfter.Before(nextExecuteAfter) {
467+
nextExecuteAfter = entry.ExecuteAfter
468+
}
469+
424470
submission, err = nextItem()
425471
if err != nil {
426472
break
427473
}
428474
}
429475

430-
return entriesFound, err
476+
return
431477
}
432478

433479
func (m *Manager) SubmitWorkRequest(wr *flex.WorkRequest) (*flex.Work, error) {

rst/sync/internal/workmgr/scheduler.go

Lines changed: 49 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ package workmgr
22

33
import (
44
"context"
5+
"fmt"
56
"math"
67
"strconv"
78
"sync/atomic"
@@ -60,7 +61,9 @@ type scheduler struct {
6061
// tokensReleased is a buffered channel releases the priority tokens to the sync manager. This
6162
// must be buffered to avoid deadlocking with manager. The channel must hold every sendWork that
6263
// can occur before the manager loop reads m.scheduler.tokensReleased.
63-
tokensReleased chan [priorityLevels]int
64+
tokensReleased chan [priorityLevels]int
65+
nextRescheduledTimes [priorityLevels]*time.Time
66+
nextSubmissionIds [priorityLevels]string
6467
}
6568

6669
func NewScheduler(ctx context.Context, log *zap.Logger, queue chan workAssignment, fairness gemetricRatio, opts ...schedulerOpt) (s *scheduler, close func() error) {
@@ -148,25 +151,53 @@ func NewScheduler(ctx context.Context, log *zap.Logger, queue chan workAssignmen
148151
return
149152
}
150153

154+
func (s *scheduler) GetNextRescheduledTime(priority int) time.Time {
155+
if s.nextRescheduledTimes[priority] == nil {
156+
return time.Time{}
157+
}
158+
return *s.nextRescheduledTimes[priority]
159+
}
160+
func (s *scheduler) SetNextRescheduledTime(ExecuteAfter time.Time, priority int) {
161+
if s.nextRescheduledTimes[priority] == nil {
162+
s.nextRescheduledTimes[priority] = new(time.Time)
163+
*s.nextRescheduledTimes[priority] = ExecuteAfter
164+
} else if ExecuteAfter.Before(*s.nextRescheduledTimes[priority]) {
165+
*s.nextRescheduledTimes[priority] = ExecuteAfter
166+
}
167+
}
168+
func (s *scheduler) GetNextSubmissionId(priority int) string {
169+
return s.nextSubmissionIds[priority]
170+
}
171+
func (s *scheduler) SetNextSubmissionId(submissionId string, priority int) {
172+
s.nextSubmissionIds[priority] = submissionId
173+
}
174+
175+
func (s *scheduler) RescheduleWork(submissionId string, ExecuteAfter time.Time) {
176+
priority := s.AddWorkToken(submissionId)
177+
s.SetNextRescheduledTime(ExecuteAfter, int(priority))
178+
}
179+
151180
// AddWorkToken(submissionID) tells the scheduler about a submission in the journal that is eligible
152181
// to be scheduled. It should be called whenever a WR is created, rediscovered on startup, or
153182
// rescheduled. The scheduler decodes the submission ID to determine the priority and increment that
154183
// bucket's token count. These counts are used to keep track of pending work at each priority to
155184
// ensure no priority queue is starved. Tokens represent work that is ready but not yet dispatched
156185
// to a worker.
157-
func (s *scheduler) AddWorkToken(submissionId string) {
186+
func (s *scheduler) AddWorkToken(submissionId string) int32 {
158187
priority := submissionIdPriority(submissionId)
159188
s.workTokens[priority].Add(1)
160189
s.allWorkTokens.Add(1)
190+
return priority
161191
}
162192

163193
// RemoveWorkToken(submissionID) is called once a work assignment has been added to the active work
164194
// queue (not when it actually completes). This tells the scheduler a request at the given priority
165195
// has been handed to a worker, allowing it to internally adjust how it assigns new work.
166-
func (s *scheduler) RemoveWorkToken(submissionId string) {
196+
func (s *scheduler) RemoveWorkToken(submissionId string) int32 {
167197
priority := submissionIdPriority(submissionId)
168198
s.workTokens[priority].Add(-1)
169199
s.allWorkTokens.Add(-1)
200+
return priority
170201
}
171202

172203
func (s *scheduler) getUpdateStatsFn(
@@ -346,8 +377,22 @@ func CreateSubmissionId(baseKey string, workRequestPriority int32) (string, int3
346377
if priority < 0 || priority > priorityLevels-1 {
347378
priority = 2
348379
}
380+
381+
workRequestPriority = priority + 1
349382
leadByte := baseKey[0] + submissionIdPriorityOffsetTable[priority]
350-
return string(leadByte) + baseKey[1:], priority + 1
383+
return string(leadByte) + baseKey[1:], workRequestPriority
384+
}
385+
386+
func IncrementSubmissionId(key string) (string, int32, error) {
387+
workRequestPriority := submissionIdPriority(key) + 1
388+
value, err := strconv.ParseUint(submissionBaseKey(key), 36, 64)
389+
if err != nil {
390+
return "", 0, fmt.Errorf("unable to cast last submission ID to an integer '%s': %w", key, err)
391+
}
392+
393+
baseKey := fmt.Sprintf("%013s", strconv.FormatUint(value+1, 36))
394+
submissionId, priority := CreateSubmissionId(baseKey, workRequestPriority)
395+
return submissionId, priority, nil
351396
}
352397

353398
func DemoteSubmissionId(key string) (string, int32) {

rst/sync/internal/workmgr/work.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -130,7 +130,7 @@ type worker struct {
130130
workJournal *kvstore.MapStore[workEntry]
131131
jobStore *kvstore.MapStore[map[string]string]
132132
beeRemoteClient *beeremote.Client
133-
rescheduleWork func(string)
133+
rescheduleWork func(submissionId string, ExecuteAfter time.Time)
134134
}
135135

136136
func (w *worker) run(ctx context.Context, wg *sync.WaitGroup) {
@@ -296,7 +296,7 @@ func (w *worker) processWork(work workAssignment) {
296296
status.SetState(flex.Work_RESCHEDULED)
297297
status.SetMessage("waiting for work request to be ready")
298298
w.sendWorkResult(work, result.Work)
299-
w.rescheduleWork(work.submissionID)
299+
w.rescheduleWork(work.submissionID, journalEntry.Value.ExecuteAfter)
300300
beeSyncWaitQueue.Add(mappedPriorityId, 1)
301301
return
302302
}

0 commit comments

Comments
 (0)