Skip to content

Commit 4f37bcf

Browse files
committed
cmd/coordinator: add tests for the scheduler, and resulting fixes
It now passes thousands of iterations in race mode. Updates golang/go#19178 Change-Id: I210277abd084bdfd3c7ada538189722ff9543e3f Reviewed-on: https://go-review.googlesource.com/c/build/+/207079 Run-TryBot: Brad Fitzpatrick <bradfitz@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Bryan C. Mills <bcmills@google.com>
1 parent 9424ac6 commit 4f37bcf

File tree

3 files changed

+278
-33
lines changed

3 files changed

+278
-33
lines changed

cmd/coordinator/coordinator.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1593,6 +1593,9 @@ func poolForConf(conf *dashboard.HostConfig) BuildletPool {
15931593
if testPoolHook != nil {
15941594
return testPoolHook(conf)
15951595
}
1596+
if conf == nil {
1597+
panic("nil conf")
1598+
}
15961599
switch {
15971600
case conf.IsVM():
15981601
return gcePool

cmd/coordinator/sched.go

Lines changed: 66 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ package main
99

1010
import (
1111
"context"
12+
"fmt"
1213
"log"
1314
"sync"
1415
"time"
@@ -26,7 +27,7 @@ import (
2627
// If false, any GetBuildlet call to the schedule delegates directly
2728
// to the BuildletPool's GetBuildlet and we make a bunch of callers
2829
// fight over a mutex and a random one wins, like we used to do it.
29-
const useScheduler = false
30+
var useScheduler = false
3031

3132
// The Scheduler prioritizes access to buidlets. It accepts requests
3233
// for buildlets, starts the creation of buildlets from BuildletPools,
@@ -79,8 +80,9 @@ func (s *Scheduler) matchBuildlet(res getBuildletResult) {
7980
return
8081
}
8182
select {
82-
case waiter.res <- res.Client:
83+
case ch := <-waiter.wantRes:
8384
// Normal happy case. Something gets its buildlet.
85+
ch <- res.Client
8486
return
8587
case <-waiter.ctxDone:
8688
// Waiter went away in the tiny window between
@@ -138,28 +140,45 @@ func (l stderrLogger) CreateSpan(event string, optText ...string) spanlog.Span {
138140
return createSpan(l, event, optText...)
139141
}
140142

143+
// getPoolBuildlet is launched as its own goroutine to do a
144+
// potentially long blocking cal to pool.GetBuildlet.
141145
func (s *Scheduler) getPoolBuildlet(pool BuildletPool, hostType string) {
142146
res := getBuildletResult{
143147
Pool: pool,
144148
HostType: hostType,
145149
}
146150
ctx := context.Background() // TODO: make these cancelable and cancel unneeded ones earlier?
147151
res.Client, res.Err = pool.GetBuildlet(ctx, hostType, stderrLogger{})
152+
153+
// This is still slightly racy, but probably ok for now.
154+
// (We might invoke the schedule method right after
155+
// GetBuildlet returns and dial an extra buildlet, but if so
156+
// we'll close it without using it.)
157+
s.mu.Lock()
158+
s.hostsCreating[res.HostType]--
159+
s.mu.Unlock()
160+
148161
s.matchBuildlet(res)
149162
}
150163

151-
// matchWaiter returns (and removes from the waiting queue) the highest priority SchedItem
164+
// matchWaiter returns (and removes from the waiting set) the highest priority SchedItem
152165
// that matches the provided host type.
153166
func (s *Scheduler) matchWaiter(hostType string) (_ *SchedItem, ok bool) {
154167
s.mu.Lock()
155168
defer s.mu.Unlock()
169+
waiters := s.waiting[hostType]
170+
156171
var best *SchedItem
157-
for si := range s.waiting[hostType] {
172+
for si := range waiters {
158173
if best == nil || schedLess(si, best) {
159174
best = si
160175
}
161176
}
162-
return best, best != nil
177+
if best != nil {
178+
delete(waiters, best)
179+
return best, true
180+
}
181+
return nil, false
163182
}
164183

165184
func (s *Scheduler) removeWaiter(si *SchedItem) {
@@ -170,7 +189,7 @@ func (s *Scheduler) removeWaiter(si *SchedItem) {
170189
}
171190
}
172191

173-
func (s *Scheduler) enqueueWaiter(si *SchedItem) {
192+
func (s *Scheduler) addWaiter(si *SchedItem) {
174193
s.mu.Lock()
175194
defer s.mu.Unlock()
176195
if _, ok := s.waiting[si.HostType]; !ok {
@@ -180,6 +199,12 @@ func (s *Scheduler) enqueueWaiter(si *SchedItem) {
180199
s.scheduleLocked()
181200
}
182201

202+
func (s *Scheduler) hasWaiter(si *SchedItem) bool {
203+
s.mu.Lock()
204+
defer s.mu.Unlock()
205+
return s.waiting[si.HostType][si]
206+
}
207+
183208
// schedLess reports whether scheduled item ia is "less" (more
184209
// important) than scheduled item ib.
185210
func schedLess(ia, ib *SchedItem) bool {
@@ -217,30 +242,44 @@ type SchedItem struct {
217242
IsTry bool
218243
IsHelper bool
219244

220-
// We set in GetBuildlet:
245+
// The following unexported fields are set by the Scheduler in
246+
// Scheduler.GetBuildlet.
247+
221248
s *Scheduler
222249
requestTime time.Time
223-
tryFor string // which user. (user with 1 trybot >> user with 50 trybots)
250+
commitTime time.Time // TODO: populate post-submit commit time from maintnerd
251+
branch string // TODO: populate from maintnerd
252+
tryFor string // TODO: which user. (user with 1 trybot >> user with 50 trybots)
224253
pool BuildletPool
225254
ctxDone <-chan struct{}
226-
// TODO: track the commit time of the BuilderRev, via call to maintnerd probably
227-
// commitTime time.Time
228255

229-
// res is the result channel, containing either a
230-
// *buildlet.Client or an error. It is read by GetBuildlet and
231-
// written by assignBuildlet.
232-
res chan interface{}
233-
}
234-
235-
func (si *SchedItem) cancel() {
236-
si.s.removeWaiter(si)
256+
// wantRes is the unbuffered channel that's passed
257+
// synchronously from Scheduler.GetBuildlet to
258+
// Scheduler.matchBuildlet. Its value is a channel (whose
259+
// buffering doesn't matter) to pass over a *buildlet.Client
260+
// just obtained from a BuildletPool. The contract to use
261+
// wantRes is that the sender must have a result already
262+
// available to send on the inner channel, and the receiver
263+
// still wants it (their context hasn't expired).
264+
wantRes chan chan<- *buildlet.Client
237265
}
238266

239267
// GetBuildlet requests a buildlet with the parameters described in si.
240268
//
241269
// The provided si must be newly allocated; ownership passes to the scheduler.
242270
func (s *Scheduler) GetBuildlet(ctx context.Context, lg logger, si *SchedItem) (*buildlet.Client, error) {
243-
pool := poolForConf(dashboard.Hosts[si.HostType])
271+
// TODO: once we remove the useScheduler const, we can remove
272+
// the "lg" logger parameter. We don't need to log anything
273+
// during the buildlet creation process anymore because we
274+
// don't know which build it'll be for. So all we can say in
275+
// the logs for is "Asking for a buildlet" and "Got one",
276+
// which the caller already does. I think. Verify that.
277+
278+
hostConf, ok := dashboard.Hosts[si.HostType]
279+
if !ok && testPoolHook == nil {
280+
return nil, fmt.Errorf("invalid SchedItem.HostType %q", si.HostType)
281+
}
282+
pool := poolForConf(hostConf)
244283

245284
if !useScheduler {
246285
return pool.GetBuildlet(ctx, si.HostType, lg)
@@ -249,25 +288,19 @@ func (s *Scheduler) GetBuildlet(ctx context.Context, lg logger, si *SchedItem) (
249288
si.pool = pool
250289
si.s = s
251290
si.requestTime = time.Now()
252-
si.res = make(chan interface{}) // NOT buffered
253291
si.ctxDone = ctx.Done()
292+
si.wantRes = make(chan chan<- *buildlet.Client) // unbuffered
254293

255-
// TODO: once we remove the useScheduler const, we can
256-
// remove the "lg" logger parameter. We don't need to
257-
// log anything during the buildlet creation process anymore
258-
// because we don't which build it'll be for. So all we can
259-
// say in the logs for is "Asking for a buildlet" and "Got
260-
// one", which the caller already does. I think. Verify that.
294+
s.addWaiter(si)
261295

262-
s.enqueueWaiter(si)
296+
ch := make(chan *buildlet.Client)
263297
select {
264-
case v := <-si.res:
265-
if bc, ok := v.(*buildlet.Client); ok {
266-
return bc, nil
267-
}
268-
return nil, v.(error)
298+
case si.wantRes <- ch:
299+
// No need to call removeWaiter. If we're here, the
300+
// sender has already done so.
301+
return <-ch, nil
269302
case <-ctx.Done():
270-
si.cancel()
303+
s.removeWaiter(si)
271304
return nil, ctx.Err()
272305
}
273306
}

0 commit comments

Comments
 (0)