Skip to content

Commit

Permalink
Merge pull request #4674 from mpawliszyn/mikepaw.not-serving-infinite…
Browse files Browse the repository at this point in the history
…-loop

We should always time out health connections to a vttablet.
  • Loading branch information
sougou authored Feb 28, 2019
2 parents eece355 + ec6a753 commit 7afe66c
Show file tree
Hide file tree
Showing 2 changed files with 14 additions and 18 deletions.
11 changes: 2 additions & 9 deletions go/vt/discovery/healthcheck.go
Original file line number Diff line number Diff line change
Expand Up @@ -527,24 +527,17 @@ func (hc *HealthCheckImpl) checkConn(hcc *healthCheckConn, name string) {
// between the goroutine that sets it and the check for its value
// later.
timedout := sync2.NewAtomicBool(false)
serving := hcc.tabletStats.Serving
go func() {
for {
select {
case serving = <-servingStatus:
case <-servingStatus:
continue
case <-time.After(hc.healthCheckTimeout):
// Ignore if not serving.
if !serving {
continue
}
timedout.Set(true)
streamCancel()
return
case <-streamCtx.Done():
// If stream returns while serving is false, the function
// will get stuck in an infinite loop. This code path
// breaks the loop.
// If the stream is done, stop watching.
return
}
}
Expand Down
21 changes: 12 additions & 9 deletions go/vt/discovery/healthcheck_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -545,28 +545,31 @@ func TestHealthCheckTimeout(t *testing.T) {
t.Errorf("StreamHealth should be canceled after timeout, but is not")
}

// repeat the wait. There should be no error or cancelation.
// repeat the wait. It will timeout one more time trying to get the connection.
fc.resetCanceledFlag()
time.Sleep(2 * timeout)
time.Sleep(timeout)
t.Logf(`Sleep(2 * timeout)`)

select {
case res = <-l.output:
t.Errorf(`<-l.output: %+v; want not message`, res)
default:
res = <-l.output
if res.Serving {
t.Errorf(`<-l.output: %+v; want not serving`, res)
}

if err := checkErrorCounter("k", "s", topodatapb.TabletType_MASTER, 1); err != nil {
if err := checkErrorCounter("k", "s", topodatapb.TabletType_MASTER, 2); err != nil {
t.Errorf("%v", err)
}

if fc.isCanceled() {
t.Errorf("StreamHealth should not be canceled after timeout")
if !fc.isCanceled() {
t.Errorf("StreamHealth should be canceled again after timeout")
}

// send a healthcheck response, it should be serving again
fc.resetCanceledFlag()
input <- shr
t.Logf(`input <- {{Keyspace: "k", Shard: "s", TabletType: MASTER}, Serving: true, TabletExternallyReparentedTimestamp: 10, {SecondsBehindMaster: 1, CpuUsage: 0.2}}`)

// wait for the exponential backoff to wear off and health monitoring to resume.
time.Sleep(timeout)
res = <-l.output
if !reflect.DeepEqual(res, want) {
t.Errorf(`<-l.output: %+v; want %+v`, res, want)
Expand Down

0 comments on commit 7afe66c

Please sign in to comment.