Skip to content

Commit

Permalink
Merge #30496 #30497
Browse files Browse the repository at this point in the history
30496: roachtest: de-flake acceptance/rapid-restart r=petermattis a=tschottdorf

The kill signal was sometimes a noop (when issued before the process to be
killed started). Prior to this patch, that would leave the test stuck.

Fixes #30475.

Release note: None

30497: roachtest: wait for teardown if local test times out r=petermattis a=tschottdorf

I suspect not doing so previously can cause issues like #30397.
Local roachtests are simply not set up to run concurrently, but once a
test times out we're doing that (though we attempt to teardown the
cluster, tests can apparently still "do stuff" with it, for example
the rapid restarts test was running one-off invocations of CockroachDB
via Exec; also note the comment near the added lines suggesting that
in local clusters we may not actually be destroying them).

We can revisit this when we allow multiple local clusters in parallel,
as we can then use a different one.

Optimistically:

Fixes #30397.

Release note: None

Co-authored-by: Tobias Schottdorf <tobias.schottdorf@gmail.com>
  • Loading branch information
craig[bot] and tbg committed Sep 21, 2018
3 parents 8a64fad + 0e0e1b5 + 91438cb commit 361b5f1
Show file tree
Hide file tree
Showing 3 changed files with 32 additions and 30 deletions.
2 changes: 1 addition & 1 deletion build/teamcity-local-roachtest.sh
Original file line number Diff line number Diff line change
Expand Up @@ -28,5 +28,5 @@ run build/builder.sh ./bin/roachtest run '(acceptance|kv/splits)' \
--cockroach "cockroach" \
--workload "bin/workload" \
--artifacts artifacts \
--teamcity
--teamcity 2>&1 | tee artifacts/roachtest.log
tc_end_block "Run local roachtests"
54 changes: 26 additions & 28 deletions pkg/cmd/roachtest/rapid_restart.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,38 +61,36 @@ func runRapidRestart(ctx context.Context, t *test, c *cluster) {
}

waitTime := time.Duration(rand.Int63n(int64(time.Second)))
if !c.isLocal() {
// TODO(peter): This is hacky: the signal might be sent before the
// cockroach process starts, which is especially true on remote
// clusters. Perhaps combine this with a monitor so that we can detect
// as soon as the process starts before killing it. Or a custom kill
// script which loops looking for a cockroach process and kills it as
// soon as it appears. Using --pid_file or --background isn't quite
// right as we want to be able to kill the process before it is ready.
waitTime += time.Second
}
time.Sleep(waitTime)

sig := [2]string{"2", "9"}[rand.Intn(2)]
c.Stop(ctx, nodes, stopArgs("--sig="+sig))
select {
case <-ctx.Done():
return
case err := <-exitCh:
cause := errors.Cause(err)
if exitErr, ok := cause.(*exec.ExitError); ok {
switch status := sysutil.ExitStatus(exitErr); status {
case -1:
// Received SIGINT before setting up our own signal handlers or
// SIGKILL.
case 1:
// Exit code from a SIGINT received by our signal handlers.
default:
t.Fatalf("unexpected exit status %d", status)
}
} else {
t.Fatalf("unexpected exit err: %v", err)

var err error
for err == nil {
c.Stop(ctx, nodes, stopArgs("--sig="+sig))
select {
case <-ctx.Done():
return
case err = <-exitCh:
case <-time.After(10 * time.Second):
// We likely ended up killing before the process spawned.
// Loop around.
c.l.Printf("no exit status yet, killing again")
}
}
cause := errors.Cause(err)
if exitErr, ok := cause.(*exec.ExitError); ok {
switch status := sysutil.ExitStatus(exitErr); status {
case -1:
// Received SIGINT before setting up our own signal handlers or
// SIGKILL.
case 1:
// Exit code from a SIGINT received by our signal handlers.
default:
t.Fatalf("unexpected exit status %d", status)
}
} else {
t.Fatalf("unexpected exit err: %v", err)
}
}

Expand Down
6 changes: 5 additions & 1 deletion pkg/cmd/roachtest/test.go
Original file line number Diff line number Diff line change
Expand Up @@ -776,13 +776,17 @@ func (r *registry) run(

select {
case <-time.After(timeout):
t.printf("test timed out (%s)", timeout)
t.printf("test timed out (%s)\n", timeout)
if c != nil {
c.FetchLogs(ctx)
// NB: c.destroyed is nil for cloned clusters (i.e. in subtests).
if !debugEnabled && c.destroyed != nil {
c.Destroy(ctx)
}
if local {
t.printf("waiting for test to tear down since cluster is local\n")
<-done
}
}
case <-done:
}
Expand Down

0 comments on commit 361b5f1

Please sign in to comment.