Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

roachtest: de-flake acceptance/rapid-restart #30496

Merged
merged 2 commits into from
Sep 21, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion build/teamcity-local-roachtest.sh
Original file line number Diff line number Diff line change
Expand Up @@ -28,5 +28,5 @@ run build/builder.sh ./bin/roachtest run '(acceptance|kv/splits)' \
--cockroach "cockroach" \
--workload "bin/workload" \
--artifacts artifacts \
--teamcity
--teamcity 2>&1 | tee artifacts/roachtest.log
tc_end_block "Run local roachtests"
54 changes: 26 additions & 28 deletions pkg/cmd/roachtest/rapid_restart.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,38 +61,36 @@ func runRapidRestart(ctx context.Context, t *test, c *cluster) {
}

waitTime := time.Duration(rand.Int63n(int64(time.Second)))
if !c.isLocal() {
// TODO(peter): This is hacky: the signal might be sent before the
// cockroach process starts, which is especially true on remote
// clusters. Perhaps combine this with a monitor so that we can detect
// as soon as the process starts before killing it. Or a custom kill
// script which loops looking for a cockroach process and kills it as
// soon as it appears. Using --pid_file or --background isn't quite
// right as we want to be able to kill the process before it is ready.
waitTime += time.Second
}
time.Sleep(waitTime)

sig := [2]string{"2", "9"}[rand.Intn(2)]
c.Stop(ctx, nodes, stopArgs("--sig="+sig))
select {
case <-ctx.Done():
return
case err := <-exitCh:
cause := errors.Cause(err)
if exitErr, ok := cause.(*exec.ExitError); ok {
switch status := sysutil.ExitStatus(exitErr); status {
case -1:
// Received SIGINT before setting up our own signal handlers or
// SIGKILL.
case 1:
// Exit code from a SIGINT received by our signal handlers.
default:
t.Fatalf("unexpected exit status %d", status)
}
} else {
t.Fatalf("unexpected exit err: %v", err)

var err error
for err == nil {
c.Stop(ctx, nodes, stopArgs("--sig="+sig))
select {
case <-ctx.Done():
return
case err = <-exitCh:
case <-time.After(10 * time.Second):
// We likely ended up killing before the process spawned.
// Loop around.
c.l.Printf("no exit status yet, killing again")
}
}
cause := errors.Cause(err)
if exitErr, ok := cause.(*exec.ExitError); ok {
switch status := sysutil.ExitStatus(exitErr); status {
case -1:
// Received SIGINT before setting up our own signal handlers or
// SIGKILL.
case 1:
// Exit code from a SIGINT received by our signal handlers.
default:
t.Fatalf("unexpected exit status %d", status)
}
} else {
t.Fatalf("unexpected exit err: %v", err)
}
}

Expand Down