From 8bde25ef579768607244e7940f20391b8f7c64ec Mon Sep 17 00:00:00 2001 From: Tobias Schottdorf Date: Fri, 21 Sep 2018 11:28:40 +0200 Subject: [PATCH 1/2] roachtest: squirrel away pre-teamcity logs The teamcity output is so much less pleasant to read. Better to just open the artifact sometimes. Release note: None --- build/teamcity-local-roachtest.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build/teamcity-local-roachtest.sh b/build/teamcity-local-roachtest.sh index 5a15c8339985..60e224df5d04 100755 --- a/build/teamcity-local-roachtest.sh +++ b/build/teamcity-local-roachtest.sh @@ -28,5 +28,5 @@ run build/builder.sh ./bin/roachtest run '(acceptance|kv/splits)' \ --cockroach "cockroach" \ --workload "bin/workload" \ --artifacts artifacts \ - --teamcity + --teamcity 2>&1 | tee artifacts/roachtest.log tc_end_block "Run local roachtests" From 0e0e1b5e05f7813709548d6a3c0d2a525bcb1772 Mon Sep 17 00:00:00 2001 From: Tobias Schottdorf Date: Fri, 21 Sep 2018 11:40:42 +0200 Subject: [PATCH 2/2] roachtest: de-flake acceptance/rapid-restart The kill signal was sometimes a noop (when issued before the process to be killed started). Prior to this patch, that would leave the test stuck. Fixes #30475. Release note: None --- pkg/cmd/roachtest/rapid_restart.go | 54 ++++++++++++++---------------- 1 file changed, 26 insertions(+), 28 deletions(-) diff --git a/pkg/cmd/roachtest/rapid_restart.go b/pkg/cmd/roachtest/rapid_restart.go index b8f02167b7b3..299809d62579 100644 --- a/pkg/cmd/roachtest/rapid_restart.go +++ b/pkg/cmd/roachtest/rapid_restart.go @@ -61,38 +61,36 @@ func runRapidRestart(ctx context.Context, t *test, c *cluster) { } waitTime := time.Duration(rand.Int63n(int64(time.Second))) - if !c.isLocal() { - // TODO(peter): This is hacky: the signal might be sent before the - // cockroach process starts, which is especially true on remote - // clusters. Perhaps combine this with a monitor so that we can detect - // as soon as the process starts before killing it. Or a custom kill - // script which loops looking for a cockroach process and kills it as - // soon as it appears. Using --pid_file or --background isn't quite - // right as we want to be able to kill the process before it is ready. - waitTime += time.Second - } time.Sleep(waitTime) sig := [2]string{"2", "9"}[rand.Intn(2)] - c.Stop(ctx, nodes, stopArgs("--sig="+sig)) - select { - case <-ctx.Done(): - return - case err := <-exitCh: - cause := errors.Cause(err) - if exitErr, ok := cause.(*exec.ExitError); ok { - switch status := sysutil.ExitStatus(exitErr); status { - case -1: - // Received SIGINT before setting up our own signal handlers or - // SIGKILL. - case 1: - // Exit code from a SIGINT received by our signal handlers. - default: - t.Fatalf("unexpected exit status %d", status) - } - } else { - t.Fatalf("unexpected exit err: %v", err) + + var err error + for err == nil { + c.Stop(ctx, nodes, stopArgs("--sig="+sig)) + select { + case <-ctx.Done(): + return + case err = <-exitCh: + case <-time.After(10 * time.Second): + // We likely ended up killing before the process spawned. + // Loop around. + c.l.Printf("no exit status yet, killing again") + } + } + cause := errors.Cause(err) + if exitErr, ok := cause.(*exec.ExitError); ok { + switch status := sysutil.ExitStatus(exitErr); status { + case -1: + // Received SIGINT before setting up our own signal handlers or + // SIGKILL. + case 1: + // Exit code from a SIGINT received by our signal handlers. + default: + t.Fatalf("unexpected exit status %d", status) } + } else { + t.Fatalf("unexpected exit err: %v", err) } }