Skip to content

Commit e01b5bb

Browse files
knztbg
andcommitted
storage: deflake TestNodeLivenessStatusMap
Prior to this patch, this test would fail `stressrace` after a few dozen iterations. The root cause of this was the invalid call to `t.Parallel()`, which this patch removes. Additionally, this patch adapts TimeUntilStoreDead for each test case to avoid flakes, and removes a previous hack obviated by this simplification. Release note: None Co-authored-by: Tobias Schottdorf <tobias.schottdorf@gmail.com>
1 parent d50ebea commit e01b5bb

File tree

1 file changed

+48
-34
lines changed

1 file changed

+48
-34
lines changed

pkg/storage/node_liveness_test.go

Lines changed: 48 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -855,48 +855,62 @@ func TestNodeLivenessStatusMap(t *testing.T) {
855855
// See what comes up in the status.
856856
callerNodeLiveness := firstServer.GetNodeLiveness()
857857

858-
type expectedStatus struct {
858+
type testCase struct {
859859
nodeID roachpb.NodeID
860860
expectedStatus storagepb.NodeLivenessStatus
861-
}
862-
testData := []expectedStatus{
863-
{liveNodeID, storagepb.NodeLivenessStatus_LIVE},
864-
{deadNodeID, storagepb.NodeLivenessStatus_DEAD},
865-
{decommissioningNodeID, storagepb.NodeLivenessStatus_DECOMMISSIONING},
866-
{removedNodeID, storagepb.NodeLivenessStatus_DECOMMISSIONED},
861+
862+
// This is a bit of a hack: we want to run with a low TimeUntilStoreDead
863+
// if we know that the node is dead to speed up the test. However, doing
864+
// so for all tests gives us false test failures in the opposite case in
865+
// which the node remains live because when stressing the test
866+
// sufficiently hard nodes can fail to become live over extended periods
867+
// of time. So we run with a short duration only if running.
868+
//
869+
// NB: the test still takes >5s because it has to wait for liveness
870+
// record expiration (~5s) before it can possibly declare a node as
871+
// dead. We could try to lower the liveness duration but this isn't
872+
// trivial and might lead to new test flakes, though.
873+
running bool
874+
}
875+
876+
// Below we're going to check that all statuses converge and stabilize
877+
// to a known situation.
878+
testData := []testCase{
879+
{liveNodeID, storagepb.NodeLivenessStatus_LIVE, true},
880+
{deadNodeID, storagepb.NodeLivenessStatus_DEAD, false},
881+
{decommissioningNodeID, storagepb.NodeLivenessStatus_DECOMMISSIONING, true},
882+
{removedNodeID, storagepb.NodeLivenessStatus_DECOMMISSIONED, false},
867883
}
868884

869885
for _, test := range testData {
870-
t.Run(test.expectedStatus.String(), func(t *testing.T) {
871-
nodeID, expectedStatus := test.nodeID, test.expectedStatus
872-
t.Parallel()
873-
886+
t.Run(fmt.Sprintf("n%d->%s", test.nodeID, test.expectedStatus), func(t *testing.T) {
874887
testutils.SucceedsSoon(t, func() error {
875-
// Ensure that dead nodes are quickly recognized as dead by
876-
// gossip. Overriding cluster settings is generally a really bad
877-
// idea as they are also populated via Gossip and so our update
878-
// is possibly going to be wiped out. But going through SQL
879-
// doesn't allow durations below 1m15s, which is much too long
880-
// for a test.
881-
// We do this in every SucceedsSoon attempt, so we'll be good.
882-
storage.TimeUntilStoreDead.Override(&firstServer.ClusterSettings().SV,
883-
storage.TestTimeUntilStoreDead)
884-
885-
log.Infof(ctx, "checking expected status for node %d", nodeID)
888+
dur := 5 * time.Minute
889+
if !test.running {
890+
// Ensure that dead nodes are quickly recognized as dead by
891+
// gossip. Overriding cluster settings is generally a really bad
892+
// idea as they are also populated via Gossip and so our update
893+
// is possibly going to be wiped out. But going through SQL
894+
// doesn't allow durations below 1m15s, which is much too long
895+
// for a test.
896+
// We do this in every SucceedsSoon attempt, so we'll be good.
897+
dur = storage.TestTimeUntilStoreDead
898+
}
899+
storage.TimeUntilStoreDead.Override(&firstServer.ClusterSettings().SV, dur)
900+
901+
nodeID, expectedStatus := test.nodeID, test.expectedStatus
902+
903+
log.Infof(ctx, "checking expected status (%s) for node %d", expectedStatus, nodeID)
886904
nodeStatuses := callerNodeLiveness.GetLivenessStatusMap()
887-
if st, ok := nodeStatuses[nodeID]; !ok {
888-
return fmt.Errorf("%s node not in statuses", expectedStatus)
889-
} else {
890-
if st != expectedStatus {
891-
if expectedStatus == storagepb.NodeLivenessStatus_DECOMMISSIONING && st == storagepb.NodeLivenessStatus_DECOMMISSIONED {
892-
// Server somehow shut down super-fast. Tolerating the mismatch.
893-
return nil
894-
}
895-
return fmt.Errorf("unexpected status: got %s, expected %s",
896-
st, expectedStatus)
897-
}
905+
st, ok := nodeStatuses[nodeID]
906+
if !ok {
907+
return errors.Errorf("node %d: not in statuses\n", nodeID)
908+
}
909+
if st != expectedStatus {
910+
return errors.Errorf("node %d: unexpected status: got %s, expected %s\n",
911+
nodeID, st, expectedStatus,
912+
)
898913
}
899-
log.Infof(ctx, "node %d status ok", nodeID)
900914
return nil
901915
})
902916
})

0 commit comments

Comments
 (0)