@@ -855,48 +855,62 @@ func TestNodeLivenessStatusMap(t *testing.T) {
855855 // See what comes up in the status.
856856 callerNodeLiveness := firstServer .GetNodeLiveness ()
857857
858- type expectedStatus struct {
858+ type testCase struct {
859859 nodeID roachpb.NodeID
860860 expectedStatus storagepb.NodeLivenessStatus
861- }
862- testData := []expectedStatus {
863- {liveNodeID , storagepb .NodeLivenessStatus_LIVE },
864- {deadNodeID , storagepb .NodeLivenessStatus_DEAD },
865- {decommissioningNodeID , storagepb .NodeLivenessStatus_DECOMMISSIONING },
866- {removedNodeID , storagepb .NodeLivenessStatus_DECOMMISSIONED },
861+
862+ // This is a bit of a hack: we want to run with a low TimeUntilStoreDead
863+ // if we know that the node is dead to speed up the test. However, doing
864+ // so for all tests gives us false test failures in the opposite case in
865+ // which the node remains live because when stressing the test
866+ // sufficiently hard nodes can fail to become live over extended periods
867+ // of time. So we run with a short duration only if running.
868+ //
869+ // NB: the test still takes >5s because it has to wait for liveness
870+ // record expiration (~5s) before it can possibly declare a node as
871+ // dead. We could try to lower the liveness duration but this isn't
872+ // trivial and might lead to new test flakes, though.
873+ running bool
874+ }
875+
876+ // Below we're going to check that all statuses converge and stabilize
877+ // to a known situation.
878+ testData := []testCase {
879+ {liveNodeID , storagepb .NodeLivenessStatus_LIVE , true },
880+ {deadNodeID , storagepb .NodeLivenessStatus_DEAD , false },
881+ {decommissioningNodeID , storagepb .NodeLivenessStatus_DECOMMISSIONING , true },
882+ {removedNodeID , storagepb .NodeLivenessStatus_DECOMMISSIONED , false },
867883 }
868884
869885 for _ , test := range testData {
870- t .Run (test .expectedStatus .String (), func (t * testing.T ) {
871- nodeID , expectedStatus := test .nodeID , test .expectedStatus
872- t .Parallel ()
873-
886+ t .Run (fmt .Sprintf ("n%d->%s" , test .nodeID , test .expectedStatus ), func (t * testing.T ) {
874887 testutils .SucceedsSoon (t , func () error {
875- // Ensure that dead nodes are quickly recognized as dead by
876- // gossip. Overriding cluster settings is generally a really bad
877- // idea as they are also populated via Gossip and so our update
878- // is possibly going to be wiped out. But going through SQL
879- // doesn't allow durations below 1m15s, which is much too long
880- // for a test.
881- // We do this in every SucceedsSoon attempt, so we'll be good.
882- storage .TimeUntilStoreDead .Override (& firstServer .ClusterSettings ().SV ,
883- storage .TestTimeUntilStoreDead )
884-
885- log .Infof (ctx , "checking expected status for node %d" , nodeID )
888+ dur := 5 * time .Minute
889+ if ! test .running {
890+ // Ensure that dead nodes are quickly recognized as dead by
891+ // gossip. Overriding cluster settings is generally a really bad
892+ // idea as they are also populated via Gossip and so our update
893+ // is possibly going to be wiped out. But going through SQL
894+ // doesn't allow durations below 1m15s, which is much too long
895+ // for a test.
896+ // We do this in every SucceedsSoon attempt, so we'll be good.
897+ dur = storage .TestTimeUntilStoreDead
898+ }
899+ storage .TimeUntilStoreDead .Override (& firstServer .ClusterSettings ().SV , dur )
900+
901+ nodeID , expectedStatus := test .nodeID , test .expectedStatus
902+
903+ log .Infof (ctx , "checking expected status (%s) for node %d" , expectedStatus , nodeID )
886904 nodeStatuses := callerNodeLiveness .GetLivenessStatusMap ()
887- if st , ok := nodeStatuses [nodeID ]; ! ok {
888- return fmt .Errorf ("%s node not in statuses" , expectedStatus )
889- } else {
890- if st != expectedStatus {
891- if expectedStatus == storagepb .NodeLivenessStatus_DECOMMISSIONING && st == storagepb .NodeLivenessStatus_DECOMMISSIONED {
892- // Server somehow shut down super-fast. Tolerating the mismatch.
893- return nil
894- }
895- return fmt .Errorf ("unexpected status: got %s, expected %s" ,
896- st , expectedStatus )
897- }
905+ st , ok := nodeStatuses [nodeID ]
906+ if ! ok {
907+ return errors .Errorf ("node %d: not in statuses\n " , nodeID )
908+ }
909+ if st != expectedStatus {
910+ return errors .Errorf ("node %d: unexpected status: got %s, expected %s\n " ,
911+ nodeID , st , expectedStatus ,
912+ )
898913 }
899- log .Infof (ctx , "node %d status ok" , nodeID )
900914 return nil
901915 })
902916 })
0 commit comments