Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

roachtest: de-flake decommission/mixed-versions #56568

Merged
merged 1 commit into from
Nov 16, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions pkg/cmd/roachtest/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ go_library(
"cluster_init.go",
"copy.go",
"decommission.go",
"decommission_self.go",
"disk_full.go",
"disk_stall.go",
"django.go",
Expand Down
9 changes: 9 additions & 0 deletions pkg/cmd/roachtest/acceptance.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,15 @@ func registerAcceptance(r *testRegistry) {
{name: "build-analyze", fn: runBuildAnalyze},
{name: "cli/node-status", fn: runCLINodeStatus},
{name: "cluster-init", fn: runClusterInit},
{name: "decommission-self",
fn: runDecommissionSelf,
// Decommissioning self was observed to hang, though not in this test
// when run locally. More investigation is needed; there is a small
// chance that the original observation was in error. However, it
// seems likely that the problem exists even if it is rarely reproduced,
// so this test is skipped.
skip: "https://github.com/cockroachdb/cockroach/issues/56718",
},
{name: "event-log", fn: runEventLog},
{name: "gossip/peerings", fn: runGossipPeerings},
{name: "gossip/restart", fn: runGossipRestart},
Expand Down
38 changes: 38 additions & 0 deletions pkg/cmd/roachtest/decommission_self.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
// Copyright 2020 The Cockroach Authors.
//
// Use of this software is governed by the Business Source License
// included in the file licenses/BSL.txt.
//
// As of the Change Date specified in that file, in accordance with
// the Business Source License, use of this software will be governed
// by the Apache License, Version 2.0, included in the file
// licenses/APL.txt.

package main

import "context"

// runDecommissionSelf decommissions n2 through n2. This is an acceptance test.
//
// See https://github.com/cockroachdb/cockroach/issues/56718
func runDecommissionSelf(ctx context.Context, t *test, c *cluster) {
// An empty string means that the cockroach binary specified by flag
// `cockroach` will be used.
const mainVersion = ""

allNodes := c.All()
u := newVersionUpgradeTest(c,
uploadVersion(allNodes, mainVersion),
startVersion(allNodes, mainVersion),
fullyDecommissionStep(2, 2, mainVersion),
func(ctx context.Context, t *test, u *versionUpgradeTest) {
// Stop n2 and exclude it from post-test consistency checks,
// as this node can't contact cluster any more and operations
// on it will hang.
u.c.Wipe(ctx, c.Node(2))
},
checkOneMembership(1, "decommissioned"),
)

u.run(ctx, t)
}
62 changes: 18 additions & 44 deletions pkg/cmd/roachtest/mixed_version_decommission.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,71 +52,45 @@ func runDecommissionMixedVersions(
waitForUpgradeStep(allNodes),
preventAutoUpgradeStep(h.nodeIDs[0]),

// We upgrade a subset of the cluster to v20.2.
// We upgrade a pinnedUpgrade and one other random node of the cluster to v20.2.
binaryUpgradeStep(c.Node(pinnedUpgrade), mainVersion),
binaryUpgradeStep(c.Node(h.getRandNodeOtherThan(pinnedUpgrade)), mainVersion),
checkAllMembership(pinnedUpgrade, "active"),

// 1. Partially decommission a random node from another random node. We
// use the v20.1 CLI to do so.
// Partially decommission a random node from another random node. We
// use the predecessor CLI to do so.
partialDecommissionStep(h.getRandNode(), h.getRandNode(), predecessorVersion),
checkOneDecommissioning(h.getRandNode()),
checkOneMembership(pinnedUpgrade, "decommissioning"),

// 2. Recommission all nodes, including the partially decommissioned
// one, from a random node. Use the v20.1 CLI to do so.
// Recommission all nodes, including the partially decommissioned
// one, from a random node. Use the predecessor CLI to do so.
recommissionAllStep(h.getRandNode(), predecessorVersion),
checkNoDecommissioning(h.getRandNode()),
checkAllMembership(pinnedUpgrade, "active"),
//
// 3. Attempt to fully decommission a from a random node, again using
// the v20.1 CLI.
fullyDecommissionStep(h.getRandNode(), h.getRandNode(), predecessorVersion),
checkOneDecommissioning(h.getRandNode()),
checkOneMembership(pinnedUpgrade, "decommissioning"),

// Roll back, which should to be fine because the cluster upgrade was
// not finalized.
binaryUpgradeStep(allNodes, predecessorVersion),
checkOneDecommissioning(h.getRandNode()),

// Repeat similar recommission/decommission cycles as above. We can no
// longer assert against the `membership` column as none of the servers
// are running v20.2.
recommissionAllStep(h.getRandNode(), predecessorVersion),
checkNoDecommissioning(h.getRandNode()),

partialDecommissionStep(h.getRandNode(), h.getRandNode(), predecessorVersion),
checkOneDecommissioning(h.getRandNode()),

// Roll all nodes forward, and finalize upgrade.
binaryUpgradeStep(allNodes, mainVersion),
allowAutoUpgradeStep(1),
waitForUpgradeStep(allNodes),

checkOneMembership(h.getRandNode(), "decommissioning"),

// Use the v20.2 CLI here on forth. Lets start with recommissioning all
// the nodes in the cluster.
recommissionAllStep(h.getRandNode(), mainVersion),
checkNoDecommissioning(h.getRandNode()),
checkAllMembership(h.getRandNode(), "active"),

// We partially decommission a random node.
partialDecommissionStep(h.getRandNode(), h.getRandNode(), mainVersion),
checkOneDecommissioning(h.getRandNode()),
checkOneMembership(h.getRandNode(), "decommissioning"),

// We check that recommissioning is still functional.
recommissionAllStep(h.getRandNode(), mainVersion),
checkNoDecommissioning(h.getRandNode()),
checkAllMembership(h.getRandNode(), "active"),

// We fully decommission a random node. We need to use the v20.2 CLI to
// do so.
fullyDecommissionStep(h.getRandNode(), h.getRandNode(), mainVersion),
checkOneDecommissioning(h.getRandNode()),
checkOneMembership(h.getRandNode(), "decommissioned"),
// Fully decommission a random node. Note that we can no longer use the
// predecessor cli, as the cluster has upgraded and won't allow connections
// from the predecessor version binary.
//
// Note also that this has to remain the last step unless we want this test to
// handle the fact that the decommissioned node will no longer be able
// to communicate with the cluster (i.e. most commands against it will fail).
// This is also why we're making sure to avoid decommissioning pinnedUpgrade
// itself, as we use it to check the membership after.
//
// NB: we avoid runNode == targetNode here to temporarily avoid #56718.
fullyDecommissionStep(h.getRandNodeOtherThan(pinnedUpgrade), pinnedUpgrade, ""),
checkOneMembership(pinnedUpgrade, "decommissioned"),
)

u.run(ctx, t)
Expand Down