Skip to content

Commit

Permalink
roachtest: improvements to mixedversion package
Browse files Browse the repository at this point in the history
This commit introduces a few improvements to the `mixedversion`
package, the recently introduced framework for mixed-version (and
mixed-binary) roachtests.

Specifically, the following improvements are made:

* Removal of `DBNode()` function: having the planner pick a database
that the individual steps will connect to is insufficient in many
cases and could be misleading. The idea was that the user would be
able to see, from the test plan itself, what node a certain step would
be interacting with. However, the reality is that steps often need to
run statements on multiple different nodes, or perhaps they need to
pick one node specifically (e.g., the statement needs to run on a node
in the old version). For that reason, the `DBNode()` function was
dropped. Instead, steps have access to a random number generator that
they can use to pick an arbitrary node themselves. The random number
generators are unique to each user function, meaning each test run
will see the same numbers being generated even if other steps are
scheduled concurrently. The numbers observed by a user function will
also be the same if the seed passed to `mixedversion.Test` is the
same.

* Definition of a "test context" that is available to mixed-version
tests. For now, the test context includes things like which version we
are upgrading (or downgrading) to and from and which nodes are running
which version. This allows tests to take actions based on, for
example, the number of nodes upgraded. It also allows them to run
certain operations on nodes that are known to be in a specific
version.

* Introduction of a `helper` struct that is passed to user-functions.
For now, the helper includes functions to connect to a specific node
and get the current test context. The struct will help us provide
common functionality to tests so that they don't have to duplicate
code.

* Log cached binary and cluster versions before executing a step. This
makes it easier to understand the state of the cluster when looking at
the logs of one specific step.

* Internal improvement to the test runner: instead of assuming the
first step of a mixed-version test plan will start the the cockroach
nodes, we now check that that is the case, providing a clear error
message if/when that assumption doesn't hold anymore (instead of a
cryptic connection failure error).

Epic: CRDB-19321

Release note: None
  • Loading branch information
renatolabs committed Feb 24, 2023
1 parent f3796ef commit 304e0b4
Show file tree
Hide file tree
Showing 5 changed files with 198 additions and 109 deletions.
107 changes: 47 additions & 60 deletions pkg/cmd/roachtest/roachtestutil/mixedversion/mixedversion.go
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,6 @@ package mixedversion

import (
"context"
gosql "database/sql"
"fmt"
"math/rand"
"strings"
Expand Down Expand Up @@ -93,11 +92,6 @@ const (
// of migration steps before the new cluster version can be
// finalized.
runWhileMigratingProbability = 0.5

// noDBNeeded is an internal sentinel value expected to be returned
// by steps generated by the test plan; it indicates that the step
// requires no connection to a cockroach node.
noDBNeeded = -1
)

var (
Expand Down Expand Up @@ -131,7 +125,7 @@ type (
Finalizing bool
}

userFunc func(*logger.Logger, *gosql.DB) error
userFunc func(*logger.Logger, *rand.Rand, *Helper) error
predicateFunc func(Context) bool

// versionUpgradeHook is a hook that can be called at any time
Expand Down Expand Up @@ -159,17 +153,12 @@ type (
// ID returns a unique ID associated with the step, making it easy
// to reference test output with the exact step it relates to
ID() int
// DBNode returns the database node that that step connects to
// during its execution. If the step does not require a database
// connection, this function should return the `noDBNeeded`
// constant
DBNode() int
// Description is a string representation of the step, intended
// for human-consumption. Displayed when pretty-printing the test
// plan.
Description() string
// Run implements the actual functionality of the step.
Run(context.Context, *logger.Logger, cluster.Cluster, func(int) *gosql.DB) error
Run(context.Context, *logger.Logger, cluster.Cluster, *Helper) error
}

hooks []versionUpgradeHook
Expand Down Expand Up @@ -218,7 +207,7 @@ func NewTest(
}

prng, seed := randutil.NewPseudoRand()
testLogger.Printf("random seed: %d", seed)
testLogger.Printf("mixed-version random seed: %d", seed)

return &Test{
ctx: ctx,
Expand Down Expand Up @@ -339,8 +328,7 @@ type startFromCheckpointStep struct {
crdbNodes option.NodeListOption
}

func (s startFromCheckpointStep) ID() int { return s.id }
func (s startFromCheckpointStep) DBNode() int { return noDBNeeded }
func (s startFromCheckpointStep) ID() int { return s.id }

func (s startFromCheckpointStep) Description() string {
return fmt.Sprintf("starting cluster from fixtures for version %q", s.version)
Expand All @@ -350,7 +338,7 @@ func (s startFromCheckpointStep) Description() string {
// upload the binary associated with that given version, and finally
// start the cockroach binary on these nodes.
func (s startFromCheckpointStep) Run(
ctx context.Context, l *logger.Logger, c cluster.Cluster, dbFunc func(int) *gosql.DB,
ctx context.Context, l *logger.Logger, c cluster.Cluster, helper *Helper,
) error {
if err := clusterupgrade.InstallFixtures(ctx, l, c, s.crdbNodes, s.version); err != nil {
return err
Expand All @@ -376,8 +364,7 @@ type waitForStableClusterVersionStep struct {
nodes option.NodeListOption
}

func (s waitForStableClusterVersionStep) ID() int { return s.id }
func (s waitForStableClusterVersionStep) DBNode() int { return noDBNeeded }
func (s waitForStableClusterVersionStep) ID() int { return s.id }

func (s waitForStableClusterVersionStep) Description() string {
return fmt.Sprintf(
Expand All @@ -387,38 +374,38 @@ func (s waitForStableClusterVersionStep) Description() string {
}

func (s waitForStableClusterVersionStep) Run(
ctx context.Context, l *logger.Logger, c cluster.Cluster, dbFunc func(int) *gosql.DB,
ctx context.Context, l *logger.Logger, c cluster.Cluster, helper *Helper,
) error {
return clusterupgrade.WaitForClusterUpgrade(ctx, l, s.nodes, dbFunc)
return clusterupgrade.WaitForClusterUpgrade(ctx, l, s.nodes, helper.Connect)
}

// preserveDowngradeOptionStep sets the `preserve_downgrade_option`
// cluster setting to the binary version running in `node`.
type preserveDowngradeOptionStep struct {
id int
node int
id int
crdbNodes option.NodeListOption
prng *rand.Rand
}

func (s preserveDowngradeOptionStep) ID() int { return s.id }
func (s preserveDowngradeOptionStep) DBNode() int { return s.node }
func (s preserveDowngradeOptionStep) ID() int { return s.id }

func (s preserveDowngradeOptionStep) Description() string {
return "preventing auto-upgrades by setting `preserve_downgrade_option`"
}

func (s preserveDowngradeOptionStep) Run(
ctx context.Context, l *logger.Logger, c cluster.Cluster, dbFunc func(int) *gosql.DB,
ctx context.Context, l *logger.Logger, c cluster.Cluster, helper *Helper,
) error {
db := dbFunc(s.node)

l.Printf("checking binary version on node %d", s.node)
node, db := helper.RandomDB(s.prng, s.crdbNodes)
l.Printf("checking binary version (via node %d)", node)
bv, err := clusterupgrade.BinaryVersion(db)
if err != nil {
return err
}

node, db = helper.RandomDB(s.prng, s.crdbNodes)
downgradeOption := bv.String()
l.Printf("setting `preserve_downgrade_option` to %s", downgradeOption)
l.Printf("setting `preserve_downgrade_option` to %s (via node %d)", downgradeOption, node)
_, err = db.ExecContext(ctx, "SET CLUSTER SETTING cluster.preserve_downgrade_option = $1", downgradeOption)
return err
}
Expand All @@ -434,15 +421,14 @@ type restartWithNewBinaryStep struct {
node int
}

func (s restartWithNewBinaryStep) ID() int { return s.id }
func (s restartWithNewBinaryStep) DBNode() int { return noDBNeeded }
func (s restartWithNewBinaryStep) ID() int { return s.id }

func (s restartWithNewBinaryStep) Description() string {
return fmt.Sprintf("restart node %d with binary version %s", s.node, versionMsg(s.version))
}

func (s restartWithNewBinaryStep) Run(
ctx context.Context, l *logger.Logger, c cluster.Cluster, dbFunc func(int) *gosql.DB,
ctx context.Context, l *logger.Logger, c cluster.Cluster, helper *Helper,
) error {
return clusterupgrade.RestartNodesWithNewBinary(
ctx,
Expand All @@ -459,45 +445,46 @@ func (s restartWithNewBinaryStep) Run(
// setting, allowing the upgrade migrations to run and the cluster
// version to eventually reach the binary version on the nodes.
type finalizeUpgradeStep struct {
id int
node int
id int
crdbNodes option.NodeListOption
prng *rand.Rand
}

func (s finalizeUpgradeStep) ID() int { return s.id }
func (s finalizeUpgradeStep) DBNode() int { return s.node }
func (s finalizeUpgradeStep) ID() int { return s.id }

func (s finalizeUpgradeStep) Description() string {
return "finalize upgrade by resetting `preserve_downgrade_option`"
}

func (s finalizeUpgradeStep) Run(
ctx context.Context, l *logger.Logger, c cluster.Cluster, dbFunc func(int) *gosql.DB,
ctx context.Context, l *logger.Logger, c cluster.Cluster, helper *Helper,
) error {
db := dbFunc(s.node)
l.Printf("resetting preserve_downgrade_option")
node, db := helper.RandomDB(s.prng, s.crdbNodes)
l.Printf("resetting preserve_downgrade_option (via node %d)", node)
_, err := db.ExecContext(ctx, "RESET CLUSTER SETTING cluster.preserve_downgrade_option")
return err
}

// runHookStep is a step used to run a user-provided hook (i.e.,
// callbacks passed to `OnStartup`, `InMixedVersion`, or `AfterTest`).
type runHookStep struct {
id int
node int
hook versionUpgradeHook
id int
testContext Context
prng *rand.Rand
hook versionUpgradeHook
}

func (s runHookStep) ID() int { return s.id }
func (s runHookStep) DBNode() int { return s.node }
func (s runHookStep) ID() int { return s.id }

func (s runHookStep) Description() string {
return fmt.Sprintf("run %q", s.hook.name)
}

func (s runHookStep) Run(
ctx context.Context, l *logger.Logger, c cluster.Cluster, dbFunc func(int) *gosql.DB,
ctx context.Context, l *logger.Logger, c cluster.Cluster, helper *Helper,
) error {
return s.hook.fn(l, dbFunc(s.node))
helper.SetContext(&s.testContext)
return s.hook.fn(l, s.prng, helper)
}

// sequentialRunStep is a "meta-step" that indicates that a sequence
Expand Down Expand Up @@ -556,11 +543,6 @@ func prefixedLogger(l *logger.Logger, prefix string) (*logger.Logger, error) {
return l.ChildLogger(fileName, logger.LogPrefix(formattedPrefix))
}

func randomNode(rng *rand.Rand, nodes option.NodeListOption) int {
idx := rng.Intn(len(nodes))
return nodes[idx]
}

func (h hooks) Filter(testContext Context) hooks {
var result hooks
for _, hook := range h {
Expand All @@ -577,19 +559,20 @@ func (h hooks) Filter(testContext Context) hooks {
// returned. Otherwise, a `concurrentRunStep` is returned, where every
// hook is run concurrently.
func (h hooks) AsSteps(
label string, idGen func() int, rng *rand.Rand, nodes option.NodeListOption,
label string, idGen func() int, prng *rand.Rand, nodes option.NodeListOption, testContext Context,
) []testStep {
steps := make([]testStep, 0, len(h))
for _, hook := range h {
rhs := runHookStep{id: idGen(), node: randomNode(rng, nodes), hook: hook}
hookPrng := rngFromRNG(prng)
rhs := runHookStep{id: idGen(), prng: hookPrng, hook: hook, testContext: testContext}
steps = append(steps, rhs)
}

if len(steps) <= 1 {
return steps
}

return []testStep{newConcurrentRunStep(label, steps, rng)}
return []testStep{newConcurrentRunStep(label, steps, prng)}
}

func (th *testHooks) AddStartup(hook versionUpgradeHook) {
Expand All @@ -604,23 +587,27 @@ func (th *testHooks) AddAfterUpgradeFinalized(hook versionUpgradeHook) {
th.afterUpgradeFinalized = append(th.afterUpgradeFinalized, hook)
}

func (th *testHooks) StartupSteps(idGen func() int) []testStep {
return th.startup.AsSteps(startupLabel, idGen, th.prng, th.crdbNodes)
func (th *testHooks) StartupSteps(idGen func() int, testContext Context) []testStep {
return th.startup.AsSteps(startupLabel, idGen, th.prng, th.crdbNodes, testContext)
}

func (th *testHooks) MixedVersionSteps(testContext Context, idGen func() int) []testStep {
return th.mixedVersion.Filter(testContext).AsSteps(mixedVersionLabel, idGen, th.prng, th.crdbNodes)
return th.mixedVersion.Filter(testContext).AsSteps(mixedVersionLabel, idGen, th.prng, th.crdbNodes, testContext)
}

func (th *testHooks) AfterUpgradeFinalizedSteps(idGen func() int) []testStep {
return th.afterUpgradeFinalized.AsSteps(afterTestLabel, idGen, th.prng, th.crdbNodes)
func (th *testHooks) AfterUpgradeFinalizedSteps(idGen func() int, testContext Context) []testStep {
return th.afterUpgradeFinalized.AsSteps(afterTestLabel, idGen, th.prng, th.crdbNodes, testContext)
}

func randomDelay(rng *rand.Rand) time.Duration {
idx := rng.Intn(len(possibleDelaysMs))
return time.Duration(possibleDelaysMs[idx]) * time.Millisecond
}

func rngFromRNG(rng *rand.Rand) *rand.Rand {
return rand.New(rand.NewSource(rng.Int63()))
}

func versionMsg(version string) string {
return clusterupgrade.VersionMsg(version)
}
38 changes: 26 additions & 12 deletions pkg/cmd/roachtest/roachtestutil/mixedversion/planner.go
Original file line number Diff line number Diff line change
Expand Up @@ -95,17 +95,33 @@ func (p *testPlanner) Plan() *TestPlan {
}
}

func (p *testPlanner) initialContext() Context {
return Context{
FromVersion: p.initialVersion,
ToVersion: clusterupgrade.MainVersion,
FromVersionNodes: p.crdbNodes,
}
}

func (p *testPlanner) finalContext(finalizing bool) Context {
return Context{
FromVersion: p.initialVersion,
ToVersion: clusterupgrade.MainVersion,
ToVersionNodes: p.crdbNodes,
Finalizing: finalizing,
}
}

// initSteps returns the sequence of steps that should be executed
// before we start changing binaries on nodes in the process of
// upgrading/downgrading. It will also run any startup hooks the user
// may have provided.
func (p *testPlanner) initSteps() []testStep {
preserveDowngradeNode := randomNode(p.prng, p.crdbNodes)
return append([]testStep{
startFromCheckpointStep{id: p.nextID(), version: p.initialVersion, rt: p.rt, crdbNodes: p.crdbNodes},
waitForStableClusterVersionStep{id: p.nextID(), nodes: p.crdbNodes},
preserveDowngradeOptionStep{id: p.nextID(), node: preserveDowngradeNode},
}, p.hooks.StartupSteps(p.nextID)...)
preserveDowngradeOptionStep{id: p.nextID(), prng: p.newRNG(), crdbNodes: p.crdbNodes},
}, p.hooks.StartupSteps(p.nextID, p.initialContext())...)
}

// finalSteps are the steps to be run once the nodes have been
Expand All @@ -115,7 +131,7 @@ func (p *testPlanner) initSteps() []testStep {
func (p *testPlanner) finalSteps() []testStep {
return append([]testStep{
waitForStableClusterVersionStep{id: p.nextID(), nodes: p.crdbNodes},
}, p.hooks.AfterUpgradeFinalizedSteps(p.nextID)...)
}, p.hooks.AfterUpgradeFinalizedSteps(p.nextID, p.finalContext(false /* finalizing */))...)
}

func (p *testPlanner) upgradeSteps(from, to string) []testStep {
Expand Down Expand Up @@ -165,18 +181,20 @@ func (p *testPlanner) changeVersionSteps(from, to, label string) []testStep {
// `preserve_downgrade_option` and potentially running mixed-version
// hooks while the cluster version is changing.
func (p *testPlanner) finalizeUpgradeSteps() []testStep {
testContext := Context{Finalizing: true}
finalizeNode := randomNode(p.prng, p.crdbNodes)
return append([]testStep{
finalizeUpgradeStep{id: p.nextID(), node: finalizeNode},
}, p.hooks.MixedVersionSteps(testContext, p.nextID)...)
finalizeUpgradeStep{id: p.nextID(), prng: p.newRNG(), crdbNodes: p.crdbNodes},
}, p.hooks.MixedVersionSteps(p.finalContext(true /* finalizing */), p.nextID)...)
}

func (p *testPlanner) nextID() int {
p.stepCount++
return p.stepCount
}

func (p *testPlanner) newRNG() *rand.Rand {
return rngFromRNG(p.prng)
}

// PrettyPrint displays a tree-like view of the mixed-version test
// plan, useful when debugging mixed-version test failures. Each step
// is assigned an ID, making it easy to correlate the step that
Expand Down Expand Up @@ -211,10 +229,6 @@ func (plan *TestPlan) prettyPrintStep(out *strings.Builder, step testStep, prefi
// concurrent execution), and what database node the step is
// connecting to.
writeSingle := func(rs singleStep, extraContext ...string) {
if node := rs.DBNode(); node != noDBNeeded {
dbinfo := fmt.Sprintf("with connection to node %d", node)
extraContext = append([]string{dbinfo}, extraContext...)
}
var extras string
if contextStr := strings.Join(extraContext, ", "); contextStr != "" {
extras = ", " + contextStr
Expand Down
Loading

0 comments on commit 304e0b4

Please sign in to comment.