Skip to content

Commit

Permalink
sql: only validate new regions when adding/dropping
Browse files Browse the repository at this point in the history
When we added validation logic to make sure every region corresponded to
a known node locality, we were a little too aggressive. The validation
made it possible to end up in a state where any ALTER..REGION operation
could hang. This could happen in a few situations; for example:

- node is restarted with a different locality flag.
- MR cluster is restored into a non-MR cluster.
- c2c streaming is used with a MR source and non-MR destination.

In all these cases, the problem was that the zone configuration could
reference a region that no longer has any nodes with the corresponding
locality. The validation was too aggressive, since it would validate
those regions which already existed in the zone configuration.

Now, only the newly added region is validated.

Release note (bug fix): Fixed a bug that could cause ALTER DATABASE ...
ADD/DROP REGION to hang if node localities were changed after regions
were added.
  • Loading branch information
rafiss committed Nov 16, 2023
1 parent 24bd011 commit 6d740e0
Show file tree
Hide file tree
Showing 7 changed files with 190 additions and 40 deletions.
1 change: 1 addition & 0 deletions pkg/cmd/roachtest/tests/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ go_library(
"liquibase_blocklist.go",
"loss_of_quorum_recovery.go",
"many_splits.go",
"mismatched_locality.go",
"mixed_version_backup.go",
"mixed_version_cdc.go",
"mixed_version_change_replicas.go",
Expand Down
6 changes: 6 additions & 0 deletions pkg/cmd/roachtest/tests/acceptance.go
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,12 @@ func registerAcceptance(r registry.Registry) {
fn: runValidateSystemSchemaAfterVersionUpgrade,
timeout: 30 * time.Minute,
defaultLeases: true,
numNodes: 1,
},
{
name: "mismatched-locality",
fn: runMismatchedLocalityTest,
numNodes: 3,
},
},
}
Expand Down
86 changes: 86 additions & 0 deletions pkg/cmd/roachtest/tests/mismatched_locality.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
// Copyright 2023 The Cockroach Authors.
//
// Use of this software is governed by the Business Source License
// included in the file licenses/BSL.txt.
//
// As of the Change Date specified in that file, in accordance with
// the Business Source License, use of this software will be governed
// by the Apache License, Version 2.0, included in the file
// licenses/APL.txt.

package tests

import (
"context"

"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster"
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option"
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test"
"github.com/cockroachdb/cockroach/pkg/roachprod/install"
)

// This test tests that we can add and drop regions even if the locality flags
// of a node no longer match the regions that already were added to the
// database.
func runMismatchedLocalityTest(ctx context.Context, t test.Test, c cluster.Cluster) {
// Start 3 nodes with a different localities.
c.Put(ctx, t.Cockroach(), "./cockroach", c.All())
startOpts := option.DefaultStartOpts()
startOpts.RoachprodOpts.ExtraArgs = []string{"--locality=region=east"}
c.Start(ctx, t.L(), startOpts, install.MakeClusterSettings(), c.Nodes(1))
startOpts.RoachprodOpts.ExtraArgs = []string{"--locality=region=central"}
c.Start(ctx, t.L(), startOpts, install.MakeClusterSettings(), c.Nodes(2))
startOpts.RoachprodOpts.ExtraArgs = []string{"--locality=region=west"}
c.Start(ctx, t.L(), startOpts, install.MakeClusterSettings(), c.Nodes(3))

// Add the 3 regions to the database.
db := c.Conn(ctx, t.L(), 1)
if _, err := db.Exec(`ALTER DATABASE defaultdb PRIMARY REGION "east";`); err != nil {
t.Fatal(err)
}
if _, err := db.Exec(`ALTER DATABASE defaultdb ADD REGION "central";`); err != nil {
t.Fatal(err)
}
if _, err := db.Exec(`ALTER DATABASE defaultdb ADD REGION "west";`); err != nil {
t.Fatal(err)
}

// Restart all the nodes with new localities.
c.Stop(ctx, t.L(), option.DefaultStopOpts(), c.Nodes(1))
startOpts.RoachprodOpts.ExtraArgs = []string{"--locality=region=mars"}
c.Start(ctx, t.L(), startOpts, install.MakeClusterSettings(), c.Nodes(1))
c.Stop(ctx, t.L(), option.DefaultStopOpts(), c.Nodes(2))
startOpts.RoachprodOpts.ExtraArgs = []string{"--locality=region=jupiter"}
c.Start(ctx, t.L(), startOpts, install.MakeClusterSettings(), c.Nodes(2))
c.Stop(ctx, t.L(), option.DefaultStopOpts(), c.Nodes(3))
startOpts.RoachprodOpts.ExtraArgs = []string{"--locality=region=venus"}
c.Start(ctx, t.L(), startOpts, install.MakeClusterSettings(), c.Nodes(3))

// Verify that we can add and drop regions for the database. There's no longer
// any node with the old localities, but that's fine.
db = c.Conn(ctx, t.L(), 3)
if err := WaitFor3XReplication(ctx, t, db); err != nil {
t.Fatal(err)
}
if _, err := db.Exec(`ALTER DATABASE defaultdb ADD REGION "venus";`); err != nil {
t.Fatal(err)
}
if _, err := db.Exec(`ALTER DATABASE defaultdb DROP REGION "central";`); err != nil {
t.Fatal(err)
}
if _, err := db.Exec(`ALTER DATABASE defaultdb ADD REGION "jupiter";`); err != nil {
t.Fatal(err)
}
if _, err := db.Exec(`ALTER DATABASE defaultdb ADD REGION "mars";`); err != nil {
t.Fatal(err)
}
if _, err := db.Exec(`ALTER DATABASE defaultdb SET PRIMARY REGION "mars";`); err != nil {
t.Fatal(err)
}
if _, err := db.Exec(`ALTER DATABASE defaultdb DROP REGION "west";`); err != nil {
t.Fatal(err)
}
if _, err := db.Exec(`ALTER DATABASE defaultdb DROP REGION "east";`); err != nil {
t.Fatal(err)
}
}
13 changes: 11 additions & 2 deletions pkg/sql/alter_database.go
Original file line number Diff line number Diff line change
Expand Up @@ -2326,6 +2326,15 @@ func (n *alterDatabaseSetZoneConfigExtensionNode) startExec(params runParams) er
}
}

currentZone := zonepb.NewZoneConfig()
if currentZoneConfigWithRaw, err := params.p.Descriptors().GetZoneConfig(
params.ctx, params.p.Txn(), n.desc.ID,
); err != nil {
return err
} else if currentZoneConfigWithRaw != nil {
currentZone = currentZoneConfigWithRaw.ZoneConfigProto()
}

if deleteZone {
switch n.n.LocalityLevel {
case tree.LocalityLevelGlobal:
Expand Down Expand Up @@ -2388,7 +2397,7 @@ func (n *alterDatabaseSetZoneConfigExtensionNode) startExec(params runParams) er
}

if err := validateZoneAttrsAndLocalities(
params.ctx, params.p.InternalSQLTxn().Regions(), params.p.ExecCfg(), newZone,
params.ctx, params.p.InternalSQLTxn().Regions(), params.p.ExecCfg(), currentZone, newZone,
); err != nil {
return err
}
Expand Down Expand Up @@ -2433,7 +2442,7 @@ func (n *alterDatabaseSetZoneConfigExtensionNode) startExec(params runParams) er

// Validate if the zone config extension is compatible with the database.
dbZoneConfig, err := generateAndValidateZoneConfigForMultiRegionDatabase(
params.ctx, params.p.InternalSQLTxn().Regions(), params.ExecCfg(), updatedRegionConfig, true, /* validateLocalities */
params.ctx, params.p.InternalSQLTxn().Regions(), params.ExecCfg(), updatedRegionConfig, currentZone, true, /* validateLocalities */
)
if err != nil {
return err
Expand Down
25 changes: 23 additions & 2 deletions pkg/sql/region_util.go
Original file line number Diff line number Diff line change
Expand Up @@ -877,6 +877,7 @@ func generateAndValidateZoneConfigForMultiRegionDatabase(
regionProvider descs.RegionProvider,
execConfig *ExecutorConfig,
regionConfig multiregion.RegionConfig,
currentZoneConfig *zonepb.ZoneConfig,
validateLocalities bool,
) (zonepb.ZoneConfig, error) {
// Build a zone config based on the RegionConfig information.
Expand All @@ -897,7 +898,7 @@ func generateAndValidateZoneConfigForMultiRegionDatabase(
return zonepb.ZoneConfig{}, err
}

if err := validateZoneAttrsAndLocalities(ctx, regionProvider, execConfig, &dbZoneConfig); err != nil {
if err := validateZoneAttrsAndLocalities(ctx, regionProvider, execConfig, currentZoneConfig, &dbZoneConfig); err != nil {
// If we are validating localities this is fatal, otherwise let's log any
// errors as warnings.
if validateLocalities {
Expand All @@ -921,8 +922,17 @@ func ApplyZoneConfigFromDatabaseRegionConfig(
validateLocalities bool,
kvTrace bool,
) error {
currentZone := zonepb.NewZoneConfig()
if currentZoneConfigWithRaw, err := txn.Descriptors().GetZoneConfig(
ctx, txn.KV(), dbID,
); err != nil {
return err
} else if currentZoneConfigWithRaw != nil {
currentZone = currentZoneConfigWithRaw.ZoneConfigProto()
}

// Build a zone config based on the RegionConfig information.
dbZoneConfig, err := generateAndValidateZoneConfigForMultiRegionDatabase(ctx, txn.Regions(), execConfig, regionConfig, validateLocalities)
dbZoneConfig, err := generateAndValidateZoneConfigForMultiRegionDatabase(ctx, txn.Regions(), execConfig, regionConfig, currentZone, validateLocalities)
if err != nil {
return err
}
Expand Down Expand Up @@ -2623,10 +2633,21 @@ func (zv *zoneConfigValidator) ValidateDbZoneConfig(
if err != nil {
return err
}

currentZone := zonepb.NewZoneConfig()
if currentZoneConfigWithRaw, err := zv.descs.GetZoneConfig(
ctx, zv.txn, db.GetID(),
); err != nil {
return err
} else if currentZoneConfigWithRaw != nil {
currentZone = currentZoneConfigWithRaw.ZoneConfigProto()
}

_, err = generateAndValidateZoneConfigForMultiRegionDatabase(ctx,
zv.regionProvider,
zv.execCfg,
regionConfig,
currentZone,
true, /*validateLocalities*/
)
if err != nil {
Expand Down
95 changes: 61 additions & 34 deletions pkg/sql/set_zone_config.go
Original file line number Diff line number Diff line change
Expand Up @@ -749,8 +749,17 @@ func (n *setZoneConfigNode) startExec(params runParams) error {
return err
}

currentZone := zonepb.NewZoneConfig()
if currentZoneConfigWithRaw, err := params.p.Descriptors().GetZoneConfig(
params.ctx, params.p.Txn(), targetID,
); err != nil {
return err
} else if currentZoneConfigWithRaw != nil {
currentZone = currentZoneConfigWithRaw.ZoneConfigProto()
}

if err := validateZoneAttrsAndLocalities(
params.ctx, params.p.InternalSQLTxn().Regions(), params.p.ExecCfg(), &newZone,
params.ctx, params.p.InternalSQLTxn().Regions(), params.p.ExecCfg(), currentZone, &newZone,
); err != nil {
return err
}
Expand Down Expand Up @@ -962,38 +971,54 @@ func validateNoRepeatKeysInConstraints(constraints []zonepb.Constraint) error {
return nil
}

// accumulateUniqueConstraints returns a list of unique constraints in the
// given zone config proto.
func accumulateUniqueConstraints(zone *zonepb.ZoneConfig) []zonepb.Constraint {
constraints := make([]zonepb.Constraint, 0)
// accumulateNewUniqueConstraints returns a list of unique constraints in the
// given newZone config proto that are not in the currentZone
func accumulateNewUniqueConstraints(currentZone, newZone *zonepb.ZoneConfig) []zonepb.Constraint {
seenConstraints := make(map[zonepb.Constraint]struct{})
retConstraints := make([]zonepb.Constraint, 0)
addToValidate := func(c zonepb.Constraint) {
var alreadyInList bool
for _, val := range constraints {
if c == val {
alreadyInList = true
break
}
if _, ok := seenConstraints[c]; ok {
// Already in the list or in the current zone config, nothing to do.
return
}
retConstraints = append(retConstraints, c)
seenConstraints[c] = struct{}{}
}
// First scan all the current zone config constraints.
for _, constraints := range currentZone.Constraints {
for _, constraint := range constraints.Constraints {
seenConstraints[constraint] = struct{}{}
}
}
for _, constraints := range currentZone.VoterConstraints {
for _, constraint := range constraints.Constraints {
seenConstraints[constraint] = struct{}{}
}
if !alreadyInList {
constraints = append(constraints, c)
}
for _, leasePreferences := range currentZone.LeasePreferences {
for _, constraint := range leasePreferences.Constraints {
seenConstraints[constraint] = struct{}{}
}
}
for _, constraints := range zone.Constraints {

// Then scan all the new zone config constraints, adding the ones that
// were not seen already.
for _, constraints := range newZone.Constraints {
for _, constraint := range constraints.Constraints {
addToValidate(constraint)
}
}
for _, constraints := range zone.VoterConstraints {
for _, constraints := range newZone.VoterConstraints {
for _, constraint := range constraints.Constraints {
addToValidate(constraint)
}
}
for _, leasePreferences := range zone.LeasePreferences {
for _, leasePreferences := range newZone.LeasePreferences {
for _, constraint := range leasePreferences.Constraints {
addToValidate(constraint)
}
}
return constraints
return retConstraints
}

// validateZoneAttrsAndLocalities ensures that all constraints/lease preferences
Expand All @@ -1009,26 +1034,27 @@ func validateZoneAttrsAndLocalities(
ctx context.Context,
regionProvider descs.RegionProvider,
execCfg *ExecutorConfig,
zone *zonepb.ZoneConfig,
currentZone, newZone *zonepb.ZoneConfig,
) error {
// Avoid RPCs to the Node/Region server if we don't have anything to validate.
if len(zone.Constraints) == 0 && len(zone.VoterConstraints) == 0 && len(zone.LeasePreferences) == 0 {
if len(newZone.Constraints) == 0 && len(newZone.VoterConstraints) == 0 && len(newZone.LeasePreferences) == 0 {
return nil
}
if execCfg.Codec.ForSystemTenant() {
ss, err := execCfg.NodesStatusServer.OptionalNodesStatusServer(MultitenancyZoneCfgIssueNo)
if err != nil {
return err
}
return validateZoneAttrsAndLocalitiesForSystemTenant(ctx, ss.ListNodesInternal, zone)
return validateZoneAttrsAndLocalitiesForSystemTenant(ctx, ss.ListNodesInternal, currentZone, newZone)
}
return validateZoneLocalitiesForSecondaryTenants(ctx, regionProvider.GetRegions, zone)
return validateZoneLocalitiesForSecondaryTenants(ctx, regionProvider.GetRegions, currentZone, newZone)
}

// validateZoneAttrsAndLocalitiesForSystemTenant performs all the constraint/
// lease preferences validation for the system tenant. The system tenant is
// allowed to reference both locality and non-locality attributes as it has
// access to node information via the NodeStatusServer.
// validateZoneAttrsAndLocalitiesForSystemTenant performs constraint/ lease
// preferences validation for the system tenant. Only newly added constraints
// are validated. The system tenant is allowed to reference both locality and
// non-locality attributes as it has access to node information via the
// NodeStatusServer.
//
// For the system tenant, this only catches typos in required constraints. This
// is by design. We don't want to reject prohibited constraints whose
Expand All @@ -1038,14 +1064,14 @@ func validateZoneAttrsAndLocalities(
// the nodes before creating the constraints, data could be replicated there
// that shouldn't be.
func validateZoneAttrsAndLocalitiesForSystemTenant(
ctx context.Context, getNodes nodeGetter, zone *zonepb.ZoneConfig,
ctx context.Context, getNodes nodeGetter, currentZone, newZone *zonepb.ZoneConfig,
) error {
nodes, err := getNodes(ctx, &serverpb.NodesRequest{})
if err != nil {
return err
}

toValidate := accumulateUniqueConstraints(zone)
toValidate := accumulateNewUniqueConstraints(currentZone, newZone)

// Check that each constraint matches some store somewhere in the cluster.
for _, constraint := range toValidate {
Expand Down Expand Up @@ -1077,20 +1103,21 @@ func validateZoneAttrsAndLocalitiesForSystemTenant(
return nil
}

// validateZoneLocalitiesForSecondaryTenants performs all the constraint/lease
// preferences validation for secondary tenants. Secondary tenants are only
// allowed to reference locality attributes as they only have access to region
// information via the serverpb.TenantStatusServer. Even then, they're only
// allowed to reference the "region" and "zone" tiers.
// validateZoneLocalitiesForSecondaryTenants performs constraint/lease
// preferences validation for secondary tenants. Only newly added constraints
// are validated. Unless secondaryTenantsAllZoneConfigsEnabled is set to 'true',
// secondary tenants are only allowed to reference locality attributes as they
// only have access to region information via the serverpb.TenantStatusServer.
// In that case they're only allowed to reference the "region" and "zone" tiers.
//
// Unlike the system tenant, we also validate prohibited constraints. This is
// because secondary tenant must operate in the narrow view exposed via the
// serverpb.TenantStatusServer and are not allowed to configure arbitrary
// constraints (required or otherwise).
func validateZoneLocalitiesForSecondaryTenants(
ctx context.Context, getRegions regionsGetter, zone *zonepb.ZoneConfig,
ctx context.Context, getRegions regionsGetter, currentZone, newZone *zonepb.ZoneConfig,
) error {
toValidate := accumulateUniqueConstraints(zone)
toValidate := accumulateNewUniqueConstraints(currentZone, newZone)
resp, err := getRegions(ctx)
if err != nil {
return err
Expand Down
4 changes: 2 additions & 2 deletions pkg/sql/set_zone_config_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@ func TestValidateZoneAttrsAndLocalitiesForSecondaryTenants(t *testing.T) {
err := yaml.UnmarshalStrict([]byte(tc.cfg), &zone)
require.NoError(t, err)

err = validateZoneLocalitiesForSecondaryTenants(context.Background(), getRegions, &zone)
err = validateZoneLocalitiesForSecondaryTenants(context.Background(), getRegions, zonepb.NewZoneConfig(), &zone)
if tc.errRe == "" {
require.NoError(t, err)
} else {
Expand Down Expand Up @@ -301,7 +301,7 @@ func TestValidateZoneAttrsAndLocalitiesForSystemTenant(t *testing.T) {
t.Fatalf("#%d: expected parse err for %q; got success", i, tc.cfg)
}

err = validateZoneAttrsAndLocalitiesForSystemTenant(context.Background(), tc.nodes, &zone)
err = validateZoneAttrsAndLocalitiesForSystemTenant(context.Background(), tc.nodes, zonepb.NewZoneConfig(), &zone)
if err != nil && tc.expectErr == expectSuccess {
t.Errorf("#%d: expected success for %q; got %v", i, tc.cfg, err)
} else if err == nil && tc.expectErr == expectValidateErr {
Expand Down

0 comments on commit 6d740e0

Please sign in to comment.