From 6d740e0ddaecb06bcbcc65148fb34ea48eb4eec1 Mon Sep 17 00:00:00 2001 From: Rafi Shamim Date: Tue, 7 Nov 2023 12:48:41 -0500 Subject: [PATCH] sql: only validate new regions when adding/dropping When we added validation logic to make sure every region corresponded to a known node locality, we were a little too aggressive. The validation made it possible to end up in a state where any ALTER..REGION operation could hang. This could happen in a few situations; for example: - node is restarted with a different locality flag. - MR cluster is restored into a non-MR cluster. - c2c streaming is used with a MR source and non-MR destination. In all these cases, the problem was that the zone configuration could reference a region that no longer has any nodes with the corresponding locality. The validation was too aggressive, since it would validate those regions which already existed in the zone configuration. Now, only the newly added region is validated. Release note (bug fix): Fixed a bug that could cause ALTER DATABASE ... ADD/DROP REGION to hang if node localities were changed after regions were added. --- pkg/cmd/roachtest/tests/BUILD.bazel | 1 + pkg/cmd/roachtest/tests/acceptance.go | 6 ++ .../roachtest/tests/mismatched_locality.go | 86 +++++++++++++++++ pkg/sql/alter_database.go | 13 ++- pkg/sql/region_util.go | 25 ++++- pkg/sql/set_zone_config.go | 95 ++++++++++++------- pkg/sql/set_zone_config_test.go | 4 +- 7 files changed, 190 insertions(+), 40 deletions(-) create mode 100644 pkg/cmd/roachtest/tests/mismatched_locality.go diff --git a/pkg/cmd/roachtest/tests/BUILD.bazel b/pkg/cmd/roachtest/tests/BUILD.bazel index d34404e356fa..57246c641235 100644 --- a/pkg/cmd/roachtest/tests/BUILD.bazel +++ b/pkg/cmd/roachtest/tests/BUILD.bazel @@ -90,6 +90,7 @@ go_library( "liquibase_blocklist.go", "loss_of_quorum_recovery.go", "many_splits.go", + "mismatched_locality.go", "mixed_version_backup.go", "mixed_version_cdc.go", "mixed_version_change_replicas.go", diff --git a/pkg/cmd/roachtest/tests/acceptance.go b/pkg/cmd/roachtest/tests/acceptance.go index 627dc2e5d77d..fa12e3ec118d 100644 --- a/pkg/cmd/roachtest/tests/acceptance.go +++ b/pkg/cmd/roachtest/tests/acceptance.go @@ -83,6 +83,12 @@ func registerAcceptance(r registry.Registry) { fn: runValidateSystemSchemaAfterVersionUpgrade, timeout: 30 * time.Minute, defaultLeases: true, + numNodes: 1, + }, + { + name: "mismatched-locality", + fn: runMismatchedLocalityTest, + numNodes: 3, }, }, } diff --git a/pkg/cmd/roachtest/tests/mismatched_locality.go b/pkg/cmd/roachtest/tests/mismatched_locality.go new file mode 100644 index 000000000000..664de6f6d6fb --- /dev/null +++ b/pkg/cmd/roachtest/tests/mismatched_locality.go @@ -0,0 +1,86 @@ +// Copyright 2023 The Cockroach Authors. +// +// Use of this software is governed by the Business Source License +// included in the file licenses/BSL.txt. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0, included in the file +// licenses/APL.txt. + +package tests + +import ( + "context" + + "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster" + "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option" + "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test" + "github.com/cockroachdb/cockroach/pkg/roachprod/install" +) + +// This test tests that we can add and drop regions even if the locality flags +// of a node no longer match the regions that already were added to the +// database. +func runMismatchedLocalityTest(ctx context.Context, t test.Test, c cluster.Cluster) { + // Start 3 nodes with a different localities. + c.Put(ctx, t.Cockroach(), "./cockroach", c.All()) + startOpts := option.DefaultStartOpts() + startOpts.RoachprodOpts.ExtraArgs = []string{"--locality=region=east"} + c.Start(ctx, t.L(), startOpts, install.MakeClusterSettings(), c.Nodes(1)) + startOpts.RoachprodOpts.ExtraArgs = []string{"--locality=region=central"} + c.Start(ctx, t.L(), startOpts, install.MakeClusterSettings(), c.Nodes(2)) + startOpts.RoachprodOpts.ExtraArgs = []string{"--locality=region=west"} + c.Start(ctx, t.L(), startOpts, install.MakeClusterSettings(), c.Nodes(3)) + + // Add the 3 regions to the database. + db := c.Conn(ctx, t.L(), 1) + if _, err := db.Exec(`ALTER DATABASE defaultdb PRIMARY REGION "east";`); err != nil { + t.Fatal(err) + } + if _, err := db.Exec(`ALTER DATABASE defaultdb ADD REGION "central";`); err != nil { + t.Fatal(err) + } + if _, err := db.Exec(`ALTER DATABASE defaultdb ADD REGION "west";`); err != nil { + t.Fatal(err) + } + + // Restart all the nodes with new localities. + c.Stop(ctx, t.L(), option.DefaultStopOpts(), c.Nodes(1)) + startOpts.RoachprodOpts.ExtraArgs = []string{"--locality=region=mars"} + c.Start(ctx, t.L(), startOpts, install.MakeClusterSettings(), c.Nodes(1)) + c.Stop(ctx, t.L(), option.DefaultStopOpts(), c.Nodes(2)) + startOpts.RoachprodOpts.ExtraArgs = []string{"--locality=region=jupiter"} + c.Start(ctx, t.L(), startOpts, install.MakeClusterSettings(), c.Nodes(2)) + c.Stop(ctx, t.L(), option.DefaultStopOpts(), c.Nodes(3)) + startOpts.RoachprodOpts.ExtraArgs = []string{"--locality=region=venus"} + c.Start(ctx, t.L(), startOpts, install.MakeClusterSettings(), c.Nodes(3)) + + // Verify that we can add and drop regions for the database. There's no longer + // any node with the old localities, but that's fine. + db = c.Conn(ctx, t.L(), 3) + if err := WaitFor3XReplication(ctx, t, db); err != nil { + t.Fatal(err) + } + if _, err := db.Exec(`ALTER DATABASE defaultdb ADD REGION "venus";`); err != nil { + t.Fatal(err) + } + if _, err := db.Exec(`ALTER DATABASE defaultdb DROP REGION "central";`); err != nil { + t.Fatal(err) + } + if _, err := db.Exec(`ALTER DATABASE defaultdb ADD REGION "jupiter";`); err != nil { + t.Fatal(err) + } + if _, err := db.Exec(`ALTER DATABASE defaultdb ADD REGION "mars";`); err != nil { + t.Fatal(err) + } + if _, err := db.Exec(`ALTER DATABASE defaultdb SET PRIMARY REGION "mars";`); err != nil { + t.Fatal(err) + } + if _, err := db.Exec(`ALTER DATABASE defaultdb DROP REGION "west";`); err != nil { + t.Fatal(err) + } + if _, err := db.Exec(`ALTER DATABASE defaultdb DROP REGION "east";`); err != nil { + t.Fatal(err) + } +} diff --git a/pkg/sql/alter_database.go b/pkg/sql/alter_database.go index f085fef5ce03..2f4ff999e3c1 100644 --- a/pkg/sql/alter_database.go +++ b/pkg/sql/alter_database.go @@ -2326,6 +2326,15 @@ func (n *alterDatabaseSetZoneConfigExtensionNode) startExec(params runParams) er } } + currentZone := zonepb.NewZoneConfig() + if currentZoneConfigWithRaw, err := params.p.Descriptors().GetZoneConfig( + params.ctx, params.p.Txn(), n.desc.ID, + ); err != nil { + return err + } else if currentZoneConfigWithRaw != nil { + currentZone = currentZoneConfigWithRaw.ZoneConfigProto() + } + if deleteZone { switch n.n.LocalityLevel { case tree.LocalityLevelGlobal: @@ -2388,7 +2397,7 @@ func (n *alterDatabaseSetZoneConfigExtensionNode) startExec(params runParams) er } if err := validateZoneAttrsAndLocalities( - params.ctx, params.p.InternalSQLTxn().Regions(), params.p.ExecCfg(), newZone, + params.ctx, params.p.InternalSQLTxn().Regions(), params.p.ExecCfg(), currentZone, newZone, ); err != nil { return err } @@ -2433,7 +2442,7 @@ func (n *alterDatabaseSetZoneConfigExtensionNode) startExec(params runParams) er // Validate if the zone config extension is compatible with the database. dbZoneConfig, err := generateAndValidateZoneConfigForMultiRegionDatabase( - params.ctx, params.p.InternalSQLTxn().Regions(), params.ExecCfg(), updatedRegionConfig, true, /* validateLocalities */ + params.ctx, params.p.InternalSQLTxn().Regions(), params.ExecCfg(), updatedRegionConfig, currentZone, true, /* validateLocalities */ ) if err != nil { return err diff --git a/pkg/sql/region_util.go b/pkg/sql/region_util.go index ddb76a97f713..9b9ffefe0294 100644 --- a/pkg/sql/region_util.go +++ b/pkg/sql/region_util.go @@ -877,6 +877,7 @@ func generateAndValidateZoneConfigForMultiRegionDatabase( regionProvider descs.RegionProvider, execConfig *ExecutorConfig, regionConfig multiregion.RegionConfig, + currentZoneConfig *zonepb.ZoneConfig, validateLocalities bool, ) (zonepb.ZoneConfig, error) { // Build a zone config based on the RegionConfig information. @@ -897,7 +898,7 @@ func generateAndValidateZoneConfigForMultiRegionDatabase( return zonepb.ZoneConfig{}, err } - if err := validateZoneAttrsAndLocalities(ctx, regionProvider, execConfig, &dbZoneConfig); err != nil { + if err := validateZoneAttrsAndLocalities(ctx, regionProvider, execConfig, currentZoneConfig, &dbZoneConfig); err != nil { // If we are validating localities this is fatal, otherwise let's log any // errors as warnings. if validateLocalities { @@ -921,8 +922,17 @@ func ApplyZoneConfigFromDatabaseRegionConfig( validateLocalities bool, kvTrace bool, ) error { + currentZone := zonepb.NewZoneConfig() + if currentZoneConfigWithRaw, err := txn.Descriptors().GetZoneConfig( + ctx, txn.KV(), dbID, + ); err != nil { + return err + } else if currentZoneConfigWithRaw != nil { + currentZone = currentZoneConfigWithRaw.ZoneConfigProto() + } + // Build a zone config based on the RegionConfig information. - dbZoneConfig, err := generateAndValidateZoneConfigForMultiRegionDatabase(ctx, txn.Regions(), execConfig, regionConfig, validateLocalities) + dbZoneConfig, err := generateAndValidateZoneConfigForMultiRegionDatabase(ctx, txn.Regions(), execConfig, regionConfig, currentZone, validateLocalities) if err != nil { return err } @@ -2623,10 +2633,21 @@ func (zv *zoneConfigValidator) ValidateDbZoneConfig( if err != nil { return err } + + currentZone := zonepb.NewZoneConfig() + if currentZoneConfigWithRaw, err := zv.descs.GetZoneConfig( + ctx, zv.txn, db.GetID(), + ); err != nil { + return err + } else if currentZoneConfigWithRaw != nil { + currentZone = currentZoneConfigWithRaw.ZoneConfigProto() + } + _, err = generateAndValidateZoneConfigForMultiRegionDatabase(ctx, zv.regionProvider, zv.execCfg, regionConfig, + currentZone, true, /*validateLocalities*/ ) if err != nil { diff --git a/pkg/sql/set_zone_config.go b/pkg/sql/set_zone_config.go index 50e689de4f6b..e0174dcd4fc4 100644 --- a/pkg/sql/set_zone_config.go +++ b/pkg/sql/set_zone_config.go @@ -749,8 +749,17 @@ func (n *setZoneConfigNode) startExec(params runParams) error { return err } + currentZone := zonepb.NewZoneConfig() + if currentZoneConfigWithRaw, err := params.p.Descriptors().GetZoneConfig( + params.ctx, params.p.Txn(), targetID, + ); err != nil { + return err + } else if currentZoneConfigWithRaw != nil { + currentZone = currentZoneConfigWithRaw.ZoneConfigProto() + } + if err := validateZoneAttrsAndLocalities( - params.ctx, params.p.InternalSQLTxn().Regions(), params.p.ExecCfg(), &newZone, + params.ctx, params.p.InternalSQLTxn().Regions(), params.p.ExecCfg(), currentZone, &newZone, ); err != nil { return err } @@ -962,38 +971,54 @@ func validateNoRepeatKeysInConstraints(constraints []zonepb.Constraint) error { return nil } -// accumulateUniqueConstraints returns a list of unique constraints in the -// given zone config proto. -func accumulateUniqueConstraints(zone *zonepb.ZoneConfig) []zonepb.Constraint { - constraints := make([]zonepb.Constraint, 0) +// accumulateNewUniqueConstraints returns a list of unique constraints in the +// given newZone config proto that are not in the currentZone +func accumulateNewUniqueConstraints(currentZone, newZone *zonepb.ZoneConfig) []zonepb.Constraint { + seenConstraints := make(map[zonepb.Constraint]struct{}) + retConstraints := make([]zonepb.Constraint, 0) addToValidate := func(c zonepb.Constraint) { - var alreadyInList bool - for _, val := range constraints { - if c == val { - alreadyInList = true - break - } + if _, ok := seenConstraints[c]; ok { + // Already in the list or in the current zone config, nothing to do. + return + } + retConstraints = append(retConstraints, c) + seenConstraints[c] = struct{}{} + } + // First scan all the current zone config constraints. + for _, constraints := range currentZone.Constraints { + for _, constraint := range constraints.Constraints { + seenConstraints[constraint] = struct{}{} + } + } + for _, constraints := range currentZone.VoterConstraints { + for _, constraint := range constraints.Constraints { + seenConstraints[constraint] = struct{}{} } - if !alreadyInList { - constraints = append(constraints, c) + } + for _, leasePreferences := range currentZone.LeasePreferences { + for _, constraint := range leasePreferences.Constraints { + seenConstraints[constraint] = struct{}{} } } - for _, constraints := range zone.Constraints { + + // Then scan all the new zone config constraints, adding the ones that + // were not seen already. + for _, constraints := range newZone.Constraints { for _, constraint := range constraints.Constraints { addToValidate(constraint) } } - for _, constraints := range zone.VoterConstraints { + for _, constraints := range newZone.VoterConstraints { for _, constraint := range constraints.Constraints { addToValidate(constraint) } } - for _, leasePreferences := range zone.LeasePreferences { + for _, leasePreferences := range newZone.LeasePreferences { for _, constraint := range leasePreferences.Constraints { addToValidate(constraint) } } - return constraints + return retConstraints } // validateZoneAttrsAndLocalities ensures that all constraints/lease preferences @@ -1009,10 +1034,10 @@ func validateZoneAttrsAndLocalities( ctx context.Context, regionProvider descs.RegionProvider, execCfg *ExecutorConfig, - zone *zonepb.ZoneConfig, + currentZone, newZone *zonepb.ZoneConfig, ) error { // Avoid RPCs to the Node/Region server if we don't have anything to validate. - if len(zone.Constraints) == 0 && len(zone.VoterConstraints) == 0 && len(zone.LeasePreferences) == 0 { + if len(newZone.Constraints) == 0 && len(newZone.VoterConstraints) == 0 && len(newZone.LeasePreferences) == 0 { return nil } if execCfg.Codec.ForSystemTenant() { @@ -1020,15 +1045,16 @@ func validateZoneAttrsAndLocalities( if err != nil { return err } - return validateZoneAttrsAndLocalitiesForSystemTenant(ctx, ss.ListNodesInternal, zone) + return validateZoneAttrsAndLocalitiesForSystemTenant(ctx, ss.ListNodesInternal, currentZone, newZone) } - return validateZoneLocalitiesForSecondaryTenants(ctx, regionProvider.GetRegions, zone) + return validateZoneLocalitiesForSecondaryTenants(ctx, regionProvider.GetRegions, currentZone, newZone) } -// validateZoneAttrsAndLocalitiesForSystemTenant performs all the constraint/ -// lease preferences validation for the system tenant. The system tenant is -// allowed to reference both locality and non-locality attributes as it has -// access to node information via the NodeStatusServer. +// validateZoneAttrsAndLocalitiesForSystemTenant performs constraint/ lease +// preferences validation for the system tenant. Only newly added constraints +// are validated. The system tenant is allowed to reference both locality and +// non-locality attributes as it has access to node information via the +// NodeStatusServer. // // For the system tenant, this only catches typos in required constraints. This // is by design. We don't want to reject prohibited constraints whose @@ -1038,14 +1064,14 @@ func validateZoneAttrsAndLocalities( // the nodes before creating the constraints, data could be replicated there // that shouldn't be. func validateZoneAttrsAndLocalitiesForSystemTenant( - ctx context.Context, getNodes nodeGetter, zone *zonepb.ZoneConfig, + ctx context.Context, getNodes nodeGetter, currentZone, newZone *zonepb.ZoneConfig, ) error { nodes, err := getNodes(ctx, &serverpb.NodesRequest{}) if err != nil { return err } - toValidate := accumulateUniqueConstraints(zone) + toValidate := accumulateNewUniqueConstraints(currentZone, newZone) // Check that each constraint matches some store somewhere in the cluster. for _, constraint := range toValidate { @@ -1077,20 +1103,21 @@ func validateZoneAttrsAndLocalitiesForSystemTenant( return nil } -// validateZoneLocalitiesForSecondaryTenants performs all the constraint/lease -// preferences validation for secondary tenants. Secondary tenants are only -// allowed to reference locality attributes as they only have access to region -// information via the serverpb.TenantStatusServer. Even then, they're only -// allowed to reference the "region" and "zone" tiers. +// validateZoneLocalitiesForSecondaryTenants performs constraint/lease +// preferences validation for secondary tenants. Only newly added constraints +// are validated. Unless secondaryTenantsAllZoneConfigsEnabled is set to 'true', +// secondary tenants are only allowed to reference locality attributes as they +// only have access to region information via the serverpb.TenantStatusServer. +// In that case they're only allowed to reference the "region" and "zone" tiers. // // Unlike the system tenant, we also validate prohibited constraints. This is // because secondary tenant must operate in the narrow view exposed via the // serverpb.TenantStatusServer and are not allowed to configure arbitrary // constraints (required or otherwise). func validateZoneLocalitiesForSecondaryTenants( - ctx context.Context, getRegions regionsGetter, zone *zonepb.ZoneConfig, + ctx context.Context, getRegions regionsGetter, currentZone, newZone *zonepb.ZoneConfig, ) error { - toValidate := accumulateUniqueConstraints(zone) + toValidate := accumulateNewUniqueConstraints(currentZone, newZone) resp, err := getRegions(ctx) if err != nil { return err diff --git a/pkg/sql/set_zone_config_test.go b/pkg/sql/set_zone_config_test.go index 1d67c96f589e..797265dace40 100644 --- a/pkg/sql/set_zone_config_test.go +++ b/pkg/sql/set_zone_config_test.go @@ -146,7 +146,7 @@ func TestValidateZoneAttrsAndLocalitiesForSecondaryTenants(t *testing.T) { err := yaml.UnmarshalStrict([]byte(tc.cfg), &zone) require.NoError(t, err) - err = validateZoneLocalitiesForSecondaryTenants(context.Background(), getRegions, &zone) + err = validateZoneLocalitiesForSecondaryTenants(context.Background(), getRegions, zonepb.NewZoneConfig(), &zone) if tc.errRe == "" { require.NoError(t, err) } else { @@ -301,7 +301,7 @@ func TestValidateZoneAttrsAndLocalitiesForSystemTenant(t *testing.T) { t.Fatalf("#%d: expected parse err for %q; got success", i, tc.cfg) } - err = validateZoneAttrsAndLocalitiesForSystemTenant(context.Background(), tc.nodes, &zone) + err = validateZoneAttrsAndLocalitiesForSystemTenant(context.Background(), tc.nodes, zonepb.NewZoneConfig(), &zone) if err != nil && tc.expectErr == expectSuccess { t.Errorf("#%d: expected success for %q; got %v", i, tc.cfg, err) } else if err == nil && tc.expectErr == expectValidateErr {