From 066c9db82e697054e9f8ec69920544b6449b8d31 Mon Sep 17 00:00:00 2001 From: Rebecca Taft Date: Wed, 19 Sep 2018 21:10:19 -0400 Subject: [PATCH 1/7] opt: fix panic when srf used with GROUP BY Instead of panicking, we now throw an appropriate error. Fixes #30412 Release note (bug fix): Fixed a panic that occurred when a generator function such as unnest was used in the SELECT list in the presence of GROUP BY. --- pkg/sql/opt/optbuilder/groupby.go | 7 +++++++ pkg/sql/opt/optbuilder/scalar.go | 11 +++++++---- pkg/sql/opt/optbuilder/testdata/srfs | 16 ++++++++++++++++ 3 files changed, 30 insertions(+), 4 deletions(-) diff --git a/pkg/sql/opt/optbuilder/groupby.go b/pkg/sql/opt/optbuilder/groupby.go index 24a1d9c678d3..00df30d00c2d 100644 --- a/pkg/sql/opt/optbuilder/groupby.go +++ b/pkg/sql/opt/optbuilder/groupby.go @@ -515,3 +515,10 @@ var aggOpLookup = map[string]opt.Operator{ "json_agg": opt.JsonAggOp, "jsonb_agg": opt.JsonbAggOp, } + +func newGroupingError(name *tree.Name) error { + return pgerror.NewErrorf(pgerror.CodeGroupingError, + "column \"%s\" must appear in the GROUP BY clause or be used in an aggregate function", + tree.ErrString(name), + ) +} diff --git a/pkg/sql/opt/optbuilder/scalar.go b/pkg/sql/opt/optbuilder/scalar.go index 131100f80edb..3498ce164c2f 100644 --- a/pkg/sql/opt/optbuilder/scalar.go +++ b/pkg/sql/opt/optbuilder/scalar.go @@ -141,10 +141,7 @@ func (b *Builder) buildScalar( // Non-grouping column was referenced. Note that a column that is part // of a larger grouping expression would have been detected by the // groupStrs checking code above. - panic(builderError{pgerror.NewErrorf(pgerror.CodeGroupingError, - "column \"%s\" must appear in the GROUP BY clause or be used in an aggregate function", - tree.ErrString(&t.name), - )}) + panic(builderError{newGroupingError(&t.name)}) } return b.finishBuildScalarRef(t, inScope, outScope, outCol, colRefs) @@ -407,6 +404,12 @@ func (b *Builder) buildScalar( case *srf: if len(t.cols) == 1 { + if inGroupingContext { + // Non-grouping column was referenced. Note that a column that is part + // of a larger grouping expression would have been detected by the + // groupStrs checking code above. + panic(builderError{newGroupingError(&t.cols[0].name)}) + } return b.finishBuildScalarRef(&t.cols[0], inScope, outScope, outCol, colRefs) } list := make([]memo.GroupID, len(t.cols)) diff --git a/pkg/sql/opt/optbuilder/testdata/srfs b/pkg/sql/opt/optbuilder/testdata/srfs index 0719359d3ab7..05bed7484c7e 100644 --- a/pkg/sql/opt/optbuilder/testdata/srfs +++ b/pkg/sql/opt/optbuilder/testdata/srfs @@ -908,3 +908,19 @@ project │ │ └── true [type=bool] │ └── const: 1000 [type=int] └── true [type=bool] + +# Regression test for #30412. +build +SELECT 0, unnest(ARRAY[0]) GROUP BY 1 +---- +error (42803): column "unnest" must appear in the GROUP BY clause or be used in an aggregate function + +build +SELECT 0, unnest(ARRAY[0]) GROUP BY 1, 2 +---- +error: unnest(): generator functions are not allowed in GROUP BY + +build +SELECT 0, information_schema._pg_expandarray(ARRAY[0]) GROUP BY 1 +---- +error (42803): column "x" must appear in the GROUP BY clause or be used in an aggregate function From 081eddd33aaee23d3ce64b31d8d0ee39d93c211a Mon Sep 17 00:00:00 2001 From: Tobias Schottdorf Date: Thu, 20 Sep 2018 10:42:25 +0200 Subject: [PATCH 2/7] roachtest: remove now-unnecessary hack Closes #27717. Release note: None --- pkg/cmd/roachtest/upgrade.go | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/pkg/cmd/roachtest/upgrade.go b/pkg/cmd/roachtest/upgrade.go index 702afaae7a6c..d41da59115b7 100644 --- a/pkg/cmd/roachtest/upgrade.go +++ b/pkg/cmd/roachtest/upgrade.go @@ -48,17 +48,7 @@ func registerUpgrade(r *registry) { c.Put(ctx, b, "./cockroach", c.Range(1, nodes)) // Force disable encryption. // TODO(mberhault): allow it once oldVersion >= 2.1. - start := func() { - c.Start(ctx, c.Range(1, nodes), startArgsDontEncrypt) - } - start() - time.Sleep(5 * time.Second) - - // TODO(tschottdorf): this is a hack similar to the one in the mixed version - // test. Remove it when we have a 2.0.x binary that has #27639 fixed. - c.Stop(ctx, c.Range(1, nodes)) - start() - time.Sleep(5 * time.Second) + c.Start(ctx, c.Range(1, nodes), startArgsDontEncrypt) const stageDuration = 30 * time.Second const timeUntilStoreDead = 90 * time.Second @@ -266,7 +256,7 @@ func registerUpgrade(r *registry) { } } - const oldVersion = "v2.0.0" + const oldVersion = "v2.0.5" for _, n := range []int{5} { r.Add(testSpec{ Name: fmt.Sprintf("upgrade/oldVersion=%s/nodes=%d", oldVersion, n), From 8757765e64032c1488180bd970e855d4d96aebdc Mon Sep 17 00:00:00 2001 From: Tobias Schottdorf Date: Thu, 20 Sep 2018 10:51:26 +0200 Subject: [PATCH 3/7] storage: give TestReplicateRemovedNodeDisruptiveElection more time Perhaps: Fixes #27253. Release note: None --- pkg/storage/client_raft_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/storage/client_raft_test.go b/pkg/storage/client_raft_test.go index 2890198bcac6..54a249b03c17 100644 --- a/pkg/storage/client_raft_test.go +++ b/pkg/storage/client_raft_test.go @@ -3193,7 +3193,7 @@ func TestReplicateRemovedNodeDisruptiveElection(t *testing.T) { default: t.Fatalf("unexpected error type %T: %s", pErr.GetDetail(), pErr) } - case <-time.After(5 * time.Second): + case <-time.After(45 * time.Second): t.Fatal("did not get expected error") } From db68ad943b2677a18ad8ab3ed8dc20d8c2790f7e Mon Sep 17 00:00:00 2001 From: Tobias Schottdorf Date: Thu, 20 Sep 2018 11:11:03 +0200 Subject: [PATCH 4/7] storage: de-flake TestReplicaIDChangePending setReplicaID refreshes the proposal and was thus synchronously writing to the commandProposed chan. This channel could have filled up due to an earlier reproposal already, deadlocking the test. Fixes #28132. Release note: None --- pkg/storage/replica_test.go | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pkg/storage/replica_test.go b/pkg/storage/replica_test.go index 3159e7bc7cd9..09fe27af3a59 100644 --- a/pkg/storage/replica_test.go +++ b/pkg/storage/replica_test.go @@ -7861,7 +7861,10 @@ func TestReplicaIDChangePending(t *testing.T) { repl.mu.Lock() repl.mu.submitProposalFn = func(p *ProposalData) error { if p.Request.Timestamp == magicTS { - commandProposed <- struct{}{} + select { + case commandProposed <- struct{}{}: + default: + } } return nil } From 4b20a079c78cbc2ed13f1caa38e5cc6938ff08bc Mon Sep 17 00:00:00 2001 From: Tobias Schottdorf Date: Thu, 20 Sep 2018 12:30:41 +0200 Subject: [PATCH 5/7] testcluster: improve AddReplicas check AddReplicas was verifying that a replica had indeed been added, but there's no guarantee that the replicate queue wouldn't have removed it in the meantime. Attempt to work around this somewhat. The real solution is not to provide that guarantee, but some tests likely rely on it (and the failure is extremely rare, i.e. the new for loop basically never runs). Observed in #28368. Release note: None --- pkg/testutils/testcluster/testcluster.go | 60 +++++++++++++----------- 1 file changed, 33 insertions(+), 27 deletions(-) diff --git a/pkg/testutils/testcluster/testcluster.go b/pkg/testutils/testcluster/testcluster.go index 030301ae04c1..e217b06804c1 100644 --- a/pkg/testutils/testcluster/testcluster.go +++ b/pkg/testutils/testcluster/testcluster.go @@ -348,37 +348,43 @@ func (tc *TestCluster) AddReplicas( startKey roachpb.Key, targets ...roachpb.ReplicationTarget, ) (roachpb.RangeDescriptor, error) { rKey := keys.MustAddr(startKey) - rangeDesc, err := tc.changeReplicas( - roachpb.ADD_REPLICA, rKey, targets..., - ) - if err != nil { - return roachpb.RangeDescriptor{}, err - } + errRetry := errors.Errorf("target not found") + for { + rangeDesc, err := tc.changeReplicas( + roachpb.ADD_REPLICA, rKey, targets..., + ) + if err != nil { + return roachpb.RangeDescriptor{}, err + } - // Wait for the replication to complete on all destination nodes. - if err := retry.ForDuration(time.Second*5, func() error { - for _, target := range targets { - // Use LookupReplica(keys) instead of GetRange(rangeID) to ensure that the - // snapshot has been transferred and the descriptor initialized. - store, err := tc.findMemberStore(target.StoreID) - if err != nil { - log.Errorf(context.TODO(), "unexpected error: %s", err) - return err - } - repl := store.LookupReplica(rKey) - if repl == nil { - return errors.Errorf("range not found on store %d", target) - } - desc := repl.Desc() - if _, ok := desc.GetReplicaDescriptor(target.StoreID); !ok { - return errors.Errorf("target store %d not yet in range descriptor %v", target.StoreID, desc) + // Wait for the replication to complete on all destination nodes. + if err := retry.ForDuration(time.Second*25, func() error { + for _, target := range targets { + // Use LookupReplica(keys) instead of GetRange(rangeID) to ensure that the + // snapshot has been transferred and the descriptor initialized. + store, err := tc.findMemberStore(target.StoreID) + if err != nil { + log.Errorf(context.TODO(), "unexpected error: %s", err) + return err + } + repl := store.LookupReplica(rKey) + if repl == nil { + return errors.Wrapf(errRetry, "for target %s", target) + } + desc := repl.Desc() + if _, ok := desc.GetReplicaDescriptor(target.StoreID); !ok { + return errors.Errorf("target store %d not yet in range descriptor %v", target.StoreID, desc) + } } + return nil + }); errors.Cause(err) == errRetry { + log.Warningf(context.Background(), "target was likely downreplicated again; retrying after %s", err) + continue + } else if err != nil { + return roachpb.RangeDescriptor{}, err } - return nil - }); err != nil { - return roachpb.RangeDescriptor{}, err + return rangeDesc, nil } - return rangeDesc, nil } // RemoveReplicas is part of the TestServerInterface. From fe0d16bc4ce58755ab2a18600d983556eb622227 Mon Sep 17 00:00:00 2001 From: Tobias Schottdorf Date: Thu, 20 Sep 2018 12:50:06 +0200 Subject: [PATCH 6/7] storage: unskip TestClosedTimestampCanServe for non-race Fixes #28607. Release note: None --- pkg/storage/closed_timestamp_test.go | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/pkg/storage/closed_timestamp_test.go b/pkg/storage/closed_timestamp_test.go index dca2215f4f34..44b2ccca5c95 100644 --- a/pkg/storage/closed_timestamp_test.go +++ b/pkg/storage/closed_timestamp_test.go @@ -16,6 +16,7 @@ package storage_test import ( "context" + "fmt" "testing" "time" @@ -28,6 +29,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/storage/engine/enginepb" "github.com/cockroachdb/cockroach/pkg/testutils" "github.com/cockroachdb/cockroach/pkg/testutils/serverutils" + "github.com/cockroachdb/cockroach/pkg/util" "github.com/cockroachdb/cockroach/pkg/util/hlc" "github.com/cockroachdb/cockroach/pkg/util/leaktest" "github.com/cockroachdb/cockroach/pkg/util/timeutil" @@ -35,7 +37,13 @@ import ( func TestClosedTimestampCanServe(t *testing.T) { defer leaktest.AfterTest(t)() - t.Skip("https://github.com/cockroachdb/cockroach/issues/28607") + + if util.RaceEnabled { + // Limiting how long transactions can run does not work + // well with race unless we're extremely lenient, which + // drives up the test duration. + t.Skip("skipping under race") + } ctx := context.Background() const numNodes = 3 @@ -47,13 +55,16 @@ func TestClosedTimestampCanServe(t *testing.T) { // Every 0.1s=100ms, try close out a timestamp ~300ms in the past. // We don't want to be more aggressive than that since it's also // a limit on how long transactions can run. - if _, err := db0.Exec(` -SET CLUSTER SETTING kv.closed_timestamp.target_duration = '300ms'; -SET CLUSTER SETTING kv.closed_timestamp.close_fraction = 0.1/0.3; + targetDuration := 300 * time.Millisecond + closeFraction := 0.3 + + if _, err := db0.Exec(fmt.Sprintf(` +SET CLUSTER SETTING kv.closed_timestamp.target_duration = '%s'; +SET CLUSTER SETTING kv.closed_timestamp.close_fraction = %.3f; SET CLUSTER SETTING kv.closed_timestamp.follower_reads_enabled = true; CREATE DATABASE cttest; CREATE TABLE cttest.kv (id INT PRIMARY KEY, value STRING); -`); err != nil { +`, targetDuration, closeFraction)); err != nil { t.Fatal(err) } From db1eb7d5374ac503eb3615fe8e2225475f2e4128 Mon Sep 17 00:00:00 2001 From: Tobias Schottdorf Date: Thu, 20 Sep 2018 16:21:21 +0200 Subject: [PATCH 7/7] roachtest: mark acceptance as stable all of its subtests are already stable, but in running a test locally I noticed that the top-level test was marked as passing as unstable. I'm not sure, but this might mean that the top-level test would actually not fail? Either way, better to mark it as stable explicitly. We should also spend some thought on how diverging notions of Stable in sub vs top level test are treated, not sure that this is well-defined. Release note: None --- pkg/cmd/roachtest/acceptance.go | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pkg/cmd/roachtest/acceptance.go b/pkg/cmd/roachtest/acceptance.go index 7dd4a21adddb..8ebff00081c9 100644 --- a/pkg/cmd/roachtest/acceptance.go +++ b/pkg/cmd/roachtest/acceptance.go @@ -25,8 +25,9 @@ func registerAcceptance(r *registry) { // local mode the acceptance tests should be configured to run within a // minute or so as these tests are run on every merge to master. spec := testSpec{ - Name: "acceptance", - Nodes: nodes(4), + Name: "acceptance", + Nodes: nodes(4), + Stable: true, // DO NOT COPY to new tests } testCases := []struct {