diff --git a/pkg/ccl/logictestccl/testdata/logic_test/partitioning b/pkg/ccl/logictestccl/testdata/logic_test/partitioning index 55b6a9eb132b..f00f24460521 100644 --- a/pkg/ccl/logictestccl/testdata/logic_test/partitioning +++ b/pkg/ccl/logictestccl/testdata/logic_test/partitioning @@ -428,9 +428,13 @@ TABLE ok1 └── PRIMARY INDEX primary ├── a int not null ├── b int not null - └── partition by list prefixes - ├── (1) - └── (2) + └── partitions + ├── p1 + │ └── partition by list prefixes + │ └── (1) + └── p2 + └── partition by list prefixes + └── (2) scan ok1 statement ok @@ -472,9 +476,13 @@ TABLE ok2 └── PRIMARY INDEX primary ├── a int not null ├── b int not null - └── partition by list prefixes - ├── (1) - └── (2) + └── partitions + ├── p1 + │ └── partition by list prefixes + │ └── (1) + └── p2 + └── partition by list prefixes + └── (2) scan ok2 statement ok @@ -516,8 +524,13 @@ TABLE ok3 └── PRIMARY INDEX primary ├── a int not null ├── b int not null - └── partition by list prefixes - └── (1) + └── partitions + ├── p1 + │ └── partition by list prefixes + │ └── (1) + └── p2 + └── partition by list prefixes + └── () scan ok3 statement ok @@ -563,10 +576,19 @@ TABLE ok4 └── PRIMARY INDEX primary ├── a int not null ├── b int not null - └── partition by list prefixes - ├── (1, 1) - ├── (1) - └── (2, 3) + └── partitions + ├── p1 + │ └── partition by list prefixes + │ └── (1, 1) + ├── p2 + │ └── partition by list prefixes + │ └── (1) + ├── p3 + │ └── partition by list prefixes + │ └── (2, 3) + └── p4 + └── partition by list prefixes + └── () scan ok4 statement ok @@ -603,9 +625,16 @@ TABLE ok5 └── PRIMARY INDEX primary ├── a int not null ├── b int not null - └── partition by list prefixes - ├── (1) - └── (2) + └── partitions + ├── p1 + │ └── partition by list prefixes + │ └── (1) + ├── p2 + │ └── partition by list prefixes + │ └── (2) + └── p3 + └── partition by list prefixes + └── () scan ok5 query TT @@ -897,9 +926,13 @@ TABLE ok11 ├── a int not null ├── b int not null ├── c int not null - └── partition by list prefixes - ├── (1) - └── (6) + └── partitions + ├── p1 + │ └── partition by list prefixes + │ └── (1) + └── p2 + └── partition by list prefixes + └── (6) scan ok11 statement ok @@ -943,10 +976,16 @@ TABLE ok12 └── PRIMARY INDEX primary ├── a int not null ├── b int not null - └── partition by list prefixes - ├── (NULL) - ├── (1) - └── (2) + └── partitions + ├── pu + │ └── partition by list prefixes + │ └── (NULL) + ├── p1 + │ └── partition by list prefixes + │ └── (1) + └── p2 + └── partition by list prefixes + └── (2) scan ok12 # Verify that creating a partition that includes NULL does not change the diff --git a/pkg/ccl/logictestccl/testdata/logic_test/partitioning_implicit b/pkg/ccl/logictestccl/testdata/logic_test/partitioning_implicit index c1cdf2d3a50e..e1f6f4951732 100644 --- a/pkg/ccl/logictestccl/testdata/logic_test/partitioning_implicit +++ b/pkg/ccl/logictestccl/testdata/logic_test/partitioning_implicit @@ -149,6 +149,73 @@ CREATE TABLE public.t ( ) -- Warning: Partitioned table with no zone configurations. +query T +EXPLAIN (OPT, CATALOG) SELECT * FROM t +---- +TABLE t + ├── pk int not null + ├── a int not null + ├── b int + ├── c int + ├── d int + ├── j jsonb + ├── crdb_internal_mvcc_timestamp decimal [hidden] [system] + ├── tableoid oid [hidden] [system] + ├── j_inverted_key bytes not null [virtual-inverted] + ├── FAMILY fam_0_pk_a_b_c_d_j (pk, a, b, c, d, j) + ├── PRIMARY INDEX primary + │ ├── a int not null (implicit) + │ ├── pk int not null + │ └── partitions + │ └── pk_implicit + │ └── partition by list prefixes + │ └── (1) + ├── INDEX t_b_idx + │ ├── a int not null (implicit) + │ ├── b int + │ ├── pk int not null + │ └── partitions + │ └── b_implicit + │ └── partition by list prefixes + │ └── (2) + ├── UNIQUE INDEX t_c_key + │ ├── a int not null (implicit) + │ ├── c int + │ ├── pk int not null (storing) + │ └── partitions + │ └── c_implicit + │ └── partition by list prefixes + │ └── (3) + ├── INDEX t_a_b_c_idx + │ ├── d int (implicit) + │ ├── a int not null + │ ├── b int + │ ├── c int + │ ├── pk int not null + │ └── partitions + │ └── a_b_c_implicit + │ └── partition by list prefixes + │ └── (4) + ├── INVERTED INDEX t_j_idx + │ ├── a int not null (implicit) + │ ├── j_inverted_key bytes not null [virtual-inverted] + │ ├── pk int not null + │ └── partitions + │ └── j_implicit + │ └── partition by list prefixes + │ └── (5) + ├── INDEX new_idx + │ ├── a int not null (implicit) + │ ├── d int + │ ├── pk int not null + │ └── partitions + │ └── d_implicit + │ └── partition by list prefixes + │ └── (1) + ├── UNIQUE WITHOUT INDEX (pk) + └── UNIQUE WITHOUT INDEX (c) +scan t + statement ok INSERT INTO t VALUES (1, 2, 3, 4, 5) diff --git a/pkg/ccl/logictestccl/testdata/logic_test/regional_by_row b/pkg/ccl/logictestccl/testdata/logic_test/regional_by_row index 0155bef99665..e26f53629c79 100644 --- a/pkg/ccl/logictestccl/testdata/logic_test/regional_by_row +++ b/pkg/ccl/logictestccl/testdata/logic_test/regional_by_row @@ -151,6 +151,194 @@ ALTER PARTITION "us-east-1" OF INDEX multi_region_test_db.public.regional_by_row voter_constraints = '{+region=us-east-1: 2}', lease_preferences = '[[+region=us-east-1]]' +query T +EXPLAIN (OPT, CATALOG) SELECT * FROM regional_by_row_table +---- +TABLE regional_by_row_table + ├── pk int not null + ├── pk2 int not null + ├── a int not null + ├── b int not null + ├── j jsonb + ├── crdb_region crdb_internal_region not null default (default_to_database_primary_region(gateway_region())::@100054) [hidden] + ├── crdb_internal_mvcc_timestamp decimal [hidden] [system] + ├── tableoid oid [hidden] [system] + ├── j_inverted_key bytes not null [virtual-inverted] + ├── FAMILY fam_0_pk_pk2_a_b_j_crdb_region (pk, pk2, a, b, j, crdb_region) + ├── CHECK (crdb_region IN (b'\x40':::@100054, b'\x80':::@100054, b'\xc0':::@100054)) + ├── PRIMARY INDEX primary + │ ├── crdb_region crdb_internal_region not null default (default_to_database_primary_region(gateway_region())::@100054) [hidden] (implicit) + │ ├── pk int not null + │ ├── ZONE + │ │ ├── replica constraints + │ │ │ ├── 1 replicas: [+region=ap-southeast-2] + │ │ │ ├── 1 replicas: [+region=ca-central-1] + │ │ │ ├── 1 replicas: [+region=us-east-1] + │ │ │ └── voter constraints: [+region=ca-central-1] + │ │ └── lease preference: [+region=ca-central-1] + │ └── partitions + │ ├── ap-southeast-2 + │ │ ├── partition by list prefixes + │ │ │ └── ('ap-southeast-2') + │ │ └── ZONE + │ │ ├── replica constraints + │ │ │ ├── 1 replicas: [+region=ap-southeast-2] + │ │ │ ├── 1 replicas: [+region=ca-central-1] + │ │ │ ├── 1 replicas: [+region=us-east-1] + │ │ │ └── voter constraints: [+region=ap-southeast-2] + │ │ └── lease preference: [+region=ap-southeast-2] + │ ├── ca-central-1 + │ │ ├── partition by list prefixes + │ │ │ └── ('ca-central-1') + │ │ └── ZONE + │ │ ├── replica constraints + │ │ │ ├── 1 replicas: [+region=ap-southeast-2] + │ │ │ ├── 1 replicas: [+region=ca-central-1] + │ │ │ ├── 1 replicas: [+region=us-east-1] + │ │ │ └── voter constraints: [+region=ca-central-1] + │ │ └── lease preference: [+region=ca-central-1] + │ └── us-east-1 + │ ├── partition by list prefixes + │ │ └── ('us-east-1') + │ └── ZONE + │ ├── replica constraints + │ │ ├── 1 replicas: [+region=ap-southeast-2] + │ │ ├── 1 replicas: [+region=ca-central-1] + │ │ ├── 1 replicas: [+region=us-east-1] + │ │ └── voter constraints: [+region=us-east-1] + │ └── lease preference: [+region=us-east-1] + ├── INDEX regional_by_row_table_a_idx + │ ├── crdb_region crdb_internal_region not null default (default_to_database_primary_region(gateway_region())::@100054) [hidden] (implicit) + │ ├── a int not null + │ ├── pk int not null + │ ├── ZONE + │ │ ├── replica constraints + │ │ │ ├── 1 replicas: [+region=ap-southeast-2] + │ │ │ ├── 1 replicas: [+region=ca-central-1] + │ │ │ ├── 1 replicas: [+region=us-east-1] + │ │ │ └── voter constraints: [+region=ca-central-1] + │ │ └── lease preference: [+region=ca-central-1] + │ └── partitions + │ ├── ap-southeast-2 + │ │ ├── partition by list prefixes + │ │ │ └── ('ap-southeast-2') + │ │ └── ZONE + │ │ ├── replica constraints + │ │ │ ├── 1 replicas: [+region=ap-southeast-2] + │ │ │ ├── 1 replicas: [+region=ca-central-1] + │ │ │ ├── 1 replicas: [+region=us-east-1] + │ │ │ └── voter constraints: [+region=ap-southeast-2] + │ │ └── lease preference: [+region=ap-southeast-2] + │ ├── ca-central-1 + │ │ ├── partition by list prefixes + │ │ │ └── ('ca-central-1') + │ │ └── ZONE + │ │ ├── replica constraints + │ │ │ ├── 1 replicas: [+region=ap-southeast-2] + │ │ │ ├── 1 replicas: [+region=ca-central-1] + │ │ │ ├── 1 replicas: [+region=us-east-1] + │ │ │ └── voter constraints: [+region=ca-central-1] + │ │ └── lease preference: [+region=ca-central-1] + │ └── us-east-1 + │ ├── partition by list prefixes + │ │ └── ('us-east-1') + │ └── ZONE + │ ├── replica constraints + │ │ ├── 1 replicas: [+region=ap-southeast-2] + │ │ ├── 1 replicas: [+region=ca-central-1] + │ │ ├── 1 replicas: [+region=us-east-1] + │ │ └── voter constraints: [+region=us-east-1] + │ └── lease preference: [+region=us-east-1] + ├── UNIQUE INDEX regional_by_row_table_b_key + │ ├── crdb_region crdb_internal_region not null default (default_to_database_primary_region(gateway_region())::@100054) [hidden] (implicit) + │ ├── b int not null + │ ├── pk int not null (storing) + │ ├── ZONE + │ │ ├── replica constraints + │ │ │ ├── 1 replicas: [+region=ap-southeast-2] + │ │ │ ├── 1 replicas: [+region=ca-central-1] + │ │ │ ├── 1 replicas: [+region=us-east-1] + │ │ │ └── voter constraints: [+region=ca-central-1] + │ │ └── lease preference: [+region=ca-central-1] + │ └── partitions + │ ├── ap-southeast-2 + │ │ ├── partition by list prefixes + │ │ │ └── ('ap-southeast-2') + │ │ └── ZONE + │ │ ├── replica constraints + │ │ │ ├── 1 replicas: [+region=ap-southeast-2] + │ │ │ ├── 1 replicas: [+region=ca-central-1] + │ │ │ ├── 1 replicas: [+region=us-east-1] + │ │ │ └── voter constraints: [+region=ap-southeast-2] + │ │ └── lease preference: [+region=ap-southeast-2] + │ ├── ca-central-1 + │ │ ├── partition by list prefixes + │ │ │ └── ('ca-central-1') + │ │ └── ZONE + │ │ ├── replica constraints + │ │ │ ├── 1 replicas: [+region=ap-southeast-2] + │ │ │ ├── 1 replicas: [+region=ca-central-1] + │ │ │ ├── 1 replicas: [+region=us-east-1] + │ │ │ └── voter constraints: [+region=ca-central-1] + │ │ └── lease preference: [+region=ca-central-1] + │ └── us-east-1 + │ ├── partition by list prefixes + │ │ └── ('us-east-1') + │ └── ZONE + │ ├── replica constraints + │ │ ├── 1 replicas: [+region=ap-southeast-2] + │ │ ├── 1 replicas: [+region=ca-central-1] + │ │ ├── 1 replicas: [+region=us-east-1] + │ │ └── voter constraints: [+region=us-east-1] + │ └── lease preference: [+region=us-east-1] + ├── INVERTED INDEX regional_by_row_table_j_idx + │ ├── crdb_region crdb_internal_region not null default (default_to_database_primary_region(gateway_region())::@100054) [hidden] (implicit) + │ ├── j_inverted_key bytes not null [virtual-inverted] + │ ├── pk int not null + │ ├── ZONE + │ │ ├── replica constraints + │ │ │ ├── 1 replicas: [+region=ap-southeast-2] + │ │ │ ├── 1 replicas: [+region=ca-central-1] + │ │ │ ├── 1 replicas: [+region=us-east-1] + │ │ │ └── voter constraints: [+region=ca-central-1] + │ │ └── lease preference: [+region=ca-central-1] + │ └── partitions + │ ├── ap-southeast-2 + │ │ ├── partition by list prefixes + │ │ │ └── ('ap-southeast-2') + │ │ └── ZONE + │ │ ├── replica constraints + │ │ │ ├── 1 replicas: [+region=ap-southeast-2] + │ │ │ ├── 1 replicas: [+region=ca-central-1] + │ │ │ ├── 1 replicas: [+region=us-east-1] + │ │ │ └── voter constraints: [+region=ap-southeast-2] + │ │ └── lease preference: [+region=ap-southeast-2] + │ ├── ca-central-1 + │ │ ├── partition by list prefixes + │ │ │ └── ('ca-central-1') + │ │ └── ZONE + │ │ ├── replica constraints + │ │ │ ├── 1 replicas: [+region=ap-southeast-2] + │ │ │ ├── 1 replicas: [+region=ca-central-1] + │ │ │ ├── 1 replicas: [+region=us-east-1] + │ │ │ └── voter constraints: [+region=ca-central-1] + │ │ └── lease preference: [+region=ca-central-1] + │ └── us-east-1 + │ ├── partition by list prefixes + │ │ └── ('us-east-1') + │ └── ZONE + │ ├── replica constraints + │ │ ├── 1 replicas: [+region=ap-southeast-2] + │ │ ├── 1 replicas: [+region=ca-central-1] + │ │ ├── 1 replicas: [+region=us-east-1] + │ │ └── voter constraints: [+region=us-east-1] + │ └── lease preference: [+region=us-east-1] + ├── UNIQUE WITHOUT INDEX (pk) + └── UNIQUE WITHOUT INDEX (b) +scan regional_by_row_table + └── check constraint expressions + └── crdb_region IN ('ap-southeast-2', 'ca-central-1', 'us-east-1') + query TTB colnames SELECT index_name, column_name, implicit FROM crdb_internal.index_columns WHERE descriptor_name = 'regional_by_row_table' AND column_type = 'key' diff --git a/pkg/config/zonepb/zone.go b/pkg/config/zonepb/zone.go index 98c45fae0ed6..4d8e60533166 100644 --- a/pkg/config/zonepb/zone.go +++ b/pkg/config/zonepb/zone.go @@ -784,6 +784,16 @@ func (z *ZoneConfig) ReplicaConstraints(i int) cat.ReplicaConstraints { return &z.Constraints[i] } +// VoterConstraintsCount is part of the cat.Zone interface. +func (z *ZoneConfig) VoterConstraintsCount() int { + return len(z.VoterConstraints) +} + +// VoterConstraint is part of the cat.Zone interface. +func (z *ZoneConfig) VoterConstraint(i int) cat.ReplicaConstraints { + return &z.VoterConstraints[i] +} + // LeasePreferenceCount is part of the cat.Zone interface. func (z *ZoneConfig) LeasePreferenceCount() int { return len(z.LeasePreferences) diff --git a/pkg/sql/opt/cat/index.go b/pkg/sql/opt/cat/index.go index fce64a26a8cf..0c167c436170 100644 --- a/pkg/sql/opt/cat/index.go +++ b/pkg/sql/opt/cat/index.go @@ -155,40 +155,6 @@ type Index interface { // Span returns the KV span associated with the index. Span() roachpb.Span - // PartitionByListPrefixes returns values that correspond to PARTITION BY LIST - // values. Specifically, it returns a list of tuples where each tuple contains - // values for a prefix of index columns (indicating a region of the index). - // Each tuple corresponds to a configured partition or subpartition. - // - // Note: this function decodes and allocates datums; use sparingly. - // - // Example: - // - // CREATE INDEX idx ON t(region,subregion,val) PARTITION BY LIST (region,subregion) ( - // PARTITION westcoast VALUES IN (('us', 'seattle'), ('us', 'cali')), - // PARTITION us VALUES IN (('us', DEFAULT)), - // PARTITION eu VALUES IN (('eu', DEFAULT)), - // PARTITION default VALUES IN (DEFAULT) - // ); - // - // PartitionByListPrefixes() returns - // ('us', 'seattle'), - // ('us', 'cali'), - // ('us'), - // ('eu'). - // - // The intended use of this function is for index skip scans. Each tuple - // corresponds to a region of the index that we can constrain further. In the - // example above: if we have a val=1 filter, instead of a full index scan we - // can skip most of the data under /us/cali and /us/seattle by scanning spans: - // [ - /us/cali ) - // [ /us/cali/1 - /us/cali/1 ] - // [ /us/cali\x00 - /us/seattle ) - // [ /us/seattle/1 - /us/seattle/1 ] - // [ /us/seattle\x00 - ] - // - PartitionByListPrefixes() []tree.Datums - // ImplicitPartitioningColumnCount returns the number of implicit partitioning // columns at the front of the index. For example, consider the following // table: @@ -257,6 +223,14 @@ type Index interface { // Version returns the IndexDescriptorVersion of the index. Version() descpb.IndexDescriptorVersion + + // PartitionCount returns the number of PARTITION BY LIST partitions defined + // on this index. + PartitionCount() int + + // Partition returns the ith PARTITION BY LIST partition within the index + // definition, where i < PartitionCount. + Partition(i int) Partition } // IndexColumn describes a single column that is part of an index definition. @@ -275,3 +249,61 @@ type IndexColumn struct { func IsMutationIndex(table Table, ord IndexOrdinal) bool { return ord >= table.IndexCount() } + +// Partition is an interface to a PARTITION BY LIST partition of an index. The +// intended use is to support planning of scans or lookup joins that will use +// locality optimized search. Locality optimized search can be planned when the +// maximum number of rows returned by a scan or lookup join is known, but the +// specific region in which the rows are located is unknown. In this case, the +// optimizer will plan a scan or lookup join in which local nodes (i.e., nodes +// in the gateway region) are searched for matching rows before remote nodes, in +// the hope that the execution engine can avoid visiting remote nodes. +type Partition interface { + // Name is the name of this partition. + Name() string + + // Zone returns the zone which constrains placement of this partition's + // replicas. If this partition does not have an associated zone, the returned + // zone is empty, but non-nil. + Zone() Zone + + // PartitionByListPrefixes returns the values of this partition. Specifically, + // it returns a list of tuples where each tuple contains values for a prefix + // of index columns (indicating the region of the index covered by this + // partition). + // + // Example: + // + // CREATE INDEX idx ON t(region,subregion,val) PARTITION BY LIST (region,subregion) ( + // PARTITION westcoast VALUES IN (('us', 'seattle'), ('us', 'cali')), + // PARTITION us VALUES IN (('us', DEFAULT)), + // PARTITION eu VALUES IN (('eu', DEFAULT)), + // PARTITION default VALUES IN (DEFAULT) + // ); + // + // If this is the westcoast partition, PartitionByListPrefixes() returns + // ('us', 'seattle'), + // ('us', 'cali') + // + // If this is the us partition, PartitionByListPrefixes() cuts off the DEFAULT + // value and just returns + // ('us') + // + // Finally, if this is the default partition, PartitionByListPrefixes() + // returns an empty slice. + // + // In addition to supporting locality optimized search as described above, + // this function can be used to support index skip scans. To support index + // skip scans, we collect the PartitionByListPrefixes for all partitions in + // the index. Each tuple corresponds to a region of the index that we can + // constrain further. In the example above: if we have a val=1 filter, instead + // of a full index scan we can skip most of the data under /us/cali and + // /us/seattle by scanning spans: + // [ - /us/cali ) + // [ /us/cali/1 - /us/cali/1 ] + // [ /us/cali\x00 - /us/seattle ) + // [ /us/seattle/1 - /us/seattle/1 ] + // [ /us/seattle\x00 - ] + // + PartitionByListPrefixes() []tree.Datums +} diff --git a/pkg/sql/opt/cat/utils.go b/pkg/sql/opt/cat/utils.go index de8daed70b0a..bdef6921a8d0 100644 --- a/pkg/sql/opt/cat/utils.go +++ b/pkg/sql/opt/cat/utils.go @@ -218,16 +218,25 @@ func formatCatalogIndex(tab Table, ord int, tp treeprinter.Node) { fmt.Fprintf(&buf, " (storing)") } + if i < idx.ImplicitPartitioningColumnCount() { + fmt.Fprintf(&buf, " (implicit)") + } + child.Child(buf.String()) } FormatZone(idx.Zone(), child) - partPrefixes := idx.PartitionByListPrefixes() - if len(partPrefixes) != 0 { - c := child.Child("partition by list prefixes") - for i := range partPrefixes { - c.Child(partPrefixes[i].String()) + if n := idx.PartitionCount(); n > 0 { + c := child.Child("partitions") + for i := 0; i < n; i++ { + p := idx.Partition(i) + part := c.Child(p.Name()) + prefixes := part.Child("partition by list prefixes") + for _, datums := range p.PartitionByListPrefixes() { + prefixes.Child(datums.String()) + } + FormatZone(p.Zone(), part) } } if n := idx.InterleaveAncestorCount(); n > 0 { diff --git a/pkg/sql/opt/cat/zone.go b/pkg/sql/opt/cat/zone.go index 06d4f3e2a829..95fcfab40924 100644 --- a/pkg/sql/opt/cat/zone.go +++ b/pkg/sql/opt/cat/zone.go @@ -24,14 +24,22 @@ type Zone interface { // ReplicaConstraintsCount returns the number of replica constraint sets that // are part of this zone. // - // TODO(aayush): Go through the callers of the methods here and decide the - // right semantics for handling the new `voter_constraints` attribute. + // TODO(aayush/rytaft): Go through the callers of the methods here and decide + // the right semantics for handling the new `voter_constraints` attribute. ReplicaConstraintsCount() int // ReplicaConstraints returns the ith set of replica constraints in the zone, // where i < ReplicaConstraintsCount. ReplicaConstraints(i int) ReplicaConstraints + // VoterConstraintsCount returns the number of voter replica constraint sets + // that are part of this zone. + VoterConstraintsCount() int + + // VoterConstraint returns the ith set of voter replica constraints in the + // zone, where i < VoterConstraintsCount. + VoterConstraint(i int) ReplicaConstraints + // LeasePreferenceCount returns the number of lease preferences that are part // of this zone. LeasePreferenceCount() int @@ -90,7 +98,8 @@ type Constraint interface { // FormatZone nicely formats a catalog zone using a treeprinter for debugging // and testing. func FormatZone(zone Zone, tp treeprinter.Node) { - if zone.ReplicaConstraintsCount() == 0 && zone.LeasePreferenceCount() == 0 { + if zone.ReplicaConstraintsCount() == 0 && zone.VoterConstraintsCount() == 0 && + zone.LeasePreferenceCount() == 0 { return } zoneChild := tp.Childf("ZONE") @@ -110,6 +119,21 @@ func FormatZone(zone Zone, tp treeprinter.Node) { } } + voterChild := zoneChild + if zone.VoterConstraintsCount() > 1 { + voterChild = voterChild.Childf("voter replica constraints") + } + for i, n := 0, zone.VoterConstraintsCount(); i < n; i++ { + voterConstraint := zone.VoterConstraint(i) + constraintStr := formatConstraintSet(voterConstraint) + if zone.VoterConstraintsCount() > 1 { + numReplicas := voterConstraint.ReplicaCount() + replicaChild.Childf("%d voter replicas: %s", numReplicas, constraintStr) + } else { + replicaChild.Childf("voter constraints: %s", constraintStr) + } + } + leaseChild := zoneChild if zone.LeasePreferenceCount() > 1 { leaseChild = leaseChild.Childf("lease preferences") diff --git a/pkg/sql/opt/testutils/testcat/create_table.go b/pkg/sql/opt/testutils/testcat/create_table.go index 1953df5b2e21..9d0c07c39af1 100644 --- a/pkg/sql/opt/testutils/testcat/create_table.go +++ b/pkg/sql/opt/testutils/testcat/create_table.go @@ -12,6 +12,7 @@ package testcat import ( "bytes" + "context" "fmt" "reflect" "sort" @@ -19,6 +20,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/config/zonepb" "github.com/cockroachdb/cockroach/pkg/geo/geoindex" + "github.com/cockroachdb/cockroach/pkg/settings/cluster" "github.com/cockroachdb/cockroach/pkg/sql/catalog/colinfo" "github.com/cockroachdb/cockroach/pkg/sql/catalog/descpb" "github.com/cockroachdb/cockroach/pkg/sql/opt/cat" @@ -153,6 +155,11 @@ func (tc *Catalog) CreateTable(stmt *tree.CreateTable) *Table { ) tab.Columns = append(tab.Columns, mvcc) + // Cache the partitioning statement for the primary index. + if stmt.PartitionByTable != nil { + tab.partitionBy = stmt.PartitionByTable.PartitionBy + } + // Add the primary index. if hasPrimaryIndex { for _, def := range stmt.Defs { @@ -172,9 +179,6 @@ func (tc *Catalog) CreateTable(stmt *tree.CreateTable) *Table { } else { tab.addPrimaryColumnIndex("rowid") } - if stmt.PartitionByTable != nil { - tab.Indexes[0].partitionBy = stmt.PartitionByTable.PartitionBy - } // Add check constraints. for _, def := range stmt.Defs { @@ -603,10 +607,6 @@ func (tt *Table) addIndexWithVersion( version: version, } - if def.PartitionByIndex != nil { - idx.partitionBy = def.PartitionByIndex.PartitionBy - } - // Look for name suffixes indicating this is a mutation index. if name, ok := extractWriteOnlyIndex(def); ok { idx.IdxName = name @@ -663,6 +663,44 @@ func (tt *Table) addIndexWithVersion( } } + // Add partitions. + var partitionBy *tree.PartitionBy + if def.PartitionByIndex != nil { + partitionBy = def.PartitionByIndex.PartitionBy + } else if typ == primaryIndex { + partitionBy = tt.partitionBy + } + if partitionBy != nil { + ctx := context.Background() + semaCtx := tree.MakeSemaContext() + evalCtx := tree.MakeTestingEvalContext(cluster.MakeTestingClusterSettings()) + + if len(partitionBy.List) > 0 { + idx.partitions = make([]Partition, len(partitionBy.List)) + for i := range partitionBy.Fields { + if i >= len(idx.Columns) || partitionBy.Fields[i] != idx.Columns[i].ColName() { + panic("partition by columns must be a prefix of the index columns") + } + } + for i := range partitionBy.List { + p := &partitionBy.List[i] + idx.partitions[i] = Partition{ + name: string(p.Name), + zone: &zonepb.ZoneConfig{}, + datums: make([]tree.Datums, 0, len(p.Exprs)), + } + + // Get the partition values. + for _, e := range p.Exprs { + d := idx.partitionByListExprToDatums(ctx, &evalCtx, &semaCtx, e) + if d != nil { + idx.partitions[i].datums = append(idx.partitions[i].datums, d) + } + } + } + } + } + if typ == primaryIndex { var pkOrdinals util.FastIntSet for _, c := range idx.Columns { @@ -946,6 +984,47 @@ func (tt *Table) addPrimaryColumnIndex(colName string) { tt.addIndex(&def, primaryIndex) } +// partitionByListExprToDatums converts an expression from a PARTITION BY LIST +// clause to a list of datums. +func (ti *Index) partitionByListExprToDatums( + ctx context.Context, evalCtx *tree.EvalContext, semaCtx *tree.SemaContext, e tree.Expr, +) tree.Datums { + var vals []tree.Expr + switch t := e.(type) { + case *tree.Tuple: + vals = t.Exprs + default: + vals = []tree.Expr{e} + } + + // Cut off at DEFAULT, if present. + for i := range vals { + if _, ok := vals[i].(tree.DefaultVal); ok { + vals = vals[:i] + } + } + if len(vals) == 0 { + return nil + } + d := make(tree.Datums, len(vals)) + for i := range vals { + c := tree.CastExpr{Expr: vals[i], Type: ti.Columns[i].DatumType()} + cTyped, err := c.TypeCheck(ctx, semaCtx, types.Any) + if err != nil { + panic(err) + } + d[i], err = cTyped.Eval(evalCtx) + if err != nil { + panic(err) + } + } + + // TODO(radu): split into multiple prefixes if Subpartition is also by list. + // Note that this functionality should be kept in sync with the real catalog + // implementation (opt_catalog.go). + return d +} + func extractInaccessibleColumn(def *tree.ColumnTableDef) (name tree.Name, ok bool) { if !strings.HasSuffix(string(def.Name), ":inaccessible") { return "", false diff --git a/pkg/sql/opt/testutils/testcat/set_zone_config.go b/pkg/sql/opt/testutils/testcat/set_zone_config.go index 986e59cbb859..adccbf28cb34 100644 --- a/pkg/sql/opt/testutils/testcat/set_zone_config.go +++ b/pkg/sql/opt/testutils/testcat/set_zone_config.go @@ -26,6 +26,37 @@ func (tc *Catalog) SetZoneConfig(stmt *tree.SetZoneConfig) *zonepb.ZoneConfig { tc.qualifyTableName(&tabName) tab := tc.Table(&tabName) + // Handle the case of a zone config targeting a partition. + if stmt.TargetsPartition() { + partitionName := string(stmt.Partition) + var index *Index + if stmt.TableOrIndex.Index == "" { + // This partition is in the primary index. + index = tab.Indexes[0] + } else { + // This partition is in a secondary index. + for _, idx := range tab.Indexes { + if idx.IdxName == string(stmt.TableOrIndex.Index) { + index = idx + break + } + } + } + if index == nil { + panic(fmt.Errorf("\"%q\" is not an index", stmt.TableOrIndex.Index)) + } + + for i := range index.partitions { + if index.partitions[i].name == partitionName { + index.partitions[i].zone = makeZoneConfig(stmt.Options) + return index.partitions[i].zone + } + } + panic(fmt.Errorf("\"%q\" is not a partition", stmt.Partition)) + } + + // The zone config must target an entire index. + // Handle special case of primary index. if stmt.TableOrIndex.Index == "" { tab.Indexes[0].IdxZone = makeZoneConfig(stmt.Options) @@ -55,6 +86,14 @@ func makeZoneConfig(options tree.KVOptions) *zonepb.ZoneConfig { } zone.Constraints = constraintsList.Constraints + case "voter_constraints": + constraintsList := &zonepb.ConstraintsList{} + value := options[i].Value.(*tree.StrVal).RawString() + if err := yaml.UnmarshalStrict([]byte(value), constraintsList); err != nil { + panic(err) + } + zone.VoterConstraints = constraintsList.Constraints + case "lease_preferences": value := options[i].Value.(*tree.StrVal).RawString() if err := yaml.UnmarshalStrict([]byte(value), &zone.LeasePreferences); err != nil { diff --git a/pkg/sql/opt/testutils/testcat/test_catalog.go b/pkg/sql/opt/testutils/testcat/test_catalog.go index 84d53fb65e3a..1ae7acd7f8a5 100644 --- a/pkg/sql/opt/testutils/testcat/test_catalog.go +++ b/pkg/sql/opt/testutils/testcat/test_catalog.go @@ -597,6 +597,10 @@ type Table struct { inboundFKs []ForeignKeyConstraint uniqueConstraints []UniqueConstraint + + // partitionBy is the partitioning clause that corresponds to the primary + // index. Used to initialize the partitioning for the primary index. + partitionBy *tree.PartitionBy } var _ cat.Table = &Table{} @@ -781,9 +785,9 @@ type Index struct { // table is a back reference to the table this index is on. table *Table - // partitionBy is the partitioning clause that corresponds to this index. Used - // to implement PartitionByListPrefixes. - partitionBy *tree.PartitionBy + // partitions stores zone information and datums for PARTITION BY LIST + // partitions. + partitions []Partition // predicate is the partial index predicate expression, if it exists. predicate string @@ -883,67 +887,6 @@ func (ti *Index) Predicate() (string, bool) { return ti.predicate, ti.predicate != "" } -// PartitionByListPrefixes is part of the cat.Index interface. -func (ti *Index) PartitionByListPrefixes() []tree.Datums { - ctx := context.Background() - p := ti.partitionBy - if p == nil { - return nil - } - if len(p.List) == 0 { - return nil - } - var res []tree.Datums - semaCtx := tree.MakeSemaContext() - evalCtx := tree.MakeTestingEvalContext(cluster.MakeTestingClusterSettings()) - for i := range p.Fields { - if i >= len(ti.Columns) || p.Fields[i] != ti.Columns[i].ColName() { - panic("partition by columns must be a prefix of the index columns") - } - } - for i := range p.List { - // Exprs contains a list of values. - for _, e := range p.List[i].Exprs { - var vals []tree.Expr - switch t := e.(type) { - case *tree.Tuple: - vals = t.Exprs - default: - vals = []tree.Expr{e} - } - - // Cut off at DEFAULT, if present. - for i := range vals { - if _, ok := vals[i].(tree.DefaultVal); ok { - vals = vals[:i] - } - } - if len(vals) == 0 { - continue - } - d := make(tree.Datums, len(vals)) - for i := range vals { - c := tree.CastExpr{Expr: vals[i], Type: ti.Columns[i].DatumType()} - cTyped, err := c.TypeCheck(ctx, &semaCtx, types.Any) - if err != nil { - panic(err) - } - d[i], err = cTyped.Eval(&evalCtx) - if err != nil { - panic(err) - } - } - - // TODO(radu): split into multiple prefixes if Subpartition is also by list. - // Note that this functionality should be kept in sync with the real catalog - // implementation (opt_catalog.go). - - res = append(res, d) - } - } - return res -} - // ImplicitPartitioningColumnCount is part of the cat.Index interface. func (ti *Index) ImplicitPartitioningColumnCount() int { return 0 @@ -979,6 +922,40 @@ func (ti *Index) Version() descpb.IndexDescriptorVersion { return ti.version } +// PartitionCount is part of the cat.Index interface. +func (ti *Index) PartitionCount() int { + return len(ti.partitions) +} + +// Partition is part of the cat.Index interface. +func (ti *Index) Partition(i int) cat.Partition { + return &ti.partitions[i] +} + +// Partition implements the cat.Partition interface for testing purposes. +type Partition struct { + name string + zone *zonepb.ZoneConfig + datums []tree.Datums +} + +var _ cat.Partition = &Partition{} + +// Name is part of the cat.Partition interface. +func (p *Partition) Name() string { + return p.name +} + +// Zone is part of the cat.Partition interface. +func (p *Partition) Zone() cat.Zone { + return p.zone +} + +// PartitionByListPrefixes is part of the cat.Partition interface. +func (p *Partition) PartitionByListPrefixes() []tree.Datums { + return p.datums +} + // TableStat implements the cat.TableStatistic interface for testing purposes. type TableStat struct { js stats.JSONStatistic diff --git a/pkg/sql/opt/testutils/testcat/testdata/table b/pkg/sql/opt/testutils/testcat/testdata/table index 58a93711d6ad..c5e4c609789b 100644 --- a/pkg/sql/opt/testutils/testcat/testdata/table +++ b/pkg/sql/opt/testutils/testcat/testdata/table @@ -110,11 +110,17 @@ TABLE part1 ├── crdb_internal_mvcc_timestamp decimal [hidden] [system] └── PRIMARY INDEX primary ├── a int not null - └── partition by list prefixes - ├── (1) - ├── (3) - ├── (4) - └── (5) + └── partitions + ├── p1 + │ └── partition by list prefixes + │ └── (1) + ├── p2 + │ └── partition by list prefixes + │ ├── (3) + │ ├── (4) + │ └── (5) + └── p3 + └── partition by list prefixes exec-ddl CREATE TABLE part2 ( @@ -145,19 +151,29 @@ TABLE part2 │ ├── a string not null │ ├── b string not null │ ├── c int not null - │ └── partition by list prefixes - │ ├── ('foo', 'bar') - │ ├── ('foo', 'baz') - │ ├── ('qux', 'qux') - │ └── ('waldo') + │ └── partitions + │ ├── p1 + │ │ └── partition by list prefixes + │ │ ├── ('foo', 'bar') + │ │ ├── ('foo', 'baz') + │ │ └── ('qux', 'qux') + │ ├── p2 + │ │ └── partition by list prefixes + │ │ └── ('waldo') + │ └── p3 + │ └── partition by list prefixes └── INDEX secondary ├── c int not null ├── a string not null ├── b string not null - └── partition by list prefixes - ├── (1) - ├── (3) - └── (4) + └── partitions + ├── pi1 + │ └── partition by list prefixes + │ └── (1) + └── pi2 + └── partition by list prefixes + ├── (3) + └── (4) exec-ddl CREATE TABLE inv ( diff --git a/pkg/sql/opt/testutils/testcat/testdata/zone b/pkg/sql/opt/testutils/testcat/testdata/zone index 4015e32d9ef5..3315e06176aa 100644 --- a/pkg/sql/opt/testutils/testcat/testdata/zone +++ b/pkg/sql/opt/testutils/testcat/testdata/zone @@ -175,3 +175,78 @@ TABLE abc ├── a int not null (storing) └── ZONE └── constraints: [+dc=west] + +exec-ddl +CREATE TABLE abc_part ( + r STRING NOT NULL CHECK (r IN ('east', 'west')), + a INT PRIMARY KEY, + b INT, + c STRING, + UNIQUE WITHOUT INDEX (b, c), + UNIQUE INDEX bc_idx (r, b, c) PARTITION BY LIST (r) ( + PARTITION east VALUES IN (('east')), + PARTITION west VALUES IN (('west')) + ), + INDEX b_idx (r, b) PARTITION BY LIST (r) ( + PARTITION east VALUES IN (('east')), + PARTITION west VALUES IN (('west')) + ) +) +---- + +exec-ddl +ALTER PARTITION "east" OF INDEX abc_part@bc_idx CONFIGURE ZONE USING + num_voters = 5, + voter_constraints = '{+region=east: 2}', + lease_preferences = '[[+region=east]]' +---- + +exec-ddl +ALTER PARTITION "west" OF INDEX abc_part@bc_idx CONFIGURE ZONE USING + num_voters = 5, + voter_constraints = '{+region=west: 2}', + lease_preferences = '[[+region=west]]'; +---- + +exec-ddl +SHOW CREATE abc_part +---- +TABLE abc_part + ├── r string not null + ├── a int not null + ├── b int + ├── c string + ├── crdb_internal_mvcc_timestamp decimal [hidden] [system] + ├── CHECK (r IN ('east', 'west')) + ├── PRIMARY INDEX primary + │ └── a int not null + ├── UNIQUE INDEX bc_idx + │ ├── r string not null + │ ├── b int + │ ├── c string + │ ├── a int not null (storing) + │ └── partitions + │ ├── east + │ │ ├── partition by list prefixes + │ │ │ └── ('east') + │ │ └── ZONE + │ │ ├── voter constraints: [+region=east] + │ │ └── lease preference: [+region=east] + │ └── west + │ ├── partition by list prefixes + │ │ └── ('west') + │ └── ZONE + │ ├── voter constraints: [+region=west] + │ └── lease preference: [+region=west] + ├── INDEX b_idx + │ ├── r string not null + │ ├── b int + │ ├── a int not null + │ └── partitions + │ ├── east + │ │ └── partition by list prefixes + │ │ └── ('east') + │ └── west + │ └── partition by list prefixes + │ └── ('west') + └── UNIQUE WITHOUT INDEX (b, c) diff --git a/pkg/sql/opt/xform/select_funcs.go b/pkg/sql/opt/xform/select_funcs.go index 93904be665b9..3619932aec84 100644 --- a/pkg/sql/opt/xform/select_funcs.go +++ b/pkg/sql/opt/xform/select_funcs.go @@ -636,7 +636,10 @@ func (c *CustomFuncs) partitionValuesFilters( ) (partitionFilter, inBetweenFilter memo.FiltersExpr) { // Find all the partition values - partitionValues := index.PartitionByListPrefixes() + partitionValues := make([]tree.Datums, 0, index.PartitionCount()) + for i, n := 0, index.PartitionCount(); i < n; i++ { + partitionValues = append(partitionValues, index.Partition(i).PartitionByListPrefixes()...) + } if len(partitionValues) == 0 { return partitionFilter, inBetweenFilter } diff --git a/pkg/sql/opt_catalog.go b/pkg/sql/opt_catalog.go index 9c10354fbfb2..ce2848793681 100644 --- a/pkg/sql/opt_catalog.go +++ b/pkg/sql/opt_catalog.go @@ -738,16 +738,25 @@ func newOptTable( idxDesc = secondaryIndexes[i-1].IndexDesc() } - // If there is a subzone that applies to the entire index, use that, - // else use the table zone. Skip subzones that apply to partitions, - // since they apply only to a subset of the index. + // If there is a subzone that applies to the entire index, use that, else + // use the table zone. Save subzones that apply to partitions, since we will + // use those later when initializing partitions in the index. idxZone := tblZone + partZones := make(map[string]*zonepb.ZoneConfig) for j := range tblZone.Subzones { subzone := &tblZone.Subzones[j] - if subzone.IndexID == uint32(idxDesc.ID) && subzone.PartitionName == "" { - copyZone := subzone.Config - copyZone.InheritFromParent(tblZone) - idxZone = ©Zone + if subzone.IndexID == uint32(idxDesc.ID) { + if subzone.PartitionName == "" { + // Subzone applies to the whole index. + copyZone := subzone.Config + copyZone.InheritFromParent(tblZone) + idxZone = ©Zone + } else { + // Subzone applies to a partition. + copyZone := subzone.Config + copyZone.InheritFromParent(tblZone) + partZones[subzone.PartitionName] = ©Zone + } } } if idxDesc.Type == descpb.IndexDescriptor_INVERTED { @@ -770,9 +779,9 @@ func newOptTable( false, /* nullable */ invertedSourceColOrdinal, ) - ot.indexes[i].init(ot, i, idxDesc, idxZone, virtualColOrd) + ot.indexes[i].init(ot, i, idxDesc, idxZone, partZones, virtualColOrd) } else { - ot.indexes[i].init(ot, i, idxDesc, idxZone, -1 /* virtualColOrd */) + ot.indexes[i].init(ot, i, idxDesc, idxZone, partZones, -1 /* virtualColOrd */) } // Add unique constraints for implicitly partitioned unique indexes. @@ -1108,6 +1117,10 @@ type optIndex struct { numKeyCols int numLaxKeyCols int + // partitions stores zone information and datums for PARTITION BY LIST + // partitions. + partitions []optPartition + // invertedVirtualColOrd is used if this is an inverted index; it stores the // ordinal of the virtual column created to refer to the key of this index. // It is -1 if this is not an inverted index. @@ -1123,6 +1136,7 @@ func (oi *optIndex) init( indexOrdinal int, desc *descpb.IndexDescriptor, zone *zonepb.ZoneConfig, + partZones map[string]*zonepb.ZoneConfig, invertedVirtualColOrd int, ) { oi.tab = tab @@ -1152,6 +1166,38 @@ func (oi *optIndex) init( oi.numCols = len(desc.ColumnIDs) + len(desc.ExtraColumnIDs) + len(desc.StoreColumnIDs) } + // Collect information about the partitions. + oi.partitions = make([]optPartition, len(desc.Partitioning.List)) + for i := range desc.Partitioning.List { + p := &desc.Partitioning.List[i] + oi.partitions[i] = optPartition{ + name: p.Name, + zone: &zonepb.ZoneConfig{}, + datums: make([]tree.Datums, 0, len(p.Values)), + } + + // Get the zone. + if zone, ok := partZones[p.Name]; ok { + oi.partitions[i].zone = zone + } + + // Get the partition values. + var a rowenc.DatumAlloc + for _, valueEncBuf := range p.Values { + t, _, err := rowenc.DecodePartitionTuple( + &a, oi.tab.codec, oi.tab.desc, oi.desc, &oi.desc.Partitioning, + valueEncBuf, nil, /* prefixDatums */ + ) + if err != nil { + panic(errors.NewAssertionErrorWithWrappedErrf(err, "while decoding partition tuple")) + } + oi.partitions[i].datums = append(oi.partitions[i].datums, t.Datums) + // TODO(radu): split into multiple prefixes if Subpartition is also by list. + // Note that this functionality should be kept in sync with the test catalog + // implementation (test_catalog.go). + } + } + if desc.Unique { notNull := true for _, id := range desc.ColumnIDs { @@ -1296,35 +1342,6 @@ func (oi *optIndex) Ordinal() int { return oi.indexOrdinal } -// PartitionByListPrefixes is part of the cat.Index interface. -func (oi *optIndex) PartitionByListPrefixes() []tree.Datums { - list := oi.desc.Partitioning.List - if len(list) == 0 { - return nil - } - res := make([]tree.Datums, 0, len(list)) - var a rowenc.DatumAlloc - for i := range list { - for _, valueEncBuf := range list[i].Values { - t, _, err := rowenc.DecodePartitionTuple( - &a, oi.tab.codec, oi.tab.desc, oi.desc, &oi.desc.Partitioning, - valueEncBuf, nil, /* prefixDatums */ - ) - if err != nil { - panic(errors.NewAssertionErrorWithWrappedErrf(err, "while decoding partition tuple")) - } - // Ignore the DEFAULT case, where there is nothing to return. - if len(t.Datums) > 0 { - res = append(res, t.Datums) - } - // TODO(radu): split into multiple prefixes if Subpartition is also by list. - // Note that this functionality should be kept in sync with the test catalog - // implementation (test_catalog.go). - } - } - return res -} - // ImplicitPartitioningColumnCount is part of the cat.Index interface. func (oi *optIndex) ImplicitPartitioningColumnCount() int { return int(oi.desc.Partitioning.NumImplicitColumns) @@ -1362,6 +1379,41 @@ func (oi *optIndex) Version() descpb.IndexDescriptorVersion { return oi.desc.Version } +// PartitionCount is part of the cat.Index interface. +func (oi *optIndex) PartitionCount() int { + return len(oi.partitions) +} + +// Partition is part of the cat.Index interface. +func (oi *optIndex) Partition(i int) cat.Partition { + return &oi.partitions[i] +} + +// optPartition implements cat.Partition and represents a PARTITION BY LIST +// partition of an index. +type optPartition struct { + name string + zone *zonepb.ZoneConfig + datums []tree.Datums +} + +var _ cat.Partition = &optPartition{} + +// Name is part of the cat.Partition interface. +func (op *optPartition) Name() string { + return op.name +} + +// Zone is part of the cat.Partition interface. +func (op *optPartition) Zone() cat.Zone { + return op.zone +} + +// PartitionByListPrefixes is part of the cat.Partition interface. +func (op *optPartition) PartitionByListPrefixes() []tree.Datums { + return op.datums +} + type optTableStat struct { stat *stats.TableStatistic columnOrdinals []int @@ -2042,11 +2094,6 @@ func (oi *optVirtualIndex) Ordinal() int { return oi.indexOrdinal } -// PartitionByListPrefixes is part of the cat.Index interface. -func (oi *optVirtualIndex) PartitionByListPrefixes() []tree.Datums { - return nil -} - // ImplicitPartitioningColumnCount is part of the cat.Index interface. func (oi *optVirtualIndex) ImplicitPartitioningColumnCount() int { return 0 @@ -2082,6 +2129,16 @@ func (oi *optVirtualIndex) Version() descpb.IndexDescriptorVersion { return 0 } +// PartitionCount is part of the cat.Index interface. +func (oi *optVirtualIndex) PartitionCount() int { + return 0 +} + +// Partition is part of the cat.Index interface. +func (oi *optVirtualIndex) Partition(i int) cat.Partition { + return nil +} + // optVirtualFamily is a dummy implementation of cat.Family for the only family // reported by a virtual table. type optVirtualFamily struct {