Skip to content

Commit 414f226

Browse files
committed
sql: support partial stats at extremes without specifying columns
This commit adds support to collect partial statistics at extremes without specifying a column, such as: `CREATE STATISTICS my_stat FROM t USING EXTREMES` This will collect partial stats on a default set of columns, which are all single columns that are prefixes of forward indexes, excluding partial, sharded, and implicitly partitioned indexes. Modifies create stats job progress reporting to work when multiple indexes will be scanned as part of the stat collection. The i'th index scan will only report progress between `(i-1)/numIndexes`% and `i/numIndexes`%. For example, if we expect to scan 4 indexes, each index will report progress between 0%-25%, 25%-50%, 50%-75%, and 75%-100% respectively. Also changes the `rowsExpected` count used by the SampleAggregator and sample size computation to be the estimated number of stale rows computed as `estimated row count * fraction of stale rows` for partial stat collections. Closes: #127832 Release note (sql): Partial stats at extremes can now be collected on all valid columns of a table using the `CREATE STATISTICS <stat_name>` `FROM <table_name> USING EXTREMES` syntax, without an `ON <col_name>` clause. Valid columns are all single column prefixes of a forward index excluding partial, sharded, and implicitly partitioned indexes.
1 parent 0541689 commit 414f226

File tree

8 files changed

+376
-58
lines changed

8 files changed

+376
-58
lines changed

pkg/ccl/logictestccl/testdata/logic_test/multi_region_stats

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -125,3 +125,38 @@ WHERE stat->>'columns' = '["region"]'
125125
# Make sure we can still use the histogram in statistics_builder.
126126
statement ok
127127
INSERT INTO t124181 (region, a) VALUES ('ca-central-1', 2)
128+
129+
# Verify that we don't collect partial stats on LOCALITY REGIONAL BY ROW table
130+
# indexes when columns are unspecified since they are implicitly partitioned
131+
# by the region column.
132+
statement ok
133+
CREATE TABLE multiregion_pstats (a INT PRIMARY KEY, b INT) LOCALITY REGIONAL BY ROW
134+
135+
statement ok
136+
INSERT INTO multiregion_pstats (a, b, crdb_region) VALUES (1, 1, 'ap-southeast-2'), (2, 2, 'ca-central-1'), (3, 3, 'us-east-1')
137+
138+
statement ok
139+
CREATE STATISTICS multiregion_full FROM multiregion_pstats
140+
141+
statement ok
142+
INSERT INTO multiregion_pstats (a, b, crdb_region) VALUES (4, 4, 'ap-southeast-2'), (5, 5, 'ca-central-1')
143+
144+
statement ok
145+
CREATE STATISTICS multiregion_partial FROM multiregion_pstats USING EXTREMES
146+
147+
query TTIII colnames
148+
SELECT
149+
statistics_name,
150+
column_names,
151+
row_count,
152+
distinct_count,
153+
null_count
154+
FROM
155+
[SHOW STATISTICS FOR TABLE multiregion_pstats]
156+
ORDER BY statistics_name, column_names::STRING
157+
----
158+
statistics_name column_names row_count distinct_count null_count
159+
multiregion_full {a,crdb_region} 3 3 0
160+
multiregion_full {a} 3 3 0
161+
multiregion_full {b} 3 3 0
162+
multiregion_full {crdb_region} 3 3 0

pkg/sql/create_stats.go

Lines changed: 62 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ import (
3838
"github.com/cockroachdb/cockroach/pkg/util/hlc"
3939
"github.com/cockroachdb/cockroach/pkg/util/log"
4040
"github.com/cockroachdb/cockroach/pkg/util/log/eventpb"
41+
"github.com/cockroachdb/cockroach/pkg/util/protoutil"
4142
"github.com/cockroachdb/errors"
4243
"github.com/cockroachdb/redact"
4344
)
@@ -77,7 +78,7 @@ func StubTableStats(
7778
) ([]*stats.TableStatisticProto, error) {
7879
colStats, err := createStatsDefaultColumns(
7980
context.Background(), desc, false /* virtColEnabled */, false, /* multiColEnabled */
80-
nonIndexColHistogramBuckets, nil, /* evalCtx */
81+
false /* partialStats */, nonIndexColHistogramBuckets, nil, /* evalCtx */
8182
)
8283
if err != nil {
8384
return nil, err
@@ -247,7 +248,13 @@ func (n *createStatsNode) makeJobRecord(ctx context.Context) (*jobs.Record, erro
247248
}
248249
defaultHistogramBuckets := stats.GetDefaultHistogramBuckets(n.p.ExecCfg().SV(), tableDesc)
249250
if colStats, err = createStatsDefaultColumns(
250-
ctx, tableDesc, virtColEnabled, multiColEnabled, defaultHistogramBuckets, n.p.EvalContext(),
251+
ctx,
252+
tableDesc,
253+
virtColEnabled,
254+
multiColEnabled,
255+
n.Options.UsingExtremes,
256+
defaultHistogramBuckets,
257+
n.p.EvalContext(),
251258
); err != nil {
252259
return nil, err
253260
}
@@ -364,13 +371,18 @@ const maxNonIndexCols = 100
364371
// predicate expressions are also likely to appear in query filters, so stats
365372
// are collected for those columns as well.
366373
//
374+
// If partialStats is true, we only collect statistics on single columns that
375+
// are prefixes of forward indexes, and skip over partial, sharded, and
376+
// implicitly partitioned indexes. Partial statistic creation only supports
377+
// these columns.
378+
//
367379
// In addition to the index columns, we collect stats on up to maxNonIndexCols
368380
// other columns from the table. We only collect histograms for index columns,
369381
// plus any other boolean or enum columns (where the "histogram" is tiny).
370382
func createStatsDefaultColumns(
371383
ctx context.Context,
372384
desc catalog.TableDescriptor,
373-
virtColEnabled, multiColEnabled bool,
385+
virtColEnabled, multiColEnabled, partialStats bool,
374386
defaultHistogramBuckets uint32,
375387
evalCtx *eval.Context,
376388
) ([]jobspb.CreateStatsDetails_ColStat, error) {
@@ -470,6 +482,27 @@ func createStatsDefaultColumns(
470482
return nil
471483
}
472484

485+
// Only collect statistics on single columns that are prefixes of forward
486+
// indexes for partial statistics, and skip over partial, sharded, and
487+
// implicitly partitioned indexes.
488+
if partialStats {
489+
for _, idx := range desc.ActiveIndexes() {
490+
if idx.GetType() != descpb.IndexDescriptor_FORWARD ||
491+
idx.IsPartial() ||
492+
idx.IsSharded() ||
493+
idx.ImplicitPartitioningColumnCount() > 0 {
494+
continue
495+
}
496+
if idx.NumKeyColumns() != 0 {
497+
colID := idx.GetKeyColumnID(0)
498+
if err := addIndexColumnStatsIfNotExists(colID, false /* isInverted */); err != nil {
499+
return nil, err
500+
}
501+
}
502+
}
503+
return colStats, nil
504+
}
505+
473506
// Add column stats for the primary key.
474507
primaryIdx := desc.GetPrimaryIndex()
475508
for i := 0; i < primaryIdx.NumKeyColumns(); i++ {
@@ -688,13 +721,35 @@ func (r *createStatsResumer) Resume(ctx context.Context, execCtx interface{}) er
688721
}
689722

690723
dsp := innerP.DistSQLPlanner()
691-
planCtx := dsp.NewPlanningCtx(ctx, innerEvalCtx, innerP, txn.KV(), FullDistribution)
692724
// CREATE STATS flow doesn't produce any rows and only emits the
693725
// metadata, so we can use a nil rowContainerHelper.
694726
resultWriter := NewRowResultWriter(nil /* rowContainer */)
695-
if err := dsp.planAndRunCreateStats(
696-
ctx, innerEvalCtx, planCtx, innerP.SemaCtx(), txn.KV(), r.job, resultWriter,
697-
); err != nil {
727+
728+
var err error
729+
if details.UsingExtremes {
730+
for i, colStat := range details.ColumnStats {
731+
// Plan and run partial stats on multiple columns separately since each
732+
// partial stat collection will use a different index and have different
733+
// plans.
734+
singleColDetails := protoutil.Clone(&details).(*jobspb.CreateStatsDetails)
735+
singleColDetails.ColumnStats = []jobspb.CreateStatsDetails_ColStat{colStat}
736+
planCtx := dsp.NewPlanningCtx(ctx, innerEvalCtx, innerP, txn.KV(), FullDistribution)
737+
if err = dsp.planAndRunCreateStats(
738+
ctx, innerEvalCtx, planCtx, innerP.SemaCtx(), txn.KV(), resultWriter, r.job.ID(), *singleColDetails,
739+
len(details.ColumnStats), i,
740+
); err != nil {
741+
break
742+
}
743+
}
744+
} else {
745+
planCtx := dsp.NewPlanningCtx(ctx, innerEvalCtx, innerP, txn.KV(), FullDistribution)
746+
err = dsp.planAndRunCreateStats(
747+
ctx, innerEvalCtx, planCtx, innerP.SemaCtx(), txn.KV(), resultWriter, r.job.ID(), details,
748+
1 /* numIndexes */, 0, /* curIndex */
749+
)
750+
}
751+
752+
if err != nil {
698753
// Check if this was a context canceled error and restart if it was.
699754
if grpcutil.IsContextCanceled(err) {
700755
return jobs.MarkAsRetryJobError(err)

pkg/sql/distsql_physical_planner.go

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3792,8 +3792,16 @@ func (dsp *DistSQLPlanner) createPhysPlanForPlanNode(
37923792
if err != nil {
37933793
return nil, err
37943794
}
3795+
details := record.Details.(jobspb.CreateStatsDetails)
3796+
3797+
numIndexes := 1
3798+
if details.UsingExtremes {
3799+
// Partial stats collections scan a different index for each column.
3800+
numIndexes = len(details.ColumnStats)
3801+
}
37953802
plan, err = dsp.createPlanForCreateStats(
3796-
ctx, planCtx, planCtx.planner.SemaCtx(), 0 /* jobID */, record.Details.(jobspb.CreateStatsDetails),
3803+
ctx, planCtx, planCtx.planner.SemaCtx(), 0 /* jobID */, details,
3804+
numIndexes, 0, /* curIndex */
37973805
)
37983806
}
37993807

pkg/sql/distsql_plan_stats.go

Lines changed: 38 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,6 @@ import (
1515
"math"
1616
"time"
1717

18-
"github.com/cockroachdb/cockroach/pkg/jobs"
1918
"github.com/cockroachdb/cockroach/pkg/jobs/jobspb"
2019
"github.com/cockroachdb/cockroach/pkg/kv"
2120
"github.com/cockroachdb/cockroach/pkg/settings"
@@ -160,6 +159,8 @@ func (dsp *DistSQLPlanner) createAndAttachSamplers(
160159
jobID jobspb.JobID,
161160
reqStats []requestedStat,
162161
sketchSpec, invSketchSpec []execinfrapb.SketchSpec,
162+
numIndexes int,
163+
curIndex int,
163164
) *PhysicalPlan {
164165
// Estimate the expected number of rows based on existing stats in the cache.
165166
var rowsExpected uint64
@@ -169,11 +170,20 @@ func (dsp *DistSQLPlanner) createAndAttachSamplers(
169170
overhead = autoStatsFractionStaleRowsForTable
170171
}
171172
// Convert to a signed integer first to make the linter happy.
172-
rowsExpected = uint64(int64(
173-
// The total expected number of rows is the same number that was measured
174-
// most recently, plus some overhead for possible insertions.
175-
float64(tableStats[0].RowCount) * (1 + overhead),
176-
))
173+
if details.UsingExtremes {
174+
rowsExpected = uint64(int64(
175+
// The total expected number of rows is the estimated number of stale
176+
// rows since we're only collecting stats on rows outside the bounds of
177+
// the most recent statistic.
178+
float64(tableStats[0].RowCount) * overhead,
179+
))
180+
} else {
181+
rowsExpected = uint64(int64(
182+
// The total expected number of rows is the same number that was measured
183+
// most recently, plus some overhead for possible insertions.
184+
float64(tableStats[0].RowCount) * (1 + overhead),
185+
))
186+
}
177187
}
178188

179189
// Set up the samplers.
@@ -249,6 +259,8 @@ func (dsp *DistSQLPlanner) createAndAttachSamplers(
249259
JobID: jobID,
250260
RowsExpected: rowsExpected,
251261
DeleteOtherStats: details.DeleteOtherStats,
262+
NumIndexes: uint64(numIndexes),
263+
CurIndex: uint64(curIndex),
252264
}
253265
// Plan the SampleAggregator on the gateway, unless we have a single Sampler.
254266
node := dsp.gatewaySQLInstanceID
@@ -273,14 +285,13 @@ func (dsp *DistSQLPlanner) createPartialStatsPlan(
273285
reqStats []requestedStat,
274286
jobID jobspb.JobID,
275287
details jobspb.CreateStatsDetails,
288+
numIndexes int,
289+
curIndex int,
276290
) (*PhysicalPlan, error) {
277-
278-
// Currently, we limit the number of requests for partial statistics
279-
// stats at a given point in time to 1.
280-
// TODO (faizaanmadhani): Add support for multiple distinct requested
281-
// partial stats in one job.
291+
// Partial stats collections on multiple columns create different plans,
292+
// so we only support one requested stat at a time here.
282293
if len(reqStats) > 1 {
283-
return nil, pgerror.Newf(pgcode.FeatureNotSupported, "cannot process multiple partial statistics at once")
294+
return nil, errors.AssertionFailedf("only one partial statistic can be requested at a time")
284295
}
285296

286297
reqStat := reqStats[0]
@@ -445,7 +456,8 @@ func (dsp *DistSQLPlanner) createPartialStatsPlan(
445456
sampledColumnIDs,
446457
jobID,
447458
reqStats,
448-
sketchSpec, invSketchSpec), nil
459+
sketchSpec, invSketchSpec,
460+
numIndexes, curIndex), nil
449461
}
450462

451463
func (dsp *DistSQLPlanner) createStatsPlan(
@@ -456,6 +468,8 @@ func (dsp *DistSQLPlanner) createStatsPlan(
456468
reqStats []requestedStat,
457469
jobID jobspb.JobID,
458470
details jobspb.CreateStatsDetails,
471+
numIndexes int,
472+
curIndex int,
459473
) (*PhysicalPlan, error) {
460474
if len(reqStats) == 0 {
461475
return nil, errors.New("no stats requested")
@@ -681,7 +695,8 @@ func (dsp *DistSQLPlanner) createStatsPlan(
681695
sampledColumnIDs,
682696
jobID,
683697
reqStats,
684-
sketchSpecs, invSketchSpecs), nil
698+
sketchSpecs, invSketchSpecs,
699+
numIndexes, curIndex), nil
685700
}
686701

687702
func (dsp *DistSQLPlanner) createPlanForCreateStats(
@@ -690,6 +705,8 @@ func (dsp *DistSQLPlanner) createPlanForCreateStats(
690705
semaCtx *tree.SemaContext,
691706
jobID jobspb.JobID,
692707
details jobspb.CreateStatsDetails,
708+
numIndexes int,
709+
curIndex int,
693710
) (*PhysicalPlan, error) {
694711
reqStats := make([]requestedStat, len(details.ColumnStats))
695712
histogramCollectionEnabled := stats.HistogramClusterMode.Get(&dsp.st.SV)
@@ -718,9 +735,9 @@ func (dsp *DistSQLPlanner) createPlanForCreateStats(
718735
}
719736

720737
if details.UsingExtremes {
721-
return dsp.createPartialStatsPlan(ctx, planCtx, tableDesc, reqStats, jobID, details)
738+
return dsp.createPartialStatsPlan(ctx, planCtx, tableDesc, reqStats, jobID, details, numIndexes, curIndex)
722739
}
723-
return dsp.createStatsPlan(ctx, planCtx, semaCtx, tableDesc, reqStats, jobID, details)
740+
return dsp.createStatsPlan(ctx, planCtx, semaCtx, tableDesc, reqStats, jobID, details, numIndexes, curIndex)
724741
}
725742

726743
func (dsp *DistSQLPlanner) planAndRunCreateStats(
@@ -729,13 +746,15 @@ func (dsp *DistSQLPlanner) planAndRunCreateStats(
729746
planCtx *PlanningCtx,
730747
semaCtx *tree.SemaContext,
731748
txn *kv.Txn,
732-
job *jobs.Job,
733749
resultWriter *RowResultWriter,
750+
jobId jobspb.JobID,
751+
details jobspb.CreateStatsDetails,
752+
numIndexes int,
753+
curIndex int,
734754
) error {
735755
ctx = logtags.AddTag(ctx, "create-stats-distsql", nil)
736756

737-
details := job.Details().(jobspb.CreateStatsDetails)
738-
physPlan, err := dsp.createPlanForCreateStats(ctx, planCtx, semaCtx, job.ID(), details)
757+
physPlan, err := dsp.createPlanForCreateStats(ctx, planCtx, semaCtx, jobId, details, numIndexes, curIndex)
739758
if err != nil {
740759
return err
741760
}

pkg/sql/execinfrapb/processors_table_stats.proto

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -194,4 +194,12 @@ message SampleAggregatorSpec {
194194
// If true, calculate partial table statistics on the extreme values of
195195
// the previous full table stat.
196196
optional bool using_extremes = 11 [(gogoproto.nullable) = false];
197+
198+
// The number of indexes expected to be scanned as part of this stats
199+
// collection. Used for progress reporting.
200+
optional uint64 num_indexes = 12 [(gogoproto.nullable) = false];
201+
202+
// The number of indexes that have been scanned so far. Used for progress
203+
// reporting.
204+
optional uint64 cur_index = 13 [(gogoproto.nullable) = false];
197205
}

0 commit comments

Comments
 (0)