diff --git a/pkg/sql/logictest/testdata/logic_test/stats b/pkg/sql/logictest/testdata/logic_test/stats new file mode 100644 index 000000000000..6d1fb6385b3b --- /dev/null +++ b/pkg/sql/logictest/testdata/logic_test/stats @@ -0,0 +1,39 @@ +# LogicTest: !fakedist-disk + +# Note that we disable the "forced disk spilling" config because the histograms +# are dropped if the stats collection reaches the memory budget limit. + +# Regression test for using values outside of the range supported by the column +# type for the histogram buckets (#76887). +statement ok +CREATE TABLE t (c INT2); + +# Insert many values so that the boundary values are likely to not be sampled. +# Splitting the INSERT statement into two such that negative values are inserted +# later for some reason makes it more likely that "outer" histogram buckets will +# be needed. +statement ok +INSERT INTO t SELECT generate_series(1, 10000); +INSERT INTO t SELECT generate_series(-10000, 0); + +statement ok +ANALYZE t; + +# Get the histogram ID for column 'c'. +let $histogram_id +WITH h(columns, id) AS + (SELECT column_names, histogram_id from [SHOW STATISTICS FOR TABLE t]) +SELECT id FROM h WHERE columns = ARRAY['c']; + +# Run a query that verifies that minimum and maximum values of the histogram +# buckets are exactly the boundaries of the INT2 supported range (unless -10000 +# and 10000 values were sampled). +query B +SELECT CASE + WHEN (SELECT count(*) FROM [SHOW HISTOGRAM $histogram_id]) = 2 + THEN true -- if the sampling picked the boundary values, we're happy + ELSE + (SELECT min(upper_bound::INT) = -32768 AND max(upper_bound::INT) = 32767 FROM [SHOW HISTOGRAM $histogram_id]) + END +---- +true diff --git a/pkg/sql/stats/histogram.go b/pkg/sql/stats/histogram.go index 65b52eefb28c..da0f91492b57 100644 --- a/pkg/sql/stats/histogram.go +++ b/pkg/sql/stats/histogram.go @@ -159,7 +159,7 @@ func EquiDepthHistogram( lowerBound = getNextLowerBound(evalCtx, upper) } - h.adjustCounts(evalCtx, float64(numRows), float64(distinctCount)) + h.adjustCounts(evalCtx, colType, float64(numRows), float64(distinctCount)) histogramData, err := h.toHistogramData(colType) return histogramData, h.buckets, err } @@ -171,7 +171,7 @@ type histogram struct { // adjustCounts adjusts the row count and number of distinct values per bucket // based on the total row count and estimated distinct count. func (h *histogram) adjustCounts( - evalCtx *tree.EvalContext, rowCountTotal, distinctCountTotal float64, + evalCtx *tree.EvalContext, colType *types.T, rowCountTotal, distinctCountTotal float64, ) { // Calculate the current state of the histogram so we can adjust it as needed. // The number of rows and distinct values represented by the histogram should @@ -274,7 +274,7 @@ func (h *histogram) adjustCounts( remDistinctCount = distinctCountTotal - distinctCountRange - distinctCountEq if remDistinctCount > 0 { h.addOuterBuckets( - evalCtx, remDistinctCount, &rowCountEq, &distinctCountEq, &rowCountRange, &distinctCountRange, + evalCtx, colType, remDistinctCount, &rowCountEq, &distinctCountEq, &rowCountRange, &distinctCountRange, ) } @@ -291,39 +291,102 @@ func (h *histogram) adjustCounts( } } +// getMinVal returns the minimum value for the minimum "outer" bucket if the +// value exists. The boolean indicates whether it exists and the bucket needs to +// be created. +func getMinVal(upperBound tree.Datum, t *types.T, evalCtx *tree.EvalContext) (tree.Datum, bool) { + if t.Family() == types.IntFamily { + // INT2 and INT4 require special handling. + // TODO(yuzefovich): other types might need it too, but it's less + // pressing to fix that. + bound, ok := upperBound.(*tree.DInt) + if !ok { + // This shouldn't happen, but we want to be defensive. + return nil, false + } + i := int64(*bound) + switch t.Width() { + case 16: + if i <= math.MinInt16 { // use inequality to be conservative + return nil, false + } + return tree.NewDInt(tree.DInt(math.MinInt16)), true + case 32: + if i <= math.MinInt32 { // use inequality to be conservative + return nil, false + } + return tree.NewDInt(tree.DInt(math.MinInt32)), true + } + } + if upperBound.IsMin(evalCtx) { + return nil, false + } + return upperBound.Min(evalCtx) +} + +// getMaxVal returns the maximum value for the maximum "outer" bucket if the +// value exists. The boolean indicates whether it exists and the bucket needs to +// be created. +func getMaxVal(upperBound tree.Datum, t *types.T, evalCtx *tree.EvalContext) (tree.Datum, bool) { + if t.Family() == types.IntFamily { + // INT2 and INT4 require special handling. + // TODO(yuzefovich): other types might need it too, but it's less + // pressing to fix that. + bound, ok := upperBound.(*tree.DInt) + if !ok { + // This shouldn't happen, but we want to be defensive. + return nil, false + } + i := int64(*bound) + switch t.Width() { + case 16: + if i >= math.MaxInt16 { // use inequality to be conservative + return nil, false + } + return tree.NewDInt(tree.DInt(math.MaxInt16)), true + case 32: + if i >= math.MaxInt32 { // use inequality to be conservative + return nil, false + } + return tree.NewDInt(tree.DInt(math.MaxInt32)), true + } + } + if upperBound.IsMax(evalCtx) { + return nil, false + } + return upperBound.Max(evalCtx) +} + // addOuterBuckets adds buckets above and below the existing buckets in the // histogram to include the remaining distinct values in remDistinctCount. It // also increments the counters rowCountEq, distinctCountEq, rowCountRange, and // distinctCountRange as needed. func (h *histogram) addOuterBuckets( evalCtx *tree.EvalContext, + colType *types.T, remDistinctCount float64, rowCountEq, distinctCountEq, rowCountRange, distinctCountRange *float64, ) { var maxDistinctCountExtraBuckets float64 var addedMin, addedMax bool var newBuckets int - if !h.buckets[0].UpperBound.IsMin(evalCtx) { - if minVal, ok := h.buckets[0].UpperBound.Min(evalCtx); ok { - lowerBound := minVal - upperBound := h.buckets[0].UpperBound - maxDistRange, _ := maxDistinctRange(evalCtx, lowerBound, upperBound) - maxDistinctCountExtraBuckets += maxDistRange - h.buckets = append([]cat.HistogramBucket{{UpperBound: minVal}}, h.buckets...) - addedMin = true - newBuckets++ - } - } - if !h.buckets[len(h.buckets)-1].UpperBound.IsMax(evalCtx) { - if maxVal, ok := h.buckets[len(h.buckets)-1].UpperBound.Max(evalCtx); ok { - lowerBound := h.buckets[len(h.buckets)-1].UpperBound - upperBound := maxVal - maxDistRange, _ := maxDistinctRange(evalCtx, lowerBound, upperBound) - maxDistinctCountExtraBuckets += maxDistRange - h.buckets = append(h.buckets, cat.HistogramBucket{UpperBound: maxVal}) - addedMax = true - newBuckets++ - } + if minVal, ok := getMinVal(h.buckets[0].UpperBound, colType, evalCtx); ok { + lowerBound := minVal + upperBound := h.buckets[0].UpperBound + maxDistRange, _ := maxDistinctRange(evalCtx, lowerBound, upperBound) + maxDistinctCountExtraBuckets += maxDistRange + h.buckets = append([]cat.HistogramBucket{{UpperBound: minVal}}, h.buckets...) + addedMin = true + newBuckets++ + } + if maxVal, ok := getMaxVal(h.buckets[len(h.buckets)-1].UpperBound, colType, evalCtx); ok { + lowerBound := h.buckets[len(h.buckets)-1].UpperBound + upperBound := maxVal + maxDistRange, _ := maxDistinctRange(evalCtx, lowerBound, upperBound) + maxDistinctCountExtraBuckets += maxDistRange + h.buckets = append(h.buckets, cat.HistogramBucket{UpperBound: maxVal}) + addedMax = true + newBuckets++ } if newBuckets == 0 { @@ -333,8 +396,7 @@ func (h *histogram) addOuterBuckets( // If this is an enum or bool histogram, increment numEq for the upper // bounds. - if typFam := h.buckets[0].UpperBound.ResolvedType().Family(); typFam == types.EnumFamily || - typFam == types.BoolFamily { + if typFam := colType.Family(); typFam == types.EnumFamily || typFam == types.BoolFamily { if addedMin { h.buckets[0].NumEq++ } @@ -367,7 +429,7 @@ func (h *histogram) addOuterBuckets( maxDistRange, countable := maxDistinctRange(evalCtx, lowerBound, upperBound) inc := avgRemPerBucket - if countable && h.buckets[0].UpperBound.ResolvedType().Family() == types.EnumFamily { + if countable && colType.Family() == types.EnumFamily { // Set the increment proportional to the remaining number of // distinct values in the bucket. This only really matters for // enums. diff --git a/pkg/sql/stats/histogram_test.go b/pkg/sql/stats/histogram_test.go index f9b83331874a..48dda440a8e7 100644 --- a/pkg/sql/stats/histogram_test.go +++ b/pkg/sql/stats/histogram_test.go @@ -531,7 +531,11 @@ func TestAdjustCounts(t *testing.T) { t.Run(fmt.Sprintf("%d", i), func(t *testing.T) { actual := histogram{buckets: make([]cat.HistogramBucket, len(tc.h))} copy(actual.buckets, tc.h) - actual.adjustCounts(&evalCtx, tc.rowCount, tc.distinctCount) + colType := types.Int + if len(tc.h) > 0 { + colType = tc.h[0].UpperBound.ResolvedType() + } + actual.adjustCounts(&evalCtx, colType, tc.rowCount, tc.distinctCount) roundHistogram(&actual) if !reflect.DeepEqual(actual.buckets, tc.expected) { t.Fatalf("expected %v but found %v", tc.expected, actual.buckets) @@ -541,7 +545,7 @@ func TestAdjustCounts(t *testing.T) { t.Run("random", func(t *testing.T) { // randHist returns a random histogram with anywhere from 1-200 buckets. - randHist := func() histogram { + randHist := func() (histogram, *types.T) { numBuckets := rand.Intn(200) + 1 buckets := make([]cat.HistogramBucket, numBuckets) ub := rand.Intn(100000000) @@ -549,6 +553,7 @@ func TestAdjustCounts(t *testing.T) { if rand.Intn(2) == 0 { ub = -ub } + colType := types.Int buckets[0].UpperBound = tree.NewDInt(tree.DInt(ub)) buckets[0].NumEq = float64(rand.Intn(1000)) + 1 for i := 1; i < len(buckets); i++ { @@ -561,17 +566,18 @@ func TestAdjustCounts(t *testing.T) { } // Half the time, use floats instead of ints. if rand.Intn(2) == 0 { + colType = types.Float for i := range buckets { buckets[i].UpperBound = tree.NewDFloat(tree.DFloat(*buckets[i].UpperBound.(*tree.DInt))) } } - return histogram{buckets: buckets} + return histogram{buckets: buckets}, colType } // Create 100 random histograms, and check that we can correctly adjust the // counts to match a random row count and distinct count. for trial := 0; trial < 100; trial++ { - h := randHist() + h, colType := randHist() rowCount := rand.Intn(1000000) distinctCount := rand.Intn(rowCount + 1) @@ -581,7 +587,7 @@ func TestAdjustCounts(t *testing.T) { distinctCount = max(distinctCount, len(h.buckets)) // Adjust the counts in the histogram to match the provided counts. - h.adjustCounts(&evalCtx, float64(rowCount), float64(distinctCount)) + h.adjustCounts(&evalCtx, colType, float64(rowCount), float64(distinctCount)) // Check that the resulting histogram is valid. if h.buckets[0].NumRange > 0 || h.buckets[0].DistinctRange > 0 {