Merge pull request #88244 from yuzefovich/backport22.1-88083

release-22.1: stats: fix buckets for INT2 and INT4
cockroachdb · Sep 21, 2022 · 7d2106a · 7d2106a
2 parents 91701fd + 159bd62
commit 7d2106a
Show file tree

Hide file tree

Showing 3 changed files with 139 additions and 32 deletions.
diff --git a/pkg/sql/logictest/testdata/logic_test/stats b/pkg/sql/logictest/testdata/logic_test/stats
@@ -0,0 +1,39 @@
+# LogicTest: !fakedist-disk
+
+# Note that we disable the "forced disk spilling" config because the histograms
+# are dropped if the stats collection reaches the memory budget limit.
+
+# Regression test for using values outside of the range supported by the column
+# type for the histogram buckets (#76887).
+statement ok
+CREATE TABLE t (c INT2);
+
+# Insert many values so that the boundary values are likely to not be sampled.
+# Splitting the INSERT statement into two such that negative values are inserted
+# later for some reason makes it more likely that "outer" histogram buckets will
+# be needed.
+statement ok
+INSERT INTO t SELECT generate_series(1, 10000);
+INSERT INTO t SELECT generate_series(-10000, 0);
+
+statement ok
+ANALYZE t;
+
+# Get the histogram ID for column 'c'.
+let $histogram_id
+WITH h(columns, id) AS
+ (SELECT column_names, histogram_id from [SHOW STATISTICS FOR TABLE t])
+SELECT id FROM h WHERE columns = ARRAY['c'];
+
+# Run a query that verifies that minimum and maximum values of the histogram
+# buckets are exactly the boundaries of the INT2 supported range (unless -10000
+# and 10000 values were sampled).
+query B
+SELECT CASE
+ WHEN (SELECT count(*) FROM [SHOW HISTOGRAM $histogram_id]) = 2
+ THEN true -- if the sampling picked the boundary values, we're happy
+ ELSE
+ (SELECT min(upper_bound::INT) = -32768 AND max(upper_bound::INT) = 32767 FROM [SHOW HISTOGRAM $histogram_id])
+ END
+----
+true
diff --git a/pkg/sql/stats/histogram.go b/pkg/sql/stats/histogram.go
@@ -159,7 +159,7 @@ func EquiDepthHistogram(
  lowerBound = getNextLowerBound(evalCtx, upper)
  }
 
- h.adjustCounts(evalCtx, float64(numRows), float64(distinctCount))
+ h.adjustCounts(evalCtx, colType, float64(numRows), float64(distinctCount))
  histogramData, err := h.toHistogramData(colType)
  return histogramData, h.buckets, err
 }
@@ -171,7 +171,7 @@ type histogram struct {
 // adjustCounts adjusts the row count and number of distinct values per bucket
 // based on the total row count and estimated distinct count.
 func (h *histogram) adjustCounts(
- evalCtx *tree.EvalContext, rowCountTotal, distinctCountTotal float64,
+ evalCtx *tree.EvalContext, colType *types.T, rowCountTotal, distinctCountTotal float64,
 ) {
  // Calculate the current state of the histogram so we can adjust it as needed.
  // The number of rows and distinct values represented by the histogram should
@@ -274,7 +274,7 @@ func (h *histogram) adjustCounts(
  remDistinctCount = distinctCountTotal - distinctCountRange - distinctCountEq
  if remDistinctCount > 0 {
  h.addOuterBuckets(
- evalCtx, remDistinctCount, &rowCountEq, &distinctCountEq, &rowCountRange, &distinctCountRange,
+ evalCtx, colType, remDistinctCount, &rowCountEq, &distinctCountEq, &rowCountRange, &distinctCountRange,
  )
  }
 
@@ -291,39 +291,102 @@ func (h *histogram) adjustCounts(
  }
 }
 
+// getMinVal returns the minimum value for the minimum "outer" bucket if the
+// value exists. The boolean indicates whether it exists and the bucket needs to
+// be created.
+func getMinVal(upperBound tree.Datum, t *types.T, evalCtx *tree.EvalContext) (tree.Datum, bool) {
+ if t.Family() == types.IntFamily {
+ // INT2 and INT4 require special handling.
+ // TODO(yuzefovich): other types might need it too, but it's less
+ // pressing to fix that.
+ bound, ok := upperBound.(*tree.DInt)
+ if !ok {
+ // This shouldn't happen, but we want to be defensive.
+ return nil, false
+ }
+ i := int64(*bound)
+ switch t.Width() {
+ case 16:
+ if i <= math.MinInt16 { // use inequality to be conservative
+ return nil, false
+ }
+ return tree.NewDInt(tree.DInt(math.MinInt16)), true
+ case 32:
+ if i <= math.MinInt32 { // use inequality to be conservative
+ return nil, false
+ }
+ return tree.NewDInt(tree.DInt(math.MinInt32)), true
+ }
+ }
+ if upperBound.IsMin(evalCtx) {
+ return nil, false
+ }
+ return upperBound.Min(evalCtx)
+}
+
+// getMaxVal returns the maximum value for the maximum "outer" bucket if the
+// value exists. The boolean indicates whether it exists and the bucket needs to
+// be created.
+func getMaxVal(upperBound tree.Datum, t *types.T, evalCtx *tree.EvalContext) (tree.Datum, bool) {
+ if t.Family() == types.IntFamily {
+ // INT2 and INT4 require special handling.
+ // TODO(yuzefovich): other types might need it too, but it's less
+ // pressing to fix that.
+ bound, ok := upperBound.(*tree.DInt)
+ if !ok {
+ // This shouldn't happen, but we want to be defensive.
+ return nil, false
+ }
+ i := int64(*bound)
+ switch t.Width() {
+ case 16:
+ if i >= math.MaxInt16 { // use inequality to be conservative
+ return nil, false
+ }
+ return tree.NewDInt(tree.DInt(math.MaxInt16)), true
+ case 32:
+ if i >= math.MaxInt32 { // use inequality to be conservative
+ return nil, false
+ }
+ return tree.NewDInt(tree.DInt(math.MaxInt32)), true
+ }
+ }
+ if upperBound.IsMax(evalCtx) {
+ return nil, false
+ }
+ return upperBound.Max(evalCtx)
+}
+
 // addOuterBuckets adds buckets above and below the existing buckets in the
 // histogram to include the remaining distinct values in remDistinctCount. It
 // also increments the counters rowCountEq, distinctCountEq, rowCountRange, and
 // distinctCountRange as needed.
 func (h *histogram) addOuterBuckets(
  evalCtx *tree.EvalContext,
+ colType *types.T,
  remDistinctCount float64,
  rowCountEq, distinctCountEq, rowCountRange, distinctCountRange *float64,
 ) {
  var maxDistinctCountExtraBuckets float64
  var addedMin, addedMax bool
  var newBuckets int
- if !h.buckets[0].UpperBound.IsMin(evalCtx) {
- if minVal, ok := h.buckets[0].UpperBound.Min(evalCtx); ok {
- lowerBound := minVal
- upperBound := h.buckets[0].UpperBound
- maxDistRange, _ := maxDistinctRange(evalCtx, lowerBound, upperBound)
- maxDistinctCountExtraBuckets += maxDistRange
- h.buckets = append([]cat.HistogramBucket{{UpperBound: minVal}}, h.buckets...)
- addedMin = true
- newBuckets++
- }
- }
- if !h.buckets[len(h.buckets)-1].UpperBound.IsMax(evalCtx) {
- if maxVal, ok := h.buckets[len(h.buckets)-1].UpperBound.Max(evalCtx); ok {
- lowerBound := h.buckets[len(h.buckets)-1].UpperBound
- upperBound := maxVal
- maxDistRange, _ := maxDistinctRange(evalCtx, lowerBound, upperBound)
- maxDistinctCountExtraBuckets += maxDistRange
- h.buckets = append(h.buckets, cat.HistogramBucket{UpperBound: maxVal})
- addedMax = true
- newBuckets++
- }
+ if minVal, ok := getMinVal(h.buckets[0].UpperBound, colType, evalCtx); ok {
+ lowerBound := minVal
+ upperBound := h.buckets[0].UpperBound
+ maxDistRange, _ := maxDistinctRange(evalCtx, lowerBound, upperBound)
+ maxDistinctCountExtraBuckets += maxDistRange
+ h.buckets = append([]cat.HistogramBucket{{UpperBound: minVal}}, h.buckets...)
+ addedMin = true
+ newBuckets++
+ }
+ if maxVal, ok := getMaxVal(h.buckets[len(h.buckets)-1].UpperBound, colType, evalCtx); ok {
+ lowerBound := h.buckets[len(h.buckets)-1].UpperBound
+ upperBound := maxVal
+ maxDistRange, _ := maxDistinctRange(evalCtx, lowerBound, upperBound)
+ maxDistinctCountExtraBuckets += maxDistRange
+ h.buckets = append(h.buckets, cat.HistogramBucket{UpperBound: maxVal})
+ addedMax = true
+ newBuckets++
  }
 
  if newBuckets == 0 {
@@ -333,8 +396,7 @@ func (h *histogram) addOuterBuckets(
 
  // If this is an enum or bool histogram, increment numEq for the upper
  // bounds.
- if typFam := h.buckets[0].UpperBound.ResolvedType().Family(); typFam == types.EnumFamily ||
- typFam == types.BoolFamily {
+ if typFam := colType.Family(); typFam == types.EnumFamily || typFam == types.BoolFamily {
  if addedMin {
  h.buckets[0].NumEq++
  }
@@ -367,7 +429,7 @@ func (h *histogram) addOuterBuckets(
  maxDistRange, countable := maxDistinctRange(evalCtx, lowerBound, upperBound)
 
  inc := avgRemPerBucket
- if countable && h.buckets[0].UpperBound.ResolvedType().Family() == types.EnumFamily {
+ if countable && colType.Family() == types.EnumFamily {
  // Set the increment proportional to the remaining number of
  // distinct values in the bucket. This only really matters for
  // enums.

diff --git a/pkg/sql/stats/histogram_test.go b/pkg/sql/stats/histogram_test.go
@@ -531,7 +531,11 @@ func TestAdjustCounts(t *testing.T) {
  t.Run(fmt.Sprintf("%d", i), func(t *testing.T) {
  actual := histogram{buckets: make([]cat.HistogramBucket, len(tc.h))}
  copy(actual.buckets, tc.h)
- actual.adjustCounts(&evalCtx, tc.rowCount, tc.distinctCount)
+ colType := types.Int
+ if len(tc.h) > 0 {
+ colType = tc.h[0].UpperBound.ResolvedType()
+ }
+ actual.adjustCounts(&evalCtx, colType, tc.rowCount, tc.distinctCount)
  roundHistogram(&actual)
  if !reflect.DeepEqual(actual.buckets, tc.expected) {
  t.Fatalf("expected %v but found %v", tc.expected, actual.buckets)
@@ -541,14 +545,15 @@ func TestAdjustCounts(t *testing.T) {
 
  t.Run("random", func(t *testing.T) {
  // randHist returns a random histogram with anywhere from 1-200 buckets.
- randHist := func() histogram {
+ randHist := func() (histogram, *types.T) {
  numBuckets := rand.Intn(200) + 1
  buckets := make([]cat.HistogramBucket, numBuckets)
  ub := rand.Intn(100000000)
  // Half the time, make it negative.
  if rand.Intn(2) == 0 {
  ub = -ub
  }
+ colType := types.Int
  buckets[0].UpperBound = tree.NewDInt(tree.DInt(ub))
  buckets[0].NumEq = float64(rand.Intn(1000)) + 1
  for i := 1; i < len(buckets); i++ {
@@ -561,17 +566,18 @@ func TestAdjustCounts(t *testing.T) {
  }
  // Half the time, use floats instead of ints.
  if rand.Intn(2) == 0 {
+ colType = types.Float
  for i := range buckets {
  buckets[i].UpperBound = tree.NewDFloat(tree.DFloat(*buckets[i].UpperBound.(*tree.DInt)))
  }
  }
- return histogram{buckets: buckets}
+ return histogram{buckets: buckets}, colType
  }
 
  // Create 100 random histograms, and check that we can correctly adjust the
  // counts to match a random row count and distinct count.
  for trial := 0; trial < 100; trial++ {
- h := randHist()
+ h, colType := randHist()
  rowCount := rand.Intn(1000000)
  distinctCount := rand.Intn(rowCount + 1)
 
@@ -581,7 +587,7 @@ func TestAdjustCounts(t *testing.T) {
  distinctCount = max(distinctCount, len(h.buckets))
 
  // Adjust the counts in the histogram to match the provided counts.
- h.adjustCounts(&evalCtx, float64(rowCount), float64(distinctCount))
+ h.adjustCounts(&evalCtx, colType, float64(rowCount), float64(distinctCount))
 
  // Check that the resulting histogram is valid.
  if h.buckets[0].NumRange > 0 || h.buckets[0].DistinctRange > 0 {