Skip to content

Commit

Permalink
Merge pull request #88244 from yuzefovich/backport22.1-88083
Browse files Browse the repository at this point in the history
release-22.1: stats: fix buckets for INT2 and INT4
  • Loading branch information
yuzefovich authored Sep 21, 2022
2 parents 91701fd + 159bd62 commit 7d2106a
Show file tree
Hide file tree
Showing 3 changed files with 139 additions and 32 deletions.
39 changes: 39 additions & 0 deletions pkg/sql/logictest/testdata/logic_test/stats
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
# LogicTest: !fakedist-disk

# Note that we disable the "forced disk spilling" config because the histograms
# are dropped if the stats collection reaches the memory budget limit.

# Regression test for using values outside of the range supported by the column
# type for the histogram buckets (#76887).
statement ok
CREATE TABLE t (c INT2);

# Insert many values so that the boundary values are likely to not be sampled.
# Splitting the INSERT statement into two such that negative values are inserted
# later for some reason makes it more likely that "outer" histogram buckets will
# be needed.
statement ok
INSERT INTO t SELECT generate_series(1, 10000);
INSERT INTO t SELECT generate_series(-10000, 0);

statement ok
ANALYZE t;

# Get the histogram ID for column 'c'.
let $histogram_id
WITH h(columns, id) AS
(SELECT column_names, histogram_id from [SHOW STATISTICS FOR TABLE t])
SELECT id FROM h WHERE columns = ARRAY['c'];

# Run a query that verifies that minimum and maximum values of the histogram
# buckets are exactly the boundaries of the INT2 supported range (unless -10000
# and 10000 values were sampled).
query B
SELECT CASE
WHEN (SELECT count(*) FROM [SHOW HISTOGRAM $histogram_id]) = 2
THEN true -- if the sampling picked the boundary values, we're happy
ELSE
(SELECT min(upper_bound::INT) = -32768 AND max(upper_bound::INT) = 32767 FROM [SHOW HISTOGRAM $histogram_id])
END
----
true
116 changes: 89 additions & 27 deletions pkg/sql/stats/histogram.go
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,7 @@ func EquiDepthHistogram(
lowerBound = getNextLowerBound(evalCtx, upper)
}

h.adjustCounts(evalCtx, float64(numRows), float64(distinctCount))
h.adjustCounts(evalCtx, colType, float64(numRows), float64(distinctCount))
histogramData, err := h.toHistogramData(colType)
return histogramData, h.buckets, err
}
Expand All @@ -171,7 +171,7 @@ type histogram struct {
// adjustCounts adjusts the row count and number of distinct values per bucket
// based on the total row count and estimated distinct count.
func (h *histogram) adjustCounts(
evalCtx *tree.EvalContext, rowCountTotal, distinctCountTotal float64,
evalCtx *tree.EvalContext, colType *types.T, rowCountTotal, distinctCountTotal float64,
) {
// Calculate the current state of the histogram so we can adjust it as needed.
// The number of rows and distinct values represented by the histogram should
Expand Down Expand Up @@ -274,7 +274,7 @@ func (h *histogram) adjustCounts(
remDistinctCount = distinctCountTotal - distinctCountRange - distinctCountEq
if remDistinctCount > 0 {
h.addOuterBuckets(
evalCtx, remDistinctCount, &rowCountEq, &distinctCountEq, &rowCountRange, &distinctCountRange,
evalCtx, colType, remDistinctCount, &rowCountEq, &distinctCountEq, &rowCountRange, &distinctCountRange,
)
}

Expand All @@ -291,39 +291,102 @@ func (h *histogram) adjustCounts(
}
}

// getMinVal returns the minimum value for the minimum "outer" bucket if the
// value exists. The boolean indicates whether it exists and the bucket needs to
// be created.
func getMinVal(upperBound tree.Datum, t *types.T, evalCtx *tree.EvalContext) (tree.Datum, bool) {
if t.Family() == types.IntFamily {
// INT2 and INT4 require special handling.
// TODO(yuzefovich): other types might need it too, but it's less
// pressing to fix that.
bound, ok := upperBound.(*tree.DInt)
if !ok {
// This shouldn't happen, but we want to be defensive.
return nil, false
}
i := int64(*bound)
switch t.Width() {
case 16:
if i <= math.MinInt16 { // use inequality to be conservative
return nil, false
}
return tree.NewDInt(tree.DInt(math.MinInt16)), true
case 32:
if i <= math.MinInt32 { // use inequality to be conservative
return nil, false
}
return tree.NewDInt(tree.DInt(math.MinInt32)), true
}
}
if upperBound.IsMin(evalCtx) {
return nil, false
}
return upperBound.Min(evalCtx)
}

// getMaxVal returns the maximum value for the maximum "outer" bucket if the
// value exists. The boolean indicates whether it exists and the bucket needs to
// be created.
func getMaxVal(upperBound tree.Datum, t *types.T, evalCtx *tree.EvalContext) (tree.Datum, bool) {
if t.Family() == types.IntFamily {
// INT2 and INT4 require special handling.
// TODO(yuzefovich): other types might need it too, but it's less
// pressing to fix that.
bound, ok := upperBound.(*tree.DInt)
if !ok {
// This shouldn't happen, but we want to be defensive.
return nil, false
}
i := int64(*bound)
switch t.Width() {
case 16:
if i >= math.MaxInt16 { // use inequality to be conservative
return nil, false
}
return tree.NewDInt(tree.DInt(math.MaxInt16)), true
case 32:
if i >= math.MaxInt32 { // use inequality to be conservative
return nil, false
}
return tree.NewDInt(tree.DInt(math.MaxInt32)), true
}
}
if upperBound.IsMax(evalCtx) {
return nil, false
}
return upperBound.Max(evalCtx)
}

// addOuterBuckets adds buckets above and below the existing buckets in the
// histogram to include the remaining distinct values in remDistinctCount. It
// also increments the counters rowCountEq, distinctCountEq, rowCountRange, and
// distinctCountRange as needed.
func (h *histogram) addOuterBuckets(
evalCtx *tree.EvalContext,
colType *types.T,
remDistinctCount float64,
rowCountEq, distinctCountEq, rowCountRange, distinctCountRange *float64,
) {
var maxDistinctCountExtraBuckets float64
var addedMin, addedMax bool
var newBuckets int
if !h.buckets[0].UpperBound.IsMin(evalCtx) {
if minVal, ok := h.buckets[0].UpperBound.Min(evalCtx); ok {
lowerBound := minVal
upperBound := h.buckets[0].UpperBound
maxDistRange, _ := maxDistinctRange(evalCtx, lowerBound, upperBound)
maxDistinctCountExtraBuckets += maxDistRange
h.buckets = append([]cat.HistogramBucket{{UpperBound: minVal}}, h.buckets...)
addedMin = true
newBuckets++
}
}
if !h.buckets[len(h.buckets)-1].UpperBound.IsMax(evalCtx) {
if maxVal, ok := h.buckets[len(h.buckets)-1].UpperBound.Max(evalCtx); ok {
lowerBound := h.buckets[len(h.buckets)-1].UpperBound
upperBound := maxVal
maxDistRange, _ := maxDistinctRange(evalCtx, lowerBound, upperBound)
maxDistinctCountExtraBuckets += maxDistRange
h.buckets = append(h.buckets, cat.HistogramBucket{UpperBound: maxVal})
addedMax = true
newBuckets++
}
if minVal, ok := getMinVal(h.buckets[0].UpperBound, colType, evalCtx); ok {
lowerBound := minVal
upperBound := h.buckets[0].UpperBound
maxDistRange, _ := maxDistinctRange(evalCtx, lowerBound, upperBound)
maxDistinctCountExtraBuckets += maxDistRange
h.buckets = append([]cat.HistogramBucket{{UpperBound: minVal}}, h.buckets...)
addedMin = true
newBuckets++
}
if maxVal, ok := getMaxVal(h.buckets[len(h.buckets)-1].UpperBound, colType, evalCtx); ok {
lowerBound := h.buckets[len(h.buckets)-1].UpperBound
upperBound := maxVal
maxDistRange, _ := maxDistinctRange(evalCtx, lowerBound, upperBound)
maxDistinctCountExtraBuckets += maxDistRange
h.buckets = append(h.buckets, cat.HistogramBucket{UpperBound: maxVal})
addedMax = true
newBuckets++
}

if newBuckets == 0 {
Expand All @@ -333,8 +396,7 @@ func (h *histogram) addOuterBuckets(

// If this is an enum or bool histogram, increment numEq for the upper
// bounds.
if typFam := h.buckets[0].UpperBound.ResolvedType().Family(); typFam == types.EnumFamily ||
typFam == types.BoolFamily {
if typFam := colType.Family(); typFam == types.EnumFamily || typFam == types.BoolFamily {
if addedMin {
h.buckets[0].NumEq++
}
Expand Down Expand Up @@ -367,7 +429,7 @@ func (h *histogram) addOuterBuckets(
maxDistRange, countable := maxDistinctRange(evalCtx, lowerBound, upperBound)

inc := avgRemPerBucket
if countable && h.buckets[0].UpperBound.ResolvedType().Family() == types.EnumFamily {
if countable && colType.Family() == types.EnumFamily {
// Set the increment proportional to the remaining number of
// distinct values in the bucket. This only really matters for
// enums.
Expand Down
16 changes: 11 additions & 5 deletions pkg/sql/stats/histogram_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -531,7 +531,11 @@ func TestAdjustCounts(t *testing.T) {
t.Run(fmt.Sprintf("%d", i), func(t *testing.T) {
actual := histogram{buckets: make([]cat.HistogramBucket, len(tc.h))}
copy(actual.buckets, tc.h)
actual.adjustCounts(&evalCtx, tc.rowCount, tc.distinctCount)
colType := types.Int
if len(tc.h) > 0 {
colType = tc.h[0].UpperBound.ResolvedType()
}
actual.adjustCounts(&evalCtx, colType, tc.rowCount, tc.distinctCount)
roundHistogram(&actual)
if !reflect.DeepEqual(actual.buckets, tc.expected) {
t.Fatalf("expected %v but found %v", tc.expected, actual.buckets)
Expand All @@ -541,14 +545,15 @@ func TestAdjustCounts(t *testing.T) {

t.Run("random", func(t *testing.T) {
// randHist returns a random histogram with anywhere from 1-200 buckets.
randHist := func() histogram {
randHist := func() (histogram, *types.T) {
numBuckets := rand.Intn(200) + 1
buckets := make([]cat.HistogramBucket, numBuckets)
ub := rand.Intn(100000000)
// Half the time, make it negative.
if rand.Intn(2) == 0 {
ub = -ub
}
colType := types.Int
buckets[0].UpperBound = tree.NewDInt(tree.DInt(ub))
buckets[0].NumEq = float64(rand.Intn(1000)) + 1
for i := 1; i < len(buckets); i++ {
Expand All @@ -561,17 +566,18 @@ func TestAdjustCounts(t *testing.T) {
}
// Half the time, use floats instead of ints.
if rand.Intn(2) == 0 {
colType = types.Float
for i := range buckets {
buckets[i].UpperBound = tree.NewDFloat(tree.DFloat(*buckets[i].UpperBound.(*tree.DInt)))
}
}
return histogram{buckets: buckets}
return histogram{buckets: buckets}, colType
}

// Create 100 random histograms, and check that we can correctly adjust the
// counts to match a random row count and distinct count.
for trial := 0; trial < 100; trial++ {
h := randHist()
h, colType := randHist()
rowCount := rand.Intn(1000000)
distinctCount := rand.Intn(rowCount + 1)

Expand All @@ -581,7 +587,7 @@ func TestAdjustCounts(t *testing.T) {
distinctCount = max(distinctCount, len(h.buckets))

// Adjust the counts in the histogram to match the provided counts.
h.adjustCounts(&evalCtx, float64(rowCount), float64(distinctCount))
h.adjustCounts(&evalCtx, colType, float64(rowCount), float64(distinctCount))

// Check that the resulting histogram is valid.
if h.buckets[0].NumRange > 0 || h.buckets[0].DistinctRange > 0 {
Expand Down

0 comments on commit 7d2106a

Please sign in to comment.