diff --git a/pkg/planner/cardinality/testdata/cardinality_suite_out.json b/pkg/planner/cardinality/testdata/cardinality_suite_out.json index 86b8d2a56aa50..f4b64ebd7b1cd 100644 --- a/pkg/planner/cardinality/testdata/cardinality_suite_out.json +++ b/pkg/planner/cardinality/testdata/cardinality_suite_out.json @@ -24,7 +24,7 @@ { "Start": 800, "End": 900, - "Count": 735.504166655054 + "Count": 771.504166655054 }, { "Start": 900, @@ -79,7 +79,7 @@ { "Start": 800, "End": 1000, - "Count": 1193.696869573942 + "Count": 1229.696869573942 }, { "Start": 900, @@ -104,7 +104,7 @@ { "Start": 200, "End": 400, - "Count": 1237.5288209899081 + "Count": 1226.2788209899081 }, { "Start": 200, @@ -2535,6 +2535,23 @@ } ] }, + { + "github.com/pingcap/tidb/pkg/statistics.(*Histogram).EqualRowCount": [ + { + "Locate value in buckets": { + "BucketIdx": 0, + "Exceed": false, + "InBucket": false, + "MatchLastValue": false, + "Value": "KindMinNotNull " + } + }, + { + "Count": 0, + "Matched": false + } + ] + }, { "github.com/pingcap/tidb/pkg/statistics.(*Histogram).OutOfRangeRowCount": [ { @@ -2816,6 +2833,23 @@ } ] }, + { + "github.com/pingcap/tidb/pkg/statistics.(*Histogram).EqualRowCount": [ + { + "Locate value in buckets": { + "BucketIdx": 0, + "Exceed": false, + "InBucket": false, + "MatchLastValue": false, + "Value": "KindBytes \\x01" + } + }, + { + "Count": 0, + "Matched": false + } + ] + }, { "github.com/pingcap/tidb/pkg/statistics.(*Histogram).OutOfRangeRowCount": [ { @@ -3526,6 +3560,23 @@ } ] }, + { + "github.com/pingcap/tidb/pkg/statistics.(*Histogram).EqualRowCount": [ + { + "Locate value in buckets": { + "BucketIdx": 0, + "Exceed": false, + "InBucket": false, + "MatchLastValue": false, + "Value": "KindMinNotNull " + } + }, + { + "Count": 0, + "Matched": false + } + ] + }, { "github.com/pingcap/tidb/pkg/statistics.(*TopN).BetweenCount": { "Result": 0 @@ -3666,6 +3717,32 @@ } ] }, + { + "github.com/pingcap/tidb/pkg/statistics.(*Histogram).EqualRowCount": [ + { + "Locate value in buckets": { + "BucketIdx": 111, + "Exceed": false, + "InBucket": true, + "MatchLastValue": false, + "Value": "KindInt64 400" + } + }, + { + "Related Buckets in Histogram": [ + { + "Count": 896, + "Index": 111, + "Repeat": 1 + } + ] + }, + { + "Count": 0.99, + "Matched": false + } + ] + }, { "github.com/pingcap/tidb/pkg/statistics.(*TopN).BetweenCount": [ { @@ -3813,6 +3890,23 @@ } ] }, + { + "github.com/pingcap/tidb/pkg/statistics.(*Histogram).EqualRowCount": [ + { + "Locate value in buckets": { + "BucketIdx": 0, + "Exceed": false, + "InBucket": false, + "MatchLastValue": false, + "Value": "KindBytes \\x01" + } + }, + { + "Count": 0, + "Matched": false + } + ] + }, { "github.com/pingcap/tidb/pkg/statistics.(*TopN).BetweenCount": { "Result": 0 @@ -3942,6 +4036,32 @@ } ] }, + { + "github.com/pingcap/tidb/pkg/statistics.(*Histogram).EqualRowCount": [ + { + "Locate value in buckets": { + "BucketIdx": 111, + "Exceed": false, + "InBucket": true, + "MatchLastValue": false, + "Value": "KindBytes \\x03\\x80\\x00\\x00\\x00\\x00\\x00\\x01\\x91" + } + }, + { + "Related Buckets in Histogram": [ + { + "Count": 896, + "Index": 111, + "Repeat": 1 + } + ] + }, + { + "Count": 0.99, + "Matched": false + } + ] + }, { "github.com/pingcap/tidb/pkg/statistics.(*TopN).BetweenCount": { "Result": 0 diff --git a/pkg/statistics/histogram.go b/pkg/statistics/histogram.go index 000dd80c5be2c..7154b5ec7707c 100644 --- a/pkg/statistics/histogram.go +++ b/pkg/statistics/histogram.go @@ -593,14 +593,17 @@ func (hg *Histogram) LessRowCount(sctx context.PlanContext, value types.Datum) f func (hg *Histogram) BetweenRowCount(sctx context.PlanContext, a, b types.Datum) float64 { lessCountA := hg.LessRowCount(sctx, a) lessCountB := hg.LessRowCount(sctx, b) - // If lessCountA is not less than lessCountB, it may be that they fall to the same bucket and we cannot estimate - // the fraction, so we use `totalCount / NDV` to estimate the row count, but the result should not greater than - // lessCountB or notNullCount-lessCountA. - if lessCountA >= lessCountB && hg.NDV > 0 { + rangeEst := lessCountB - lessCountA + lowEqual, _ := hg.EqualRowCount(sctx, a, false) + ndvAvg := hg.NotNullCount() / float64(hg.NDV) + // If values fall in the same bucket, we may underestimate the fractional result. So estimate the low value (a) as an equals, and + // estimate the high value as the default (because the input high value may be "larger" than the true high value). The range should + // not be less than both the low+high - or the lesser of the estimate for the individual range of a or b is used as a bound. + if rangeEst < math.Max(lowEqual, ndvAvg) && hg.NDV > 0 { result := math.Min(lessCountB, hg.NotNullCount()-lessCountA) - return math.Min(result, hg.NotNullCount()/float64(hg.NDV)) + return math.Min(result, lowEqual+ndvAvg) } - return lessCountB - lessCountA + return rangeEst } // TotalRowCount returns the total count of this histogram.