From 562c585ee332ac8c8c126d87a4c9e0fbac9a527a Mon Sep 17 00:00:00 2001 From: tpp Date: Tue, 30 Jul 2024 08:02:55 -0500 Subject: [PATCH 1/4] Estimate to recognize modifyCount when all TopN collected --- pkg/planner/cardinality/row_count_column.go | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/pkg/planner/cardinality/row_count_column.go b/pkg/planner/cardinality/row_count_column.go index fad100d82c358..7d627ee80aa75 100644 --- a/pkg/planner/cardinality/row_count_column.go +++ b/pkg/planner/cardinality/row_count_column.go @@ -124,7 +124,7 @@ func GetRowCountByIntColumnRanges(sctx context.PlanContext, coll *statistics.His } // equalRowCountOnColumn estimates the row count by a slice of Range and a Datum. -func equalRowCountOnColumn(sctx context.PlanContext, c *statistics.Column, val types.Datum, encodedVal []byte, realtimeRowCount int64) (result float64, err error) { +func equalRowCountOnColumn(sctx context.PlanContext, c *statistics.Column, val types.Datum, encodedVal []byte, realtimeRowCount int64, modifyCount int64) (result float64, err error) { if sctx.GetSessionVars().StmtCtx.EnableOptimizerDebugTrace { debugtrace.EnterContextCommon(sctx) debugtrace.RecordAnyValuesWithNames(sctx, "Value", val.String(), "Encoded", encodedVal) @@ -172,7 +172,10 @@ func equalRowCountOnColumn(sctx context.PlanContext, c *statistics.Column, val t // 3. use uniform distribution assumption for the rest (even when this value is not covered by the range of stats) histNDV := float64(c.Histogram.NDV - int64(c.TopN.Num())) if histNDV <= 0 { - return 0, nil + if modifyCount == 0 { + return 0, nil + } + return 1, nil } return c.Histogram.NotNullCount() / histNDV, nil } @@ -224,7 +227,7 @@ func GetColumnRowCount(sctx context.PlanContext, c *statistics.Column, ranges [] continue } var cnt float64 - cnt, err = equalRowCountOnColumn(sctx, c, lowVal, lowEncoded, realtimeRowCount) + cnt, err = equalRowCountOnColumn(sctx, c, lowVal, lowEncoded, realtimeRowCount, modifyCount) if err != nil { return 0, errors.Trace(err) } @@ -245,7 +248,7 @@ func GetColumnRowCount(sctx context.PlanContext, c *statistics.Column, ranges [] // case 2: it's a small range && using ver1 stats if rangeVals != nil { for _, val := range rangeVals { - cnt, err := equalRowCountOnColumn(sctx, c, val, lowEncoded, realtimeRowCount) + cnt, err := equalRowCountOnColumn(sctx, c, val, lowEncoded, realtimeRowCount, modifyCount) if err != nil { return 0, err } @@ -269,7 +272,7 @@ func GetColumnRowCount(sctx context.PlanContext, c *statistics.Column, ranges [] // And because we use (2, MaxValue] to represent expressions like a > 2 and use [MinNotNull, 3) to represent // expressions like b < 3, we need to exclude the special values. if rg.LowExclude && !lowVal.IsNull() && lowVal.Kind() != types.KindMaxValue && lowVal.Kind() != types.KindMinNotNull { - lowCnt, err := equalRowCountOnColumn(sctx, c, lowVal, lowEncoded, realtimeRowCount) + lowCnt, err := equalRowCountOnColumn(sctx, c, lowVal, lowEncoded, realtimeRowCount, modifyCount) if err != nil { return 0, errors.Trace(err) } @@ -280,7 +283,7 @@ func GetColumnRowCount(sctx context.PlanContext, c *statistics.Column, ranges [] cnt += float64(c.NullCount) } if !rg.HighExclude && highVal.Kind() != types.KindMaxValue && highVal.Kind() != types.KindMinNotNull { - highCnt, err := equalRowCountOnColumn(sctx, c, highVal, highEncoded, realtimeRowCount) + highCnt, err := equalRowCountOnColumn(sctx, c, highVal, highEncoded, realtimeRowCount, modifyCount) if err != nil { return 0, errors.Trace(err) } @@ -376,7 +379,7 @@ func ColumnEqualRowCount(sctx context.PlanContext, t *statistics.Table, value ty if err != nil { return 0, err } - result, err := equalRowCountOnColumn(sctx, c, value, encodedVal, t.ModifyCount) + result, err := equalRowCountOnColumn(sctx, c, value, encodedVal, t.RealtimeCount, t.ModifyCount) result *= c.GetIncreaseFactor(t.RealtimeCount) return result, errors.Trace(err) } From 4fa22d113a446bb7fb2a1a4c8b34d0a94fd2825f Mon Sep 17 00:00:00 2001 From: tpp Date: Tue, 30 Jul 2024 10:28:57 -0500 Subject: [PATCH 2/4] unit testing --- pkg/planner/cardinality/row_count_column.go | 1 + pkg/planner/cardinality/row_count_index.go | 11 ++++++++--- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/pkg/planner/cardinality/row_count_column.go b/pkg/planner/cardinality/row_count_column.go index 7d627ee80aa75..6b39112913bad 100644 --- a/pkg/planner/cardinality/row_count_column.go +++ b/pkg/planner/cardinality/row_count_column.go @@ -172,6 +172,7 @@ func equalRowCountOnColumn(sctx context.PlanContext, c *statistics.Column, val t // 3. use uniform distribution assumption for the rest (even when this value is not covered by the range of stats) histNDV := float64(c.Histogram.NDV - int64(c.TopN.Num())) if histNDV <= 0 { + // If the table hasn't been modified, it's safe to return 0. Otherwise, the TopN could be stale - return 1. if modifyCount == 0 { return 0, nil } diff --git a/pkg/planner/cardinality/row_count_index.go b/pkg/planner/cardinality/row_count_index.go index ab4ff96cf84ee..2ac71e129eab7 100644 --- a/pkg/planner/cardinality/row_count_index.go +++ b/pkg/planner/cardinality/row_count_index.go @@ -263,7 +263,7 @@ func getIndexRowCountForStatsV2(sctx context.PlanContext, idx *statistics.Index, } continue } - count = equalRowCountOnIndex(sctx, idx, lb, realtimeRowCount) + count = equalRowCountOnIndex(sctx, idx, lb, realtimeRowCount, modifyCount) // If the current table row count has changed, we should scale the row count accordingly. count *= idx.GetIncreaseFactor(realtimeRowCount) if debugTrace { @@ -363,7 +363,7 @@ func getIndexRowCountForStatsV2(sctx context.PlanContext, idx *statistics.Index, var nullKeyBytes, _ = codec.EncodeKey(time.UTC, nil, types.NewDatum(nil)) -func equalRowCountOnIndex(sctx context.PlanContext, idx *statistics.Index, b []byte, realtimeRowCount int64) (result float64) { +func equalRowCountOnIndex(sctx context.PlanContext, idx *statistics.Index, b []byte, realtimeRowCount, modifyCount int64) (result float64) { if sctx.GetSessionVars().StmtCtx.EnableOptimizerDebugTrace { debugtrace.EnterContextCommon(sctx) debugtrace.RecordAnyValuesWithNames(sctx, "Encoded Value", b) @@ -404,7 +404,12 @@ func equalRowCountOnIndex(sctx context.PlanContext, idx *statistics.Index, b []b // 3. use uniform distribution assumption for the rest (even when this value is not covered by the range of stats) histNDV := float64(idx.Histogram.NDV - int64(idx.TopN.Num())) if histNDV <= 0 { - return 0 + // If the table hasn't been modified, it's safe to return 0. Otherwise, the TopN could be stale - return 1. + if modifyCount == 0 { + return 0 + } else { + return 1 + } } return idx.Histogram.NotNullCount() / histNDV } From adbb55fd72322dbf4e7934d01ae9a26d35add462 Mon Sep 17 00:00:00 2001 From: tpp Date: Tue, 30 Jul 2024 10:33:36 -0500 Subject: [PATCH 3/4] cleanup --- pkg/planner/cardinality/row_count_column.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/planner/cardinality/row_count_column.go b/pkg/planner/cardinality/row_count_column.go index 6b39112913bad..dd8fba0dd2ff3 100644 --- a/pkg/planner/cardinality/row_count_column.go +++ b/pkg/planner/cardinality/row_count_column.go @@ -124,7 +124,7 @@ func GetRowCountByIntColumnRanges(sctx context.PlanContext, coll *statistics.His } // equalRowCountOnColumn estimates the row count by a slice of Range and a Datum. -func equalRowCountOnColumn(sctx context.PlanContext, c *statistics.Column, val types.Datum, encodedVal []byte, realtimeRowCount int64, modifyCount int64) (result float64, err error) { +func equalRowCountOnColumn(sctx context.PlanContext, c *statistics.Column, val types.Datum, encodedVal []byte, realtimeRowCount, modifyCount int64) (result float64, err error) { if sctx.GetSessionVars().StmtCtx.EnableOptimizerDebugTrace { debugtrace.EnterContextCommon(sctx) debugtrace.RecordAnyValuesWithNames(sctx, "Value", val.String(), "Encoded", encodedVal) From 285d6eed5e74793597203dfa74ad1b6beaa6fc06 Mon Sep 17 00:00:00 2001 From: tpp Date: Wed, 31 Jul 2024 09:20:29 -0500 Subject: [PATCH 4/4] testcase failures --- pkg/planner/cardinality/row_count_index.go | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pkg/planner/cardinality/row_count_index.go b/pkg/planner/cardinality/row_count_index.go index 2ac71e129eab7..7f6aec9764dc6 100644 --- a/pkg/planner/cardinality/row_count_index.go +++ b/pkg/planner/cardinality/row_count_index.go @@ -407,9 +407,8 @@ func equalRowCountOnIndex(sctx context.PlanContext, idx *statistics.Index, b []b // If the table hasn't been modified, it's safe to return 0. Otherwise, the TopN could be stale - return 1. if modifyCount == 0 { return 0 - } else { - return 1 } + return 1 } return idx.Histogram.NotNullCount() / histNDV }