From 5bca03700e3e2f295a409f9905a3e3a27627154d Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Mon, 25 Nov 2024 23:34:41 +0800 Subject: [PATCH] planner: Use realtimeRowCount when all topN collected (#56848) (#57689) close pingcap/tidb#47400 --- pkg/planner/cardinality/BUILD.bazel | 2 +- pkg/planner/cardinality/row_count_column.go | 21 ++++++++- pkg/planner/cardinality/row_count_index.go | 19 +++++++- pkg/planner/cardinality/selectivity_test.go | 50 +++++++++++++++++++++ 4 files changed, 87 insertions(+), 5 deletions(-) diff --git a/pkg/planner/cardinality/BUILD.bazel b/pkg/planner/cardinality/BUILD.bazel index 0c5f0dc481ea8..36657805f4bce 100644 --- a/pkg/planner/cardinality/BUILD.bazel +++ b/pkg/planner/cardinality/BUILD.bazel @@ -59,7 +59,7 @@ go_test( data = glob(["testdata/**"]), embed = [":cardinality"], flaky = True, - shard_count = 28, + shard_count = 29, deps = [ "//pkg/config", "//pkg/domain", diff --git a/pkg/planner/cardinality/row_count_column.go b/pkg/planner/cardinality/row_count_column.go index 5e6d75bf19a74..fa1d8364ed6e3 100644 --- a/pkg/planner/cardinality/row_count_column.go +++ b/pkg/planner/cardinality/row_count_column.go @@ -15,6 +15,8 @@ package cardinality import ( + "math" + "github.com/pingcap/errors" "github.com/pingcap/tidb/pkg/planner/planctx" "github.com/pingcap/tidb/pkg/planner/util/debugtrace" @@ -173,12 +175,27 @@ func equalRowCountOnColumn(sctx planctx.PlanContext, c *statistics.Column, val t // 3. use uniform distribution assumption for the rest (even when this value is not covered by the range of stats) histNDV := float64(c.Histogram.NDV - int64(c.TopN.Num())) if histNDV <= 0 { - // If the table hasn't been modified, it's safe to return 0. Otherwise, the TopN could be stale - return 1. + // If histNDV is zero - we have all NDV's in TopN - and no histograms. This function uses + // c.NotNullCount rather than c.Histogram.NotNullCount() since the histograms are empty. + // + // If the table hasn't been modified, it's safe to return 0. if modifyCount == 0 { return 0, nil } - return 1, nil + // ELSE calculate an approximate estimate based upon newly inserted rows. + // + // Reset to the original NDV, or if no NDV - derive an NDV using sqrt + if c.Histogram.NDV > 0 { + histNDV = float64(c.Histogram.NDV) + } else { + histNDV = math.Sqrt(max(c.NotNullCount(), float64(realtimeRowCount))) + } + // As a conservative estimate - take the smaller of the orignal totalRows or the additions. + // "realtimeRowCount - original count" is a better measure of inserts than modifyCount + totalRowCount := min(c.NotNullCount(), float64(realtimeRowCount)-c.NotNullCount()) + return max(1, totalRowCount/histNDV), nil } + // return the average histogram rows (which excludes topN) and NDV that excluded topN return c.Histogram.NotNullCount() / histNDV, nil } diff --git a/pkg/planner/cardinality/row_count_index.go b/pkg/planner/cardinality/row_count_index.go index c89b6e290c736..928395ae76eef 100644 --- a/pkg/planner/cardinality/row_count_index.go +++ b/pkg/planner/cardinality/row_count_index.go @@ -415,12 +415,27 @@ func equalRowCountOnIndex(sctx planctx.PlanContext, idx *statistics.Index, b []b // 3. use uniform distribution assumption for the rest (even when this value is not covered by the range of stats) histNDV := float64(idx.Histogram.NDV - int64(idx.TopN.Num())) if histNDV <= 0 { - // If the table hasn't been modified, it's safe to return 0. Otherwise, the TopN could be stale - return 1. + // If histNDV is zero - we have all NDV's in TopN - and no histograms. This function uses + // idx.TotalRowCount rather than idx.Histogram.NotNullCount() since the histograms are empty. + // + // If the table hasn't been modified, it's safe to return 0. if modifyCount == 0 { return 0 } - return 1 + // ELSE calculate an approximate estimate based upon newly inserted rows. + // + // Reset to the original NDV, or if no NDV - derive an NDV using sqrt + if idx.Histogram.NDV > 0 { + histNDV = float64(idx.Histogram.NDV) + } else { + histNDV = math.Sqrt(max(idx.TotalRowCount(), float64(realtimeRowCount))) + } + // As a conservative estimate - take the smaller of the orignal totalRows or the additions. + // "realtimeRowCount - original count" is a better measure of inserts than modifyCount + totalRowCount := min(idx.TotalRowCount(), float64(realtimeRowCount)-idx.TotalRowCount()) + return max(1, totalRowCount/histNDV) } + // return the average histogram rows (which excludes topN) and NDV that excluded topN return idx.Histogram.NotNullCount() / histNDV } diff --git a/pkg/planner/cardinality/selectivity_test.go b/pkg/planner/cardinality/selectivity_test.go index 40d336f9b7850..898e932eed31e 100644 --- a/pkg/planner/cardinality/selectivity_test.go +++ b/pkg/planner/cardinality/selectivity_test.go @@ -289,6 +289,56 @@ func TestEstimationForUnknownValues(t *testing.T) { require.Equal(t, 0.0, count) } +func TestEstimationForUnknownValuesAfterModify(t *testing.T) { + store, dom := testkit.CreateMockStoreAndDomain(t) + testKit := testkit.NewTestKit(t, store) + testKit.MustExec("use test") + testKit.MustExec("drop table if exists t") + testKit.MustExec("create table t(a int, key idx(a))") + testKit.MustExec("set @@tidb_analyze_version=2") + testKit.MustExec("set @@global.tidb_enable_auto_analyze='OFF'") + for i := 1; i <= 10; i++ { + testKit.MustExec(fmt.Sprintf("insert into t values (%d)", i)) + testKit.MustExec(fmt.Sprintf("insert into t values (%d)", i)) + testKit.MustExec(fmt.Sprintf("insert into t values (%d)", i)) + testKit.MustExec(fmt.Sprintf("insert into t values (%d)", i)) + testKit.MustExec(fmt.Sprintf("insert into t values (%d)", i)) + testKit.MustExec(fmt.Sprintf("insert into t select a from t where a = %d", i)) + } + testKit.MustExec("analyze table t") + h := dom.StatsHandle() + require.Nil(t, h.DumpStatsDeltaToKV(true)) + + table, err := dom.InfoSchema().TableByName(context.Background(), pmodel.NewCIStr("test"), pmodel.NewCIStr("t")) + require.NoError(t, err) + statsTbl := h.GetTableStats(table.Meta()) + + // Search for a found value == 10.0 + sctx := mock.NewContext() + col := statsTbl.GetCol(table.Meta().Columns[0].ID) + count, err := cardinality.GetColumnRowCount(sctx, col, getRange(5, 5), statsTbl.RealtimeCount, statsTbl.ModifyCount, false) + require.NoError(t, err) + require.Equal(t, 10.0, count) + + // Search for a not found value with zero modifyCount. Defaults to count == 1.0 + count, err = cardinality.GetColumnRowCount(sctx, col, getRange(11, 11), statsTbl.RealtimeCount, statsTbl.ModifyCount, false) + require.NoError(t, err) + require.Equal(t, 1.0, count) + + // Add another 200 rows to the table + testKit.MustExec("insert into t select a+10 from t") + testKit.MustExec("insert into t select a+10 from t where a <= 10") + require.Nil(t, h.DumpStatsDeltaToKV(true)) + require.Nil(t, h.Update(context.Background(), dom.InfoSchema())) + statsTblnew := h.GetTableStats(table.Meta()) + + // Search for a not found value based upon statistics - count should be >= 10 and <=40 + count, err = cardinality.GetColumnRowCount(sctx, col, getRange(15, 15), statsTblnew.RealtimeCount, statsTblnew.ModifyCount, false) + require.NoError(t, err) + require.Truef(t, count < 41, "expected: between 10 to 40, got: %v", count) + require.Truef(t, count > 9, "expected: between 10 to 40, got: %v", count) +} + func TestEstimationUniqueKeyEqualConds(t *testing.T) { store, dom := testkit.CreateMockStoreAndDomain(t) testKit := testkit.NewTestKit(t, store)