From 5bca03700e3e2f295a409f9905a3e3a27627154d Mon Sep 17 00:00:00 2001
From: Ti Chi Robot <ti-community-prow-bot@tidb.io>
Date: Mon, 25 Nov 2024 23:34:41 +0800
Subject: [PATCH] planner: Use realtimeRowCount when all topN collected
 (#56848) (#57689)

close pingcap/tidb#47400
---
 pkg/planner/cardinality/BUILD.bazel         |  2 +-
 pkg/planner/cardinality/row_count_column.go | 21 ++++++++-
 pkg/planner/cardinality/row_count_index.go  | 19 +++++++-
 pkg/planner/cardinality/selectivity_test.go | 50 +++++++++++++++++++++
 4 files changed, 87 insertions(+), 5 deletions(-)

diff --git a/pkg/planner/cardinality/BUILD.bazel b/pkg/planner/cardinality/BUILD.bazel
index 0c5f0dc481ea8..36657805f4bce 100644
--- a/pkg/planner/cardinality/BUILD.bazel
+++ b/pkg/planner/cardinality/BUILD.bazel
@@ -59,7 +59,7 @@ go_test(
     data = glob(["testdata/**"]),
     embed = [":cardinality"],
     flaky = True,
-    shard_count = 28,
+    shard_count = 29,
     deps = [
         "//pkg/config",
         "//pkg/domain",
diff --git a/pkg/planner/cardinality/row_count_column.go b/pkg/planner/cardinality/row_count_column.go
index 5e6d75bf19a74..fa1d8364ed6e3 100644
--- a/pkg/planner/cardinality/row_count_column.go
+++ b/pkg/planner/cardinality/row_count_column.go
@@ -15,6 +15,8 @@
 package cardinality
 
 import (
+	"math"
+
 	"github.com/pingcap/errors"
 	"github.com/pingcap/tidb/pkg/planner/planctx"
 	"github.com/pingcap/tidb/pkg/planner/util/debugtrace"
@@ -173,12 +175,27 @@ func equalRowCountOnColumn(sctx planctx.PlanContext, c *statistics.Column, val t
 	// 3. use uniform distribution assumption for the rest (even when this value is not covered by the range of stats)
 	histNDV := float64(c.Histogram.NDV - int64(c.TopN.Num()))
 	if histNDV <= 0 {
-		// If the table hasn't been modified, it's safe to return 0. Otherwise, the TopN could be stale - return 1.
+		// If histNDV is zero - we have all NDV's in TopN - and no histograms. This function uses
+		// c.NotNullCount rather than c.Histogram.NotNullCount() since the histograms are empty.
+		//
+		// If the table hasn't been modified, it's safe to return 0.
 		if modifyCount == 0 {
 			return 0, nil
 		}
-		return 1, nil
+		// ELSE calculate an approximate estimate based upon newly inserted rows.
+		//
+		// Reset to the original NDV, or if no NDV - derive an NDV using sqrt
+		if c.Histogram.NDV > 0 {
+			histNDV = float64(c.Histogram.NDV)
+		} else {
+			histNDV = math.Sqrt(max(c.NotNullCount(), float64(realtimeRowCount)))
+		}
+		// As a conservative estimate - take the smaller of the orignal totalRows or the additions.
+		// "realtimeRowCount - original count" is a better measure of inserts than modifyCount
+		totalRowCount := min(c.NotNullCount(), float64(realtimeRowCount)-c.NotNullCount())
+		return max(1, totalRowCount/histNDV), nil
 	}
+	// return the average histogram rows (which excludes topN) and NDV that excluded topN
 	return c.Histogram.NotNullCount() / histNDV, nil
 }
 
diff --git a/pkg/planner/cardinality/row_count_index.go b/pkg/planner/cardinality/row_count_index.go
index c89b6e290c736..928395ae76eef 100644
--- a/pkg/planner/cardinality/row_count_index.go
+++ b/pkg/planner/cardinality/row_count_index.go
@@ -415,12 +415,27 @@ func equalRowCountOnIndex(sctx planctx.PlanContext, idx *statistics.Index, b []b
 	// 3. use uniform distribution assumption for the rest (even when this value is not covered by the range of stats)
 	histNDV := float64(idx.Histogram.NDV - int64(idx.TopN.Num()))
 	if histNDV <= 0 {
-		// If the table hasn't been modified, it's safe to return 0. Otherwise, the TopN could be stale - return 1.
+		// If histNDV is zero - we have all NDV's in TopN - and no histograms. This function uses
+		// idx.TotalRowCount rather than idx.Histogram.NotNullCount() since the histograms are empty.
+		//
+		// If the table hasn't been modified, it's safe to return 0.
 		if modifyCount == 0 {
 			return 0
 		}
-		return 1
+		// ELSE calculate an approximate estimate based upon newly inserted rows.
+		//
+		// Reset to the original NDV, or if no NDV - derive an NDV using sqrt
+		if idx.Histogram.NDV > 0 {
+			histNDV = float64(idx.Histogram.NDV)
+		} else {
+			histNDV = math.Sqrt(max(idx.TotalRowCount(), float64(realtimeRowCount)))
+		}
+		// As a conservative estimate - take the smaller of the orignal totalRows or the additions.
+		// "realtimeRowCount - original count" is a better measure of inserts than modifyCount
+		totalRowCount := min(idx.TotalRowCount(), float64(realtimeRowCount)-idx.TotalRowCount())
+		return max(1, totalRowCount/histNDV)
 	}
+	// return the average histogram rows (which excludes topN) and NDV that excluded topN
 	return idx.Histogram.NotNullCount() / histNDV
 }
 
diff --git a/pkg/planner/cardinality/selectivity_test.go b/pkg/planner/cardinality/selectivity_test.go
index 40d336f9b7850..898e932eed31e 100644
--- a/pkg/planner/cardinality/selectivity_test.go
+++ b/pkg/planner/cardinality/selectivity_test.go
@@ -289,6 +289,56 @@ func TestEstimationForUnknownValues(t *testing.T) {
 	require.Equal(t, 0.0, count)
 }
 
+func TestEstimationForUnknownValuesAfterModify(t *testing.T) {
+	store, dom := testkit.CreateMockStoreAndDomain(t)
+	testKit := testkit.NewTestKit(t, store)
+	testKit.MustExec("use test")
+	testKit.MustExec("drop table if exists t")
+	testKit.MustExec("create table t(a int, key idx(a))")
+	testKit.MustExec("set @@tidb_analyze_version=2")
+	testKit.MustExec("set @@global.tidb_enable_auto_analyze='OFF'")
+	for i := 1; i <= 10; i++ {
+		testKit.MustExec(fmt.Sprintf("insert into t values (%d)", i))
+		testKit.MustExec(fmt.Sprintf("insert into t values (%d)", i))
+		testKit.MustExec(fmt.Sprintf("insert into t values (%d)", i))
+		testKit.MustExec(fmt.Sprintf("insert into t values (%d)", i))
+		testKit.MustExec(fmt.Sprintf("insert into t values (%d)", i))
+		testKit.MustExec(fmt.Sprintf("insert into t select a from t where a = %d", i))
+	}
+	testKit.MustExec("analyze table t")
+	h := dom.StatsHandle()
+	require.Nil(t, h.DumpStatsDeltaToKV(true))
+
+	table, err := dom.InfoSchema().TableByName(context.Background(), pmodel.NewCIStr("test"), pmodel.NewCIStr("t"))
+	require.NoError(t, err)
+	statsTbl := h.GetTableStats(table.Meta())
+
+	// Search for a found value == 10.0
+	sctx := mock.NewContext()
+	col := statsTbl.GetCol(table.Meta().Columns[0].ID)
+	count, err := cardinality.GetColumnRowCount(sctx, col, getRange(5, 5), statsTbl.RealtimeCount, statsTbl.ModifyCount, false)
+	require.NoError(t, err)
+	require.Equal(t, 10.0, count)
+
+	// Search for a not found value with zero modifyCount. Defaults to count == 1.0
+	count, err = cardinality.GetColumnRowCount(sctx, col, getRange(11, 11), statsTbl.RealtimeCount, statsTbl.ModifyCount, false)
+	require.NoError(t, err)
+	require.Equal(t, 1.0, count)
+
+	// Add another 200 rows to the table
+	testKit.MustExec("insert into t select a+10 from t")
+	testKit.MustExec("insert into t select a+10 from t where a <= 10")
+	require.Nil(t, h.DumpStatsDeltaToKV(true))
+	require.Nil(t, h.Update(context.Background(), dom.InfoSchema()))
+	statsTblnew := h.GetTableStats(table.Meta())
+
+	// Search for a not found value based upon statistics - count should be >= 10 and <=40
+	count, err = cardinality.GetColumnRowCount(sctx, col, getRange(15, 15), statsTblnew.RealtimeCount, statsTblnew.ModifyCount, false)
+	require.NoError(t, err)
+	require.Truef(t, count < 41, "expected: between 10 to 40, got: %v", count)
+	require.Truef(t, count > 9, "expected: between 10 to 40, got: %v", count)
+}
+
 func TestEstimationUniqueKeyEqualConds(t *testing.T) {
 	store, dom := testkit.CreateMockStoreAndDomain(t)
 	testKit := testkit.NewTestKit(t, store)