util/ranger: support use like to build range for new collation colu…

…mns (pingcap#48522) close pingcap#48181, close pingcap#49138
time-and-fate · Feb 20, 2024 · f8d69f6 · f8d69f6
1 parent f7fae4b
commit f8d69f6
Show file tree

Hide file tree

Showing 13 changed files with 763 additions and 495 deletions.
diff --git a/pkg/planner/core/integration_test.go b/pkg/planner/core/integration_test.go
@@ -2230,14 +2230,14 @@ func TestPlanCacheForIndexJoinRangeFallback(t *testing.T) {
  tk.MustExec("drop table if exists t1, t2")
  tk.MustExec("create table t1(a int, b varchar(10), c varchar(10), index idx_a_b(a, b))")
  tk.MustExec("create table t2(d int)")
- tk.MustExec("set @@tidb_opt_range_max_size=1275")
- // 1275 is enough for [? a,? a], [? b,? b], [? c,? c] but is not enough for [? aaaaaa,? aaaaaa], [? bbbbbb,? bbbbbb], [? cccccc,? cccccc].
+ tk.MustExec("set @@tidb_opt_range_max_size=1260")
+ // 1260 is enough for [? a,? a], [? b,? b], [? c,? c] but is not enough for [? aaaaaa,? aaaaaa], [? bbbbbb,? bbbbbb], [? cccccc,? cccccc].
  rows := tk.MustQuery("explain format='brief' select /*+ inl_join(t1) */ * from t1 join t2 on t1.a = t2.d where t1.b in ('a', 'b', 'c')").Rows()
  require.True(t, strings.Contains(rows[6][4].(string), "range: decided by [eq(test.t1.a, test.t2.d) in(test.t1.b, a, b, c)]"))
  tk.MustQuery("show warnings").Check(testkit.Rows())
  rows = tk.MustQuery("explain format='brief' select /*+ inl_join(t1) */ * from t1 join t2 on t1.a = t2.d where t1.b in ('aaaaaa', 'bbbbbb', 'cccccc');").Rows()
- require.True(t, strings.Contains(rows[6][4].(string), "range: decided by [eq(test.t1.a, test.t2.d)]"))
- tk.MustQuery("show warnings").Check(testkit.Rows("Warning 1105 Memory capacity of 1275 bytes for 'tidb_opt_range_max_size' exceeded when building ranges. Less accurate ranges such as full range are chosen"))
+ require.Contains(t, rows[6][4].(string), "range: decided by [eq(test.t1.a, test.t2.d)]")
+ tk.MustQuery("show warnings").Check(testkit.Rows("Warning 1105 Memory capacity of 1260 bytes for 'tidb_opt_range_max_size' exceeded when building ranges. Less accurate ranges such as full range are chosen"))
 
  tk.MustExec("prepare stmt1 from 'select /*+ inl_join(t1) */ * from t1 join t2 on t1.a = t2.d where t1.b in (?, ?, ?)'")
  tk.MustExec("set @a='a', @b='b', @c='c'")
@@ -2252,13 +2252,13 @@ func TestPlanCacheForIndexJoinRangeFallback(t *testing.T) {
  tk.Session().SetSessionManager(&testkit.MockSessionManager{PS: ps})
  rows = tk.MustQuery(fmt.Sprintf("explain for connection %d", tkProcess.ID)).Rows()
  // We don't limit range mem usage when rebuilding index join ranges for the cached plan. So [? aaaaaa,? aaaaaa], [? bbbbbb,? bbbbbb], [? cccccc,? cccccc] can be built.
- require.True(t, strings.Contains(rows[6][4].(string), "range: decided by [eq(test.t1.a, test.t2.d) in(test.t1.b, aaaaaa, bbbbbb, cccccc)]"))
+ require.Contains(t, rows[6][4].(string), "range: decided by [eq(test.t1.a, test.t2.d) in(test.t1.b, aaaaaa, bbbbbb, cccccc)]")
 
  // Test the plan with range fallback would not be put into cache.
  tk.MustExec("prepare stmt2 from 'select /*+ inl_join(t1) */ * from t1 join t2 on t1.a = t2.d where t1.b in (?, ?, ?, ?, ?)'")
  tk.MustExec("set @a='a', @b='b', @c='c', @d='d', @e='e'")
  tk.MustExec("execute stmt2 using @a, @b, @c, @d, @e")
- tk.MustQuery("show warnings").Sort().Check(testkit.Rows("Warning 1105 Memory capacity of 1275 bytes for 'tidb_opt_range_max_size' exceeded when building ranges. Less accurate ranges such as full range are chosen",
+ tk.MustQuery("show warnings").Sort().Check(testkit.Rows("Warning 1105 Memory capacity of 1260 bytes for 'tidb_opt_range_max_size' exceeded when building ranges. Less accurate ranges such as full range are chosen",
  "Warning 1105 skip prepared plan-cache: in-list is too long"))
  tk.MustExec("execute stmt2 using @a, @b, @c, @d, @e")
  tk.MustQuery("select @@last_plan_from_cache").Check(testkit.Rows("0"))

diff --git a/pkg/planner/core/testdata/index_merge_suite_out.json b/pkg/planner/core/testdata/index_merge_suite_out.json
@@ -131,8 +131,8 @@
  "IndexMerge 0.00 root type: intersection",
  "├─IndexRangeScan(Build) 10.00 cop[tikv] table:t5, index:is1(s1) range:[\"Abc\",\"Abc\"], keep order:false, stats:pseudo",
  "├─IndexRangeScan(Build) 3333.33 cop[tikv] table:t5, index:is2(s2) range:(\"zzz\",+inf], keep order:false, stats:pseudo",
- "├─IndexRangeScan(Build) 3323.33 cop[tikv] table:t5, index:is3(s3) range:[-inf,\"B啊a\"), keep order:false, stats:pseudo",
- "├─IndexRangeScan(Build) 10.00 cop[tikv] table:t5, index:is4(s4) range:[\"CcC\",\"CcC\"], keep order:false, stats:pseudo",
+ "├─IndexRangeScan(Build) 3323.33 cop[tikv] table:t5, index:is3(s3) range:[-inf,\"\\x0eJ\\xfb@\\xd5J\\x0e3\"), keep order:false, stats:pseudo",
+ "├─IndexRangeScan(Build) 10.00 cop[tikv] table:t5, index:is4(s4) range:[\"CCC\",\"CCC\"], keep order:false, stats:pseudo",
  "└─TableRowIDScan(Probe) 0.00 cop[tikv] table:t5 keep order:false, stats:pseudo"
  ],
  "Result": [
@@ -144,7 +144,7 @@
  "Plan": [
  "IndexMerge 0.03 root type: intersection",
  "├─IndexRangeScan(Build) 33.33 cop[tikv] table:t6, index:PRIMARY(s1, s2) range:(\"Abc\" \"zzz\",\"Abc\" +inf], keep order:false, stats:pseudo",
- "├─IndexRangeScan(Build) 10.00 cop[tikv] table:t6, index:is3(s3) range:[\"A啊a\",\"A啊a\"], keep order:false, stats:pseudo",
+ "├─IndexRangeScan(Build) 10.00 cop[tikv] table:t6, index:is3(s3) range:[\"\\x0e3\\xfb@\\xd5J\\x0e3\",\"\\x0e3\\xfb@\\xd5J\\x0e3\"], keep order:false, stats:pseudo",
  "└─Selection(Probe) 0.03 cop[tikv] gt(test.t6.s2, \"zzz\"), not(like(test.t6.s4, \"Cd_\", 92))",
  " └─TableRowIDScan 0.03 cop[tikv] table:t6 keep order:false, stats:pseudo"
  ],
@@ -172,13 +172,14 @@
  {
  "SQL": "select /*+ use_index_merge(t8, primary,is2,is3,is4,is5) */ * from t8 where s1 like '啊A%' and s2 > 'abc' and s3 > 'cba' and s4 in ('aA', '??') and s5 = 'test,2'",
  "Plan": [
- "Selection 1.42 root eq(test.t8.s5, \"test,2\")",
- "└─IndexMerge 0.59 root type: intersection",
+ "Selection 0.04 root eq(test.t8.s5, \"test,2\")",
+ "└─IndexMerge 0.06 root type: intersection",
+ " ├─IndexRangeScan(Build) 250.00 cop[tikv] table:t8, index:PRIMARY(s1) range:[\"UJ\\x00A\",\"UJ\\x00B\"), keep order:false, stats:pseudo",
  " ├─IndexRangeScan(Build) 3333.33 cop[tikv] table:t8, index:is2(s2) range:(\"abc\",+inf], keep order:false, stats:pseudo",
  " ├─IndexRangeScan(Build) 3333.33 cop[tikv] table:t8, index:is3(s3) range:(\"cba\",+inf], keep order:false, stats:pseudo",
  " ├─IndexRangeScan(Build) 20.00 cop[tikv] table:t8, index:is4(s4) range:[\"aA\",\"aA\"], [\"??\",\"??\"], keep order:false, stats:pseudo",
- " └─Selection(Probe) 0.59 cop[tikv] gt(test.t8.s3, \"cba\"), like(test.t8.s1, \"啊A%\", 92)",
- " └─TableRowIDScan 2.22 cop[tikv] table:t8 keep order:false, stats:pseudo"
+ " └─Selection(Probe) 0.06 cop[tikv] gt(test.t8.s3, \"cba\"), like(test.t8.s1, \"啊A%\", 92)",
+ " └─TableRowIDScan 0.06 cop[tikv] table:t8 keep order:false, stats:pseudo"
  ],
  "Result": [
  "啊aabbccdd abcc cccc aA tEsT,2"

diff --git a/pkg/util/ranger/BUILD.bazel b/pkg/util/ranger/BUILD.bazel
@@ -30,7 +30,7 @@ go_library(
  "//pkg/util/codec",
  "//pkg/util/collate",
  "//pkg/util/dbterror",
- "//pkg/util/mathutil",
+ "//pkg/util/hack",
  "@com_github_pingcap_errors//:errors",
  ],
 )

diff --git a/pkg/util/ranger/checker.go b/pkg/util/ranger/checker.go
@@ -141,16 +141,6 @@ func (c *conditionChecker) checkScalarFunction(scalar *expression.ScalarFunction
 
 func (c *conditionChecker) checkLikeFunc(scalar *expression.ScalarFunction) (isAccessCond, shouldReserve bool) {
  _, collation := scalar.CharsetAndCollation()
- if collate.NewCollationEnabled() && !collate.IsBinCollation(collation) {
- // The algorithm constructs the range in byte-level: for example, ab% is mapped to [ab, ac] by adding 1 to the last byte.
- // However, this is incorrect for non-binary collation strings because the sort key order is not the same as byte order.
- // For example, "`%" is mapped to the range [`, a](where ` is 0x60 and a is 0x61).
- // Because the collation utf8_general_ci is case-insensitive, a and A have the same sort key.
- // Finally, the range comes to be [`, A], which is actually an empty range.
- // See https://github.com/pingcap/tidb/issues/31174 for more details.
- // In short, when the column type is non-binary collation string, we cannot use `like` expressions to generate the range.
- return false, true
- }
  if !collate.CompatibleCollate(scalar.GetArgs()[0].GetType().GetCollate(), collation) {
  return false, true
  }

diff --git a/pkg/util/ranger/detacher.go b/pkg/util/ranger/detacher.go
@@ -242,7 +242,7 @@ func compareCNFItemRangeResult(curResult, bestResult *cnfItemRangeResult) (curIs
 // e.g, for input CNF expressions ((a,b) in ((1,1),(2,2))) and a > 1 and ((a,b,c) in (1,1,1),(2,2,2))
 // ((a,b,c) in (1,1,1),(2,2,2)) would be extracted.
 func extractBestCNFItemRanges(sctx sessionctx.Context, conds []expression.Expression, cols []*expression.Column,
- lengths []int, rangeMaxSize int64) (*cnfItemRangeResult, []*valueInfo, error) {
+ lengths []int, rangeMaxSize int64, convertToSortKey bool) (*cnfItemRangeResult, []*valueInfo, error) {
  if len(conds) < 2 {
  return nil, nil, nil
  }
@@ -261,7 +261,7 @@ func extractBestCNFItemRanges(sctx sessionctx.Context, conds []expression.Expres
  // We build ranges for `(a,b) in ((1,1),(1,2))` and get `[1 1, 1 1] [1 2, 1 2]`, which are point ranges and we can
  // append `c = 1` to the point ranges. However, if we choose to merge consecutive ranges here, we get `[1 1, 1 2]`,
  // which are not point ranges, and we cannot append `c = 1` anymore.
- res, err := detachCondAndBuildRangeWithoutMerging(sctx, tmpConds, cols, lengths, rangeMaxSize)
+ res, err := detachCondAndBuildRangeWithoutMerging(sctx, tmpConds, cols, lengths, rangeMaxSize, convertToSortKey)
  if err != nil {
  return nil, nil, err
  }
@@ -377,7 +377,7 @@ func (d *rangeDetacher) detachCNFCondAndBuildRangeForIndex(conditions []expressi
  optPrefixIndexSingleScan: d.sctx.GetSessionVars().OptPrefixIndexSingleScan,
  }
  if considerDNF {
- bestCNFItemRes, columnValues, err := extractBestCNFItemRanges(d.sctx, conditions, d.cols, d.lengths, d.rangeMaxSize)
+ bestCNFItemRes, columnValues, err := extractBestCNFItemRanges(d.sctx, conditions, d.cols, d.lengths, d.rangeMaxSize, d.convertToSortKey)
  if err != nil {
  return nil, err
  }
@@ -628,12 +628,16 @@ func ExtractEqAndInCondition(sctx sessionctx.Context, conditions []expression.Ex
  }
  // Multiple Eq/In conditions for one column in CNF, apply intersection on them
  // Lazily compute the points for the previously visited Eq/In
+ newTp := newFieldType(cols[offset].GetType())
  collator := collate.GetCollator(cols[offset].GetType().GetCollate())
  if mergedAccesses[offset] == nil {
  mergedAccesses[offset] = accesses[offset]
- points[offset] = rb.build(accesses[offset], collator, lengths[offset])
+ // Note that this is a relatively special usage of build(). We will restore the points back to Expression for
+ // later use and may build the Expression to points again.
+ // We need to keep the original value here, which means we neither cut prefix nor convert to sort key.
+ points[offset] = rb.build(accesses[offset], newTp, types.UnspecifiedLength, false)
  }
- points[offset] = rb.intersection(points[offset], rb.build(cond, collator, lengths[offset]), collator)
+ points[offset] = rb.intersection(points[offset], rb.build(cond, newTp, types.UnspecifiedLength, false), collator)
  if len(points[offset]) == 0 { // Early termination if false expression found
  if expression.MaybeOverOptimized4PlanCache(sctx, conditions) {
  // `a>@x and a<@y` --> `invalid-range if @x>=@y`
@@ -774,9 +778,10 @@ func (d *rangeDetacher) detachDNFCondAndBuildRangeForIndex(condition *expression
  if shouldReserve {
  hasResidual = true
  }
- points := rb.build(item, collate.GetCollator(newTpSlice[0].GetCollate()), d.lengths[0])
+ points := rb.build(item, newTpSlice[0], d.lengths[0], d.convertToSortKey)
+ tmpNewTp := convertStringFTToBinaryCollate(newTpSlice[0])
  // TODO: restrict the mem usage of ranges
- ranges, rangeFallback, err := points2Ranges(d.sctx, points, newTpSlice[0], d.rangeMaxSize)
+ ranges, rangeFallback, err := points2Ranges(d.sctx, points, tmpNewTp, d.rangeMaxSize)
  if err != nil {
  return nil, nil, nil, false, errors.Trace(err)
  }
@@ -868,6 +873,7 @@ func DetachCondAndBuildRangeForIndex(sctx sessionctx.Context, conditions []expre
  cols: cols,
  lengths: lengths,
  mergeConsecutive: true,
+ convertToSortKey: true,
  rangeMaxSize: rangeMaxSize,
  }
  return d.detachCondAndBuildRangeForCols()
@@ -876,13 +882,14 @@ func DetachCondAndBuildRangeForIndex(sctx sessionctx.Context, conditions []expre
 // detachCondAndBuildRangeWithoutMerging detaches the index filters from table filters and uses them to build ranges.
 // When building ranges, it doesn't merge consecutive ranges.
 func detachCondAndBuildRangeWithoutMerging(sctx sessionctx.Context, conditions []expression.Expression, cols []*expression.Column,
- lengths []int, rangeMaxSize int64) (*DetachRangeResult, error) {
+ lengths []int, rangeMaxSize int64, convertToSortKey bool) (*DetachRangeResult, error) {
  d := &rangeDetacher{
  sctx: sctx,
  allConds: conditions,
  cols: cols,
  lengths: lengths,
  mergeConsecutive: false,
+ convertToSortKey: convertToSortKey,
  rangeMaxSize: rangeMaxSize,
  }
  return d.detachCondAndBuildRangeForCols()
@@ -894,7 +901,7 @@ func detachCondAndBuildRangeWithoutMerging(sctx sessionctx.Context, conditions [
 // The returned values are encapsulated into a struct DetachRangeResult, see its comments for explanation.
 func DetachCondAndBuildRangeForPartition(sctx sessionctx.Context, conditions []expression.Expression, cols []*expression.Column,
  lengths []int, rangeMaxSize int64) (*DetachRangeResult, error) {
- return detachCondAndBuildRangeWithoutMerging(sctx, conditions, cols, lengths, rangeMaxSize)
+ return detachCondAndBuildRangeWithoutMerging(sctx, conditions, cols, lengths, rangeMaxSize, false)
 }
 
 type rangeDetacher struct {
@@ -903,6 +910,7 @@ type rangeDetacher struct {
  cols []*expression.Column
  lengths []int
  mergeConsecutive bool
+ convertToSortKey bool
  rangeMaxSize int64
 }
 
@@ -949,6 +957,7 @@ func DetachSimpleCondAndBuildRangeForIndex(sctx sessionctx.Context, conditions [
  cols: cols,
  lengths: lengths,
  mergeConsecutive: true,
+ convertToSortKey: true,
  rangeMaxSize: rangeMaxSize,
  }
  res, err := d.detachCNFCondAndBuildRangeForIndex(conditions, newTpSlice, false)