From e8f85e18153679ca6a1bca8ec3fb4e2175319ca6 Mon Sep 17 00:00:00 2001 From: time-and-fate <25057648+time-and-fate@users.noreply.github.com> Date: Tue, 28 Nov 2023 22:38:53 +0800 Subject: [PATCH 1/3] add --- pkg/util/ranger/checker.go | 5 +---- pkg/util/ranger/points.go | 21 ++++++++++++++++--- .../core/issuetest/planner_issue.result | 12 ++++++++++- .../planner/core/issuetest/planner_issue.test | 5 ++++- 4 files changed, 34 insertions(+), 9 deletions(-) diff --git a/pkg/util/ranger/checker.go b/pkg/util/ranger/checker.go index 2c7812b6cdb3f..d343d27aea397 100644 --- a/pkg/util/ranger/checker.go +++ b/pkg/util/ranger/checker.go @@ -17,7 +17,6 @@ package ranger import ( "github.com/pingcap/tidb/pkg/expression" "github.com/pingcap/tidb/pkg/parser/ast" - "github.com/pingcap/tidb/pkg/parser/charset" "github.com/pingcap/tidb/pkg/parser/mysql" "github.com/pingcap/tidb/pkg/sessionctx" "github.com/pingcap/tidb/pkg/types" @@ -175,9 +174,7 @@ func (c *conditionChecker) checkLikeFunc(scalar *expression.ScalarFunction) (isA // In tidb's implementation, for PAD SPACE collations, the trailing spaces are removed in the index key. So we are // unable to distinguish 'xxx' from 'xxx ' by a single index range scan, and we may read more data than needed by // the `like` function. Therefore, a Selection is needed to filter the data. - // Since all collations, except for binary, implemented in tidb are PAD SPACE collations for now, we use a simple - // collation != binary check here. - if collation != charset.CollationBin { + if isPadSpaceCollation(collation) { likeFuncReserve = true } diff --git a/pkg/util/ranger/points.go b/pkg/util/ranger/points.go index a6b4b796fe561..439afedd68311 100644 --- a/pkg/util/ranger/points.go +++ b/pkg/util/ranger/points.go @@ -24,6 +24,7 @@ import ( "github.com/pingcap/tidb/pkg/errno" "github.com/pingcap/tidb/pkg/expression" "github.com/pingcap/tidb/pkg/parser/ast" + "github.com/pingcap/tidb/pkg/parser/charset" "github.com/pingcap/tidb/pkg/parser/mysql" "github.com/pingcap/tidb/pkg/sessionctx" "github.com/pingcap/tidb/pkg/types" @@ -694,9 +695,15 @@ func (r *builder) newBuildFromPatternLike(expr *expression.ScalarFunction, prefi break } else if pattern[i] == '_' { // Get the prefix, but exclude the prefix. - // e.g., "abc_x", the start point exclude "abc", - // because the string length is more than 3. - exclude = true + // e.g., "abc_x", the start point excludes "abc" because the string length is more than 3. + // + // However, like the similar check in (*conditionChecker).checkLikeFunc(), in tidb's implementation, for + // PAD SPACE collations, the trailing spaces are removed in the index key. So we are unable to distinguish + // 'xxx' from 'xxx ' by a single index range scan. If we exclude the start point for PAD SPACE collation, + // we will actually miss 'xxx ', which will cause wrong results. + if !isPadSpaceCollation(collation) { + exclude = true + } isExactMatch = false break } @@ -736,6 +743,14 @@ func (r *builder) newBuildFromPatternLike(expr *expression.ScalarFunction, prefi return []*point{startPoint, endPoint} } +// isPadSpaceCollation returns whether the collation is a PAD SPACE collation. +// Since all collations, except for binary, implemented in tidb are PAD SPACE collations for now, we use a simple +// collation != binary check here. We may also move it to collation related packages when NO PAD collations are +// implemented in the future. +func isPadSpaceCollation(collation string) bool { + return collation != charset.CollationBin +} + func (r *builder) buildFromNot(expr *expression.ScalarFunction, prefixLen int) []*point { switch n := expr.FuncName.L; n { case ast.IsTruthWithoutNull: diff --git a/tests/integrationtest/r/planner/core/issuetest/planner_issue.result b/tests/integrationtest/r/planner/core/issuetest/planner_issue.result index 920b962398a76..5b5aae65d4adb 100644 --- a/tests/integrationtest/r/planner/core/issuetest/planner_issue.result +++ b/tests/integrationtest/r/planner/core/issuetest/planner_issue.result @@ -260,7 +260,7 @@ a set @@tidb_max_chunk_size = default; drop table if exists t1, t2; create table t1(a varchar(20) collate utf8mb4_bin, index ia(a)); -insert into t1 value('测试'),('测试 '); +insert into t1 value('测试'),('测试 '),('xxx '); explain format = brief select *,length(a) from t1 where a like '测试 %'; id estRows task access object operator info Projection 250.00 root planner__core__issuetest__planner_issue.t1.a, length(planner__core__issuetest__planner_issue.t1.a)->Column#3 @@ -281,6 +281,16 @@ a length(a) select *,length(a) from t1 where a like '测试'; a length(a) 测试 6 +explain format = brief select * from t1 use index (ia) where a like 'xxx_'; +id estRows task access object operator info +Projection 250.00 root planner__core__issuetest__planner_issue.t1.a +└─UnionScan 250.00 root like(planner__core__issuetest__planner_issue.t1.a, "xxx_", 92) + └─IndexReader 250.00 root index:Selection + └─Selection 250.00 cop[tikv] like(planner__core__issuetest__planner_issue.t1.a, "xxx_", 92) + └─IndexRangeScan 250.00 cop[tikv] table:t1, index:ia(a) range:["xxx","xxy"), keep order:false, stats:pseudo +select * from t1 use index (ia) where a like 'xxx_'; +a +xxx create table t2(a varchar(20) collate gbk_chinese_ci, index ia(a)); insert into t2 value('测试'),('测试 '); explain format = brief select *,length(a) from t2 where a like '测试 %'; diff --git a/tests/integrationtest/t/planner/core/issuetest/planner_issue.test b/tests/integrationtest/t/planner/core/issuetest/planner_issue.test index 45d79ba30b5fa..bb04737801aeb 100644 --- a/tests/integrationtest/t/planner/core/issuetest/planner_issue.test +++ b/tests/integrationtest/t/planner/core/issuetest/planner_issue.test @@ -153,13 +153,16 @@ select a from (select 100 as a, 100 as b union all select * from t) t where b != set @@tidb_max_chunk_size = default; # https://github.com/pingcap/tidb/issues/48821 +# https://github.com/pingcap/tidb/issues/48983 drop table if exists t1, t2; create table t1(a varchar(20) collate utf8mb4_bin, index ia(a)); -insert into t1 value('测试'),('测试 '); +insert into t1 value('测试'),('测试 '),('xxx '); explain format = brief select *,length(a) from t1 where a like '测试 %'; explain format = brief select *,length(a) from t1 where a like '测试'; select *,length(a) from t1 where a like '测试 %'; select *,length(a) from t1 where a like '测试'; +explain format = brief select * from t1 use index (ia) where a like 'xxx_'; +select * from t1 use index (ia) where a like 'xxx_'; create table t2(a varchar(20) collate gbk_chinese_ci, index ia(a)); insert into t2 value('测试'),('测试 '); explain format = brief select *,length(a) from t2 where a like '测试 %'; From bb1f180c8d725fb395bd3e10497a54f2512fbe89 Mon Sep 17 00:00:00 2001 From: time-and-fate <25057648+time-and-fate@users.noreply.github.com> Date: Tue, 28 Nov 2023 22:57:06 +0800 Subject: [PATCH 2/3] update test result --- .../casetest/physicalplantest/testdata/plan_suite_out.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/planner/core/casetest/physicalplantest/testdata/plan_suite_out.json b/pkg/planner/core/casetest/physicalplantest/testdata/plan_suite_out.json index 234706233bd4e..8a7fc8f06327e 100644 --- a/pkg/planner/core/casetest/physicalplantest/testdata/plan_suite_out.json +++ b/pkg/planner/core/casetest/physicalplantest/testdata/plan_suite_out.json @@ -2195,7 +2195,7 @@ }, { "SQL": "select a from t where c_str like 'abc_'", - "Best": "IndexReader(Index(t.c_d_e_str)[(\"abc\",\"abd\")]->Sel([like(test.t.c_str, abc_, 92)]))->Projection" + "Best": "IndexReader(Index(t.c_d_e_str)[[\"abc\",\"abd\")]->Sel([like(test.t.c_str, abc_, 92)]))->Projection" }, { "SQL": "select a from t where c_str like 'abc%af'", @@ -2223,7 +2223,7 @@ }, { "SQL": "select a from t where c_str like 'abc\\__'", - "Best": "IndexReader(Index(t.c_d_e_str)[(\"abc_\",\"abc`\")]->Sel([like(test.t.c_str, abc\\__, 92)]))->Projection" + "Best": "IndexReader(Index(t.c_d_e_str)[[\"abc_\",\"abc`\")]->Sel([like(test.t.c_str, abc\\__, 92)]))->Projection" }, { "SQL": "select a from t where c_str like 123", From 261466baba2400dac041a6a044a76f5d9eb7e681 Mon Sep 17 00:00:00 2001 From: time-and-fate <25057648+time-and-fate@users.noreply.github.com> Date: Tue, 28 Nov 2023 23:06:48 +0800 Subject: [PATCH 3/3] update test result --- pkg/util/ranger/ranger_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/util/ranger/ranger_test.go b/pkg/util/ranger/ranger_test.go index aeb2b8a1dc31b..49c073ed9e570 100644 --- a/pkg/util/ranger/ranger_test.go +++ b/pkg/util/ranger/ranger_test.go @@ -1114,7 +1114,7 @@ create table t( exprStr: "a LIKE 'abc_'", accessConds: "[like(test.t.a, abc_, 92)]", filterConds: "[like(test.t.a, abc_, 92)]", - resultStr: "[(\"abc\",\"abd\")]", + resultStr: "[[\"abc\",\"abd\")]", }, { indexPos: 0,