Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

collation: fix tidb panic when compare string with collation #23760

Closed
wants to merge 7 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions expression/integration_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -8154,6 +8154,27 @@ func (s *testIntegrationSerialSuite) TestIssue20161(c *C) {
Check(testkit.Rows("<nil>", "<nil>", "<nil>"))
}

func (s *testIntegrationSerialSuite) TestIssue23506(c *C) {
collate.SetNewCollationEnabledForTest(true)
defer collate.SetNewCollationEnabledForTest(false)

tk := testkit.NewTestKit(c, s.store)
tk.MustExec(`use test;`)
tk.MustExec("drop table if exists t;")
tk.MustExec(`create table t(a char(10) collate utf8mb4_general_ci, b char(10) collate utf8mb4_unicode_ci);`)
tk.MustExec(`insert into t values ('a', 'a')`)
tk.MustExec(`insert into t values ('一', '一')`)
tk.MustQuery(`select * from t where a > 0x80;`).Check(testkit.Rows("一 一"))
tk.MustQuery(`select * from t where b > 0x80;`).Check(testkit.Rows("a a", "一 一"))

// uncomment this when #23759 fix
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

#23759 has been fixed, so please uncomment these test.

//tk.MustQuery(`select concat(a, 0x80) = concat(a, 0x81) collate utf8mb4_unicode_ci from t;`).Check(testkit.Rows("1 1"))
//tk.MustQuery(`select hex(weight_string(concat(a, 0x80))) from t`).Check(testkit.Rows("0041 4E00"))
//tk.MustQuery(`select hex(weight_string(concat(b, 0x80))) from t`).Check(testkit.Rows("0E33 FB40CE00"))
//tk.MustQuery(`select collation(concat(a, 0x80)) from t`).Check(testkit.Rows("utf8mb4_general_ci", "utf8mb4_general_ci"))
//tk.MustQuery(`select collation(concat(b, 0x80)) from t`).Check(testkit.Rows("utf8mb4_unicode_ci", "utf8mb4_unicode_ci"))
}

func (s *testIntegrationSuite) TestIssue10462(c *C) {
tk := testkit.NewTestKit(c, s.store)
tk.MustExec("use test")
Expand Down
33 changes: 0 additions & 33 deletions util/collate/collate.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,14 +47,6 @@ var (
const (
// DefaultLen is set for datum if the string datum don't know its length.
DefaultLen = 0
// first byte of a 2-byte encoding starts 110 and carries 5 bits of data
b2Mask = 0x1F // 0001 1111
// first byte of a 3-byte encoding starts 1110 and carries 4 bits of data
b3Mask = 0x0F // 0000 1111
// first byte of a 4-byte encoding starts 11110 and carries 3 bits of data
b4Mask = 0x07 // 0000 0111
// non-first bytes start 10 and carry 6 bits of data
mbMask = 0x3F // 0011 1111
)

// Collator provides functionality for comparing strings for a given
Expand Down Expand Up @@ -247,31 +239,6 @@ func sign(i int) int {
return 0
}

// decode rune by hand
func decodeRune(s string, si int) (r rune, newIndex int) {
switch b := s[si]; {
case b < 0x80:
r = rune(b)
newIndex = si + 1
case b < 0xE0:
r = rune(b&b2Mask)<<6 |
rune(s[1+si]&mbMask)
newIndex = si + 2
case b < 0xF0:
r = rune(b&b3Mask)<<12 |
rune(s[si+1]&mbMask)<<6 |
rune(s[si+2]&mbMask)
newIndex = si + 3
default:
r = rune(b&b4Mask)<<18 |
rune(s[si+1]&mbMask)<<12 |
rune(s[si+2]&mbMask)<<6 |
rune(s[si+3]&mbMask)
newIndex = si + 4
}
return
}

// IsCICollation returns if the collation is case-sensitive
func IsCICollation(collate string) bool {
return collate == "utf8_general_ci" || collate == "utf8mb4_general_ci" ||
Expand Down
4 changes: 4 additions & 0 deletions util/collate/collate_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,7 @@ func (s *testCollateSuite) TestGeneralCICollator(c *C) {
{"😜", "😃", 0},
{"a ", "a ", 0},
{"a\t", "a", 1},
{"\x80", "a", 1},
}
keyTable := []keyTable{
{"a", []byte{0x0, 0x41}},
Expand All @@ -128,6 +129,7 @@ func (s *testCollateSuite) TestGeneralCICollator(c *C) {
0x3, 0x0, 0x20, 0x0, 0x51, 0x0, 0x55, 0x0, 0x58}},
{"a ", []byte{0x0, 0x41}},
{"a", []byte{0x0, 0x41}},
{"\x80\x81", []byte{}},
}
testCompareTable(compareTable, "utf8mb4_general_ci", c)
testKeyTable(keyTable, "utf8mb4_general_ci", c)
Expand All @@ -149,6 +151,7 @@ func (s *testCollateSuite) TestUnicodeCICollator(c *C) {
{"a\t", "a", 1},
{"ß", "s", 1},
{"ß", "ss", 0},
{"\x80", "a", -1},
}
keyTable := []keyTable{
{"a", []byte{0x0E, 0x33}},
Expand All @@ -160,6 +163,7 @@ func (s *testCollateSuite) TestUnicodeCICollator(c *C) {
0xB4, 0x10, 0x1F, 0x10, 0x5A}},
{"a ", []byte{0x0E, 0x33}},
{"ﷻ", []byte{0x13, 0x5E, 0x13, 0xAB, 0x02, 0x09, 0x13, 0x5E, 0x13, 0xAB, 0x13, 0x50, 0x13, 0xAB, 0x13, 0xB7}},
{"\x80\x81", []byte{}},
}

testCompareTable(compareTable, "utf8mb4_unicode_ci", c)
Expand Down
30 changes: 20 additions & 10 deletions util/collate/general_ci.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@
package collate

import (
"strings"
"unicode/utf8"

"github.com/pingcap/tidb/util/stringutil"
)

Expand All @@ -24,30 +27,37 @@ type generalCICollator struct {
func (gc *generalCICollator) Compare(a, b string) int {
a = truncateTailingSpace(a)
b = truncateTailingSpace(b)
r1, r2 := rune(0), rune(0)
ai, bi := 0, 0
for ai < len(a) && bi < len(b) {
r1, ai = decodeRune(a, ai)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well, is it possible to keep decodeRune with some additional validation checks?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

that will be the same with the golang library, it's faster because decodeRune has no validation check.

r2, bi = decodeRune(b, bi)
for len(a) > 0 && len(b) > 0 {
r1, asize := utf8.DecodeRuneInString(a)
r2, bsize := utf8.DecodeRuneInString(b)

// Incorrect string, compare bytewise
if (r1 == utf8.RuneError && asize == 1) || (r2 == utf8.RuneError && bsize == 1) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why need check size == 1?

return strings.Compare(a, b)
}

cmp := int(convertRuneGeneralCI(r1)) - int(convertRuneGeneralCI(r2))
if cmp != 0 {
return sign(cmp)
}
a = a[asize:]
b = b[bsize:]
}
return sign((len(a) - ai) - (len(b) - bi))
return sign(len(a) - len(b))
}

// Key implements Collator interface.
func (gc *generalCICollator) Key(str string) []byte {
str = truncateTailingSpace(str)
buf := make([]byte, 0, len(str))
i := 0
r := rune(0)
for i < len(str) {
r, i = decodeRune(str, i)
for len(str) > 0 {
r, size := utf8.DecodeRuneInString(str)
if r == utf8.RuneError && size == 1 {
return buf
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Any test to cover the change? Or the path is never be taken? I doubt the correctness of returning buf directly...

}
u16 := convertRuneGeneralCI(r)
buf = append(buf, byte(u16>>8), byte(u16))
str = str[size:]
}
return buf
}
Expand Down
32 changes: 23 additions & 9 deletions util/collate/unicode_ci.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
package collate

import (
"unicode/utf8"

"github.com/pingcap/tidb/util/stringutil"
)

Expand All @@ -36,12 +38,17 @@ func (uc *unicodeCICollator) Compare(a, b string) int {
// rune of a, b
ar, br := rune(0), rune(0)
// decode index of a, b
ai, bi := 0, 0
asize, bsize := 0, 0
for {
if an == 0 {
if as == 0 {
for an == 0 && ai < len(a) {
ar, ai = decodeRune(a, ai)
for an == 0 && len(a) > 0 {
ar, asize = utf8.DecodeRuneInString(a)
if ar == utf8.RuneError && asize == 1 {
an = 0
break
}
a = a[asize:]
an, as = convertRuneUnicodeCI(ar)
}
} else {
Expand All @@ -52,8 +59,13 @@ func (uc *unicodeCICollator) Compare(a, b string) int {

if bn == 0 {
if bs == 0 {
for bn == 0 && bi < len(b) {
br, bi = decodeRune(b, bi)
for bn == 0 && len(b) > 0 {
br, bsize = utf8.DecodeRuneInString(b)
if br == utf8.RuneError && bsize == 1 {
bn = 0
break
}
b = b[bsize:]
bn, bs = convertRuneUnicodeCI(br)
}
} else {
Expand Down Expand Up @@ -86,12 +98,13 @@ func (uc *unicodeCICollator) Compare(a, b string) int {
func (uc *unicodeCICollator) Key(str string) []byte {
str = truncateTailingSpace(str)
buf := make([]byte, 0, len(str)*2)
r := rune(0)
si := 0 // decode index of s
sn, ss := uint64(0), uint64(0) // weight of str. weight in unicode_ci may has 8 uint16s. sn indicate first 4 u16s, ss indicate last 4 u16s

for si < len(str) {
r, si = decodeRune(str, si)
for len(str) > 0 {
r, size := utf8.DecodeRuneInString(str)
if r == utf8.RuneError && size == 1 {
return buf
}
sn, ss = convertRuneUnicodeCI(r)
for sn != 0 {
buf = append(buf, byte((sn&0xFF00)>>8), byte(sn))
Expand All @@ -101,6 +114,7 @@ func (uc *unicodeCICollator) Key(str string) []byte {
buf = append(buf, byte((ss&0xFF00)>>8), byte(ss))
ss >>= 16
}
str = str[size:]
}
return buf
}
Expand Down