Skip to content

Commit

Permalink
parser: add custom_gbk encoding to deal with 0x80 (#31123)
Browse files Browse the repository at this point in the history
close #30581
  • Loading branch information
Defined2014 authored Dec 30, 2021
1 parent a8ce4af commit 6c0c442
Show file tree
Hide file tree
Showing 3 changed files with 81 additions and 10 deletions.
16 changes: 7 additions & 9 deletions executor/simple_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -994,19 +994,17 @@ func (s *testSuiteWithCliBaseCharset) TestUserWithSetNames(c *C) {
tk.MustExec("use test;")
tk.MustExec("set names gbk;")

gbkString := string([]byte{0xD2, 0xBB})
tk.MustExec("drop user if exists '\xd2\xbb'@'localhost';")
tk.MustExec("create user '\xd2\xbb'@'localhost' IDENTIFIED BY '\xd2\xbb';")

tk.MustExec("drop user if exists '一'@'localhost';")
tk.MustExec("create user '一'@'localhost' IDENTIFIED BY '" + gbkString + "';")

result := tk.MustQuery(`SELECT authentication_string FROM mysql.User WHERE User="一" and Host="localhost";`)
result := tk.MustQuery("SELECT authentication_string FROM mysql.User WHERE User='\xd2\xbb' and Host='localhost';")
result.Check(testkit.Rows(auth.EncodePassword("一")))

tk.MustExec(`ALTER USER ''@'localhost' IDENTIFIED BY '` + gbkString + gbkString + `';`)
result = tk.MustQuery(`SELECT authentication_string FROM mysql.User WHERE User="一" and Host="localhost";`)
tk.MustExec("ALTER USER '\xd2\xbb'@'localhost' IDENTIFIED BY '\xd2\xbb\xd2\xbb';")
result = tk.MustQuery("SELECT authentication_string FROM mysql.User WHERE User='\xd2\xbb' and Host='localhost';")
result.Check(testkit.Rows(auth.EncodePassword("一一")))

tk.MustExec(`RENAME USER ''@'localhost' to '一'`)
tk.MustExec("RENAME USER '\xd2\xbb'@'localhost' to '\xd2\xbb'")

tk.MustExec("drop user '';")
tk.MustExec("drop user '\xd2\xbb';")
}
65 changes: 64 additions & 1 deletion parser/charset/encoding_gbk.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,17 @@
package charset

import (
"bytes"
"strings"
"unicode"
"unicode/utf8"

"golang.org/x/text/encoding"
"golang.org/x/text/encoding/simplifiedchinese"
)

// EncodingGBKImpl is the instance of encodingGBK
var EncodingGBKImpl = &encodingGBK{encodingBase{enc: simplifiedchinese.GBK}}
var EncodingGBKImpl = &encodingGBK{encodingBase{enc: customGBK{}}}

func init() {
EncodingGBKImpl.self = EncodingGBKImpl
Expand Down Expand Up @@ -105,3 +108,63 @@ var GBKCase = unicode.SpecialCase{
unicode.CaseRange{Lo: 0x01DC, Hi: 0x01DC, Delta: [unicode.MaxCase]rune{0, 0, 0}},
unicode.CaseRange{Lo: 0x216A, Hi: 0x216B, Delta: [unicode.MaxCase]rune{0, 0, 0}},
}

// customGBK is a simplifiedchinese.GBK wrapper.
type customGBK struct{}

// NewDecoder returns simplifiedchinese.GBK.NewDecoder().
func (c customGBK) NewDecoder() *encoding.Decoder {
return &encoding.Decoder{
Transformer: customGBKDecoder{
gbkDecoder: simplifiedchinese.GBK.NewDecoder(),
},
}
}

type customGBKDecoder struct {
gbkDecoder *encoding.Decoder
}

// Transform special treatment for 0x80,
// see https://github.com/pingcap/tidb/issues/30581 get details.
func (c customGBKDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
if len(src) == 0 {
return 0, 0, nil
}
if src[0] == 0x80 {
return utf8.EncodeRune(dst[:], utf8.RuneError), 1, nil
}
return c.gbkDecoder.Transform(dst, src, atEOF)
}

// Reset is same as simplifiedchinese.GBK.Reset().
func (c customGBKDecoder) Reset() {
c.gbkDecoder.Reset()
}

type customGBKEncoder struct {
gbkEncoder *encoding.Encoder
}

// NewEncoder returns simplifiedchinese.gbk.
func (c customGBK) NewEncoder() *encoding.Encoder {
return &encoding.Encoder{
Transformer: customGBKEncoder{
gbkEncoder: simplifiedchinese.GBK.NewEncoder(),
},
}
}

// Transform special treatment for `€`,
// see https://github.com/pingcap/tidb/issues/30581 get details.
func (c customGBKEncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
if bytes.HasPrefix(src, []byte{0xe2, 0x82, 0xac} /* '€' */) {
return 0, 0, errInvalidCharacterString
}
return c.gbkEncoder.Transform(dst, src, atEOF)
}

// Reset is same as simplifiedchinese.gbk.
func (c customGBKEncoder) Reset() {
c.gbkEncoder.Reset()
}
10 changes: 10 additions & 0 deletions parser/charset/encoding_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,12 @@ func TestEncoding(t *testing.T) {
{"移維緯胃萎衣謂違", "绉荤董绶?儍钀庤。璎傞仌", false},
{"仆仂仗仞仭仟价伉佚估", "浠嗕粋浠椾粸浠?粺浠蜂級浣氫及", false},
{"佝佗佇佶侈侏侘佻佩佰侑佯", "浣濅綏浣囦蕉渚堜緩渚樹交浣╀桨渚戜蒋", true},
{"\x80", "?", false},
{"\x80a", "?", false},
{"\x80aa", "?a", false},
{"aa\x80ab", "aa?b", false},
{"a你好\x80a测试", "a浣犲ソ?娴嬭瘯", false},
{"aa\x80", "aa?", false},
}
for _, tc := range GBKCases {
cmt := fmt.Sprintf("%v", tc)
Expand All @@ -75,6 +81,10 @@ func TestEncoding(t *testing.T) {
{"一二三", \xb6\xfe\xc8\xfd", true},
{"🀁", "?", false},
{"valid_string_🀁", "valid_string_?", false},
{"€", "?", false},
{"€a", "?a", false},
{"a€aa", "a?aa", false},
{"aaa€", "aaa?", false},
}
for _, tc := range utf8Cases {
cmt := fmt.Sprintf("%v", tc)
Expand Down

0 comments on commit 6c0c442

Please sign in to comment.