From b7eeb41c318f3e2dd0cf44faded0bde5d2edd90d Mon Sep 17 00:00:00 2001 From: Hangjie Mo Date: Mon, 30 May 2022 22:04:26 +0800 Subject: [PATCH] parser: revert `latin1` as an alias for `utf8mb4` (#35025) ref pingcap/tidb#34008 --- expression/integration_test.go | 2 +- parser/charset/encoding_latin1.go | 25 ++++++++++++++++++++++++- parser/mysql/charset.go | 4 ++-- 3 files changed, 27 insertions(+), 4 deletions(-) diff --git a/expression/integration_test.go b/expression/integration_test.go index 83fa1791e955c..c3734a1a5c184 100644 --- a/expression/integration_test.go +++ b/expression/integration_test.go @@ -653,7 +653,7 @@ func TestStringBuiltin(t *testing.T) { result = tk.MustQuery("select ord('123'), ord(123), ord(''), ord('你好'), ord(NULL), ord('👍')") result.Check(testkit.Rows("49 49 0 14990752 4036989325")) result = tk.MustQuery("select ord(X''), ord(X'6161'), ord(X'e4bd'), ord(X'e4bda0'), ord(_ascii'你'), ord(_latin1'你')") - result.Check(testkit.Rows("0 97 228 228 228 14990752")) + result.Check(testkit.Rows("0 97 228 228 228 228")) // for space result = tk.MustQuery(`select space(0), space(2), space(-1), space(1.1), space(1.9)`) diff --git a/parser/charset/encoding_latin1.go b/parser/charset/encoding_latin1.go index f1893484b9a6b..db7b66ed101af 100644 --- a/parser/charset/encoding_latin1.go +++ b/parser/charset/encoding_latin1.go @@ -14,11 +14,12 @@ package charset import ( + "bytes" "golang.org/x/text/encoding" ) // EncodingLatin1Impl is the instance of encodingLatin1. -// In TiDB, latin1 is an alias for utf8, so uses utf8 implementation for latin1. +// TiDB uses utf8 implementation for latin1 charset because of the backward compatibility. var EncodingLatin1Impl = &encodingLatin1{encodingUTF8{encodingBase{enc: encoding.Nop}}} func init() { @@ -34,3 +35,25 @@ type encodingLatin1 struct { func (e *encodingLatin1) Name() string { return CharsetLatin1 } + +// Peek implements Encoding interface. +func (e *encodingLatin1) Peek(src []byte) []byte { + if len(src) == 0 { + return src + } + return src[:1] +} + +// IsValid implements Encoding interface. +func (e *encodingLatin1) IsValid(src []byte) bool { + return true +} + +// Tp implements Encoding interface. +func (e *encodingLatin1) Tp() EncodingTp { + return EncodingTpLatin1 +} + +func (e *encodingLatin1) Transform(dest *bytes.Buffer, src []byte, op Op) ([]byte, error) { + return src, nil +} diff --git a/parser/mysql/charset.go b/parser/mysql/charset.go index cb3666bb21b31..75ee0563ae5f3 100644 --- a/parser/mysql/charset.go +++ b/parser/mysql/charset.go @@ -593,9 +593,9 @@ const ( MaxBytesOfCharacter = 4 ) -// IsUTF8Charset checks if charset is utf8, utf8mb4 or latin1. +// IsUTF8Charset checks if charset is utf8, utf8mb4. func IsUTF8Charset(charset string) bool { - return charset == UTF8Charset || charset == UTF8MB4Charset || charset == Latin1Charset + return charset == UTF8Charset || charset == UTF8MB4Charset } // RangeGraph defines valid unicode characters to use in column names. It strictly follows MySQL's definition.