From 5e14c964c62d4b4f4309845e6df69151cd2f54db Mon Sep 17 00:00:00 2001 From: Takashi Kokubun Date: Sat, 13 Aug 2022 01:29:28 -0700 Subject: [PATCH] Support parsing CHARSET=utf8mb3 --- cmd/explaintest/r/collation_misc_disabled.result | 4 ++++ cmd/explaintest/r/collation_misc_enabled.result | 2 ++ parser/charset/charset.go | 7 +++++++ parser/charset/charset_test.go | 1 + 4 files changed, 14 insertions(+) diff --git a/cmd/explaintest/r/collation_misc_disabled.result b/cmd/explaintest/r/collation_misc_disabled.result index a9ee8ac04631f..0398454acfbba 100644 --- a/cmd/explaintest/r/collation_misc_disabled.result +++ b/cmd/explaintest/r/collation_misc_disabled.result @@ -91,6 +91,7 @@ binary 1 gbk_bin 2 latin1_bin 1 utf8_bin 3 +utf8mb3_general_ci 3 utf8mb4_bin 4 SELECT character_set_name, id, sortlen FROM information_schema.collations ORDER BY collation_name, id; character_set_name id sortlen @@ -99,6 +100,7 @@ binary 63 1 gbk 87 1 latin1 47 1 utf8 83 1 +utf8mb3 33 1 utf8mb4 46 1 select * from information_schema.COLLATION_CHARACTER_SET_APPLICABILITY where COLLATION_NAME='utf8mb4_bin'; COLLATION_NAME CHARACTER_SET_NAME @@ -110,9 +112,11 @@ binary binary binary 1 gbk Chinese Internal Code Specification gbk_bin 2 latin1 Latin1 latin1_bin 1 utf8 UTF-8 Unicode utf8_bin 3 +utf8mb3 UTF-8 Unicode utf8mb3_general_ci 3 utf8mb4 UTF-8 Unicode utf8mb4_bin 4 show collation; Collation Charset Id Default Compiled Sortlen +utf8mb3_general_ci utf8mb3 33 Yes Yes 1 utf8mb4_bin utf8mb4 46 Yes Yes 1 latin1_bin latin1 47 Yes Yes 1 binary binary 63 Yes Yes 1 diff --git a/cmd/explaintest/r/collation_misc_enabled.result b/cmd/explaintest/r/collation_misc_enabled.result index 38161ba4ca6a6..0c647e9554bcf 100644 --- a/cmd/explaintest/r/collation_misc_enabled.result +++ b/cmd/explaintest/r/collation_misc_enabled.result @@ -94,6 +94,7 @@ binary 1 gbk_chinese_ci 2 latin1_bin 1 utf8_bin 3 +utf8mb3_general_ci 3 utf8mb4_bin 4 SELECT character_set_name, id, sortlen FROM information_schema.collations ORDER BY collation_name, id; character_set_name id sortlen @@ -118,6 +119,7 @@ binary binary binary 1 gbk Chinese Internal Code Specification gbk_chinese_ci 2 latin1 Latin1 latin1_bin 1 utf8 UTF-8 Unicode utf8_bin 3 +utf8mb3 UTF-8 Unicode utf8mb3_general_ci 3 utf8mb4 UTF-8 Unicode utf8mb4_bin 4 show collation; Collation Charset Id Default Compiled Sortlen diff --git a/parser/charset/charset.go b/parser/charset/charset.go index 6067e4f623424..d241ee33dce1d 100644 --- a/parser/charset/charset.go +++ b/parser/charset/charset.go @@ -57,6 +57,7 @@ var supportedCollations = make([]*Collation, 0, len(supportedCollationNames)) // CharacterSetInfos contains all the supported charsets. var CharacterSetInfos = map[string]*Charset{ CharsetUTF8: {CharsetUTF8, CollationUTF8, make(map[string]*Collation), "UTF-8 Unicode", 3}, + CharsetUTF8MB3: {CharsetUTF8MB3, CollationUTF8MB3, make(map[string]*Collation), "UTF-8 Unicode", 3}, CharsetUTF8MB4: {CharsetUTF8MB4, CollationUTF8MB4, make(map[string]*Collation), "UTF-8 Unicode", 4}, CharsetASCII: {CharsetASCII, CollationASCII, make(map[string]*Collation), "US ASCII", 1}, CharsetLatin1: {CharsetLatin1, CollationLatin1, make(map[string]*Collation), "Latin1", 1}, @@ -67,6 +68,7 @@ var CharacterSetInfos = map[string]*Charset{ // All the names supported collations should be in the following table. var supportedCollationNames = map[string]struct{}{ CollationUTF8: {}, + CollationUTF8MB3: {}, CollationUTF8MB4: {}, CollationASCII: {}, CollationLatin1: {}, @@ -204,6 +206,8 @@ const ( CollationBin = "binary" // CollationUTF8 is the default collation for CharsetUTF8. CollationUTF8 = "utf8_bin" + // CollationUTF8MB3 is the default collation for CharsetUTF8MB3. + CollationUTF8MB3 = "utf8mb3_general_ci" // CollationUTF8MB4 is the default collation for CharsetUTF8MB4. CollationUTF8MB4 = "utf8mb4_bin" // CollationASCII is the default collation for CharsetACSII. @@ -225,6 +229,8 @@ const ( CharsetLatin1 = "latin1" // CharsetUTF8 is the default charset for string types. CharsetUTF8 = "utf8" + // CharsetUTF8MB3 is another name of CharsetUTF8. + CharsetUTF8MB3 = "utf8mb3" // CharsetUTF8MB4 represents 4 bytes utf8, which works the same way as utf8 in Go. CharsetUTF8MB4 = "utf8mb4" //revive:disable:exported @@ -344,6 +350,7 @@ var collations = []*Collation{ {31, "latin1", "latin1_german2_ci", false}, {32, "armscii8", "armscii8_general_ci", true}, {33, "utf8", "utf8_general_ci", false}, + {33, "utf8mb3", "utf8mb3_general_ci", true}, {34, "cp1250", "cp1250_czech_cs", false}, {35, "ucs2", "ucs2_general_ci", true}, {36, "cp866", "cp866_general_ci", true}, diff --git a/parser/charset/charset_test.go b/parser/charset/charset_test.go index 6de594c68c54d..62102fe1d4da5 100644 --- a/parser/charset/charset_test.go +++ b/parser/charset/charset_test.go @@ -67,6 +67,7 @@ func TestGetDefaultCollation(t *testing.T) { }{ {"utf8", "utf8_bin", true}, {"UTF8", "utf8_bin", true}, + {"utf8mb3", "utf8mb3_general_ci", true}, {"utf8mb4", "utf8mb4_bin", true}, {"ascii", "ascii_bin", true}, {"binary", "binary", true},