Skip to content

Commit

Permalink
charset: alias utf8 and utf8mb3 (#44655)
Browse files Browse the repository at this point in the history
close #26226
  • Loading branch information
dveeden authored Jun 14, 2023
1 parent 14ca3ce commit 0c49f18
Show file tree
Hide file tree
Showing 3 changed files with 55 additions and 5 deletions.
2 changes: 1 addition & 1 deletion parser/charset/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ go_test(
],
embed = [":charset"],
flaky = True,
shard_count = 7,
shard_count = 8,
deps = [
"@com_github_stretchr_testify//require",
"@org_golang_x_text//transform",
Expand Down
30 changes: 26 additions & 4 deletions parser/charset/charset.go
Original file line number Diff line number Diff line change
Expand Up @@ -106,8 +106,8 @@ func GetSupportedCollations() []*Collation {
// and returns a boolean.
func ValidCharsetAndCollation(cs string, co string) bool {
// We will use utf8 as a default charset.
if cs == "" {
cs = "utf8"
if cs == "" || cs == CharsetUTF8MB3 {
cs = CharsetUTF8
}
chs, err := GetCharsetInfo(cs)
if err != nil {
Expand All @@ -117,14 +117,16 @@ func ValidCharsetAndCollation(cs string, co string) bool {
if co == "" {
return true
}
co = strings.ToLower(co)
co = utf8Alias(strings.ToLower(co))
_, ok := chs.Collations[co]
return ok
}

// GetDefaultCollationLegacy is compatible with the charset support in old version parser.
func GetDefaultCollationLegacy(charset string) (string, error) {
switch strings.ToLower(charset) {
case CharsetUTF8MB3:
return GetDefaultCollation(CharsetUTF8)
case CharsetUTF8, CharsetUTF8MB4, CharsetASCII, CharsetLatin1, CharsetBin:
return GetDefaultCollation(charset)
default:
Expand All @@ -148,6 +150,10 @@ func GetDefaultCharsetAndCollate() (defaultCharset string, defaultCollationName

// GetCharsetInfo returns charset and collation for cs as name.
func GetCharsetInfo(cs string) (*Charset, error) {
if strings.ToLower(cs) == CharsetUTF8MB3 {
cs = CharsetUTF8
}

if c, ok := CharacterSetInfos[strings.ToLower(cs)]; ok {
return c, nil
}
Expand Down Expand Up @@ -180,9 +186,23 @@ func GetCollations() []*Collation {
return collations
}

func utf8Alias(csname string) string {
switch csname {
case "utf8mb3_bin":
csname = "utf8_bin"
case "utf8mb3_unicode_ci":
csname = "utf8_unicode_ci"
case "utf8mb3_general_ci":
csname = "utf8_general_ci"
default:
}
return csname
}

// GetCollationByName returns the collation by name.
func GetCollationByName(name string) (*Collation, error) {
collation, ok := collationsNameMap[strings.ToLower(name)]
csname := utf8Alias(strings.ToLower(name))
collation, ok := collationsNameMap[csname]
if !ok {
return nil, ErrUnknownCollation.GenWithStackByArgs(name)
}
Expand Down Expand Up @@ -225,6 +245,8 @@ const (
CharsetLatin1 = "latin1"
// CharsetUTF8 is the default charset for string types.
CharsetUTF8 = "utf8"
// CharsetUTF8MB3 is 3 bytes utf8, a MySQL legacy encoding. "utf8" and "utf8mb3" are aliases.
CharsetUTF8MB3 = "utf8mb3"
// CharsetUTF8MB4 represents 4 bytes utf8, which works the same way as utf8 in Go.
CharsetUTF8MB4 = "utf8mb4"
//revive:disable:exported
Expand Down
28 changes: 28 additions & 0 deletions parser/charset/charset_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,10 @@ func TestValidCharset(t *testing.T) {
{"UTF8MB4", "UTF8MB4_bin", true},
{"UTF8MB4", "UTF8MB4_general_ci", true},
{"Utf8", "uTf8_bIN", true},
{"utf8mb3", "", true},
{"utf8mb3", "utf8mb3_bin", true},
{"utf8mb3", "utf8mb3_general_ci", true},
{"utf8mb3", "utf8mb3_unicode_ci", true},
}
for _, tt := range tests {
testValidCharset(t, tt.cs, tt.co, tt.succ)
Expand Down Expand Up @@ -145,6 +149,30 @@ func TestValidCustomCharset(t *testing.T) {
}
}

func TestUTF8MB3(t *testing.T) {
colname, err := GetDefaultCollationLegacy("utf8mb3")
require.NoError(t, err)
require.Equal(t, colname, "utf8_bin")

csinfo, err := GetCharsetInfo("utf8mb3")
require.NoError(t, err)
require.Equal(t, csinfo.Name, "utf8")

tests := []struct {
cs string
alias string
}{
{"utf8mb3_bin", "utf8_bin"},
{"utf8mb3_general_ci", "utf8_general_ci"},
{"utf8mb3_unicode_ci", "utf8_unicode_ci"},
}
for _, tt := range tests {
col, err := GetCollationByName(tt.cs)
require.NoError(t, err)
require.Equal(t, col.Name, tt.alias)
}
}

func BenchmarkGetCharsetDesc(b *testing.B) {
b.ResetTimer()
charsets := []string{CharsetUTF8, CharsetUTF8MB4, CharsetASCII, CharsetLatin1, CharsetBin}
Expand Down

0 comments on commit 0c49f18

Please sign in to comment.