Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

charset,collation: implement GB18030 and 2 collations | tidb-test=pr/2087 #55792

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion pkg/expression/builtin_string_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,9 @@ func TestASCII(t *testing.T) {
{"你好", "gbk", 196},
{"你好", "", 228},
{"世界", "gbk", 202},
{"世界", "", 228},
{"abc", "gb18030", 97},
{"你好", "gb18030", 196},
{"世界", "gb18030", 202},
}

for _, c := range tbl {
Expand Down Expand Up @@ -1255,6 +1257,7 @@ func TestHexFunc(t *testing.T) {
{0x12, false, false, "12"},
{nil, true, false, ""},
{errors.New("must err"), false, true, ""},
{"🀁", false, false, "F09F8081"},
}
for _, c := range cases {
f, err := newFunctionForTest(ctx, ast.Hex, primitiveValsToConstants(ctx, []any{c.arg})...)
Expand Down Expand Up @@ -1282,6 +1285,7 @@ func TestHexFunc(t *testing.T) {
{"你好", "gbk", "C4E3BAC3", 0},
{"一忒(๑•ㅂ•)و✧", "", "E4B880E5BF9228E0B991E280A2E38582E280A229D988E29CA7", 0},
{"一忒(๑•ㅂ•)و✧", "gbk", "", errno.ErrInvalidCharacterString},
{"🀁", "gb18030", "9438E131", 0},
}
for _, c := range strCases {
err := ctx.GetSessionVars().SetSystemVarWithoutValidation(variable.CharacterSetConnection, c.chs)
Expand Down
1 change: 1 addition & 0 deletions pkg/lightning/mydump/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ go_library(
"//pkg/lightning/worker",
"//pkg/parser",
"//pkg/parser/ast",
"//pkg/parser/charset",
"//pkg/parser/format",
"//pkg/parser/model",
"//pkg/parser/mysql",
Expand Down
5 changes: 3 additions & 2 deletions pkg/lightning/mydump/charset_convertor.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (

"github.com/pingcap/errors"
"github.com/pingcap/tidb/pkg/lightning/config"
"github.com/pingcap/tidb/pkg/parser/charset"
"golang.org/x/text/encoding"
"golang.org/x/text/encoding/charmap"
"golang.org/x/text/encoding/simplifiedchinese"
Expand Down Expand Up @@ -64,7 +65,7 @@ func (cc *CharsetConvertor) initDecoder() error {
case config.Binary, config.UTF8MB4, config.ASCII:
return nil
case config.GB18030:
cc.decoder = simplifiedchinese.GB18030.NewDecoder()
cc.decoder = charset.NewCustomGB18030Decoder()
return nil
case config.GBK:
cc.decoder = simplifiedchinese.GBK.NewDecoder()
Expand All @@ -83,7 +84,7 @@ func (cc *CharsetConvertor) initEncoder() error {
case config.Binary, config.UTF8MB4, config.ASCII:
return nil
case config.GB18030:
cc.encoder = simplifiedchinese.GB18030.NewEncoder()
cc.encoder = charset.NewCustomGB18030Encoder()
return nil
case config.GBK:
cc.encoder = simplifiedchinese.GBK.NewEncoder()
Expand Down
13 changes: 9 additions & 4 deletions pkg/lightning/mydump/charset_convertor_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,18 +29,22 @@ const (
)

var (
normalCharUTF8MB4 = []byte{0xE4, 0xBD, 0xA0, 0xE5, 0xA5, 0xBD, 0xEF, 0xBC, 0x8C, 0xE4, 0xB8, 0x96, 0xE7, 0x95, 0x8C, 0xEF, 0xBC, 0x81} // “你好,世界!” in utf8mb4
normalCharGB18030 = []byte{0xC4, 0xE3, 0xBA, 0xC3, 0xA3, 0xAC, 0xCA, 0xC0, 0xBD, 0xE7, 0xA3, 0xA1} // “你好,世界!” in gb18030
invalidChar = []byte{0xff} // Invalid gb18030 char
normalCharUTF8MB4 = []byte{0xE4, 0xBD, 0xA0, 0xE5, 0xA5, 0xBD, 0xEF, 0xBC, 0x8C, 0xE4, 0xB8, 0x96, 0xE7, 0x95, 0x8C, 0xEF, 0xBC, 0x81,
0xE1, 0xB8, 0xBF, 0xE2, 0x82, 0xAC, 0xE9, 0xBE, 0xB4} // “你好,世界!ḿ€龴” in utf8mb4
normalCharGB18030 = []byte{0xC4, 0xE3, 0xBA, 0xC3, 0xA3, 0xAC, 0xCA, 0xC0, 0xBD, 0xE7, 0xA3, 0xA1,
0xA8, 0xBC, 0xA2, 0xE3, 0xFE, 0x59} // “你好,世界!ḿ€龴” in gb18030
invalidChar = []byte{0xff} // Invalid gb18030 char // Invalid gb18030 char
)

func TestCharsetConvertor(t *testing.T) {
utf8Reader, err := os.Open(testUTF8DataFile)
require.NoError(t, err)
defer utf8Reader.Close()
utf8Data, err := io.ReadAll(utf8Reader)
require.NoError(t, err)
gbkReader, err := os.Open(testGBKDataFile)
require.NoError(t, err)
defer gbkReader.Close()
gbkData, err := io.ReadAll(gbkReader)
require.NoError(t, err)

Expand All @@ -52,7 +56,7 @@ func TestCharsetConvertor(t *testing.T) {

utf8ToGBKData, err := cc.Encode(string(normalCharUTF8MB4))
require.NoError(t, err)
require.Equal(t, string(normalCharGB18030), utf8ToGBKData)
require.Equal(t, string(normalCharGB18030), utf8ToGBKData, "%x, %x", normalCharGB18030, []byte(utf8ToGBKData))
}

func TestInvalidCharReplace(t *testing.T) {
Expand All @@ -70,6 +74,7 @@ func TestInvalidCharReplace(t *testing.T) {

gbkReader, err := os.Open(testTempDataFile)
require.NoError(t, err)
defer gbkReader.Close()
gbkData, err := io.ReadAll(gbkReader)
require.NoError(t, err)
cc, err := NewCharsetConvertor("gb18030", dataInvalidCharReplace)
Expand Down
4 changes: 2 additions & 2 deletions pkg/lightning/mydump/reader.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,10 @@ import (
"github.com/pingcap/tidb/br/pkg/storage"
"github.com/pingcap/tidb/pkg/lightning/log"
"github.com/pingcap/tidb/pkg/lightning/worker"
"github.com/pingcap/tidb/pkg/parser/charset"
"github.com/spkg/bom"
"go.uber.org/zap"
"golang.org/x/text/encoding/charmap"
"golang.org/x/text/encoding/simplifiedchinese"
)

var (
Expand All @@ -54,7 +54,7 @@ func decodeCharacterSet(data []byte, characterSet string) ([]byte, error) {
// perform `chardet` first.
fallthrough
case "gb18030":
decoded, err := simplifiedchinese.GB18030.NewDecoder().Bytes(data)
decoded, err := charset.EncodingGB18030Impl.Transform(nil, data, charset.OpDecodeReplace)
if err != nil {
return nil, errors.Trace(err)
}
Expand Down
2 changes: 1 addition & 1 deletion pkg/lightning/mydump/reader_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ func TestExportStatementGibberishError(t *testing.T) {
f := FileInfo{FileMeta: SourceFileMeta{Path: stat.Name(), FileSize: stat.Size()}}
data, err := ExportStatement(context.TODO(), store, f, "auto")
require.Len(t, data, 0)
require.Regexp(t, `failed to decode \w* as auto: invalid schema encoding`, err.Error())
require.Error(t, err)
}

type AlwaysErrorReadSeekCloser struct{}
Expand Down
4 changes: 3 additions & 1 deletion pkg/parser/charset/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ go_library(
"encoding_ascii.go",
"encoding_base.go",
"encoding_bin.go",
"encoding_gb18030.go",
"encoding_gb18030_data.go",
"encoding_gbk.go",
"encoding_latin1.go",
"encoding_table.go",
Expand Down Expand Up @@ -41,7 +43,7 @@ go_test(
],
embed = [":charset"],
flaky = True,
shard_count = 8,
shard_count = 9,
deps = [
"@com_github_stretchr_testify//require",
"@org_golang_x_text//transform",
Expand Down
63 changes: 35 additions & 28 deletions pkg/parser/charset/charset.go
Original file line number Diff line number Diff line change
Expand Up @@ -71,16 +71,18 @@ var CharacterSetInfos = map[string]*Charset{
CharsetLatin1: {CharsetLatin1, CollationLatin1, make(map[string]*Collation), "Latin1", 1},
CharsetBin: {CharsetBin, CollationBin, make(map[string]*Collation), "binary", 1},
CharsetGBK: {CharsetGBK, CollationGBKBin, make(map[string]*Collation), "Chinese Internal Code Specification", 2},
CharsetGB18030: {CharsetGB18030, CollationGB18030Bin, make(map[string]*Collation), "China National Standard GB18030", 4},
}

// All the names supported collations should be in the following table.
var supportedCollationNames = map[string]struct{}{
CollationUTF8: {},
CollationUTF8MB4: {},
CollationASCII: {},
CollationLatin1: {},
CollationBin: {},
CollationGBKBin: {},
CollationUTF8: {},
CollationUTF8MB4: {},
CollationASCII: {},
CollationLatin1: {},
CollationBin: {},
CollationGBKBin: {},
CollationGB18030Bin: {},
}

// TiFlashSupportedCharsets is a map which contains TiFlash supports charsets.
Expand Down Expand Up @@ -238,6 +240,10 @@ const (
CollationGBKBin = "gbk_bin"
// CollationGBKChineseCI is the default collation for CharsetGBK when new collation is enabled.
CollationGBKChineseCI = "gbk_chinese_ci"
// CollationGB18030Bin is the default collation for CharsetGB18030 when new collation is disabled.
CollationGB18030Bin = "gb18030_bin"
// CollationGB18030ChineseCI is the default collation for CharsetGB18030 when new collation is enabled.
CollationGB18030ChineseCI = "gb18030_chinese_ci"
)

const (
Expand All @@ -253,6 +259,8 @@ const (
CharsetUTF8MB3 = "utf8mb3"
// CharsetUTF8MB4 represents 4 bytes utf8, which works the same way as utf8 in Go.
CharsetUTF8MB4 = "utf8mb4"
// CharsetGB18030 represents 4 bytes gb18030.
CharsetGB18030 = "gb18030"
//revive:disable:exported
CharsetARMSCII8 = "armscii8"
CharsetBig5 = "big5"
Expand All @@ -267,7 +275,6 @@ const (
CharsetDEC8 = "dec8"
CharsetEUCJPMS = "eucjpms"
CharsetEUCKR = "euckr"
CharsetGB18030 = "gb18030"
CharsetGB2312 = "gb2312"
CharsetGBK = "gbk"
CharsetGEOSTD8 = "geostd8"
Expand Down Expand Up @@ -298,28 +305,28 @@ var charsets = map[string]*Charset{
CharsetASCII: {Name: CharsetASCII, Maxlen: 1, DefaultCollation: "ascii_general_ci", Desc: "US ASCII", Collations: make(map[string]*Collation)},
CharsetBig5: {Name: CharsetBig5, Maxlen: 2, DefaultCollation: "big5_chinese_ci", Desc: "Big5 Traditional Chinese", Collations: make(map[string]*Collation)},
CharsetBin: {Name: CharsetBin, Maxlen: 1, DefaultCollation: "binary", Desc: "Binary pseudo charset", Collations: make(map[string]*Collation)},
CharsetLatin1: {Name: CharsetLatin1, Maxlen: 1, DefaultCollation: "cp1250_general_ci", Desc: "Windows Central European", Collations: make(map[string]*Collation)},
CharsetCP1250: {Name: CharsetCP1250, Maxlen: 1, DefaultCollation: "cp1251_general_ci", Desc: "Windows Cyrillic", Collations: make(map[string]*Collation)},
CharsetCP1251: {Name: CharsetCP1251, Maxlen: 1, DefaultCollation: "cp1256_general_ci", Desc: "Windows Arabic", Collations: make(map[string]*Collation)},
CharsetCP1256: {Name: CharsetCP1256, Maxlen: 1, DefaultCollation: "cp1257_general_ci", Desc: "Windows Baltic", Collations: make(map[string]*Collation)},
CharsetCP1257: {Name: CharsetCP1257, Maxlen: 1, DefaultCollation: "cp850_general_ci", Desc: "DOS West European", Collations: make(map[string]*Collation)},
CharsetCP850: {Name: CharsetCP850, Maxlen: 1, DefaultCollation: "cp852_general_ci", Desc: "DOS Central European", Collations: make(map[string]*Collation)},
CharsetCP852: {Name: CharsetCP852, Maxlen: 1, DefaultCollation: "cp866_general_ci", Desc: "DOS Russian", Collations: make(map[string]*Collation)},
CharsetCP866: {Name: CharsetCP866, Maxlen: 1, DefaultCollation: "cp932_japanese_ci", Desc: "SJIS for Windows Japanese", Collations: make(map[string]*Collation)},
CharsetCP932: {Name: CharsetCP932, Maxlen: 2, DefaultCollation: "dec8_swedish_ci", Desc: "DEC West European", Collations: make(map[string]*Collation)},
CharsetDEC8: {Name: CharsetDEC8, Maxlen: 1, DefaultCollation: "eucjpms_japanese_ci", Desc: "UJIS for Windows Japanese", Collations: make(map[string]*Collation)},
CharsetEUCJPMS: {Name: CharsetEUCJPMS, Maxlen: 3, DefaultCollation: "euckr_korean_ci", Desc: "EUC-KR Korean", Collations: make(map[string]*Collation)},
CharsetEUCKR: {Name: CharsetEUCKR, Maxlen: 2, DefaultCollation: "gb18030_chinese_ci", Desc: "China National Standard GB18030", Collations: make(map[string]*Collation)},
CharsetGB18030: {Name: CharsetGB18030, Maxlen: 4, DefaultCollation: "gb2312_chinese_ci", Desc: "GB2312 Simplified Chinese", Collations: make(map[string]*Collation)},
CharsetGB2312: {Name: CharsetGB2312, Maxlen: 2, DefaultCollation: "gbk_chinese_ci", Desc: "GBK Simplified Chinese", Collations: make(map[string]*Collation)},
CharsetGBK: {Name: CharsetGBK, Maxlen: 2, DefaultCollation: "geostd8_general_ci", Desc: "GEOSTD8 Georgian", Collations: make(map[string]*Collation)},
CharsetGEOSTD8: {Name: CharsetGEOSTD8, Maxlen: 1, DefaultCollation: "greek_general_ci", Desc: "ISO 8859-7 Greek", Collations: make(map[string]*Collation)},
CharsetGreek: {Name: CharsetGreek, Maxlen: 1, DefaultCollation: "hebrew_general_ci", Desc: "ISO 8859-8 Hebrew", Collations: make(map[string]*Collation)},
CharsetHebrew: {Name: CharsetHebrew, Maxlen: 1, DefaultCollation: "hp8_english_ci", Desc: "HP West European", Collations: make(map[string]*Collation)},
CharsetHP8: {Name: CharsetHP8, Maxlen: 1, DefaultCollation: "keybcs2_general_ci", Desc: "DOS Kamenicky Czech-Slovak", Collations: make(map[string]*Collation)},
CharsetKEYBCS2: {Name: CharsetKEYBCS2, Maxlen: 1, DefaultCollation: "koi8r_general_ci", Desc: "KOI8-R Relcom Russian", Collations: make(map[string]*Collation)},
CharsetCP1250: {Name: CharsetCP1250, Maxlen: 1, DefaultCollation: "cp1250_general_ci", Desc: "Windows Central European", Collations: make(map[string]*Collation)},
CharsetCP1251: {Name: CharsetCP1251, Maxlen: 1, DefaultCollation: "cp1251_general_ci", Desc: "Windows Cyrillic", Collations: make(map[string]*Collation)},
CharsetCP1256: {Name: CharsetCP1256, Maxlen: 1, DefaultCollation: "cp1256_general_ci", Desc: "Windows Arabic", Collations: make(map[string]*Collation)},
CharsetCP1257: {Name: CharsetCP1257, Maxlen: 1, DefaultCollation: "cp1257_general_ci", Desc: "Windows Baltic", Collations: make(map[string]*Collation)},
CharsetCP850: {Name: CharsetCP850, Maxlen: 1, DefaultCollation: "cp850_general_ci", Desc: "DOS West European", Collations: make(map[string]*Collation)},
CharsetCP852: {Name: CharsetCP852, Maxlen: 1, DefaultCollation: "cp852_general_ci", Desc: "DOS Central European", Collations: make(map[string]*Collation)},
CharsetCP866: {Name: CharsetCP866, Maxlen: 1, DefaultCollation: "cp866_general_ci", Desc: "DOS Russian", Collations: make(map[string]*Collation)},
CharsetCP932: {Name: CharsetCP932, Maxlen: 2, DefaultCollation: "cp932_japanese_ci", Desc: "SJIS for Windows Japanese", Collations: make(map[string]*Collation)},
CharsetDEC8: {Name: CharsetDEC8, Maxlen: 1, DefaultCollation: "dec8_swedish_ci", Desc: "DEC West European", Collations: make(map[string]*Collation)},
CharsetEUCJPMS: {Name: CharsetEUCJPMS, Maxlen: 3, DefaultCollation: "eucjpms_japanese_ci", Desc: "UJIS for Windows Japanese", Collations: make(map[string]*Collation)},
CharsetEUCKR: {Name: CharsetEUCKR, Maxlen: 2, DefaultCollation: "euckr_korean_ci", Desc: "EUC-KR Korean", Collations: make(map[string]*Collation)},
CharsetGB18030: {Name: CharsetGB18030, Maxlen: 4, DefaultCollation: "gb18030_chinese_ci", Desc: "China National Standard GB18030", Collations: make(map[string]*Collation)},
CharsetGB2312: {Name: CharsetGB2312, Maxlen: 2, DefaultCollation: "gb2312_chinese_ci", Desc: "GB2312 Simplified Chinese", Collations: make(map[string]*Collation)},
CharsetGBK: {Name: CharsetGBK, Maxlen: 2, DefaultCollation: "gbk_chinese_ci", Desc: "GBK Simplified Chinese", Collations: make(map[string]*Collation)},
CharsetGEOSTD8: {Name: CharsetGEOSTD8, Maxlen: 1, DefaultCollation: "geostd8_general_ci", Desc: "GEOSTD8 Georgian", Collations: make(map[string]*Collation)},
CharsetGreek: {Name: CharsetGreek, Maxlen: 1, DefaultCollation: "greek_general_ci", Desc: "ISO 8859-7 Greek", Collations: make(map[string]*Collation)},
CharsetHebrew: {Name: CharsetHebrew, Maxlen: 1, DefaultCollation: "hebrew_general_ci", Desc: "ISO 8859-8 Hebrew", Collations: make(map[string]*Collation)},
CharsetHP8: {Name: CharsetHP8, Maxlen: 1, DefaultCollation: "hp8_english_ci", Desc: "HP West European", Collations: make(map[string]*Collation)},
CharsetKEYBCS2: {Name: CharsetKEYBCS2, Maxlen: 1, DefaultCollation: "keybcs2_general_ci", Desc: "DOS Kamenicky Czech-Slovak", Collations: make(map[string]*Collation)},
CharsetKOI8R: {Name: CharsetKOI8R, Maxlen: 1, DefaultCollation: "koi8u_general_ci", Desc: "KOI8-U Ukrainian", Collations: make(map[string]*Collation)},
CharsetKOI8U: {Name: CharsetKOI8U, Maxlen: 1, DefaultCollation: "latin1_swedish_ci", Desc: "cp1252 West European", Collations: make(map[string]*Collation)},
CharsetKOI8U: {Name: CharsetKOI8U, Maxlen: 1, DefaultCollation: "koi8r_general_ci", Desc: "KOI8-R Relcom Russian", Collations: make(map[string]*Collation)},
CharsetLatin1: {Name: CharsetLatin1, Maxlen: 1, DefaultCollation: "latin1_swedish_ci", Desc: "cp1252 West European", Collations: make(map[string]*Collation)},
CharsetLatin2: {Name: CharsetLatin2, Maxlen: 1, DefaultCollation: "latin2_general_ci", Desc: "ISO 8859-2 Central European", Collations: make(map[string]*Collation)},
CharsetLatin5: {Name: CharsetLatin5, Maxlen: 1, DefaultCollation: "latin5_turkish_ci", Desc: "ISO 8859-9 Turkish", Collations: make(map[string]*Collation)},
CharsetLatin7: {Name: CharsetLatin7, Maxlen: 1, DefaultCollation: "latin7_general_ci", Desc: "ISO 8859-13 Baltic", Collations: make(map[string]*Collation)},
Expand Down
3 changes: 3 additions & 0 deletions pkg/parser/charset/encoding.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ var (
_ Encoding = &encodingLatin1{}
_ Encoding = &encodingBin{}
_ Encoding = &encodingGBK{}
_ Encoding = &encodingGB18030{}
)

// IsSupportedEncoding checks if the charset is fully supported.
Expand Down Expand Up @@ -60,6 +61,7 @@ var encodingMap = map[string]Encoding{
CharsetLatin1: EncodingLatin1Impl,
CharsetBin: EncodingBinImpl,
CharsetASCII: EncodingASCIIImpl,
CharsetGB18030: EncodingGB18030Impl,
}

// Encoding provide encode/decode functions for a string with a specific charset.
Expand Down Expand Up @@ -100,6 +102,7 @@ const (
EncodingTpLatin1
EncodingTpBin
EncodingTpGBK
EncodingTpGB18030
)

//revive:enable
Expand Down
Loading