From ede6f8cf16a16da62609cd88296666eb8c42d10a Mon Sep 17 00:00:00 2001 From: Hangjie Mo Date: Tue, 10 May 2022 13:00:35 +0800 Subject: [PATCH] *: Support modify table/column charset from latin1 to utf8/utf8mb4 (#34386) ref pingcap/tidb#34008 --- .../r/collation_misc_disabled.result | 46 +++++++++++++++++-- .../r/collation_misc_enabled.result | 46 +++++++++++++++++-- cmd/explaintest/t/collation_misc.test | 37 +++++++++++++-- ddl/db_integration_test.go | 24 ++++++++-- ddl/ddl_api.go | 6 ++- expression/integration_test.go | 2 +- parser/charset/encoding_latin1.go | 26 +---------- parser/mysql/charset.go | 5 +- util/collate/collate.go | 2 +- 9 files changed, 148 insertions(+), 46 deletions(-) diff --git a/cmd/explaintest/r/collation_misc_disabled.result b/cmd/explaintest/r/collation_misc_disabled.result index 33f0ebe26f701..20b3d9db10a9e 100644 --- a/cmd/explaintest/r/collation_misc_disabled.result +++ b/cmd/explaintest/r/collation_misc_disabled.result @@ -14,19 +14,57 @@ select * from t; a t_value alter table t modify column a varchar(20) charset utf8; -Error 8200: Unsupported modify charset from latin1 to utf8 +admin check table t; + +select * from t; +a +t_value +drop table t; +create table t(a varchar(20) charset latin1); +insert into t values ("t_value"); alter table t modify column a varchar(20) charset utf8mb4; -Error 8200: Unsupported modify charset from latin1 to utf8mb4 +admin check table t; + +select * from t; +a +t_value +drop table t; +create table t(a varchar(20) charset latin1); +insert into t values ("t_value"); alter table t modify column a varchar(20) charset utf8 collate utf8_bin; -Error 8200: Unsupported modify charset from latin1 to utf8 +admin check table t; + +select * from t; +a +t_value +drop table t; +create table t(a varchar(20) charset latin1); +insert into t values ("t_value"); alter table t modify column a varchar(20) charset utf8mb4 collate utf8mb4_general_ci; -Error 8200: Unsupported modify charset from latin1 to utf8mb4 alter table t modify column a varchar(20) charset utf8mb4 collate utf8bin; [ddl:1273]Unknown collation: 'utf8bin' alter table t collate LATIN1_GENERAL_CI charset utf8 collate utf8_bin; Error 1302: Conflicting declarations: 'CHARACTER SET latin1' and 'CHARACTER SET utf8' alter table t collate LATIN1_GENERAL_CI collate UTF8MB4_UNICODE_ci collate utf8_bin; Error 1253: COLLATION 'utf8mb4_unicode_ci' is not valid for CHARACTER SET 'latin1' +drop table t; +create table t(a varchar(20) charset latin1); +insert into t values ("t_value"); +alter table t modify column a varchar(19) charset utf8mb4; +admin check table t; + +select * from t; +a +t_value +drop table t; +create table t(a varchar(20) charset latin1); +insert into t values ("t_value"); +alter table t modify column a varchar(19) charset utf8 collate utf8_bin; +admin check table t; + +select * from t; +a +t_value create database if not exists cd_test_utf8 CHARACTER SET utf8 COLLATE utf8_bin; create database if not exists cd_test_latin1 CHARACTER SET latin1 COLLATE latin1_swedish_ci; use cd_test_utf8; diff --git a/cmd/explaintest/r/collation_misc_enabled.result b/cmd/explaintest/r/collation_misc_enabled.result index 8f75d4e18d151..687ea0486e046 100644 --- a/cmd/explaintest/r/collation_misc_enabled.result +++ b/cmd/explaintest/r/collation_misc_enabled.result @@ -14,19 +14,57 @@ select * from t; a t_value alter table t modify column a varchar(20) charset utf8; -Error 8200: Unsupported modify charset from latin1 to utf8 +admin check table t; + +select * from t; +a +t_value +drop table t; +create table t(a varchar(20) charset latin1); +insert into t values ("t_value"); alter table t modify column a varchar(20) charset utf8mb4; -Error 8200: Unsupported modify charset from latin1 to utf8mb4 +admin check table t; + +select * from t; +a +t_value +drop table t; +create table t(a varchar(20) charset latin1); +insert into t values ("t_value"); alter table t modify column a varchar(20) charset utf8 collate utf8_bin; -Error 8200: Unsupported modify charset from latin1 to utf8 +admin check table t; + +select * from t; +a +t_value +drop table t; +create table t(a varchar(20) charset latin1); +insert into t values ("t_value"); alter table t modify column a varchar(20) charset utf8mb4 collate utf8mb4_general_ci; -Error 8200: Unsupported modify charset from latin1 to utf8mb4 alter table t modify column a varchar(20) charset utf8mb4 collate utf8bin; [ddl:1273]Unknown collation: 'utf8bin' alter table t collate LATIN1_GENERAL_CI charset utf8 collate utf8_bin; Error 1273: Unsupported collation when new collation is enabled: 'latin1_general_ci' alter table t collate LATIN1_GENERAL_CI collate UTF8MB4_UNICODE_ci collate utf8_bin; Error 1273: Unsupported collation when new collation is enabled: 'latin1_general_ci' +drop table t; +create table t(a varchar(20) charset latin1); +insert into t values ("t_value"); +alter table t modify column a varchar(19) charset utf8mb4; +admin check table t; + +select * from t; +a +t_value +drop table t; +create table t(a varchar(20) charset latin1); +insert into t values ("t_value"); +alter table t modify column a varchar(19) charset utf8 collate utf8_bin; +admin check table t; + +select * from t; +a +t_value create database if not exists cd_test_utf8 CHARACTER SET utf8 COLLATE utf8_bin; create database if not exists cd_test_latin1 CHARACTER SET latin1 COLLATE latin1_swedish_ci; Error 1273: Unsupported collation when new collation is enabled: 'latin1_swedish_ci' diff --git a/cmd/explaintest/t/collation_misc.test b/cmd/explaintest/t/collation_misc.test index 443c6c4106cc0..e77fea4d783fd 100644 --- a/cmd/explaintest/t/collation_misc.test +++ b/cmd/explaintest/t/collation_misc.test @@ -12,12 +12,28 @@ create table t(a varchar(20) charset latin1); insert into t values ("t_value"); alter table t modify column a varchar(20) charset latin1; select * from t; ---error 8200 + alter table t modify column a varchar(20) charset utf8; ---error 8200 +admin check table t; +select * from t; + +drop table t; +create table t(a varchar(20) charset latin1); +insert into t values ("t_value"); alter table t modify column a varchar(20) charset utf8mb4; ---error 8200 +admin check table t; +select * from t; + +drop table t; +create table t(a varchar(20) charset latin1); +insert into t values ("t_value"); alter table t modify column a varchar(20) charset utf8 collate utf8_bin; +admin check table t; +select * from t; + +drop table t; +create table t(a varchar(20) charset latin1); +insert into t values ("t_value"); --error 8200 alter table t modify column a varchar(20) charset utf8mb4 collate utf8mb4_general_ci; --error 1273 @@ -27,6 +43,21 @@ alter table t collate LATIN1_GENERAL_CI charset utf8 collate utf8_bin; --error 1253, 1273 alter table t collate LATIN1_GENERAL_CI collate UTF8MB4_UNICODE_ci collate utf8_bin; +# ChangingCharsetToUtf8 with reorg +drop table t; +create table t(a varchar(20) charset latin1); +insert into t values ("t_value"); +alter table t modify column a varchar(19) charset utf8mb4; +admin check table t; +select * from t; + +drop table t; +create table t(a varchar(20) charset latin1); +insert into t values ("t_value"); +alter table t modify column a varchar(19) charset utf8 collate utf8_bin; +admin check table t; +select * from t; + # TestCharsetDatabase create database if not exists cd_test_utf8 CHARACTER SET utf8 COLLATE utf8_bin; --error 1273 diff --git a/ddl/db_integration_test.go b/ddl/db_integration_test.go index f50260cb80358..74d3621d8ab07 100644 --- a/ddl/db_integration_test.go +++ b/ddl/db_integration_test.go @@ -760,7 +760,7 @@ func TestChangingTableCharset(t *testing.T) { tk := testkit.NewTestKit(t, store) tk.MustExec("USE test") - tk.MustExec("create table t(a char(10)) charset latin1 collate latin1_bin") + tk.MustExec("create table t(a char(10), index i(a)) charset latin1 collate latin1_bin") tk.MustGetErrCode("alter table t charset gbk", errno.ErrUnsupportedDDLOperation) tk.MustGetErrCode("alter table t charset ''", errno.ErrUnknownCharacterSet) @@ -771,9 +771,18 @@ func TestChangingTableCharset(t *testing.T) { tk.MustGetErrCode("alter table t charset utf8 collate utf8mb4_bin;", errno.ErrCollationCharsetMismatch) tk.MustGetErrCode("alter table t charset utf8 collate utf8_bin collate utf8mb4_bin collate utf8_bin;", errno.ErrCollationCharsetMismatch) - tk.MustGetErrCode("alter table t charset utf8", errno.ErrUnsupportedDDLOperation) - tk.MustGetErrCode("alter table t charset utf8mb4", errno.ErrUnsupportedDDLOperation) - tk.MustGetErrCode("alter table t charset utf8mb4 collate utf8mb4_bin", errno.ErrUnsupportedDDLOperation) + tk.MustExec("alter table t charset utf8") + tk.MustExec("admin check table t") + + tk.MustExec("drop table if exists t") + tk.MustExec("create table t(a char(10), index i(a)) charset latin1 collate latin1_bin") + tk.MustExec("alter table t charset utf8mb4") + tk.MustExec("admin check table t") + + tk.MustExec("drop table if exists t") + tk.MustExec("create table t(a char(10), index i(a)) charset latin1 collate latin1_bin") + tk.MustExec("alter table t charset utf8mb4 collate utf8mb4_bin") + tk.MustExec("admin check table t") tk.MustGetErrCode("alter table t charset latin1 charset utf8 charset utf8mb4 collate utf8_bin;", errno.ErrConflictingDeclarations) @@ -793,6 +802,13 @@ func TestChangingTableCharset(t *testing.T) { } checkCharset(charset.CharsetUTF8MB4, charset.CollationUTF8MB4) + tk.MustExec("drop table if exists t") + tk.MustExec("create table t(a varchar(20), key i(a)) charset=latin1") + tk.MustGetErrCode("alter table t convert to charset utf8 collate utf8_unicode_ci", errno.ErrUnsupportedDDLOperation) + tk.MustGetErrCode("alter table t convert to charset utf8 collate utf8_general_ci", errno.ErrUnsupportedDDLOperation) + tk.MustExec("alter table t convert to charset utf8 collate utf8_bin") + tk.MustGetErrCode("alter table t convert to charset latin1", errno.ErrUnsupportedDDLOperation) + // Test when column charset can not convert to the target charset. tk.MustExec("drop table t;") tk.MustExec("create table t(a varchar(10) character set ascii) charset utf8mb4") diff --git a/ddl/ddl_api.go b/ddl/ddl_api.go index a7ffbdddd7143..e4ded50b7d794 100644 --- a/ddl/ddl_api.go +++ b/ddl/ddl_api.go @@ -4177,8 +4177,10 @@ func checkModifyCharsetAndCollation(toCharset, toCollate, origCharset, origColla if (origCharset == charset.CharsetUTF8 && toCharset == charset.CharsetUTF8MB4) || (origCharset == charset.CharsetUTF8 && toCharset == charset.CharsetUTF8) || - (origCharset == charset.CharsetUTF8MB4 && toCharset == charset.CharsetUTF8MB4) { - // TiDB only allow utf8 to be changed to utf8mb4, or changing the collation when the charset is utf8/utf8mb4. + (origCharset == charset.CharsetUTF8MB4 && toCharset == charset.CharsetUTF8MB4) || + (origCharset == charset.CharsetLatin1 && toCharset == charset.CharsetUTF8) || + (origCharset == charset.CharsetLatin1 && toCharset == charset.CharsetUTF8MB4) { + // TiDB only allow utf8/latin1 to be changed to utf8mb4, or changing the collation when the charset is utf8/utf8mb4/latin1. return nil } diff --git a/expression/integration_test.go b/expression/integration_test.go index 37290e447b6e2..ae7bfbf906357 100644 --- a/expression/integration_test.go +++ b/expression/integration_test.go @@ -648,7 +648,7 @@ func TestStringBuiltin(t *testing.T) { result = tk.MustQuery("select ord('123'), ord(123), ord(''), ord('你好'), ord(NULL), ord('👍')") result.Check(testkit.Rows("49 49 0 14990752 4036989325")) result = tk.MustQuery("select ord(X''), ord(X'6161'), ord(X'e4bd'), ord(X'e4bda0'), ord(_ascii'你'), ord(_latin1'你')") - result.Check(testkit.Rows("0 97 228 228 228 228")) + result.Check(testkit.Rows("0 97 228 228 228 14990752")) // for space result = tk.MustQuery(`select space(0), space(2), space(-1), space(1.1), space(1.9)`) diff --git a/parser/charset/encoding_latin1.go b/parser/charset/encoding_latin1.go index 38f9bb601ac4e..f1893484b9a6b 100644 --- a/parser/charset/encoding_latin1.go +++ b/parser/charset/encoding_latin1.go @@ -14,13 +14,11 @@ package charset import ( - "bytes" - "golang.org/x/text/encoding" ) // EncodingLatin1Impl is the instance of encodingLatin1. -// TiDB uses utf8 implementation for latin1 charset because of the backward compatibility. +// In TiDB, latin1 is an alias for utf8, so uses utf8 implementation for latin1. var EncodingLatin1Impl = &encodingLatin1{encodingUTF8{encodingBase{enc: encoding.Nop}}} func init() { @@ -36,25 +34,3 @@ type encodingLatin1 struct { func (e *encodingLatin1) Name() string { return CharsetLatin1 } - -// Peek implements Encoding interface. -func (e *encodingLatin1) Peek(src []byte) []byte { - if len(src) == 0 { - return src - } - return src[:1] -} - -// IsValid implements Encoding interface. -func (e *encodingLatin1) IsValid(src []byte) bool { - return true -} - -// Tp implements Encoding interface. -func (e *encodingLatin1) Tp() EncodingTp { - return EncodingTpLatin1 -} - -func (e *encodingLatin1) Transform(dest *bytes.Buffer, src []byte, op Op) ([]byte, error) { - return src, nil -} diff --git a/parser/mysql/charset.go b/parser/mysql/charset.go index d3115df457cb8..cb3666bb21b31 100644 --- a/parser/mysql/charset.go +++ b/parser/mysql/charset.go @@ -574,6 +574,7 @@ var CollationNames = map[string]uint8{ const ( UTF8Charset = "utf8" UTF8MB4Charset = "utf8mb4" + Latin1Charset = "latin1" DefaultCharset = UTF8MB4Charset // DefaultCollationID is utf8mb4_bin(46) DefaultCollationID = 46 @@ -592,9 +593,9 @@ const ( MaxBytesOfCharacter = 4 ) -// IsUTF8Charset checks if charset is utf8 or utf8mb4 +// IsUTF8Charset checks if charset is utf8, utf8mb4 or latin1. func IsUTF8Charset(charset string) bool { - return charset == UTF8Charset || charset == UTF8MB4Charset + return charset == UTF8Charset || charset == UTF8MB4Charset || charset == Latin1Charset } // RangeGraph defines valid unicode characters to use in column names. It strictly follows MySQL's definition. diff --git a/util/collate/collate.go b/util/collate/collate.go index 171f7f7cfe491..e018215d92886 100644 --- a/util/collate/collate.go +++ b/util/collate/collate.go @@ -98,7 +98,7 @@ func NewCollationEnabled() bool { func CompatibleCollate(collate1, collate2 string) bool { if (collate1 == "utf8mb4_general_ci" || collate1 == "utf8_general_ci") && (collate2 == "utf8mb4_general_ci" || collate2 == "utf8_general_ci") { return true - } else if (collate1 == "utf8mb4_bin" || collate1 == "utf8_bin") && (collate2 == "utf8mb4_bin" || collate2 == "utf8_bin") { + } else if (collate1 == "utf8mb4_bin" || collate1 == "utf8_bin" || collate1 == "latin1_bin") && (collate2 == "utf8mb4_bin" || collate2 == "utf8_bin") { return true } else if (collate1 == "utf8mb4_unicode_ci" || collate1 == "utf8_unicode_ci") && (collate2 == "utf8mb4_unicode_ci" || collate2 == "utf8_unicode_ci") { return true