Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

*: Support modify table/column charset from latin1 to utf8/utf8mb4 #34386

Merged
merged 10 commits into from
May 10, 2022
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 42 additions & 4 deletions cmd/explaintest/r/collation_misc_disabled.result
Original file line number Diff line number Diff line change
Expand Up @@ -14,19 +14,57 @@ select * from t;
a
t_value
alter table t modify column a varchar(20) charset utf8;
Error 8200: Unsupported modify charset from latin1 to utf8
admin check table t;

select * from t;
a
t_value
drop table t;
create table t(a varchar(20) charset latin1);
insert into t values ("t_value");
alter table t modify column a varchar(20) charset utf8mb4;
Error 8200: Unsupported modify charset from latin1 to utf8mb4
admin check table t;

select * from t;
a
t_value
drop table t;
create table t(a varchar(20) charset latin1);
insert into t values ("t_value");
alter table t modify column a varchar(20) charset utf8 collate utf8_bin;
Error 8200: Unsupported modify charset from latin1 to utf8
admin check table t;

select * from t;
a
t_value
drop table t;
create table t(a varchar(20) charset latin1);
insert into t values ("t_value");
alter table t modify column a varchar(20) charset utf8mb4 collate utf8mb4_general_ci;
Error 8200: Unsupported modify charset from latin1 to utf8mb4
alter table t modify column a varchar(20) charset utf8mb4 collate utf8bin;
[ddl:1273]Unknown collation: 'utf8bin'
alter table t collate LATIN1_GENERAL_CI charset utf8 collate utf8_bin;
Error 1302: Conflicting declarations: 'CHARACTER SET latin1' and 'CHARACTER SET utf8'
alter table t collate LATIN1_GENERAL_CI collate UTF8MB4_UNICODE_ci collate utf8_bin;
Error 1253: COLLATION 'utf8mb4_unicode_ci' is not valid for CHARACTER SET 'latin1'
drop table t;
create table t(a varchar(20) charset latin1);
insert into t values ("t_value");
alter table t modify column a varchar(19) charset utf8mb4;
admin check table t;

select * from t;
a
t_value
drop table t;
create table t(a varchar(20) charset latin1);
insert into t values ("t_value");
alter table t modify column a varchar(19) charset utf8 collate utf8_bin;
admin check table t;

select * from t;
a
t_value
create database if not exists cd_test_utf8 CHARACTER SET utf8 COLLATE utf8_bin;
create database if not exists cd_test_latin1 CHARACTER SET latin1 COLLATE latin1_swedish_ci;
use cd_test_utf8;
Expand Down
46 changes: 42 additions & 4 deletions cmd/explaintest/r/collation_misc_enabled.result
Original file line number Diff line number Diff line change
Expand Up @@ -14,19 +14,57 @@ select * from t;
a
t_value
alter table t modify column a varchar(20) charset utf8;
Error 8200: Unsupported modify charset from latin1 to utf8
admin check table t;

select * from t;
a
t_value
drop table t;
create table t(a varchar(20) charset latin1);
insert into t values ("t_value");
alter table t modify column a varchar(20) charset utf8mb4;
Error 8200: Unsupported modify charset from latin1 to utf8mb4
admin check table t;

select * from t;
a
t_value
drop table t;
create table t(a varchar(20) charset latin1);
insert into t values ("t_value");
alter table t modify column a varchar(20) charset utf8 collate utf8_bin;
Error 8200: Unsupported modify charset from latin1 to utf8
admin check table t;

select * from t;
a
t_value
drop table t;
create table t(a varchar(20) charset latin1);
insert into t values ("t_value");
alter table t modify column a varchar(20) charset utf8mb4 collate utf8mb4_general_ci;
Error 8200: Unsupported modify charset from latin1 to utf8mb4
alter table t modify column a varchar(20) charset utf8mb4 collate utf8bin;
[ddl:1273]Unknown collation: 'utf8bin'
alter table t collate LATIN1_GENERAL_CI charset utf8 collate utf8_bin;
Error 1273: Unsupported collation when new collation is enabled: 'latin1_general_ci'
alter table t collate LATIN1_GENERAL_CI collate UTF8MB4_UNICODE_ci collate utf8_bin;
Error 1273: Unsupported collation when new collation is enabled: 'latin1_general_ci'
drop table t;
create table t(a varchar(20) charset latin1);
insert into t values ("t_value");
alter table t modify column a varchar(19) charset utf8mb4;
admin check table t;

select * from t;
a
t_value
drop table t;
create table t(a varchar(20) charset latin1);
insert into t values ("t_value");
alter table t modify column a varchar(19) charset utf8 collate utf8_bin;
admin check table t;

select * from t;
a
t_value
create database if not exists cd_test_utf8 CHARACTER SET utf8 COLLATE utf8_bin;
create database if not exists cd_test_latin1 CHARACTER SET latin1 COLLATE latin1_swedish_ci;
Error 1273: Unsupported collation when new collation is enabled: 'latin1_swedish_ci'
Expand Down
37 changes: 34 additions & 3 deletions cmd/explaintest/t/collation_misc.test
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,28 @@ create table t(a varchar(20) charset latin1);
insert into t values ("t_value");
alter table t modify column a varchar(20) charset latin1;
select * from t;
--error 8200

alter table t modify column a varchar(20) charset utf8;
Defined2014 marked this conversation as resolved.
Show resolved Hide resolved
--error 8200
admin check table t;
select * from t;

drop table t;
create table t(a varchar(20) charset latin1);
insert into t values ("t_value");
alter table t modify column a varchar(20) charset utf8mb4;
Defined2014 marked this conversation as resolved.
Show resolved Hide resolved
--error 8200
admin check table t;
select * from t;

drop table t;
create table t(a varchar(20) charset latin1);
insert into t values ("t_value");
alter table t modify column a varchar(20) charset utf8 collate utf8_bin;
admin check table t;
select * from t;

drop table t;
create table t(a varchar(20) charset latin1);
insert into t values ("t_value");
--error 8200
alter table t modify column a varchar(20) charset utf8mb4 collate utf8mb4_general_ci;
--error 1273
Expand All @@ -27,6 +43,21 @@ alter table t collate LATIN1_GENERAL_CI charset utf8 collate utf8_bin;
--error 1253, 1273
alter table t collate LATIN1_GENERAL_CI collate UTF8MB4_UNICODE_ci collate utf8_bin;

# ChangingCharsetToUtf8 with reorg
drop table t;
create table t(a varchar(20) charset latin1);
insert into t values ("t_value");
alter table t modify column a varchar(19) charset utf8mb4;
admin check table t;
select * from t;

drop table t;
create table t(a varchar(20) charset latin1);
insert into t values ("t_value");
alter table t modify column a varchar(19) charset utf8 collate utf8_bin;
admin check table t;
select * from t;

# TestCharsetDatabase
create database if not exists cd_test_utf8 CHARACTER SET utf8 COLLATE utf8_bin;
--error 1273
Expand Down
24 changes: 20 additions & 4 deletions ddl/db_integration_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -758,7 +758,7 @@ func TestChangingTableCharset(t *testing.T) {
tk := testkit.NewTestKit(t, store)

tk.MustExec("USE test")
tk.MustExec("create table t(a char(10)) charset latin1 collate latin1_bin")
tk.MustExec("create table t(a char(10), index i(a)) charset latin1 collate latin1_bin")

tk.MustGetErrCode("alter table t charset gbk", errno.ErrUnsupportedDDLOperation)
tk.MustGetErrCode("alter table t charset ''", errno.ErrUnknownCharacterSet)
Expand All @@ -769,9 +769,18 @@ func TestChangingTableCharset(t *testing.T) {
tk.MustGetErrCode("alter table t charset utf8 collate utf8mb4_bin;", errno.ErrCollationCharsetMismatch)
tk.MustGetErrCode("alter table t charset utf8 collate utf8_bin collate utf8mb4_bin collate utf8_bin;", errno.ErrCollationCharsetMismatch)

tk.MustGetErrCode("alter table t charset utf8", errno.ErrUnsupportedDDLOperation)
tk.MustGetErrCode("alter table t charset utf8mb4", errno.ErrUnsupportedDDLOperation)
tk.MustGetErrCode("alter table t charset utf8mb4 collate utf8mb4_bin", errno.ErrUnsupportedDDLOperation)
tk.MustExec("alter table t charset utf8")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please add the check of admin check table t.
And do we need to add some records?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done. I think with data is covered by collation_misc.test

tk.MustExec("admin check table t")

tk.MustExec("drop table if exists t")
tk.MustExec("create table t(a char(10), index i(a)) charset latin1 collate latin1_bin")
tk.MustExec("alter table t charset utf8mb4")
tk.MustExec("admin check table t")

tk.MustExec("drop table if exists t")
tk.MustExec("create table t(a char(10), index i(a)) charset latin1 collate latin1_bin")
tk.MustExec("alter table t charset utf8mb4 collate utf8mb4_bin")
tk.MustExec("admin check table t")

tk.MustGetErrCode("alter table t charset latin1 charset utf8 charset utf8mb4 collate utf8_bin;", errno.ErrConflictingDeclarations)

Expand All @@ -791,6 +800,13 @@ func TestChangingTableCharset(t *testing.T) {
}
checkCharset(charset.CharsetUTF8MB4, charset.CollationUTF8MB4)

tk.MustExec("drop table if exists t")
tk.MustExec("create table t(a varchar(20), key i(a)) charset=latin1")
tk.MustGetErrCode("alter table t convert to charset utf8 collate utf8_unicode_ci", errno.ErrUnsupportedDDLOperation)
tk.MustGetErrCode("alter table t convert to charset utf8 collate utf8_general_ci", errno.ErrUnsupportedDDLOperation)
tk.MustExec("alter table t convert to charset utf8 collate utf8_bin")
tk.MustGetErrCode("alter table t convert to charset latin1", errno.ErrUnsupportedDDLOperation)

// Test when column charset can not convert to the target charset.
tk.MustExec("drop table t;")
tk.MustExec("create table t(a varchar(10) character set ascii) charset utf8mb4")
Expand Down
6 changes: 4 additions & 2 deletions ddl/ddl_api.go
Original file line number Diff line number Diff line change
Expand Up @@ -4177,8 +4177,10 @@ func checkModifyCharsetAndCollation(toCharset, toCollate, origCharset, origColla

if (origCharset == charset.CharsetUTF8 && toCharset == charset.CharsetUTF8MB4) ||
(origCharset == charset.CharsetUTF8 && toCharset == charset.CharsetUTF8) ||
(origCharset == charset.CharsetUTF8MB4 && toCharset == charset.CharsetUTF8MB4) {
// TiDB only allow utf8 to be changed to utf8mb4, or changing the collation when the charset is utf8/utf8mb4.
(origCharset == charset.CharsetUTF8MB4 && toCharset == charset.CharsetUTF8MB4) ||
(origCharset == charset.CharsetLatin1 && toCharset == charset.CharsetUTF8) ||
(origCharset == charset.CharsetLatin1 && toCharset == charset.CharsetUTF8MB4) {
// TiDB only allow utf8/latin1 to be changed to utf8mb4, or changing the collation when the charset is utf8/utf8mb4/latin1.
return nil
}

Expand Down
2 changes: 1 addition & 1 deletion expression/integration_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -648,7 +648,7 @@ func TestStringBuiltin(t *testing.T) {
result = tk.MustQuery("select ord('123'), ord(123), ord(''), ord('你好'), ord(NULL), ord('👍')")
result.Check(testkit.Rows("49 49 0 14990752 <nil> 4036989325"))
result = tk.MustQuery("select ord(X''), ord(X'6161'), ord(X'e4bd'), ord(X'e4bda0'), ord(_ascii'你'), ord(_latin1'你')")
result.Check(testkit.Rows("0 97 228 228 228 228"))
result.Check(testkit.Rows("0 97 228 228 228 14990752"))

// for space
result = tk.MustQuery(`select space(0), space(2), space(-1), space(1.1), space(1.9)`)
Expand Down
26 changes: 1 addition & 25 deletions parser/charset/encoding_latin1.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,11 @@
package charset

import (
"bytes"

"golang.org/x/text/encoding"
)

// EncodingLatin1Impl is the instance of encodingLatin1.
// TiDB uses utf8 implementation for latin1 charset because of the backward compatibility.
// In TiDB, latin1 is an alias for utf8, so uses utf8 implementation for latin1.
var EncodingLatin1Impl = &encodingLatin1{encodingUTF8{encodingBase{enc: encoding.Nop}}}

func init() {
Expand All @@ -36,25 +34,3 @@ type encodingLatin1 struct {
func (e *encodingLatin1) Name() string {
return CharsetLatin1
}

// Peek implements Encoding interface.
func (e *encodingLatin1) Peek(src []byte) []byte {
if len(src) == 0 {
return src
}
return src[:1]
}

// IsValid implements Encoding interface.
func (e *encodingLatin1) IsValid(src []byte) bool {
return true
}

// Tp implements Encoding interface.
func (e *encodingLatin1) Tp() EncodingTp {
return EncodingTpLatin1
}

func (e *encodingLatin1) Transform(dest *bytes.Buffer, src []byte, op Op) ([]byte, error) {
return src, nil
}
5 changes: 3 additions & 2 deletions parser/mysql/charset.go
Original file line number Diff line number Diff line change
Expand Up @@ -574,6 +574,7 @@ var CollationNames = map[string]uint8{
const (
UTF8Charset = "utf8"
UTF8MB4Charset = "utf8mb4"
Latin1Charset = "latin1"
DefaultCharset = UTF8MB4Charset
// DefaultCollationID is utf8mb4_bin(46)
DefaultCollationID = 46
Expand All @@ -592,9 +593,9 @@ const (
MaxBytesOfCharacter = 4
)

// IsUTF8Charset checks if charset is utf8 or utf8mb4
// IsUTF8Charset checks if charset is utf8, utf8mb4 or latin1.
func IsUTF8Charset(charset string) bool {
return charset == UTF8Charset || charset == UTF8MB4Charset
return charset == UTF8Charset || charset == UTF8MB4Charset || charset == Latin1Charset
}

// RangeGraph defines valid unicode characters to use in column names. It strictly follows MySQL's definition.
Expand Down
2 changes: 1 addition & 1 deletion util/collate/collate.go
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ func NewCollationEnabled() bool {
func CompatibleCollate(collate1, collate2 string) bool {
if (collate1 == "utf8mb4_general_ci" || collate1 == "utf8_general_ci") && (collate2 == "utf8mb4_general_ci" || collate2 == "utf8_general_ci") {
return true
} else if (collate1 == "utf8mb4_bin" || collate1 == "utf8_bin") && (collate2 == "utf8mb4_bin" || collate2 == "utf8_bin") {
} else if (collate1 == "utf8mb4_bin" || collate1 == "utf8_bin" || collate1 == "latin1_bin") && (collate2 == "utf8mb4_bin" || collate2 == "utf8_bin") {
return true
} else if (collate1 == "utf8mb4_unicode_ci" || collate1 == "utf8_unicode_ci") && (collate2 == "utf8mb4_unicode_ci" || collate2 == "utf8_unicode_ci") {
return true
Expand Down