Skip to content

Commit

Permalink
*: Support modify table/column charset from latin1 to utf8/utf8mb4 (#…
Browse files Browse the repository at this point in the history
  • Loading branch information
Defined2014 committed May 10, 2022
1 parent e9e1e53 commit ede6f8c
Show file tree
Hide file tree
Showing 9 changed files with 148 additions and 46 deletions.
46 changes: 42 additions & 4 deletions cmd/explaintest/r/collation_misc_disabled.result
Original file line number Diff line number Diff line change
Expand Up @@ -14,19 +14,57 @@ select * from t;
a
t_value
alter table t modify column a varchar(20) charset utf8;
Error 8200: Unsupported modify charset from latin1 to utf8
admin check table t;

select * from t;
a
t_value
drop table t;
create table t(a varchar(20) charset latin1);
insert into t values ("t_value");
alter table t modify column a varchar(20) charset utf8mb4;
Error 8200: Unsupported modify charset from latin1 to utf8mb4
admin check table t;

select * from t;
a
t_value
drop table t;
create table t(a varchar(20) charset latin1);
insert into t values ("t_value");
alter table t modify column a varchar(20) charset utf8 collate utf8_bin;
Error 8200: Unsupported modify charset from latin1 to utf8
admin check table t;

select * from t;
a
t_value
drop table t;
create table t(a varchar(20) charset latin1);
insert into t values ("t_value");
alter table t modify column a varchar(20) charset utf8mb4 collate utf8mb4_general_ci;
Error 8200: Unsupported modify charset from latin1 to utf8mb4
alter table t modify column a varchar(20) charset utf8mb4 collate utf8bin;
[ddl:1273]Unknown collation: 'utf8bin'
alter table t collate LATIN1_GENERAL_CI charset utf8 collate utf8_bin;
Error 1302: Conflicting declarations: 'CHARACTER SET latin1' and 'CHARACTER SET utf8'
alter table t collate LATIN1_GENERAL_CI collate UTF8MB4_UNICODE_ci collate utf8_bin;
Error 1253: COLLATION 'utf8mb4_unicode_ci' is not valid for CHARACTER SET 'latin1'
drop table t;
create table t(a varchar(20) charset latin1);
insert into t values ("t_value");
alter table t modify column a varchar(19) charset utf8mb4;
admin check table t;

select * from t;
a
t_value
drop table t;
create table t(a varchar(20) charset latin1);
insert into t values ("t_value");
alter table t modify column a varchar(19) charset utf8 collate utf8_bin;
admin check table t;

select * from t;
a
t_value
create database if not exists cd_test_utf8 CHARACTER SET utf8 COLLATE utf8_bin;
create database if not exists cd_test_latin1 CHARACTER SET latin1 COLLATE latin1_swedish_ci;
use cd_test_utf8;
Expand Down
46 changes: 42 additions & 4 deletions cmd/explaintest/r/collation_misc_enabled.result
Original file line number Diff line number Diff line change
Expand Up @@ -14,19 +14,57 @@ select * from t;
a
t_value
alter table t modify column a varchar(20) charset utf8;
Error 8200: Unsupported modify charset from latin1 to utf8
admin check table t;

select * from t;
a
t_value
drop table t;
create table t(a varchar(20) charset latin1);
insert into t values ("t_value");
alter table t modify column a varchar(20) charset utf8mb4;
Error 8200: Unsupported modify charset from latin1 to utf8mb4
admin check table t;

select * from t;
a
t_value
drop table t;
create table t(a varchar(20) charset latin1);
insert into t values ("t_value");
alter table t modify column a varchar(20) charset utf8 collate utf8_bin;
Error 8200: Unsupported modify charset from latin1 to utf8
admin check table t;

select * from t;
a
t_value
drop table t;
create table t(a varchar(20) charset latin1);
insert into t values ("t_value");
alter table t modify column a varchar(20) charset utf8mb4 collate utf8mb4_general_ci;
Error 8200: Unsupported modify charset from latin1 to utf8mb4
alter table t modify column a varchar(20) charset utf8mb4 collate utf8bin;
[ddl:1273]Unknown collation: 'utf8bin'
alter table t collate LATIN1_GENERAL_CI charset utf8 collate utf8_bin;
Error 1273: Unsupported collation when new collation is enabled: 'latin1_general_ci'
alter table t collate LATIN1_GENERAL_CI collate UTF8MB4_UNICODE_ci collate utf8_bin;
Error 1273: Unsupported collation when new collation is enabled: 'latin1_general_ci'
drop table t;
create table t(a varchar(20) charset latin1);
insert into t values ("t_value");
alter table t modify column a varchar(19) charset utf8mb4;
admin check table t;

select * from t;
a
t_value
drop table t;
create table t(a varchar(20) charset latin1);
insert into t values ("t_value");
alter table t modify column a varchar(19) charset utf8 collate utf8_bin;
admin check table t;

select * from t;
a
t_value
create database if not exists cd_test_utf8 CHARACTER SET utf8 COLLATE utf8_bin;
create database if not exists cd_test_latin1 CHARACTER SET latin1 COLLATE latin1_swedish_ci;
Error 1273: Unsupported collation when new collation is enabled: 'latin1_swedish_ci'
Expand Down
37 changes: 34 additions & 3 deletions cmd/explaintest/t/collation_misc.test
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,28 @@ create table t(a varchar(20) charset latin1);
insert into t values ("t_value");
alter table t modify column a varchar(20) charset latin1;
select * from t;
--error 8200

alter table t modify column a varchar(20) charset utf8;
--error 8200
admin check table t;
select * from t;

drop table t;
create table t(a varchar(20) charset latin1);
insert into t values ("t_value");
alter table t modify column a varchar(20) charset utf8mb4;
--error 8200
admin check table t;
select * from t;

drop table t;
create table t(a varchar(20) charset latin1);
insert into t values ("t_value");
alter table t modify column a varchar(20) charset utf8 collate utf8_bin;
admin check table t;
select * from t;

drop table t;
create table t(a varchar(20) charset latin1);
insert into t values ("t_value");
--error 8200
alter table t modify column a varchar(20) charset utf8mb4 collate utf8mb4_general_ci;
--error 1273
Expand All @@ -27,6 +43,21 @@ alter table t collate LATIN1_GENERAL_CI charset utf8 collate utf8_bin;
--error 1253, 1273
alter table t collate LATIN1_GENERAL_CI collate UTF8MB4_UNICODE_ci collate utf8_bin;

# ChangingCharsetToUtf8 with reorg
drop table t;
create table t(a varchar(20) charset latin1);
insert into t values ("t_value");
alter table t modify column a varchar(19) charset utf8mb4;
admin check table t;
select * from t;

drop table t;
create table t(a varchar(20) charset latin1);
insert into t values ("t_value");
alter table t modify column a varchar(19) charset utf8 collate utf8_bin;
admin check table t;
select * from t;

# TestCharsetDatabase
create database if not exists cd_test_utf8 CHARACTER SET utf8 COLLATE utf8_bin;
--error 1273
Expand Down
24 changes: 20 additions & 4 deletions ddl/db_integration_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -760,7 +760,7 @@ func TestChangingTableCharset(t *testing.T) {
tk := testkit.NewTestKit(t, store)

tk.MustExec("USE test")
tk.MustExec("create table t(a char(10)) charset latin1 collate latin1_bin")
tk.MustExec("create table t(a char(10), index i(a)) charset latin1 collate latin1_bin")

tk.MustGetErrCode("alter table t charset gbk", errno.ErrUnsupportedDDLOperation)
tk.MustGetErrCode("alter table t charset ''", errno.ErrUnknownCharacterSet)
Expand All @@ -771,9 +771,18 @@ func TestChangingTableCharset(t *testing.T) {
tk.MustGetErrCode("alter table t charset utf8 collate utf8mb4_bin;", errno.ErrCollationCharsetMismatch)
tk.MustGetErrCode("alter table t charset utf8 collate utf8_bin collate utf8mb4_bin collate utf8_bin;", errno.ErrCollationCharsetMismatch)

tk.MustGetErrCode("alter table t charset utf8", errno.ErrUnsupportedDDLOperation)
tk.MustGetErrCode("alter table t charset utf8mb4", errno.ErrUnsupportedDDLOperation)
tk.MustGetErrCode("alter table t charset utf8mb4 collate utf8mb4_bin", errno.ErrUnsupportedDDLOperation)
tk.MustExec("alter table t charset utf8")
tk.MustExec("admin check table t")

tk.MustExec("drop table if exists t")
tk.MustExec("create table t(a char(10), index i(a)) charset latin1 collate latin1_bin")
tk.MustExec("alter table t charset utf8mb4")
tk.MustExec("admin check table t")

tk.MustExec("drop table if exists t")
tk.MustExec("create table t(a char(10), index i(a)) charset latin1 collate latin1_bin")
tk.MustExec("alter table t charset utf8mb4 collate utf8mb4_bin")
tk.MustExec("admin check table t")

tk.MustGetErrCode("alter table t charset latin1 charset utf8 charset utf8mb4 collate utf8_bin;", errno.ErrConflictingDeclarations)

Expand All @@ -793,6 +802,13 @@ func TestChangingTableCharset(t *testing.T) {
}
checkCharset(charset.CharsetUTF8MB4, charset.CollationUTF8MB4)

tk.MustExec("drop table if exists t")
tk.MustExec("create table t(a varchar(20), key i(a)) charset=latin1")
tk.MustGetErrCode("alter table t convert to charset utf8 collate utf8_unicode_ci", errno.ErrUnsupportedDDLOperation)
tk.MustGetErrCode("alter table t convert to charset utf8 collate utf8_general_ci", errno.ErrUnsupportedDDLOperation)
tk.MustExec("alter table t convert to charset utf8 collate utf8_bin")
tk.MustGetErrCode("alter table t convert to charset latin1", errno.ErrUnsupportedDDLOperation)

// Test when column charset can not convert to the target charset.
tk.MustExec("drop table t;")
tk.MustExec("create table t(a varchar(10) character set ascii) charset utf8mb4")
Expand Down
6 changes: 4 additions & 2 deletions ddl/ddl_api.go
Original file line number Diff line number Diff line change
Expand Up @@ -4177,8 +4177,10 @@ func checkModifyCharsetAndCollation(toCharset, toCollate, origCharset, origColla

if (origCharset == charset.CharsetUTF8 && toCharset == charset.CharsetUTF8MB4) ||
(origCharset == charset.CharsetUTF8 && toCharset == charset.CharsetUTF8) ||
(origCharset == charset.CharsetUTF8MB4 && toCharset == charset.CharsetUTF8MB4) {
// TiDB only allow utf8 to be changed to utf8mb4, or changing the collation when the charset is utf8/utf8mb4.
(origCharset == charset.CharsetUTF8MB4 && toCharset == charset.CharsetUTF8MB4) ||
(origCharset == charset.CharsetLatin1 && toCharset == charset.CharsetUTF8) ||
(origCharset == charset.CharsetLatin1 && toCharset == charset.CharsetUTF8MB4) {
// TiDB only allow utf8/latin1 to be changed to utf8mb4, or changing the collation when the charset is utf8/utf8mb4/latin1.
return nil
}

Expand Down
2 changes: 1 addition & 1 deletion expression/integration_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -648,7 +648,7 @@ func TestStringBuiltin(t *testing.T) {
result = tk.MustQuery("select ord('123'), ord(123), ord(''), ord('你好'), ord(NULL), ord('👍')")
result.Check(testkit.Rows("49 49 0 14990752 <nil> 4036989325"))
result = tk.MustQuery("select ord(X''), ord(X'6161'), ord(X'e4bd'), ord(X'e4bda0'), ord(_ascii'你'), ord(_latin1'你')")
result.Check(testkit.Rows("0 97 228 228 228 228"))
result.Check(testkit.Rows("0 97 228 228 228 14990752"))

// for space
result = tk.MustQuery(`select space(0), space(2), space(-1), space(1.1), space(1.9)`)
Expand Down
26 changes: 1 addition & 25 deletions parser/charset/encoding_latin1.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,11 @@
package charset

import (
"bytes"

"golang.org/x/text/encoding"
)

// EncodingLatin1Impl is the instance of encodingLatin1.
// TiDB uses utf8 implementation for latin1 charset because of the backward compatibility.
// In TiDB, latin1 is an alias for utf8, so uses utf8 implementation for latin1.
var EncodingLatin1Impl = &encodingLatin1{encodingUTF8{encodingBase{enc: encoding.Nop}}}

func init() {
Expand All @@ -36,25 +34,3 @@ type encodingLatin1 struct {
func (e *encodingLatin1) Name() string {
return CharsetLatin1
}

// Peek implements Encoding interface.
func (e *encodingLatin1) Peek(src []byte) []byte {
if len(src) == 0 {
return src
}
return src[:1]
}

// IsValid implements Encoding interface.
func (e *encodingLatin1) IsValid(src []byte) bool {
return true
}

// Tp implements Encoding interface.
func (e *encodingLatin1) Tp() EncodingTp {
return EncodingTpLatin1
}

func (e *encodingLatin1) Transform(dest *bytes.Buffer, src []byte, op Op) ([]byte, error) {
return src, nil
}
5 changes: 3 additions & 2 deletions parser/mysql/charset.go
Original file line number Diff line number Diff line change
Expand Up @@ -574,6 +574,7 @@ var CollationNames = map[string]uint8{
const (
UTF8Charset = "utf8"
UTF8MB4Charset = "utf8mb4"
Latin1Charset = "latin1"
DefaultCharset = UTF8MB4Charset
// DefaultCollationID is utf8mb4_bin(46)
DefaultCollationID = 46
Expand All @@ -592,9 +593,9 @@ const (
MaxBytesOfCharacter = 4
)

// IsUTF8Charset checks if charset is utf8 or utf8mb4
// IsUTF8Charset checks if charset is utf8, utf8mb4 or latin1.
func IsUTF8Charset(charset string) bool {
return charset == UTF8Charset || charset == UTF8MB4Charset
return charset == UTF8Charset || charset == UTF8MB4Charset || charset == Latin1Charset
}

// RangeGraph defines valid unicode characters to use in column names. It strictly follows MySQL's definition.
Expand Down
2 changes: 1 addition & 1 deletion util/collate/collate.go
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ func NewCollationEnabled() bool {
func CompatibleCollate(collate1, collate2 string) bool {
if (collate1 == "utf8mb4_general_ci" || collate1 == "utf8_general_ci") && (collate2 == "utf8mb4_general_ci" || collate2 == "utf8_general_ci") {
return true
} else if (collate1 == "utf8mb4_bin" || collate1 == "utf8_bin") && (collate2 == "utf8mb4_bin" || collate2 == "utf8_bin") {
} else if (collate1 == "utf8mb4_bin" || collate1 == "utf8_bin" || collate1 == "latin1_bin") && (collate2 == "utf8mb4_bin" || collate2 == "utf8_bin") {
return true
} else if (collate1 == "utf8mb4_unicode_ci" || collate1 == "utf8_unicode_ci") && (collate2 == "utf8mb4_unicode_ci" || collate2 == "utf8_unicode_ci") {
return true
Expand Down

0 comments on commit ede6f8c

Please sign in to comment.