Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

*: LOAD DATA support GBK character set #42644

Merged
merged 8 commits into from
Mar 29, 2023
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 26 additions & 2 deletions executor/load_data.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import (
"strings"
"sync/atomic"
"time"
"unicode/utf8"

"github.com/pingcap/errors"
"github.com/pingcap/failpoint"
Expand All @@ -39,6 +40,7 @@ import (
"github.com/pingcap/tidb/parser/terror"
plannercore "github.com/pingcap/tidb/planner/core"
"github.com/pingcap/tidb/sessionctx"
"github.com/pingcap/tidb/sessionctx/variable"
"github.com/pingcap/tidb/sessiontxn"
"github.com/pingcap/tidb/table"
"github.com/pingcap/tidb/types"
Expand Down Expand Up @@ -263,6 +265,7 @@ type LoadDataWorker struct {
controller *importer.LoadDataController

table table.Table
charset *string
row []types.Datum
rows [][]types.Datum
commitTaskQueue chan commitTask
Expand Down Expand Up @@ -337,11 +340,23 @@ func NewLoadDataWorker(
sctx.GetSessionVars().StmtCtx.TruncateAsWarning = true
sctx.GetSessionVars().StmtCtx.BadNullAsWarning = true
}
charset := plan.Charset
if charset == nil {
// https://dev.mysql.com/doc/refman/8.0/en/load-data.html#load-data-character-set
d, err2 := userSctx.GetSessionVars().GetSessionOrGlobalSystemVar(
context.Background(), variable.CharsetDatabase)
if err2 != nil {
logutil.BgLogger().Error("LOAD DATA get charset failed", zap.Error(err2))
} else {
charset = &d
}
}
loadDataWorker := &LoadDataWorker{
row: make([]types.Datum, 0, len(insertVal.insertColumns)),
commitTaskQueue: make(chan commitTask, taskQueueSize),
InsertValues: insertVal,
table: tbl,
charset: charset,
controller: controller,
Ctx: sctx,
restrictive: restrictive,
Expand Down Expand Up @@ -586,6 +601,16 @@ func (e *LoadDataWorker) buildParser(
) (parser mydump.Parser, err error) {
switch e.controller.Format {
case importer.LoadDataFormatDelimitedData:
var charsetConvertor *mydump.CharsetConvertor
if e.charset != nil {
charsetConvertor, err = mydump.NewCharsetConvertor(*e.charset, string(utf8.RuneError))
if err != nil {
return nil, err
}
}
if err != nil {
return nil, err
}
// CSV-like
parser, err = mydump.NewCSVParser(
ctx,
Expand All @@ -594,8 +619,7 @@ func (e *LoadDataWorker) buildParser(
importer.LoadDataReadBlockSize,
nil,
false,
// TODO: support charset conversion
nil)
charsetConvertor)
case importer.LoadDataFormatSQLDump:
parser = mydump.NewChunkParser(
ctx,
Expand Down
55 changes: 55 additions & 0 deletions executor/loadremotetest/one_csv_test.go
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

test for invalid charset?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Original file line number Diff line number Diff line change
Expand Up @@ -294,3 +294,58 @@ func (s *mockGCSSuite) TestMultiValueIndex() {
"2 [2, 3, 4]",
))
}

func (s *mockGCSSuite) TestGBK() {
s.tk.MustExec("DROP DATABASE IF EXISTS load_charset;")
s.tk.MustExec("CREATE DATABASE load_charset;")
s.tk.MustExec(`CREATE TABLE load_charset.gbk (
i INT, j VARCHAR(255)
) CHARACTER SET gbk;`)
s.tk.MustExec(`CREATE TABLE load_charset.utf8mb4 (
i INT, j VARCHAR(255)
) CHARACTER SET utf8mb4;`)

s.server.CreateObject(fakestorage.Object{
ObjectAttrs: fakestorage.ObjectAttrs{
BucketName: "test-load",
Name: "gbk.tsv",
},
Content: []byte{
// 1 一丁丂七丄丅丆万丈三上下丌不与丏
0x31, 0x09, 0xd2, 0xbb, 0xb6, 0xa1, 0x81, 0x40, 0xc6, 0xdf, 0x81,
0x41, 0x81, 0x42, 0x81, 0x43, 0xcd, 0xf2, 0xd5, 0xc9, 0xc8, 0xfd,
0xc9, 0xcf, 0xcf, 0xc2, 0xd8, 0xa2, 0xb2, 0xbb, 0xd3, 0xeb, 0x81,
0x44, 0x0a,
// 2 丐丑丒专且丕世丗丘丙业丛东丝丞丢
0x32, 0x09, 0xd8, 0xa4, 0xb3, 0xf3, 0x81, 0x45, 0xd7, 0xa8, 0xc7,
0xd2, 0xd8, 0xa7, 0xca, 0xc0, 0x81, 0x46, 0xc7, 0xf0, 0xb1, 0xfb,
0xd2, 0xb5, 0xb4, 0xd4, 0xb6, 0xab, 0xcb, 0xbf, 0xd8, 0xa9, 0xb6,
0xaa,
},
})

sql := fmt.Sprintf(`LOAD DATA INFILE 'gs://test-load/gbk.tsv?endpoint=%s'
INTO TABLE load_charset.gbk CHARACTER SET gbk`, gcsEndpoint)
s.tk.MustExec(sql)
s.tk.MustQuery("SELECT * FROM load_charset.gbk;").Check(testkit.Rows(
"1 一丁丂七丄丅丆万丈三上下丌不与丏",
"2 丐丑丒专且丕世丗丘丙业丛东丝丞丢",
))
sql = fmt.Sprintf(`LOAD DATA INFILE 'gs://test-load/gbk.tsv?endpoint=%s'
INTO TABLE load_charset.utf8mb4 CHARACTER SET gbk`, gcsEndpoint)
s.tk.MustExec(sql)
s.tk.MustQuery("SELECT * FROM load_charset.utf8mb4;").Check(testkit.Rows(
"1 一丁丂七丄丅丆万丈三上下丌不与丏",
"2 丐丑丒专且丕世丗丘丙业丛东丝丞丢",
))

s.tk.MustExec("TRUNCATE TABLE load_charset.utf8mb4;")
s.tk.MustExec("SET SESSION character_set_database = 'gbk';")
sql = fmt.Sprintf(`LOAD DATA INFILE 'gs://test-load/gbk.tsv?endpoint=%s'
INTO TABLE load_charset.utf8mb4;`, gcsEndpoint)
s.tk.MustExec(sql)
s.tk.MustQuery("SELECT * FROM load_charset.utf8mb4;").Check(testkit.Rows(
"1 一丁丂七丄丅丆万丈三上下丌不与丏",
"2 丐丑丒专且丕世丗丘丙业丛东丝丞丢",
))
}
5 changes: 5 additions & 0 deletions parser/ast/dml.go
Original file line number Diff line number Diff line change
Expand Up @@ -1824,6 +1824,7 @@ type LoadDataStmt struct {
Format *string
OnDuplicate OnDuplicateKeyHandlingType
Table *TableName
Charset *string
Columns []*ColumnName
FieldsInfo *FieldsClause
LinesInfo *LinesClause
Expand Down Expand Up @@ -1857,6 +1858,10 @@ func (n *LoadDataStmt) Restore(ctx *format.RestoreCtx) error {
if err := n.Table.Restore(ctx); err != nil {
return errors.Annotate(err, "An error occurred while restore LoadDataStmt.Table")
}
if n.Charset != nil {
ctx.WriteKeyWord(" CHARACTER SET ")
ctx.WritePlain(*n.Charset)
}
if n.FieldsInfo != nil {
n.FieldsInfo.Restore(ctx)
}
Expand Down
8 changes: 8 additions & 0 deletions parser/ast/dml_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -416,6 +416,14 @@ func TestLoadDataRestore(t *testing.T) {
sourceSQL: "load data infile '/a.csv' format 'sql file' into table `t`",
expectSQL: "LOAD DATA INFILE '/a.csv' FORMAT 'sql file' INTO TABLE `t`",
},
{
sourceSQL: "load data infile '/a.csv' format 'sql file' into table `t` character set utf8mb4",
expectSQL: "LOAD DATA INFILE '/a.csv' FORMAT 'sql file' INTO TABLE `t` CHARACTER SET utf8mb4",
},
{
sourceSQL: "load data infile '/a.csv' format 'sql file' into table `t` character set gbk",
expectSQL: "LOAD DATA INFILE '/a.csv' FORMAT 'sql file' INTO TABLE `t` CHARACTER SET gbk",
},
// ignore N lines
{
sourceSQL: "load data infile '/a.csv' into table `t` ignore 0 lines",
Expand Down
10 changes: 10 additions & 0 deletions parser/parser.go
Original file line number Diff line number Diff line change
Expand Up @@ -21933,6 +21933,7 @@ yynewstate:
Format: yyS[yypt-11].item.(*string),
OnDuplicate: yyS[yypt-10].item.(ast.OnDuplicateKeyHandlingType),
Table: yyS[yypt-7].item.(*ast.TableName),
Charset: yyS[yypt-6].item.(*string),
FieldsInfo: yyS[yypt-5].item.(*ast.FieldsClause),
LinesInfo: yyS[yypt-4].item.(*ast.LinesClause),
IgnoreLines: yyS[yypt-3].item.(*uint64),
Expand Down Expand Up @@ -21976,6 +21977,15 @@ yynewstate:
v := getUint64FromNUM(yyS[yypt-1].item)
parser.yyVAL.item = &v
}
case 2491:
{
parser.yyVAL.item = (*string)(nil)
}
case 2492:
{
v := yyS[yypt-0].ident
parser.yyVAL.item = &v
}
case 2493:
{
parser.yyVAL.item = nil
Expand Down
11 changes: 9 additions & 2 deletions parser/parser.y
Original file line number Diff line number Diff line change
Expand Up @@ -1014,6 +1014,7 @@ import (
Boolean "Boolean (0, 1, false, true)"
OptionalBraces "optional braces"
CastType "Cast function target type"
CharsetOpt "CHARACTER SET option in LOAD DATA"
ColumnDef "table column definition"
ColumnDefList "table column definition list"
ColumnName "column name"
Expand Down Expand Up @@ -13723,6 +13724,7 @@ LoadDataStmt:
Format: $6.(*string),
OnDuplicate: $7.(ast.OnDuplicateKeyHandlingType),
Table: $10.(*ast.TableName),
Charset: $11.(*string),
FieldsInfo: $12.(*ast.FieldsClause),
LinesInfo: $13.(*ast.LinesClause),
IgnoreLines: $14.(*uint64),
Expand Down Expand Up @@ -13770,8 +13772,14 @@ IgnoreLines:
}

CharsetOpt:
{}
{
$$ = (*string)(nil)
}
| "CHARACTER" "SET" CharsetName
{
v := $3
$$ = &v
}

LocalOpt:
{
Expand Down Expand Up @@ -14673,5 +14681,4 @@ CalibrateResourceStmt:
{
$$ = &ast.CalibrateResourceStmt{}
}

%%
8 changes: 4 additions & 4 deletions parser/parser_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -609,7 +609,7 @@ func TestDMLStmt(t *testing.T) {
// load data
{"load data local infile '/tmp/t.csv' into table t1 fields terminated by ',' optionally enclosed by '\"' ignore 1 lines", true, "LOAD DATA LOCAL INFILE '/tmp/t.csv' IGNORE INTO TABLE `t1` FIELDS TERMINATED BY ',' OPTIONALLY ENCLOSED BY '\"' IGNORE 1 LINES"},
{"load data infile '/tmp/t.csv' into table t", true, "LOAD DATA INFILE '/tmp/t.csv' INTO TABLE `t`"},
{"load data infile '/tmp/t.csv' into table t character set utf8", true, "LOAD DATA INFILE '/tmp/t.csv' INTO TABLE `t`"},
{"load data infile '/tmp/t.csv' into table t character set utf8", true, "LOAD DATA INFILE '/tmp/t.csv' INTO TABLE `t` CHARACTER SET utf8"},
{"load data infile '/tmp/t.csv' into table t fields terminated by 'ab'", true, "LOAD DATA INFILE '/tmp/t.csv' INTO TABLE `t` FIELDS TERMINATED BY 'ab'"},
{"load data infile '/tmp/t.csv' into table t columns terminated by 'ab'", true, "LOAD DATA INFILE '/tmp/t.csv' INTO TABLE `t` FIELDS TERMINATED BY 'ab'"},
{"load data infile '/tmp/t.csv' into table t fields terminated by 'ab' enclosed by 'b'", true, "LOAD DATA INFILE '/tmp/t.csv' INTO TABLE `t` FIELDS TERMINATED BY 'ab' ENCLOSED BY 'b'"},
Expand All @@ -623,7 +623,7 @@ func TestDMLStmt(t *testing.T) {
{"load data local infile '/tmp/t.csv' into table t columns terminated by 'ab'", true, "LOAD DATA LOCAL INFILE '/tmp/t.csv' IGNORE INTO TABLE `t` FIELDS TERMINATED BY 'ab'"},
{"load data local infile '/tmp/t.csv' into table t fields terminated by 'ab' enclosed by 'b'", true, "LOAD DATA LOCAL INFILE '/tmp/t.csv' IGNORE INTO TABLE `t` FIELDS TERMINATED BY 'ab' ENCLOSED BY 'b'"},
{"load data local infile '/tmp/t.csv' into table t fields terminated by 'ab' enclosed by 'b' escaped by '*'", true, "LOAD DATA LOCAL INFILE '/tmp/t.csv' IGNORE INTO TABLE `t` FIELDS TERMINATED BY 'ab' ENCLOSED BY 'b' ESCAPED BY '*'"},
{"load data local infile '/tmp/t.csv' into table t character set utf8 fields terminated by 'ab' enclosed by 'b' escaped by '*'", true, "LOAD DATA LOCAL INFILE '/tmp/t.csv' IGNORE INTO TABLE `t` FIELDS TERMINATED BY 'ab' ENCLOSED BY 'b' ESCAPED BY '*'"},
{"load data local infile '/tmp/t.csv' into table t character set utf8 fields terminated by 'ab' enclosed by 'b' escaped by '*'", true, "LOAD DATA LOCAL INFILE '/tmp/t.csv' IGNORE INTO TABLE `t` CHARACTER SET utf8 FIELDS TERMINATED BY 'ab' ENCLOSED BY 'b' ESCAPED BY '*'"},
{"load data local infile '/tmp/t.csv' into table t lines starting by 'ab'", true, "LOAD DATA LOCAL INFILE '/tmp/t.csv' IGNORE INTO TABLE `t` LINES STARTING BY 'ab'"},
{"load data local infile '/tmp/t.csv' into table t lines starting by 'ab' terminated by 'xy'", true, "LOAD DATA LOCAL INFILE '/tmp/t.csv' IGNORE INTO TABLE `t` LINES STARTING BY 'ab' TERMINATED BY 'xy'"},
{"load data local infile '/tmp/t.csv' into table t fields terminated by 'ab' lines terminated by 'xy'", true, "LOAD DATA LOCAL INFILE '/tmp/t.csv' IGNORE INTO TABLE `t` FIELDS TERMINATED BY 'ab' LINES TERMINATED BY 'xy'"},
Expand All @@ -634,10 +634,10 @@ func TestDMLStmt(t *testing.T) {
{"load data local infile '/tmp/t.csv' into table t columns terminated by 'ab' (a,b)", true, "LOAD DATA LOCAL INFILE '/tmp/t.csv' IGNORE INTO TABLE `t` FIELDS TERMINATED BY 'ab' (`a`,`b`)"},
{"load data local infile '/tmp/t.csv' into table t fields terminated by 'ab' enclosed by 'b' (a,b)", true, "LOAD DATA LOCAL INFILE '/tmp/t.csv' IGNORE INTO TABLE `t` FIELDS TERMINATED BY 'ab' ENCLOSED BY 'b' (`a`,`b`)"},
{"load data local infile '/tmp/t.csv' into table t fields terminated by 'ab' enclosed by 'b' escaped by '*' (a,b)", true, "LOAD DATA LOCAL INFILE '/tmp/t.csv' IGNORE INTO TABLE `t` FIELDS TERMINATED BY 'ab' ENCLOSED BY 'b' ESCAPED BY '*' (`a`,`b`)"},
{"load data local infile '/tmp/t.csv' into table t character set utf8 fields terminated by 'ab' enclosed by 'b' escaped by '*' (a,b)", true, "LOAD DATA LOCAL INFILE '/tmp/t.csv' IGNORE INTO TABLE `t` FIELDS TERMINATED BY 'ab' ENCLOSED BY 'b' ESCAPED BY '*' (`a`,`b`)"},
{"load data local infile '/tmp/t.csv' into table t character set utf8 fields terminated by 'ab' enclosed by 'b' escaped by '*' (a,b)", true, "LOAD DATA LOCAL INFILE '/tmp/t.csv' IGNORE INTO TABLE `t` CHARACTER SET utf8 FIELDS TERMINATED BY 'ab' ENCLOSED BY 'b' ESCAPED BY '*' (`a`,`b`)"},
{"load data local infile '/tmp/t.csv' into table t lines starting by 'ab' (a,b)", true, "LOAD DATA LOCAL INFILE '/tmp/t.csv' IGNORE INTO TABLE `t` LINES STARTING BY 'ab' (`a`,`b`)"},
{"load data local infile '/tmp/t.csv' into table t lines starting by 'ab' terminated by 'xy' (a,b)", true, "LOAD DATA LOCAL INFILE '/tmp/t.csv' IGNORE INTO TABLE `t` LINES STARTING BY 'ab' TERMINATED BY 'xy' (`a`,`b`)"},
{"load data local infile '/tmp/t.csv' into table t character set utf8 fields terminated by 'ab' lines terminated by 'xy' (a,b)", true, "LOAD DATA LOCAL INFILE '/tmp/t.csv' IGNORE INTO TABLE `t` FIELDS TERMINATED BY 'ab' LINES TERMINATED BY 'xy' (`a`,`b`)"},
{"load data local infile '/tmp/t.csv' into table t character set utf8 fields terminated by 'ab' lines terminated by 'xy' (a,b)", true, "LOAD DATA LOCAL INFILE '/tmp/t.csv' IGNORE INTO TABLE `t` CHARACTER SET utf8 FIELDS TERMINATED BY 'ab' LINES TERMINATED BY 'xy' (`a`,`b`)"},
{"load data local infile '/tmp/t.csv' into table t fields terminated by 'ab' lines terminated by 'xy' (a,b)", true, "LOAD DATA LOCAL INFILE '/tmp/t.csv' IGNORE INTO TABLE `t` FIELDS TERMINATED BY 'ab' LINES TERMINATED BY 'xy' (`a`,`b`)"},
{"load data local infile '/tmp/t.csv' into table t (a,b) fields terminated by 'ab'", false, ""},
{"load data local infile '/tmp/t.csv' into table t ignore 1 lines", true, "LOAD DATA LOCAL INFILE '/tmp/t.csv' IGNORE INTO TABLE `t` IGNORE 1 LINES"},
Expand Down
1 change: 1 addition & 0 deletions planner/core/common_plans.go
Original file line number Diff line number Diff line change
Expand Up @@ -552,6 +552,7 @@ type LoadData struct {
Path string
Format *string
Table *ast.TableName
Charset *string
Columns []*ast.ColumnName
FieldsInfo *ast.FieldsClause
LinesInfo *ast.LinesClause
Expand Down
1 change: 1 addition & 0 deletions planner/core/planbuilder.go
Original file line number Diff line number Diff line change
Expand Up @@ -4224,6 +4224,7 @@ func (b *PlanBuilder) buildLoadData(ctx context.Context, ld *ast.LoadDataStmt) (
Path: ld.Path,
Format: ld.Format,
Table: ld.Table,
Charset: ld.Charset,
Columns: ld.Columns,
FieldsInfo: ld.FieldsInfo,
LinesInfo: ld.LinesInfo,
Expand Down