From 122d26bc8c5f16a34079f14db38390dec263868f Mon Sep 17 00:00:00 2001 From: winkyao Date: Wed, 19 Dec 2018 21:41:39 +0800 Subject: [PATCH] table: check non-BMP characters and return error when the charset is utf8 and sql mode is strict mode (#8738) (#8754) --- executor/statement_context_test.go | 20 ++++++++++++++++++++ table/column.go | 28 +++++++++++++++++++++------- 2 files changed, 41 insertions(+), 7 deletions(-) diff --git a/executor/statement_context_test.go b/executor/statement_context_test.go index df7d86d4115a8..895e7a81e2df6 100644 --- a/executor/statement_context_test.go +++ b/executor/statement_context_test.go @@ -81,4 +81,24 @@ func (s *testSuite) TestStatementContext(c *C) { tk.MustExec("set @@tidb_skip_utf8_check = '0'") runeErrStr := string(utf8.RuneError) tk.MustExec(fmt.Sprintf("insert sc2 values ('%s')", runeErrStr)) + + // Test non-BMP characters. + tk.MustExec(nonStrictModeSQL) + tk.MustExec("drop table if exists t1") + tk.MustExec("create table t1(a varchar(100) charset utf8);") + defer tk.MustExec("drop table if exists t1") + tk.MustExec("insert t1 values (unhex('f09f8c80'))") + c.Assert(tk.Se.GetSessionVars().StmtCtx.WarningCount(), Greater, uint16(0)) + tk.MustQuery("select * from t1").Check(testkit.Rows("")) + tk.MustExec("insert t1 values (unhex('4040f09f8c80'))") + c.Assert(tk.Se.GetSessionVars().StmtCtx.WarningCount(), Greater, uint16(0)) + tk.MustQuery("select * from t1").Check(testkit.Rows("", "@@")) + tk.MustQuery("select length(a) from t1").Check(testkit.Rows("0", "2")) + tk.MustExec(strictModeSQL) + _, err = tk.Exec("insert t1 values (unhex('f09f8c80'))") + c.Assert(err, NotNil) + c.Assert(terror.ErrorEqual(err, table.ErrTruncateWrongValue), IsTrue, Commentf("err %v", err)) + _, err = tk.Exec("insert t1 values (unhex('F0A48BAE'))") + c.Assert(err, NotNil) + c.Assert(terror.ErrorEqual(err, table.ErrTruncateWrongValue), IsTrue, Commentf("err %v", err)) } diff --git a/table/column.go b/table/column.go index bd875505aafc2..5f6c64ffe5238 100644 --- a/table/column.go +++ b/table/column.go @@ -145,6 +145,16 @@ func CastValues(ctx sessionctx.Context, rec []types.Datum, cols []*Column) (err return nil } +func handleWrongUtf8Value(ctx sessionctx.Context, col *model.ColumnInfo, casted *types.Datum, str string, i int) (types.Datum, error) { + sc := ctx.GetSessionVars().StmtCtx + err := ErrTruncateWrongValue.FastGen("incorrect utf8 value %x(%s) for column %s", casted.GetBytes(), str, col.Name) + log.Errorf("con:%d %v", ctx.GetSessionVars().ConnectionID, err) + // Truncate to valid utf8 string. + truncateVal := types.NewStringDatum(str[:i]) + err = sc.HandleTruncate(err) + return truncateVal, err +} + // CastValue casts a value based on column type. func CastValue(ctx sessionctx.Context, val types.Datum, col *model.ColumnInfo) (casted types.Datum, err error) { sc := ctx.GetSessionVars().StmtCtx @@ -166,18 +176,22 @@ func CastValue(ctx sessionctx.Context, val types.Datum, col *model.ColumnInfo) ( return casted, nil } str := casted.GetString() - for i, r := range str { - if r == utf8.RuneError { + utf8Charset := col.Charset == mysql.UTF8Charset + for i, w := 0, 0; i < len(str); i += w { + runeValue, width := utf8.DecodeRuneInString(str[i:]) + if runeValue == utf8.RuneError { if strings.HasPrefix(str[i:], string(utf8.RuneError)) { + w = width continue } - err = ErrTruncateWrongValue.FastGen("incorrect utf8 value %x(%s) for column %s", casted.GetBytes(), str, col.Name) - log.Errorf("con:%d %v", ctx.GetSessionVars().ConnectionID, err) - // Truncate to valid utf8 string. - casted = types.NewStringDatum(str[:i]) - err = sc.HandleTruncate(err) + casted, err = handleWrongUtf8Value(ctx, col, &casted, str, i) + break + } else if width > 3 && utf8Charset { + // Handle non-BMP characters. + casted, err = handleWrongUtf8Value(ctx, col, &casted, str, i) break } + w = width } return casted, errors.Trace(err)