Skip to content

Commit

Permalink
Merge branch 'master' into support_database_placement_option_
Browse files Browse the repository at this point in the history
  • Loading branch information
zhaoxugang authored Sep 17, 2021
2 parents 7c9407f + ac71111 commit 4b866ff
Show file tree
Hide file tree
Showing 9 changed files with 184 additions and 62 deletions.
1 change: 1 addition & 0 deletions ast/ddl.go
Original file line number Diff line number Diff line change
Expand Up @@ -3268,6 +3268,7 @@ var (
ErrSystemVersioningWrongPartitions = terror.ClassDDL.NewStd(mysql.ErrSystemVersioningWrongPartitions)
ErrTooManyValues = terror.ClassDDL.NewStd(mysql.ErrTooManyValues)
ErrWrongPartitionTypeExpectedSystemTime = terror.ClassDDL.NewStd(mysql.ErrWrongPartitionTypeExpectedSystemTime)
ErrUnknownCharacterSet = terror.ClassDDL.NewStd(mysql.ErrUnknownCharacterSet)
)

type SubPartitionDefinition struct {
Expand Down
118 changes: 67 additions & 51 deletions charset/encoding.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,18 +14,20 @@
package charset

import (
"bytes"
"fmt"
"strings"

"github.com/cznic/mathutil"
"github.com/pingcap/parser/mysql"
"github.com/pingcap/parser/terror"
"golang.org/x/text/encoding"
"golang.org/x/text/transform"
)

const (
encodingBufferSizeDefault = 1024
encodingBufferSizeRecycleThreshold = 4 * 1024
const encodingLegacy = "utf-8" // utf-8 encoding is compatible with old default behavior.

encodingDefault = "utf-8"
)
var errInvalidCharacterString = terror.ClassParser.NewStd(mysql.ErrInvalidCharacterString)

type EncodingLabel string

Expand All @@ -44,7 +46,6 @@ type Encoding struct {
enc encoding.Encoding
name string
charLength func([]byte) int
buffer []byte
}

// Enabled indicates whether the non-utf8 encoding is used.
Expand All @@ -58,80 +59,95 @@ func (e *Encoding) Name() string {
}

// NewEncoding creates a new Encoding.
func NewEncoding(label EncodingLabel) *Encoding {
func NewEncoding(label string) *Encoding {
if len(label) == 0 {
return &Encoding{}
}
e, name := lookup(label)
if e != nil && name != encodingDefault {
e, name := Lookup(label)
if e != nil && name != encodingLegacy {
return &Encoding{
enc: e,
name: name,
charLength: FindNextCharacterLength(name),
buffer: make([]byte, encodingBufferSizeDefault),
}
}
return &Encoding{name: name}
}

// UpdateEncoding updates to a new Encoding without changing the buffer.
// UpdateEncoding updates to a new Encoding.
func (e *Encoding) UpdateEncoding(label EncodingLabel) {
enc, name := lookup(label)
e.name = name
if enc != nil && name != encodingDefault {
if enc != nil && name != encodingLegacy {
e.enc = enc
}
if len(e.buffer) == 0 {
e.buffer = make([]byte, encodingBufferSizeDefault)
e.charLength = FindNextCharacterLength(name)
} else {
e.enc = nil
e.charLength = nil
}
}

// Encode encodes the bytes to a string.
func (e *Encoding) Encode(src []byte) (string, bool) {
return e.transform(e.enc.NewEncoder(), src)
// Encode convert bytes from utf-8 charset to a specific charset.
func (e *Encoding) Encode(dest, src []byte) ([]byte, error) {
return e.transform(e.enc.NewEncoder(), dest, src, false)
}

// Decode decodes the bytes to a string.
func (e *Encoding) Decode(src []byte) (string, bool) {
return e.transform(e.enc.NewDecoder(), src)
// Decode convert bytes from a specific charset to utf-8 charset.
func (e *Encoding) Decode(dest, src []byte) ([]byte, error) {
return e.transform(e.enc.NewDecoder(), dest, src, true)
}

func (e *Encoding) transform(transformer transform.Transformer, src []byte) (string, bool) {
if len(e.buffer) < len(src) {
e.buffer = make([]byte, len(src)*2)
func (e *Encoding) transform(transformer transform.Transformer, dest, src []byte, isDecoding bool) ([]byte, error) {
if len(dest) < len(src) {
dest = make([]byte, len(src)*2)
}
var destOffset, srcOffset int
ok := true
var encodingErr error
for {
nextLen := 4
if e.charLength != nil {
nextLen = e.charLength(src[srcOffset:])
}
srcEnd := srcOffset + nextLen
if srcEnd > len(src) {
srcEnd = len(src)
srcNextLen := e.nextCharLenInSrc(src[srcOffset:], isDecoding)
srcEnd := mathutil.Min(srcOffset+srcNextLen, len(src))
nDest, nSrc, err := transformer.Transform(dest[destOffset:], src[srcOffset:srcEnd], false)
if err == transform.ErrShortDst {
dest = enlargeCapacity(dest)
} else if err != nil || isDecoding && beginWithReplacementChar(dest[destOffset:destOffset+nDest]) {
if encodingErr == nil {
encodingErr = e.generateErr(src[srcOffset:], srcNextLen)
}
dest[destOffset] = byte('?')
nDest, nSrc = 1, srcNextLen // skip the source bytes that cannot be decoded normally.
}
nDest, nSrc, err := transformer.Transform(e.buffer[destOffset:], src[srcOffset:srcEnd], false)
destOffset += nDest
srcOffset += nSrc
if err == nil {
if srcOffset >= len(src) {
result := string(e.buffer[:destOffset])
if len(e.buffer) > encodingBufferSizeRecycleThreshold {
// This prevents Encoding from holding too much memory.
e.buffer = make([]byte, encodingBufferSizeDefault)
}
return result, ok
}
} else if err == transform.ErrShortDst {
newDest := make([]byte, len(e.buffer)*2)
copy(newDest, e.buffer)
e.buffer = newDest
} else {
e.buffer[destOffset] = byte('?')
destOffset += 1
srcOffset += 1
ok = false
// The source bytes are exhausted.
if srcOffset >= len(src) {
return dest[:destOffset], encodingErr
}
}
}

func (e *Encoding) nextCharLenInSrc(srcRest []byte, isDecoding bool) int {
if isDecoding && e.charLength != nil {
return e.charLength(srcRest)
}
return len(srcRest)
}

func enlargeCapacity(dest []byte) []byte {
newDest := make([]byte, len(dest)*2)
copy(newDest, dest)
return newDest
}

func (e *Encoding) generateErr(srcRest []byte, srcNextLen int) error {
cutEnd := mathutil.Min(srcNextLen, len(srcRest))
invalidBytes := fmt.Sprintf("%X", string(srcRest[:cutEnd]))
return errInvalidCharacterString.GenWithStackByArgs(e.name, invalidBytes)
}

// replacementBytes are bytes for the replacement rune 0xfffd.
var replacementBytes = []byte{0xEF, 0xBF, 0xBD}

// beginWithReplacementChar check if dst has the prefix '0xEFBFBD'.
func beginWithReplacementChar(dst []byte) bool {
return bytes.HasPrefix(dst, replacementBytes)
}
3 changes: 3 additions & 0 deletions charset/encoding_table.go
Original file line number Diff line number Diff line change
Expand Up @@ -290,4 +290,7 @@ var encodingNextCharacterLength = map[string]func([]byte) int{
}
return 4
},
"binary": func(bs []byte) int {
return 1
},
}
76 changes: 76 additions & 0 deletions charset/encoding_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
// Copyright 2021 PingCAP, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// See the License for the specific language governing permissions and
// limitations under the License.

package charset_test

import (
. "github.com/pingcap/check"
"github.com/pingcap/parser/charset"
"golang.org/x/text/transform"
)

var _ = Suite(&testEncodingSuite{})

type testEncodingSuite struct {
}

func (s *testEncodingSuite) TestEncoding(c *C) {
enc := charset.NewEncoding("gbk")
c.Assert(enc.Name(), Equals, "gbk")
c.Assert(enc.Enabled(), IsTrue)
enc.UpdateEncoding("utf-8")
c.Assert(enc.Name(), Equals, "utf-8")
enc.UpdateEncoding("gbk")
c.Assert(enc.Name(), Equals, "gbk")
c.Assert(enc.Enabled(), IsTrue)

txt := []byte("一二三四")
e, _ := charset.Lookup("gbk")
gbkEncodedTxt, _, err := transform.Bytes(e.NewEncoder(), txt)
c.Assert(err, IsNil)
result, err := enc.Decode(nil, gbkEncodedTxt)
c.Assert(err, IsNil)
c.Assert(result, DeepEquals, txt)

gbkEncodedTxt2, err := enc.Encode(nil, txt)
c.Assert(err, IsNil)
c.Assert(gbkEncodedTxt, DeepEquals, gbkEncodedTxt2)
result, err = enc.Decode(nil, gbkEncodedTxt2)
c.Assert(err, IsNil)
c.Assert(result, DeepEquals, txt)

GBKCases := []struct {
utf8Str string
result string
isValid bool
}{
{"一二三", "涓?簩涓?", false}, // MySQL reports '涓?簩涓'.
{"一二三123", "涓?簩涓?23", false},
{"案1案2", "妗?妗?", false},
{"焊䏷菡釬", "鐒婁彿鑿¢嚞", true},
{"鞍杏以伊位依", "闉嶆潖浠ヤ紛浣嶄緷", true},
{"移維緯胃萎衣謂違", "绉荤董绶?儍钀庤。璎傞仌", false},
{"仆仂仗仞仭仟价伉佚估", "浠嗕粋浠椾粸浠?粺浠蜂級浣氫及", false},
{"佝佗佇佶侈侏侘佻佩佰侑佯", "浣濅綏浣囦蕉渚堜緩渚樹交浣╀桨渚戜蒋", true},
}
for _, tc := range GBKCases {
cmt := Commentf("%v", tc)
result, err = enc.Decode(nil, []byte(tc.utf8Str))
if tc.isValid {
c.Assert(err, IsNil, cmt)
} else {
c.Assert(err, NotNil, cmt)
}
c.Assert(string(result), Equals, tc.result, Commentf("%v", tc))
}
}
8 changes: 4 additions & 4 deletions lexer.go
Original file line number Diff line number Diff line change
Expand Up @@ -147,12 +147,12 @@ func (s *Scanner) tryDecodeToUTF8String(sql string) string {
}
return sql
}
utf8Lit, ok := s.encoding.Decode(Slice(sql))
if !ok {
s.AppendError(errors.Errorf("Cannot convert string '%x' from %s to utf8mb4", sql, s.encoding.Name()))
utf8Lit, err := s.encoding.Decode(nil, Slice(sql))
if err != nil {
s.AppendError(err)
s.lastErrorAsWarn()
}
return utf8Lit
return string(utf8Lit)
}

func (s *Scanner) getNextToken() int {
Expand Down
6 changes: 3 additions & 3 deletions parser.go
Original file line number Diff line number Diff line change
Expand Up @@ -14925,7 +14925,7 @@ yynewstate:
// See https://dev.mysql.com/doc/refman/5.7/en/charset-literal.html
co, err := charset.GetDefaultCollationLegacy(yyS[yypt-1].ident)
if err != nil {
yylex.AppendError(yylex.Errorf("Get collation error for charset: %s", yyS[yypt-1].ident))
yylex.AppendError(ast.ErrUnknownCharacterSet.GenWithStack("Unsupported character introducer: '%-.64s'", yyS[yypt-1].ident))
return 1
}
expr := ast.NewValueExpr(yyS[yypt-0].ident, parser.charset, parser.collation)
Expand All @@ -14949,7 +14949,7 @@ yynewstate:
{
co, err := charset.GetDefaultCollationLegacy(yyS[yypt-1].ident)
if err != nil {
yylex.AppendError(yylex.Errorf("Get collation error for charset: %s", yyS[yypt-1].ident))
yylex.AppendError(ast.ErrUnknownCharacterSet.GenWithStack("Unsupported character introducer: '%-.64s'", yyS[yypt-1].ident))
return 1
}
expr := ast.NewValueExpr(yyS[yypt-0].item, parser.charset, parser.collation)
Expand All @@ -14965,7 +14965,7 @@ yynewstate:
{
co, err := charset.GetDefaultCollationLegacy(yyS[yypt-1].ident)
if err != nil {
yylex.AppendError(yylex.Errorf("Get collation error for charset: %s", yyS[yypt-1].ident))
yylex.AppendError(ast.ErrUnknownCharacterSet.GenWithStack("Unsupported character introducer: '%-.64s'", yyS[yypt-1].ident))
return 1
}
expr := ast.NewValueExpr(yyS[yypt-0].item, parser.charset, parser.collation)
Expand Down
6 changes: 3 additions & 3 deletions parser.y
Original file line number Diff line number Diff line change
Expand Up @@ -6459,7 +6459,7 @@ Literal:
// See https://dev.mysql.com/doc/refman/5.7/en/charset-literal.html
co, err := charset.GetDefaultCollationLegacy($1)
if err != nil {
yylex.AppendError(yylex.Errorf("Get collation error for charset: %s", $1))
yylex.AppendError(ast.ErrUnknownCharacterSet.GenWithStack("Unsupported character introducer: '%-.64s'", $1))
return 1
}
expr := ast.NewValueExpr($2, parser.charset, parser.collation)
Expand All @@ -6483,7 +6483,7 @@ Literal:
{
co, err := charset.GetDefaultCollationLegacy($1)
if err != nil {
yylex.AppendError(yylex.Errorf("Get collation error for charset: %s", $1))
yylex.AppendError(ast.ErrUnknownCharacterSet.GenWithStack("Unsupported character introducer: '%-.64s'", $1))
return 1
}
expr := ast.NewValueExpr($2, parser.charset, parser.collation)
Expand All @@ -6499,7 +6499,7 @@ Literal:
{
co, err := charset.GetDefaultCollationLegacy($1)
if err != nil {
yylex.AppendError(yylex.Errorf("Get collation error for charset: %s", $1))
yylex.AppendError(ast.ErrUnknownCharacterSet.GenWithStack("Unsupported character introducer: '%-.64s'", $1))
return 1
}
expr := ast.NewValueExpr($2, parser.charset, parser.collation)
Expand Down
26 changes: 26 additions & 0 deletions parser_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -6341,6 +6341,32 @@ func (s *testParserSuite) TestPlanRecreator(c *C) {
c.Assert(v.Analyze, IsTrue)
}

func (s *testParserSuite) TestCharsetIntroducer(c *C) {
p := parser.New()
// `_gbk` is treated as an identifier.
_, _, err := p.Parse("select _gbk 'a';", "", "")
c.Assert(err, IsNil)

charset.AddCharset(&charset.Charset{
Name: "gbk",
DefaultCollation: "gbk_bin",
Collations: map[string]*charset.Collation{},
Desc: "gbk",
Maxlen: 2,
})
defer charset.RemoveCharset("gbk")
// `_gbk` is treated as a character set.
_, _, err = p.Parse("select _gbk 'a';", "", "")
c.Assert(err, NotNil)
c.Assert(err.Error(), Equals, "[ddl:1115]Unsupported character introducer: 'gbk'")
_, _, err = p.Parse("select _gbk 0x1234;", "", "")
c.Assert(err, NotNil)
c.Assert(err.Error(), Equals, "[ddl:1115]Unsupported character introducer: 'gbk'")
_, _, err = p.Parse("select _gbk 0b101001;", "", "")
c.Assert(err, NotNil)
c.Assert(err.Error(), Equals, "[ddl:1115]Unsupported character introducer: 'gbk'")
}

func (s *testParserSuite) TestGBKEncoding(c *C) {
p := parser.New()
gbkEncoding, _ := charset.Lookup("gbk")
Expand Down
2 changes: 1 addition & 1 deletion yy_parser.go
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ func (parser *Parser) SetParserConfig(config ParserConfig) {
parser.EnableWindowFunc(config.EnableWindowFunction)
parser.SetStrictDoubleTypeCheck(config.EnableStrictDoubleTypeCheck)
parser.lexer.skipPositionRecording = config.SkipPositionRecording
parser.lexer.encoding = *charset.NewEncoding(charset.Format(config.CharsetClient))
parser.lexer.encoding = *charset.NewEncoding(config.CharsetClient)
}

// Parse parses a query string to raw ast.StmtNode.
Expand Down

0 comments on commit 4b866ff

Please sign in to comment.