Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

parser: add IsValid() to Encoding to speed up string validation for UTF-8 #30937

Merged
merged 4 commits into from
Dec 22, 2021
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion expression/builtin_string.go
Original file line number Diff line number Diff line change
Expand Up @@ -1150,7 +1150,7 @@ func (b *builtinConvertSig) evalString(row chunk.Row) (string, bool, error) {
return string(ret), false, err
}
enc := charset.FindEncoding(resultTp.Charset)
if !charset.IsValidString(enc, expr) {
if !enc.IsValid(hack.Slice(expr)) {
replace, _ := enc.Transform(nil, hack.Slice(expr), charset.OpReplace)
return string(replace), false, nil
}
Expand Down
2 changes: 1 addition & 1 deletion expression/builtin_string_vec.go
Original file line number Diff line number Diff line change
Expand Up @@ -689,7 +689,7 @@ func (b *builtinConvertSig) vecEvalString(input *chunk.Chunk, result *chunk.Colu
continue
}
exprI := expr.GetBytes(i)
if !charset.IsValid(enc, exprI) {
if !enc.IsValid(exprI) {
encBuf, _ = enc.Transform(encBuf, exprI, charset.OpReplace)
result.AppendBytes(encBuf)
} else {
Expand Down
3 changes: 2 additions & 1 deletion expression/collation.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import (
"github.com/pingcap/tidb/types"
"github.com/pingcap/tidb/util/chunk"
"github.com/pingcap/tidb/util/collate"
"github.com/pingcap/tidb/util/hack"
"github.com/pingcap/tidb/util/logutil"
)

Expand Down Expand Up @@ -315,7 +316,7 @@ func safeConvert(ctx sessionctx.Context, ec *ExprCollation, args ...Expression)
if isNull {
continue
}
if !charset.IsValidString(enc, str) {
if !enc.IsValid(hack.Slice(str)) {
return false
}
} else {
Expand Down
17 changes: 2 additions & 15 deletions parser/charset/encoding.go
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,8 @@ type Encoding interface {
Tp() EncodingTp
// Peek returns the next char.
Peek(src []byte) []byte
// IsValid checks whether the utf-8 bytes can be convert to valid string in current encoding.
IsValid(src []byte) bool
// Foreach iterates the characters in in current encoding.
Foreach(src []byte, op Op, fn func(from, to []byte, ok bool) bool)
// Transform map the bytes in src to dest according to Op.
Expand Down Expand Up @@ -101,21 +103,6 @@ const (
OpDecodeReplace = opToUTF8 | opTruncateReplace | opCollectTo
)

// IsValid checks whether the bytes is valid in current encoding.
func IsValid(e Encoding, src []byte) bool {
isValid := true
e.Foreach(src, opFromUTF8, func(from, to []byte, ok bool) bool {
isValid = ok
return ok
})
return isValid
}

// IsValidString is a string version of IsValid.
func IsValidString(e Encoding, str string) bool {
return IsValid(e, Slice(str))
}

// CountValidBytes counts the first valid bytes in src that
// can be encode to the current encoding.
func CountValidBytes(e Encoding, src []byte) int {
Expand Down
13 changes: 12 additions & 1 deletion parser/charset/encoding_ascii.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,19 @@ func (e *encodingASCII) Peek(src []byte) []byte {
return src[:1]
}

// IsValid implements Encoding interface.
func (e *encodingASCII) IsValid(src []byte) bool {
srcLen := len(src)
for i := 0; i < srcLen; i++ {
if src[i] > go_unicode.MaxASCII {
return false
}
}
return true
}

func (e *encodingASCII) Transform(dest, src []byte, op Op) ([]byte, error) {
if IsValid(e, src) {
if e.IsValid(src) {
return src, nil
}
return e.encodingBase.Transform(dest, src, op)
Expand Down
9 changes: 9 additions & 0 deletions parser/charset/encoding_base.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,15 @@ func (b encodingBase) ToLower(src string) string {
return strings.ToLower(src)
}

func (b encodingBase) IsValid(src []byte) bool {
isValid := true
b.self.Foreach(src, opFromUTF8, func(from, to []byte, ok bool) bool {
isValid = ok
return ok
})
return isValid
}

func (b encodingBase) Transform(dest, src []byte, op Op) (result []byte, err error) {
if dest == nil {
dest = make([]byte, len(src))
Expand Down
5 changes: 5 additions & 0 deletions parser/charset/encoding_bin.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,11 @@ func (e *encodingBin) Peek(src []byte) []byte {
return src[:1]
}

// IsValid implements Encoding interface.
func (e *encodingBin) IsValid(src []byte) bool {
return true
}

// Foreach implements Encoding interface.
func (e *encodingBin) Foreach(src []byte, op Op, fn func(from, to []byte, ok bool) bool) {
for i := 0; i < len(src); i++ {
Expand Down
5 changes: 5 additions & 0 deletions parser/charset/encoding_latin1.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,11 @@ func (e *encodingLatin1) Peek(src []byte) []byte {
return src[:1]
}

// IsValid implements Encoding interface.
func (e *encodingLatin1) IsValid(src []byte) bool {
return true
}

// Tp implements Encoding interface.
func (e *encodingLatin1) Tp() EncodingTp {
return EncodingTpLatin1
Expand Down
3 changes: 1 addition & 2 deletions parser/charset/encoding_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -133,8 +133,7 @@ func TestEncodingValidate(t *testing.T) {
enc = charset.EncodingUTF8MB3StrictImpl
}
strBytes := []byte(tc.str)
ok := charset.IsValid(enc, strBytes)
require.Equal(t, tc.ok, ok, msg)
require.Equal(t, tc.ok, enc.IsValid(strBytes), msg)
replace, _ := enc.Transform(nil, strBytes, charset.OpReplace)
require.Equal(t, tc.expected, string(replace), msg)
}
Expand Down
17 changes: 15 additions & 2 deletions parser/charset/encoding_utf8.go
Original file line number Diff line number Diff line change
Expand Up @@ -67,9 +67,17 @@ func (e *encodingUTF8) Peek(src []byte) []byte {
return src[:nextLen]
}

// IsValid implements Encoding interface.
func (e *encodingUTF8) IsValid(src []byte) bool {
if utf8.Valid(src) {
return true
}
return e.encodingBase.IsValid(src)
}

// Transform implements Encoding interface.
func (e *encodingUTF8) Transform(dest, src []byte, op Op) ([]byte, error) {
if IsValid(e, src) {
if e.IsValid(src) {
return src, nil
}
return e.encodingBase.Transform(dest, src, op)
Expand All @@ -93,6 +101,11 @@ type encodingUTF8MB3Strict struct {
encodingUTF8
}

// IsValid implements Encoding interface.
func (e *encodingUTF8MB3Strict) IsValid(src []byte) bool {
return e.encodingBase.IsValid(src)
}

// Foreach implements Encoding interface.
func (e *encodingUTF8MB3Strict) Foreach(src []byte, op Op, fn func(srcCh, dstCh []byte, ok bool) bool) {
for i, w := 0, 0; i < len(src); i += w {
Expand All @@ -107,7 +120,7 @@ func (e *encodingUTF8MB3Strict) Foreach(src []byte, op Op, fn func(srcCh, dstCh

// Transform implements Encoding interface.
func (e *encodingUTF8MB3Strict) Transform(dest, src []byte, op Op) ([]byte, error) {
if IsValid(e, src) {
if e.IsValid(src) {
return src, nil
}
return e.encodingBase.Transform(dest, src, op)
Expand Down
2 changes: 1 addition & 1 deletion table/column.go
Original file line number Diff line number Diff line change
Expand Up @@ -372,7 +372,7 @@ func validateStringDatum(ctx sessionctx.Context, origin, casted *types.Datum, co
}
// Check if the string is valid in the given column charset.
str := casted.GetBytes()
if !charset.IsValid(enc, str) {
if !enc.IsValid(str) {
replace, _ := enc.Transform(nil, str, charset.OpReplace)
casted.SetBytesAsString(replace, charset.CollationUTF8MB4, 0)
nSrc := charset.CountValidBytes(enc, str)
Expand Down