Skip to content

Commit

Permalink
expression: support all regexp functions (#37407)
Browse files Browse the repository at this point in the history
close #23881
  • Loading branch information
xzhangxian1008 authored Sep 16, 2022
1 parent c19dc46 commit dcdbb87
Show file tree
Hide file tree
Showing 18 changed files with 2,901 additions and 201 deletions.
36 changes: 36 additions & 0 deletions ddl/db_integration_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -4098,3 +4098,39 @@ func TestDDLLastInfo(t *testing.T) {
tk.MustExec("drop table t, t2")
tk.MustQuery("select json_extract(@@tidb_last_ddl_info, '$.query'), json_extract(@@tidb_last_ddl_info, '$.seq_num')").Check(testkit.Rows(fmt.Sprintf("\"drop table t, t2\" %d", firstSequence+3)))
}

func TestRegexpFunctionsGeneratedColumn(t *testing.T) {
store := testkit.CreateMockStore(t)
tk := testkit.NewTestKit(t, store)
tk.MustExec("use test")

// test regexp_like
tk.MustExec("drop table if exists reg_like")
tk.MustExec("create table reg_like(a varchar(50), b varchar(50), c int generated always as (regexp_like(a, b)))")
tk.MustExec("insert into reg_like(a, b) values('123', '2')")
tk.MustExec("insert into reg_like(a, b) values('456', '1')")
tk.MustQuery("select * from reg_like").Check(testkit.Rows("123 2 1", "456 1 0"))

// test regexp_substr
tk.MustExec("drop table if exists reg_sub;")
tk.MustExec("create table reg_sub(a varchar(50),b varchar(50),c varchar(50) generated always as (regexp_substr(a, b)))")
tk.MustExec("insert into reg_sub(a, b) values('abcd', 'bc.')")
tk.MustExec("insert into reg_sub(a, b) values('1234', '23.')")
tk.MustQuery("select * from reg_sub").Check(testkit.Rows("abcd bc. bcd", "1234 23. 234"))

// test regexp_instr
tk.MustExec("drop table if exists reg_instr;")
tk.MustExec("create table reg_instr(a varchar(50),b varchar(50),c varchar(50) generated always as (regexp_instr(a, b)))")
tk.MustExec("insert into reg_instr(a, b) values('abcd', 'bc.')")
tk.MustExec("insert into reg_instr(a, b) values('1234', '23.')")
tk.MustQuery("select * from reg_instr").Check(testkit.Rows("abcd bc. 2", "1234 23. 2"))

// test regexp_replace
tk.MustExec("drop table if exists reg_replace;")
tk.MustExec("create table reg_replace(a varchar(50),b varchar(50),c varchar(50),d varchar(50) generated always as (regexp_replace(a, b, c)));")
tk.MustExec("insert into reg_replace(a, b, c) values('abcd', 'bc.', 'xzx')")
tk.MustExec("insert into reg_replace(a, b, c) values('1234', '23.', 'xzx')")
tk.MustQuery("select * from reg_replace").Check(testkit.Rows("abcd bc. xzx axzx", "1234 23. xzx 1xzx"))

tk.MustExec("drop table if exists reg_like")
}
4 changes: 2 additions & 2 deletions executor/showtest/show_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1438,8 +1438,8 @@ func TestShowBuiltin(t *testing.T) {
res := tk.MustQuery("show builtins;")
require.NotNil(t, res)
rows := res.Rows()
const builtinFuncNum = 277
require.Equal(t, len(rows), builtinFuncNum)
const builtinFuncNum = 281
require.Equal(t, builtinFuncNum, len(rows))
require.Equal(t, rows[0][0].(string), "abs")
require.Equal(t, rows[builtinFuncNum-1][0].(string), "yearweek")
}
Expand Down
6 changes: 5 additions & 1 deletion expression/builtin.go
Original file line number Diff line number Diff line change
Expand Up @@ -829,7 +829,11 @@ var funcs = map[string]functionClass{
ast.IsTruthWithNull: &isTrueOrFalseFunctionClass{baseFunctionClass{ast.IsTruthWithNull, 1, 1}, opcode.IsTruth, true},
ast.IsFalsity: &isTrueOrFalseFunctionClass{baseFunctionClass{ast.IsFalsity, 1, 1}, opcode.IsFalsity, false},
ast.Like: &likeFunctionClass{baseFunctionClass{ast.Like, 3, 3}},
ast.Regexp: &regexpFunctionClass{baseFunctionClass{ast.Regexp, 2, 2}},
ast.Regexp: &regexpLikeFunctionClass{baseFunctionClass{ast.Regexp, 2, 2}},
ast.RegexpLike: &regexpLikeFunctionClass{baseFunctionClass{ast.RegexpLike, 2, 3}},
ast.RegexpSubstr: &regexpSubstrFunctionClass{baseFunctionClass{ast.RegexpSubstr, 2, 5}},
ast.RegexpInStr: &regexpInStrFunctionClass{baseFunctionClass{ast.RegexpInStr, 2, 6}},
ast.RegexpReplace: &regexpReplaceFunctionClass{baseFunctionClass{ast.RegexpReplace, 3, 6}},
ast.Case: &caseWhenFunctionClass{baseFunctionClass{ast.Case, 1, -1}},
ast.RowFunc: &rowFunctionClass{baseFunctionClass{ast.RowFunc, 2, -1}},
ast.SetVar: &setVarFunctionClass{baseFunctionClass{ast.SetVar, 2, 2}},
Expand Down
2 changes: 1 addition & 1 deletion expression/builtin_convert_charset.go
Original file line number Diff line number Diff line change
Expand Up @@ -289,7 +289,7 @@ var convertActionMap = map[funcProp][]string{
/* string comparing */
ast.Like, ast.Strcmp,
/* regex */
ast.Regexp,
ast.Regexp, ast.RegexpLike, ast.RegexpInStr, ast.RegexpSubstr, ast.RegexpReplace,
/* math */
ast.CRC32,
},
Expand Down
103 changes: 0 additions & 103 deletions expression/builtin_like.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,8 @@
package expression

import (
"regexp"
"sync"

"github.com/pingcap/tidb/parser/charset"
"github.com/pingcap/tidb/sessionctx"
"github.com/pingcap/tidb/types"
"github.com/pingcap/tidb/util/chunk"
Expand All @@ -28,13 +26,10 @@ import (

var (
_ functionClass = &likeFunctionClass{}
_ functionClass = &regexpFunctionClass{}
)

var (
_ builtinFunc = &builtinLikeSig{}
_ builtinFunc = &builtinRegexpSig{}
_ builtinFunc = &builtinRegexpUTF8Sig{}
)

type likeFunctionClass struct {
Expand Down Expand Up @@ -108,101 +103,3 @@ func (b *builtinLikeSig) evalInt(row chunk.Row) (int64, bool, error) {
}
return boolToInt64(b.pattern.DoMatch(valStr)), false, nil
}

type regexpFunctionClass struct {
baseFunctionClass
}

func (c *regexpFunctionClass) getFunction(ctx sessionctx.Context, args []Expression) (builtinFunc, error) {
if err := c.verifyArgs(args); err != nil {
return nil, err
}
bf, err := newBaseBuiltinFuncWithTp(ctx, c.funcName, args, types.ETInt, types.ETString, types.ETString)
if err != nil {
return nil, err
}
bf.tp.SetFlen(1)
var sig builtinFunc
if bf.collation == charset.CollationBin {
sig = newBuiltinRegexpSig(bf)
sig.setPbCode(tipb.ScalarFuncSig_RegexpSig)
} else {
sig = newBuiltinRegexpUTF8Sig(bf)
sig.setPbCode(tipb.ScalarFuncSig_RegexpUTF8Sig)
}
return sig, nil
}

type builtinRegexpSharedSig struct {
baseBuiltinFunc
compile func(string) (*regexp.Regexp, error)
memorizedRegexp *regexp.Regexp
memorizedErr error
}

func (b *builtinRegexpSharedSig) clone(from *builtinRegexpSharedSig) {
b.cloneFrom(&from.baseBuiltinFunc)
b.compile = from.compile
if from.memorizedRegexp != nil {
b.memorizedRegexp = from.memorizedRegexp.Copy()
}
b.memorizedErr = from.memorizedErr
}

// evalInt evals `expr REGEXP pat`, or `expr RLIKE pat`.
// See https://dev.mysql.com/doc/refman/5.7/en/regexp.html#operator_regexp
func (b *builtinRegexpSharedSig) evalInt(row chunk.Row) (int64, bool, error) {
expr, isNull, err := b.args[0].EvalString(b.ctx, row)
if isNull || err != nil {
return 0, true, err
}

pat, isNull, err := b.args[1].EvalString(b.ctx, row)
if isNull || err != nil {
return 0, true, err
}

re, err := b.compile(pat)
if err != nil {
return 0, true, ErrRegexp.GenWithStackByArgs(err.Error())
}
return boolToInt64(re.MatchString(expr)), false, nil
}

type builtinRegexpSig struct {
builtinRegexpSharedSig
}

func newBuiltinRegexpSig(bf baseBuiltinFunc) *builtinRegexpSig {
shared := builtinRegexpSharedSig{baseBuiltinFunc: bf}
shared.compile = regexp.Compile
return &builtinRegexpSig{builtinRegexpSharedSig: shared}
}

func (b *builtinRegexpSig) Clone() builtinFunc {
newSig := &builtinRegexpSig{}
newSig.clone(&b.builtinRegexpSharedSig)
return newSig
}

type builtinRegexpUTF8Sig struct {
builtinRegexpSharedSig
}

func newBuiltinRegexpUTF8Sig(bf baseBuiltinFunc) *builtinRegexpUTF8Sig {
shared := builtinRegexpSharedSig{baseBuiltinFunc: bf}
if collate.IsCICollation(bf.collation) {
shared.compile = func(pat string) (*regexp.Regexp, error) {
return regexp.Compile("(?i)" + pat)
}
} else {
shared.compile = regexp.Compile
}
return &builtinRegexpUTF8Sig{builtinRegexpSharedSig: shared}
}

func (b *builtinRegexpUTF8Sig) Clone() builtinFunc {
newSig := &builtinRegexpUTF8Sig{}
newSig.clone(&b.builtinRegexpSharedSig)
return newSig
}
84 changes: 0 additions & 84 deletions expression/builtin_like_vec.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,6 @@
package expression

import (
"regexp"

"github.com/pingcap/errors"
"github.com/pingcap/tidb/util/chunk"
)

Expand Down Expand Up @@ -71,84 +68,3 @@ func (b *builtinLikeSig) vecEvalInt(input *chunk.Chunk, result *chunk.Column) er

return nil
}

func (b *builtinRegexpSig) vectorized() bool {
return true
}

func (b *builtinRegexpUTF8Sig) vectorized() bool {
return true
}

func (b *builtinRegexpSharedSig) isMemorizedRegexpInitialized() bool {
return !(b.memorizedRegexp == nil && b.memorizedErr == nil)
}

func (b *builtinRegexpSharedSig) initMemoizedRegexp(patterns *chunk.Column, n int) {
// Precondition: patterns is generated from a constant expression
if n == 0 {
// If the input rownum is zero, the Regexp error shouldn't be generated.
return
}
for i := 0; i < n; i++ {
if patterns.IsNull(i) {
continue
}
re, err := b.compile(patterns.GetString(i))
b.memorizedRegexp = re
b.memorizedErr = err
break
}
if !b.isMemorizedRegexpInitialized() {
b.memorizedErr = errors.New("No valid regexp pattern found")
}
if b.memorizedErr != nil {
b.memorizedRegexp = nil
}
}

func (b *builtinRegexpSharedSig) vecEvalInt(input *chunk.Chunk, result *chunk.Column) error {
n := input.NumRows()
bufExpr, err := b.bufAllocator.get()
if err != nil {
return err
}
defer b.bufAllocator.put(bufExpr)
if err := b.args[0].VecEvalString(b.ctx, input, bufExpr); err != nil {
return err
}

bufPat, err := b.bufAllocator.get()
if err != nil {
return err
}
defer b.bufAllocator.put(bufPat)
if err := b.args[1].VecEvalString(b.ctx, input, bufPat); err != nil {
return err
}

if b.args[1].ConstItem(b.ctx.GetSessionVars().StmtCtx) && !b.isMemorizedRegexpInitialized() {
b.initMemoizedRegexp(bufPat, n)
}
getRegexp := func(pat string) (*regexp.Regexp, error) {
if b.isMemorizedRegexpInitialized() {
return b.memorizedRegexp, b.memorizedErr
}
return b.compile(pat)
}

result.ResizeInt64(n, false)
result.MergeNulls(bufExpr, bufPat)
i64s := result.Int64s()
for i := 0; i < n; i++ {
if result.IsNull(i) {
continue
}
re, err := getRegexp(bufPat.GetString(i))
if err != nil {
return err
}
i64s[i] = boolToInt64(re.MatchString(bufExpr.GetString(i)))
}
return nil
}
Loading

0 comments on commit dcdbb87

Please sign in to comment.