Skip to content

Commit

Permalink
ruleguard/textmatch: an abstraction on top of regexp for performance (#…
Browse files Browse the repository at this point in the history
…281)

`textmatch.Compile()` takes a regexp pattern and tries to recognize
it, returning the matcher that can match the input strings faster
than real `*regexp.Regexp` would. If it can't recognize the pattern,
it returns a normal `*regexp.Regexp`.

Right now we only optimize the simplest patterns, but it's a
first step to prove that we can still use regexp in ruleguard
rules and avoid big performance loses.

```
name                       old time/op    new time/op    delta
Match/^\p{Lu}_0-8             153ns ± 4%      11ns ± 1%  -92.81%  (p=0.008 n=5+5)
Match/^\p{Lu}_1-8             140ns ± 2%      11ns ± 0%  -92.13%  (p=0.008 n=5+5)
Match/^\p{Ll}_0-8             152ns ± 1%      11ns ± 1%  -92.77%  (p=0.008 n=5+5)
Match/^\p{Ll}_1-8             140ns ± 2%      11ns ± 3%  -92.04%  (p=0.008 n=5+5)
Match/foo$_0-8                174ns ± 1%      13ns ± 1%  -92.26%  (p=0.008 n=5+5)
Match/foo$_1-8               83.4ns ± 2%    13.4ns ± 6%  -83.96%  (p=0.008 n=5+5)
Match/^foo_0-8                135ns ± 0%      10ns ± 1%  -92.33%  (p=0.016 n=4+5)
Match/^foo_1-8                108ns ± 4%      11ns ± 4%  -89.78%  (p=0.008 n=5+5)
Match/simpleIdent_0-8         243ns ± 2%      18ns ± 1%  -92.51%  (p=0.008 n=5+5)
Match/simpleIdent_1-8        92.7ns ± 1%    26.5ns ± 1%  -71.43%  (p=0.008 n=5+5)
Match/.*simpleIdent.*_0-8    1.59µs ± 2%    0.02µs ± 1%  -98.86%  (p=0.008 n=5+5)
Match/.*simpleIdent.*_1-8    1.70µs ± 1%    0.03µs ± 1%  -98.46%  (p=0.008 n=5+5)
Match/simpleIdent_0#01-8      237ns ± 1%      14ns ± 1%  -94.03%  (p=0.008 n=5+5)
Match/simpleIdent_1#01-8      247ns ± 1%      24ns ± 3%  -90.42%  (p=0.008 n=5+5)
[Geo mean]                    211ns           15ns       -93.00%
```
  • Loading branch information
quasilyte authored Oct 14, 2021
1 parent 4b7bdbb commit 7b21d77
Show file tree
Hide file tree
Showing 6 changed files with 356 additions and 6 deletions.
8 changes: 4 additions & 4 deletions ruleguard/filters.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,12 @@ import (
"go/token"
"go/types"
"path/filepath"
"regexp"

"github.com/quasilyte/go-ruleguard/internal/gogrep"
"github.com/quasilyte/go-ruleguard/internal/xtypes"
"github.com/quasilyte/go-ruleguard/nodetag"
"github.com/quasilyte/go-ruleguard/ruleguard/quasigo"
"github.com/quasilyte/go-ruleguard/ruleguard/textmatch"
"github.com/quasilyte/go-ruleguard/ruleguard/typematch"
)

Expand Down Expand Up @@ -76,7 +76,7 @@ func makeFileImportsFilter(src, pkgPath string) filterFunc {
}
}

func makeFilePkgPathMatchesFilter(src string, re *regexp.Regexp) filterFunc {
func makeFilePkgPathMatchesFilter(src string, re textmatch.Pattern) filterFunc {
return func(params *filterParams) matchFilterResult {
pkgPath := params.ctx.Pkg.Path()
if re.MatchString(pkgPath) {
Expand All @@ -86,7 +86,7 @@ func makeFilePkgPathMatchesFilter(src string, re *regexp.Regexp) filterFunc {
}
}

func makeFileNameMatchesFilter(src string, re *regexp.Regexp) filterFunc {
func makeFileNameMatchesFilter(src string, re textmatch.Pattern) filterFunc {
return func(params *filterParams) matchFilterResult {
if re.MatchString(filepath.Base(params.filename)) {
return filterSuccess
Expand Down Expand Up @@ -373,7 +373,7 @@ func makeTextFilter(src, varname string, op token.Token, rhsVarname string) filt
}
}

func makeTextMatchesFilter(src, varname string, re *regexp.Regexp) filterFunc {
func makeTextMatchesFilter(src, varname string, re textmatch.Pattern) filterFunc {
// TODO(quasilyte): add variadic support.
return func(params *filterParams) matchFilterResult {
if re.Match(params.nodeText(params.subNode(varname))) {
Expand Down
5 changes: 3 additions & 2 deletions ruleguard/ir_loader.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ import (
"github.com/quasilyte/go-ruleguard/ruleguard/goutil"
"github.com/quasilyte/go-ruleguard/ruleguard/ir"
"github.com/quasilyte/go-ruleguard/ruleguard/quasigo"
"github.com/quasilyte/go-ruleguard/ruleguard/textmatch"
"github.com/quasilyte/go-ruleguard/ruleguard/typematch"
)

Expand Down Expand Up @@ -409,12 +410,12 @@ func (l *irLoader) unwrapInterfaceExpr(filter ir.FilterExpr) (*types.Interface,
return iface, nil
}

func (l *irLoader) unwrapRegexpExpr(filter ir.FilterExpr) (*regexp.Regexp, error) {
func (l *irLoader) unwrapRegexpExpr(filter ir.FilterExpr) (textmatch.Pattern, error) {
patternString := l.unwrapStringExpr(filter)
if patternString == "" {
return nil, l.errorf(filter.Line, nil, "expected a non-empty regexp pattern argument")
}
re, err := regexp.Compile(patternString)
re, err := textmatch.Compile(patternString)
if err != nil {
return nil, l.errorf(filter.Line, err, "compile regexp")
}
Expand Down
84 changes: 84 additions & 0 deletions ruleguard/textmatch/compile.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
package textmatch

import (
"regexp"
"regexp/syntax"
"unicode"
)

func compile(s string) (Pattern, error) {
reSyntax, err := syntax.Parse(s, syntax.Perl)
if err == nil {
if optimized := compileOptimized(s, reSyntax); optimized != nil {
return optimized, nil
}
}
return regexp.Compile(s)
}

func compileOptimized(s string, re *syntax.Regexp) Pattern {
// .*
isAny := func(re *syntax.Regexp) bool {
return re.Op == syntax.OpStar && re.Sub[0].Op == syntax.OpAnyCharNotNL
}
// "literal"
isLit := func(re *syntax.Regexp) bool {
return re.Op == syntax.OpLiteral
}
// ^
isBegin := func(re *syntax.Regexp) bool {
return re.Op == syntax.OpBeginText
}
// $
isEnd := func(re *syntax.Regexp) bool {
return re.Op == syntax.OpEndText
}

// TODO: analyze what kind of regexps people use in rules
// more often and optimize those as well.

// lit => strings.Contains($input, lit)
if re.Op == syntax.OpLiteral {
return &containsLiteralMatcher{value: newInputValue(string(re.Rune))}
}

// `.*` lit `.*` => strings.Contains($input, lit)
if re.Op == syntax.OpConcat && len(re.Sub) == 3 {
if isAny(re.Sub[0]) && isLit(re.Sub[1]) && isAny(re.Sub[2]) {
return &containsLiteralMatcher{value: newInputValue(string(re.Sub[1].Rune))}
}
}

// `^` lit => strings.HasPrefix($input, lit)
if re.Op == syntax.OpConcat && len(re.Sub) == 2 {
if isBegin(re.Sub[0]) && isLit(re.Sub[1]) {
return &prefixLiteralMatcher{value: newInputValue(string(re.Sub[1].Rune))}
}
}

// lit `$` => strings.HasSuffix($input, lit)
if re.Op == syntax.OpConcat && len(re.Sub) == 2 {
if isLit(re.Sub[0]) && isEnd(re.Sub[1]) {
return &suffixLiteralMatcher{value: newInputValue(string(re.Sub[0].Rune))}
}
}

// `^` lit `$` => $input == lit
if re.Op == syntax.OpConcat && len(re.Sub) == 3 {
if isBegin(re.Sub[0]) && isLit(re.Sub[1]) && isEnd(re.Sub[2]) {
return &eqLiteralMatcher{value: newInputValue(string(re.Sub[1].Rune))}
}
}

// `^\p{Lu}` => prefixRunePredMatcher:unicode.IsUpper
// `^\p{Ll}` => prefixRunePredMatcher:unicode.IsLower
switch s {
case `^\p{Lu}`:
return &prefixRunePredMatcher{pred: unicode.IsUpper}
case `^\p{Ll}`:
return &prefixRunePredMatcher{pred: unicode.IsLower}
}

// Can't optimize.
return nil
}
72 changes: 72 additions & 0 deletions ruleguard/textmatch/matchers.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
package textmatch

import (
"bytes"
"strings"
"unicode/utf8"
)

// inputValue is a wrapper for string|[]byte.
//
// We hold both values to avoid string->[]byte and vice versa
// conversions when doing Match and MatchString.
type inputValue struct {
s string
b []byte
}

func newInputValue(s string) inputValue {
return inputValue{s: s, b: []byte(s)}
}

type containsLiteralMatcher struct{ value inputValue }

func (m *containsLiteralMatcher) MatchString(s string) bool {
return strings.Contains(s, m.value.s)
}

func (m *containsLiteralMatcher) Match(b []byte) bool {
return bytes.Contains(b, m.value.b)
}

type prefixLiteralMatcher struct{ value inputValue }

func (m *prefixLiteralMatcher) MatchString(s string) bool {
return strings.HasPrefix(s, m.value.s)
}

func (m *prefixLiteralMatcher) Match(b []byte) bool {
return bytes.HasPrefix(b, m.value.b)
}

type suffixLiteralMatcher struct{ value inputValue }

func (m *suffixLiteralMatcher) MatchString(s string) bool {
return strings.HasSuffix(s, m.value.s)
}

func (m *suffixLiteralMatcher) Match(b []byte) bool {
return bytes.HasSuffix(b, m.value.b)
}

type eqLiteralMatcher struct{ value inputValue }

func (m *eqLiteralMatcher) MatchString(s string) bool {
return m.value.s == s
}

func (m *eqLiteralMatcher) Match(b []byte) bool {
return bytes.Equal(m.value.b, b)
}

type prefixRunePredMatcher struct{ pred func(rune) bool }

func (m *prefixRunePredMatcher) MatchString(s string) bool {
r, _ := utf8.DecodeRuneInString(s)
return m.pred(r)
}

func (m *prefixRunePredMatcher) Match(b []byte) bool {
r, _ := utf8.DecodeRune(b)
return m.pred(r)
}
26 changes: 26 additions & 0 deletions ruleguard/textmatch/textmatch.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
package textmatch

import "regexp"

// Pattern is a compiled regular expression.
type Pattern interface {
MatchString(s string) bool
Match(b []byte) bool
}

// Compile parses a regular expression and returns a compiled
// pattern that can match inputs descriped by the regexp.
//
// Semantically it's close to the regexp.Compile, but
// it does recognize some common patterns and creates
// a more optimized matcher for them.
func Compile(re string) (Pattern, error) {
return compile(re)
}

// IsRegexp reports whether p is implemented using regexp.
// False means that the underlying matcher is something optimized.
func IsRegexp(p Pattern) bool {
_, ok := p.(*regexp.Regexp)
return ok
}
Loading

0 comments on commit 7b21d77

Please sign in to comment.