ruleguard/textmatch: an abstraction on top of regexp for performance (#…

…281) `textmatch.Compile()` takes a regexp pattern and tries to recognize it, returning the matcher that can match the input strings faster than real `*regexp.Regexp` would. If it can't recognize the pattern, it returns a normal `*regexp.Regexp`. Right now we only optimize the simplest patterns, but it's a first step to prove that we can still use regexp in ruleguard rules and avoid big performance loses. ``` name old time/op new time/op delta Match/^\p{Lu}_0-8 153ns ± 4% 11ns ± 1% -92.81% (p=0.008 n=5+5) Match/^\p{Lu}_1-8 140ns ± 2% 11ns ± 0% -92.13% (p=0.008 n=5+5) Match/^\p{Ll}_0-8 152ns ± 1% 11ns ± 1% -92.77% (p=0.008 n=5+5) Match/^\p{Ll}_1-8 140ns ± 2% 11ns ± 3% -92.04% (p=0.008 n=5+5) Match/foo$_0-8 174ns ± 1% 13ns ± 1% -92.26% (p=0.008 n=5+5) Match/foo$_1-8 83.4ns ± 2% 13.4ns ± 6% -83.96% (p=0.008 n=5+5) Match/^foo_0-8 135ns ± 0% 10ns ± 1% -92.33% (p=0.016 n=4+5) Match/^foo_1-8 108ns ± 4% 11ns ± 4% -89.78% (p=0.008 n=5+5) Match/simpleIdent_0-8 243ns ± 2% 18ns ± 1% -92.51% (p=0.008 n=5+5) Match/simpleIdent_1-8 92.7ns ± 1% 26.5ns ± 1% -71.43% (p=0.008 n=5+5) Match/.*simpleIdent.*_0-8 1.59µs ± 2% 0.02µs ± 1% -98.86% (p=0.008 n=5+5) Match/.*simpleIdent.*_1-8 1.70µs ± 1% 0.03µs ± 1% -98.46% (p=0.008 n=5+5) Match/simpleIdent_0#01-8 237ns ± 1% 14ns ± 1% -94.03% (p=0.008 n=5+5) Match/simpleIdent_1#01-8 247ns ± 1% 24ns ± 3% -90.42% (p=0.008 n=5+5) [Geo mean] 211ns 15ns -93.00% ```
quasilyte · Oct 14, 2021 · 7b21d77 · 7b21d77
1 parent 4b7bdbb
commit 7b21d77
Show file tree

Hide file tree

Showing 6 changed files with 356 additions and 6 deletions.
diff --git a/ruleguard/filters.go b/ruleguard/filters.go
@@ -6,12 +6,12 @@ import (
 	"go/token"
 	"go/types"
 	"path/filepath"
-	"regexp"
 
 	"github.com/quasilyte/go-ruleguard/internal/gogrep"
 	"github.com/quasilyte/go-ruleguard/internal/xtypes"
 	"github.com/quasilyte/go-ruleguard/nodetag"
 	"github.com/quasilyte/go-ruleguard/ruleguard/quasigo"
+	"github.com/quasilyte/go-ruleguard/ruleguard/textmatch"
 	"github.com/quasilyte/go-ruleguard/ruleguard/typematch"
 )
 
@@ -76,7 +76,7 @@ func makeFileImportsFilter(src, pkgPath string) filterFunc {
 	}
 }
 
-func makeFilePkgPathMatchesFilter(src string, re *regexp.Regexp) filterFunc {
+func makeFilePkgPathMatchesFilter(src string, re textmatch.Pattern) filterFunc {
 	return func(params *filterParams) matchFilterResult {
 		pkgPath := params.ctx.Pkg.Path()
 		if re.MatchString(pkgPath) {
@@ -86,7 +86,7 @@ func makeFilePkgPathMatchesFilter(src string, re *regexp.Regexp) filterFunc {
 	}
 }
 
-func makeFileNameMatchesFilter(src string, re *regexp.Regexp) filterFunc {
+func makeFileNameMatchesFilter(src string, re textmatch.Pattern) filterFunc {
 	return func(params *filterParams) matchFilterResult {
 		if re.MatchString(filepath.Base(params.filename)) {
 			return filterSuccess
@@ -373,7 +373,7 @@ func makeTextFilter(src, varname string, op token.Token, rhsVarname string) filt
 	}
 }
 
-func makeTextMatchesFilter(src, varname string, re *regexp.Regexp) filterFunc {
+func makeTextMatchesFilter(src, varname string, re textmatch.Pattern) filterFunc {
 	// TODO(quasilyte): add variadic support.
 	return func(params *filterParams) matchFilterResult {
 		if re.Match(params.nodeText(params.subNode(varname))) {

diff --git a/ruleguard/ir_loader.go b/ruleguard/ir_loader.go
@@ -16,6 +16,7 @@ import (
 	"github.com/quasilyte/go-ruleguard/ruleguard/goutil"
 	"github.com/quasilyte/go-ruleguard/ruleguard/ir"
 	"github.com/quasilyte/go-ruleguard/ruleguard/quasigo"
+	"github.com/quasilyte/go-ruleguard/ruleguard/textmatch"
 	"github.com/quasilyte/go-ruleguard/ruleguard/typematch"
 )
 
@@ -409,12 +410,12 @@ func (l *irLoader) unwrapInterfaceExpr(filter ir.FilterExpr) (*types.Interface,
 	return iface, nil
 }
 
-func (l *irLoader) unwrapRegexpExpr(filter ir.FilterExpr) (*regexp.Regexp, error) {
+func (l *irLoader) unwrapRegexpExpr(filter ir.FilterExpr) (textmatch.Pattern, error) {
 	patternString := l.unwrapStringExpr(filter)
 	if patternString == "" {
 		return nil, l.errorf(filter.Line, nil, "expected a non-empty regexp pattern argument")
 	}
-	re, err := regexp.Compile(patternString)
+	re, err := textmatch.Compile(patternString)
 	if err != nil {
 		return nil, l.errorf(filter.Line, err, "compile regexp")
 	}

diff --git a/ruleguard/textmatch/compile.go b/ruleguard/textmatch/compile.go
@@ -0,0 +1,84 @@
+package textmatch
+
+import (
+	"regexp"
+	"regexp/syntax"
+	"unicode"
+)
+
+func compile(s string) (Pattern, error) {
+	reSyntax, err := syntax.Parse(s, syntax.Perl)
+	if err == nil {
+		if optimized := compileOptimized(s, reSyntax); optimized != nil {
+			return optimized, nil
+		}
+	}
+	return regexp.Compile(s)
+}
+
+func compileOptimized(s string, re *syntax.Regexp) Pattern {
+	// .*
+	isAny := func(re *syntax.Regexp) bool {
+		return re.Op == syntax.OpStar && re.Sub[0].Op == syntax.OpAnyCharNotNL
+	}
+	// "literal"
+	isLit := func(re *syntax.Regexp) bool {
+		return re.Op == syntax.OpLiteral
+	}
+	// ^
+	isBegin := func(re *syntax.Regexp) bool {
+		return re.Op == syntax.OpBeginText
+	}
+	// $
+	isEnd := func(re *syntax.Regexp) bool {
+		return re.Op == syntax.OpEndText
+	}
+
+	// TODO: analyze what kind of regexps people use in rules
+	// more often and optimize those as well.
+
+	// lit => strings.Contains($input, lit)
+	if re.Op == syntax.OpLiteral {
+		return &containsLiteralMatcher{value: newInputValue(string(re.Rune))}
+	}
+
+	// `.*` lit `.*` => strings.Contains($input, lit)
+	if re.Op == syntax.OpConcat && len(re.Sub) == 3 {
+		if isAny(re.Sub[0]) && isLit(re.Sub[1]) && isAny(re.Sub[2]) {
+			return &containsLiteralMatcher{value: newInputValue(string(re.Sub[1].Rune))}
+		}
+	}
+
+	// `^` lit => strings.HasPrefix($input, lit)
+	if re.Op == syntax.OpConcat && len(re.Sub) == 2 {
+		if isBegin(re.Sub[0]) && isLit(re.Sub[1]) {
+			return &prefixLiteralMatcher{value: newInputValue(string(re.Sub[1].Rune))}
+		}
+	}
+
+	// lit `$` => strings.HasSuffix($input, lit)
+	if re.Op == syntax.OpConcat && len(re.Sub) == 2 {
+		if isLit(re.Sub[0]) && isEnd(re.Sub[1]) {
+			return &suffixLiteralMatcher{value: newInputValue(string(re.Sub[0].Rune))}
+		}
+	}
+
+	// `^` lit `$` => $input == lit
+	if re.Op == syntax.OpConcat && len(re.Sub) == 3 {
+		if isBegin(re.Sub[0]) && isLit(re.Sub[1]) && isEnd(re.Sub[2]) {
+			return &eqLiteralMatcher{value: newInputValue(string(re.Sub[1].Rune))}
+		}
+	}
+
+	// `^\p{Lu}` => prefixRunePredMatcher:unicode.IsUpper
+	// `^\p{Ll}` => prefixRunePredMatcher:unicode.IsLower
+	switch s {
+	case `^\p{Lu}`:
+		return &prefixRunePredMatcher{pred: unicode.IsUpper}
+	case `^\p{Ll}`:
+		return &prefixRunePredMatcher{pred: unicode.IsLower}
+	}
+
+	// Can't optimize.
+	return nil
+}
diff --git a/ruleguard/textmatch/matchers.go b/ruleguard/textmatch/matchers.go
@@ -0,0 +1,72 @@
+package textmatch
+
+import (
+	"bytes"
+	"strings"
+	"unicode/utf8"
+)
+
+// inputValue is a wrapper for string|[]byte.
+//
+// We hold both values to avoid string->[]byte and vice versa
+// conversions when doing Match and MatchString.
+type inputValue struct {
+	s string
+	b []byte
+}
+
+func newInputValue(s string) inputValue {
+	return inputValue{s: s, b: []byte(s)}
+}
+
+type containsLiteralMatcher struct{ value inputValue }
+
+func (m *containsLiteralMatcher) MatchString(s string) bool {
+	return strings.Contains(s, m.value.s)
+}
+
+func (m *containsLiteralMatcher) Match(b []byte) bool {
+	return bytes.Contains(b, m.value.b)
+}
+
+type prefixLiteralMatcher struct{ value inputValue }
+
+func (m *prefixLiteralMatcher) MatchString(s string) bool {
+	return strings.HasPrefix(s, m.value.s)
+}
+
+func (m *prefixLiteralMatcher) Match(b []byte) bool {
+	return bytes.HasPrefix(b, m.value.b)
+}
+
+type suffixLiteralMatcher struct{ value inputValue }
+
+func (m *suffixLiteralMatcher) MatchString(s string) bool {
+	return strings.HasSuffix(s, m.value.s)
+}
+
+func (m *suffixLiteralMatcher) Match(b []byte) bool {
+	return bytes.HasSuffix(b, m.value.b)
+}
+
+type eqLiteralMatcher struct{ value inputValue }
+
+func (m *eqLiteralMatcher) MatchString(s string) bool {
+	return m.value.s == s
+}
+
+func (m *eqLiteralMatcher) Match(b []byte) bool {
+	return bytes.Equal(m.value.b, b)
+}
+
+type prefixRunePredMatcher struct{ pred func(rune) bool }
+
+func (m *prefixRunePredMatcher) MatchString(s string) bool {
+	r, _ := utf8.DecodeRuneInString(s)
+	return m.pred(r)
+}
+
+func (m *prefixRunePredMatcher) Match(b []byte) bool {
+	r, _ := utf8.DecodeRune(b)
+	return m.pred(r)
+}
diff --git a/ruleguard/textmatch/textmatch.go b/ruleguard/textmatch/textmatch.go
@@ -0,0 +1,26 @@
+package textmatch
+
+import "regexp"
+
+// Pattern is a compiled regular expression.
+type Pattern interface {
+	MatchString(s string) bool
+	Match(b []byte) bool
+}
+
+// Compile parses a regular expression and returns a compiled
+// pattern that can match inputs descriped by the regexp.
+//
+// Semantically it's close to the regexp.Compile, but
+// it does recognize some common patterns and creates
+// a more optimized matcher for them.
+func Compile(re string) (Pattern, error) {
+	return compile(re)
+}
+
+// IsRegexp reports whether p is implemented using regexp.
+// False means that the underlying matcher is something optimized.
+func IsRegexp(p Pattern) bool {
+	_, ok := p.(*regexp.Regexp)
+	return ok
+}