diff --git a/docs/sources/query/query_accceleration.md b/docs/sources/query/query_accceleration.md index 48998fffb9dda..9117ecb209f23 100644 --- a/docs/sources/query/query_accceleration.md +++ b/docs/sources/query/query_accceleration.md @@ -26,6 +26,11 @@ If [bloom filters][] are enabled, you can write LogQL queries using [structured Queries will be accelerated for any [label filter expression][] that satisfies _all_ of the following criteria: * The label filter expression using **string equality**, such as `| key="value"`. + * `or` and `and` operators can be used to match multiple values, such as `| detected_level="error" or detected_level="warn"`. + * _Basic_ regular expressions are automatically simplified into a supported expression: + * `| key=~"value"` is converted to `| key="value"`. + * `| key=~"value1|value2"` is converted to `| key="value1" or key="value2"`. + * `| key=~".+"` checks for existence of `key`. `.*` is not supported. * The label filter expression is querying for structured metadata and not a stream label. * The label filter expression is placed before any [parser expression][], [labels format expression][], [drop labels expression][], or [keep labels expression][]. diff --git a/pkg/bloomgateway/processor_test.go b/pkg/bloomgateway/processor_test.go index f1120fe530a41..58ed638e39a18 100644 --- a/pkg/bloomgateway/processor_test.go +++ b/pkg/bloomgateway/processor_test.go @@ -141,7 +141,7 @@ func TestProcessor(t *testing.T) { } matchers := []v1.LabelMatcher{ - v1.PlainLabelMatcher{ + v1.KeyValueMatcher{ Key: "trace_id", Value: "nomatch", }, @@ -191,7 +191,7 @@ func TestProcessor(t *testing.T) { day: config.NewDayTime(truncateDay(now)), } matchers := []v1.LabelMatcher{ - v1.PlainLabelMatcher{ + v1.KeyValueMatcher{ Key: "trace_id", Value: "nomatch", }, @@ -238,7 +238,7 @@ func TestProcessor(t *testing.T) { day: config.NewDayTime(truncateDay(now)), } matchers := []v1.LabelMatcher{ - v1.PlainLabelMatcher{ + v1.KeyValueMatcher{ Key: "trace_id", Value: "nomatch", }, diff --git a/pkg/storage/bloom/v1/ast_extractor.go b/pkg/storage/bloom/v1/ast_extractor.go index 4c59c93e937fb..b6616e9a357f5 100644 --- a/pkg/storage/bloom/v1/ast_extractor.go +++ b/pkg/storage/bloom/v1/ast_extractor.go @@ -1,12 +1,24 @@ package v1 import ( + regexsyn "github.com/grafana/regexp/syntax" + "github.com/prometheus/prometheus/model/labels" "github.com/grafana/loki/v3/pkg/logql/log" "github.com/grafana/loki/v3/pkg/logql/syntax" + "github.com/grafana/loki/v3/pkg/util" ) +// Simplifiable regexp expressions can quickly expand into very high +// cardinality; we limit the number of matchers to prevent this. However, +// since bloom tests are relatively cheap to test, we can afford to be a little +// generous while still preventing excessive cardinality. +// +// For example, the regex `[0-9]` expands to 10 matchers (0, 1, .. 9), while +// `[0-9][0-9][0-9]` expands to 1000 matchers (000, 001, .., 999). +const maxRegexMatchers = 200 + // LabelMatcher represents bloom tests for key-value pairs, mapped from // LabelFilterExprs from the AST. type LabelMatcher interface{ isLabelMatcher() } @@ -15,9 +27,13 @@ type LabelMatcher interface{ isLabelMatcher() } // mapped. Bloom tests for UnsupportedLabelMatchers must always pass. type UnsupportedLabelMatcher struct{} -// PlainLabelMatcher represents a direct key-value matcher. Bloom tests -// must only pass if the key-value pair exists in the bloom. -type PlainLabelMatcher struct{ Key, Value string } +// KeyValueMatcher represents a direct key-value matcher. Bloom tests must only +// pass if the key-value pair exists in the bloom. +type KeyValueMatcher struct{ Key, Value string } + +// KeyMatcher represents a key matcher. Bloom tests must only pass if the key +// exists in the bloom. +type KeyMatcher struct{ Key string } // OrLabelMatcher represents a logical OR test. Bloom tests must only pass if // one of the Left or Right label matcher bloom tests pass. @@ -54,21 +70,27 @@ func buildLabelMatcher(filter log.LabelFilterer) LabelMatcher { switch filter := filter.(type) { case *log.LineFilterLabelFilter: - if filter.Type != labels.MatchEqual { - return UnsupportedLabelMatcher{} + if filter.Type == labels.MatchEqual { + return KeyValueMatcher{ + Key: filter.Name, + Value: filter.Value, + } + } else if filter.Type == labels.MatchRegexp { + reg, err := regexsyn.Parse(filter.Value, regexsyn.Perl) + if err != nil { + return UnsupportedLabelMatcher{} + } + return buildSimplifiedRegexMatcher(filter.Name, reg.Simplify()) } - return PlainLabelMatcher{ - Key: filter.Name, - Value: filter.Value, - } + return UnsupportedLabelMatcher{} case *log.StringLabelFilter: if filter.Type != labels.MatchEqual { return UnsupportedLabelMatcher{} } - return PlainLabelMatcher{ + return KeyValueMatcher{ Key: filter.Name, Value: filter.Value, } @@ -89,11 +111,169 @@ func buildLabelMatcher(filter log.LabelFilterer) LabelMatcher { } } +// buildSimplifiedRegexMatcher builds a simplified label matcher from a regex. +// reg may be mutated. +func buildSimplifiedRegexMatcher(key string, reg *regexsyn.Regexp) LabelMatcher { + switch reg.Op { + case regexsyn.OpAlternate: + util.ClearCapture(reg) + + left := buildSimplifiedRegexMatcher(key, reg.Sub[0]) + if len(reg.Sub) == 1 { + // This shouldn't be possible (even `warn|` has two subexpressions, where + // the latter matches an empty string), but we have a length check here + // anyway just to avoid a potential panic. + return left + } + for _, sub := range reg.Sub[1:] { + right := buildSimplifiedRegexMatcher(key, sub) + left = OrLabelMatcher{Left: left, Right: right} + } + return left + + case regexsyn.OpConcat: + // OpConcat checks for the concatenation of two or more subexpressions. For + // example, value1|value2 simplifies to value[12], with the two + // subexpressions value and [12]. + // + // We expand subexpressions back out into full matchers where possible, so + // value[12] becomes value1 OR value2, and value[1-9] becomes value1 OR + // value2 .. OR value9. + util.ClearCapture(reg) + + matchers, ok := expandSubexpr(reg) + if !ok || len(matchers) == 0 { + return UnsupportedLabelMatcher{} + } + + var left LabelMatcher = KeyValueMatcher{Key: key, Value: matchers[0]} + for _, matcher := range matchers[1:] { + right := KeyValueMatcher{Key: key, Value: matcher} + left = OrLabelMatcher{Left: left, Right: right} + } + return left + + case regexsyn.OpCapture: + util.ClearCapture(reg) + return buildSimplifiedRegexMatcher(key, reg) + + case regexsyn.OpLiteral: + return KeyValueMatcher{ + Key: key, + Value: string(reg.Rune), + } + + case regexsyn.OpPlus: + if reg.Sub[0].Op == regexsyn.OpAnyChar || reg.Sub[0].Op == regexsyn.OpAnyCharNotNL { // .+ + return KeyMatcher{Key: key} + } + + return UnsupportedLabelMatcher{} + + default: + return UnsupportedLabelMatcher{} + } +} + +func expandSubexpr(reg *regexsyn.Regexp) (prefixes []string, ok bool) { + switch reg.Op { + case regexsyn.OpAlternate: + util.ClearCapture(reg) + + for _, sub := range reg.Sub { + subPrefixes, ok := expandSubexpr(sub) + if !ok { + return nil, false + } else if len(prefixes)+len(subPrefixes) > maxRegexMatchers { + return nil, false + } + prefixes = append(prefixes, subPrefixes...) + } + return prefixes, true + + case regexsyn.OpCharClass: + // OpCharClass stores ranges of characters, so [12] is the range of bytes + // []rune('1', '2'), while [15] is represented as []rune('1', '1', '5', + // '5'). + // + // To expand OpCharClass, we iterate over each pair of runes. + if len(reg.Rune)%2 != 0 { + // Invalid regexp; sequences should be even. + return nil, false + } + + for i := 0; i < len(reg.Rune); i += 2 { + start, end := reg.Rune[i+0], reg.Rune[i+1] + for r := start; r <= end; r++ { + prefixes = append(prefixes, string(r)) + if len(prefixes) > maxRegexMatchers { + return nil, false + } + } + } + + return prefixes, true + + case regexsyn.OpConcat: + if len(reg.Sub) == 0 { + return nil, false + } + + // We get the prefixes for each subexpression and then iteratively combine + // them together. + // + // For the regexp [12][34]value (which concatenates [12], [34], and value): + // + // 1. We get the prefixes for [12], which are 1 and 2. + // 2. We get the prefixes for [34], which are 3 and 4. + // 3. We add the prefixes together to get 13, 14, 23, and 24. + // 4. We get the prerfixes for value, which is value. + // 5. Finally, we add the prefixes together to get 13value, 14value, 23value, and 24value. + curPrefixes, ok := expandSubexpr(reg.Sub[0]) + if !ok { + return nil, false + } + + for _, sub := range reg.Sub[1:] { + subPrefixes, ok := expandSubexpr(sub) + if !ok { + return nil, false + } else if len(curPrefixes)*len(subPrefixes) > maxRegexMatchers { + return nil, false + } + + newPrefixes := make([]string, 0, len(curPrefixes)*len(subPrefixes)) + + for _, curPrefix := range curPrefixes { + for _, subPrefix := range subPrefixes { + newPrefixes = append(newPrefixes, curPrefix+subPrefix) + } + } + + curPrefixes = newPrefixes + } + + return curPrefixes, true + + case regexsyn.OpCapture: + util.ClearCapture(reg) + return expandSubexpr(reg) + + case regexsyn.OpLiteral: + prefixes = append(prefixes, string(reg.Rune)) + return prefixes, true + + default: + return nil, false + } +} + // // Implement marker types: // func (UnsupportedLabelMatcher) isLabelMatcher() {} -func (PlainLabelMatcher) isLabelMatcher() {} +func (KeyValueMatcher) isLabelMatcher() {} +func (KeyMatcher) isLabelMatcher() {} func (OrLabelMatcher) isLabelMatcher() {} func (AndLabelMatcher) isLabelMatcher() {} diff --git a/pkg/storage/bloom/v1/ast_extractor_test.go b/pkg/storage/bloom/v1/ast_extractor_test.go index 856f0412c8a99..69b5327246b87 100644 --- a/pkg/storage/bloom/v1/ast_extractor_test.go +++ b/pkg/storage/bloom/v1/ast_extractor_test.go @@ -20,7 +20,7 @@ func TestExtractLabelMatchers(t *testing.T) { name: "basic label matcher", input: `{app="foo"} | key="value"`, expect: []v1.LabelMatcher{ - v1.PlainLabelMatcher{Key: "key", Value: "value"}, + v1.KeyValueMatcher{Key: "key", Value: "value"}, }, }, @@ -29,8 +29,8 @@ func TestExtractLabelMatchers(t *testing.T) { input: `{app="foo"} | key1="value1" or key2="value2"`, expect: []v1.LabelMatcher{ v1.OrLabelMatcher{ - Left: v1.PlainLabelMatcher{Key: "key1", Value: "value1"}, - Right: v1.PlainLabelMatcher{Key: "key2", Value: "value2"}, + Left: v1.KeyValueMatcher{Key: "key1", Value: "value1"}, + Right: v1.KeyValueMatcher{Key: "key2", Value: "value2"}, }, }, }, @@ -40,8 +40,8 @@ func TestExtractLabelMatchers(t *testing.T) { input: `{app="foo"} | key1="value1" and key2="value2"`, expect: []v1.LabelMatcher{ v1.AndLabelMatcher{ - Left: v1.PlainLabelMatcher{Key: "key1", Value: "value1"}, - Right: v1.PlainLabelMatcher{Key: "key2", Value: "value2"}, + Left: v1.KeyValueMatcher{Key: "key1", Value: "value1"}, + Right: v1.KeyValueMatcher{Key: "key2", Value: "value2"}, }, }, }, @@ -50,14 +50,136 @@ func TestExtractLabelMatchers(t *testing.T) { name: "multiple label matchers", input: `{app="foo"} | key1="value1" | key2="value2"`, expect: []v1.LabelMatcher{ - v1.PlainLabelMatcher{Key: "key1", Value: "value1"}, - v1.PlainLabelMatcher{Key: "key2", Value: "value2"}, + v1.KeyValueMatcher{Key: "key1", Value: "value1"}, + v1.KeyValueMatcher{Key: "key2", Value: "value2"}, }, }, { - name: "unsupported label matchers", + name: "basic regex matcher", input: `{app="foo"} | key1=~"value1"`, + expect: []v1.LabelMatcher{ + v1.KeyValueMatcher{Key: "key1", Value: "value1"}, + }, + }, + + { + name: "regex matcher short", // simplifies to value[15]. + input: `{app="foo"} | key1=~"value1|value5"`, + expect: []v1.LabelMatcher{ + v1.OrLabelMatcher{ + v1.KeyValueMatcher{Key: "key1", Value: "value1"}, + v1.KeyValueMatcher{Key: "key1", Value: "value5"}, + }, + }, + }, + + { + name: "regex matcher range", + input: `{app="foo"} | key1=~"value[0-9]"`, + expect: []v1.LabelMatcher{ + buildOrMatchers( + v1.KeyValueMatcher{Key: "key1", Value: "value0"}, + v1.KeyValueMatcher{Key: "key1", Value: "value1"}, + v1.KeyValueMatcher{Key: "key1", Value: "value2"}, + v1.KeyValueMatcher{Key: "key1", Value: "value3"}, + v1.KeyValueMatcher{Key: "key1", Value: "value4"}, + v1.KeyValueMatcher{Key: "key1", Value: "value5"}, + v1.KeyValueMatcher{Key: "key1", Value: "value6"}, + v1.KeyValueMatcher{Key: "key1", Value: "value7"}, + v1.KeyValueMatcher{Key: "key1", Value: "value8"}, + v1.KeyValueMatcher{Key: "key1", Value: "value9"}, + ), + }, + }, + + { + name: "regex matcher ignore high cardinality", + input: `{app="foo"} | key1=~"value[0-9][0-9][0-9]"`, // This would expand to 1000 matchers. Too many! + expect: []v1.LabelMatcher{ + v1.UnsupportedLabelMatcher{}, + }, + }, + + { + name: "regex matcher", + input: `{app="foo"} | key1=~"value123|value456"`, + expect: []v1.LabelMatcher{ + v1.OrLabelMatcher{ + v1.KeyValueMatcher{Key: "key1", Value: "value123"}, + v1.KeyValueMatcher{Key: "key1", Value: "value456"}, + }, + }, + }, + + { + name: "regex multiple expands", + input: `{app="foo"} | detected_level=~"debug|info|warn|error"`, + expect: []v1.LabelMatcher{ + buildOrMatchers( + v1.KeyValueMatcher{Key: "detected_level", Value: "debug"}, + v1.KeyValueMatcher{Key: "detected_level", Value: "info"}, + v1.KeyValueMatcher{Key: "detected_level", Value: "warn"}, + v1.KeyValueMatcher{Key: "detected_level", Value: "error"}, + ), + }, + }, + + { + name: "regex matcher with ignored capture groups", + input: `{app="foo"} | key1=~"value1|(value2)"`, + expect: []v1.LabelMatcher{ + v1.OrLabelMatcher{ + v1.KeyValueMatcher{Key: "key1", Value: "value1"}, + v1.KeyValueMatcher{Key: "key1", Value: "value2"}, + }, + }, + }, + + { + name: "advanced regex matcher", + input: `{app="foo"} | key1=~"(warn|info[0-3])"`, + expect: []v1.LabelMatcher{ + v1.OrLabelMatcher{ + v1.KeyValueMatcher{Key: "key1", Value: "warn"}, + buildOrMatchers( + v1.KeyValueMatcher{Key: "key1", Value: "info0"}, + v1.KeyValueMatcher{Key: "key1", Value: "info1"}, + v1.KeyValueMatcher{Key: "key1", Value: "info2"}, + v1.KeyValueMatcher{Key: "key1", Value: "info3"}, + ), + }, + }, + }, + + { + name: "regex .+ matcher", + input: `{app="foo"} | key1=~".+"`, + expect: []v1.LabelMatcher{ + v1.KeyMatcher{Key: "key1"}, + }, + }, + + { + // This should also be unsupported for suffix or substring regexes. + name: "regex .+ prefix matcher", + input: `{app="foo"} | key1=~".+foo"`, + expect: []v1.LabelMatcher{ + v1.UnsupportedLabelMatcher{}, + }, + }, + + { + name: "regex .* matcher", + input: `{app="foo"} | key1=~".*"`, + expect: []v1.LabelMatcher{ + v1.UnsupportedLabelMatcher{}, + }, + }, + + { + name: "unsupported label matchers", + input: `{app="foo"} | key1!="value1"`, expect: []v1.LabelMatcher{ v1.UnsupportedLabelMatcher{}, }, @@ -73,6 +195,23 @@ func TestExtractLabelMatchers(t *testing.T) { } } +func buildOrMatchers(matchers ...v1.LabelMatcher) v1.LabelMatcher { + if len(matchers) == 1 { + return matchers[0] + } + + left := matchers[0] + + for _, right := range matchers[1:] { + left = v1.OrLabelMatcher{ + Left: left, + Right: right, + } + } + + return left +} + func TestExtractLabelMatchers_IgnoreAfterParse(t *testing.T) { tt := []struct { name string @@ -92,7 +231,7 @@ func TestExtractLabelMatchers_IgnoreAfterParse(t *testing.T) { t.Run(tc.name, func(t *testing.T) { fullInput := fmt.Sprintf(`{app="foo"} | key1="value1" | %s | key2="value2"`, tc.expr) expect := []v1.LabelMatcher{ - v1.PlainLabelMatcher{Key: "key1", Value: "value1"}, + v1.KeyValueMatcher{Key: "key1", Value: "value1"}, // key2="value2" should be ignored following tc.expr } diff --git a/pkg/storage/bloom/v1/bloom_tester.go b/pkg/storage/bloom/v1/bloom_tester.go index b70c64804118e..1682556bf7d60 100644 --- a/pkg/storage/bloom/v1/bloom_tester.go +++ b/pkg/storage/bloom/v1/bloom_tester.go @@ -119,8 +119,11 @@ func matcherToBloomTest(matcher LabelMatcher) BloomTest { case UnsupportedLabelMatcher: return matchAllTest{} - case PlainLabelMatcher: - return newStringMatcherTest(matcher) + case KeyValueMatcher: + return newKeyValueMatcherTest(matcher) + + case KeyMatcher: + return newKeyMatcherTest(matcher) case OrLabelMatcher: return newOrTest( @@ -140,15 +143,15 @@ func matcherToBloomTest(matcher LabelMatcher) BloomTest { } } -type stringMatcherTest struct { - matcher PlainLabelMatcher +type keyValueMatcherTest struct { + matcher KeyValueMatcher } -func newStringMatcherTest(matcher PlainLabelMatcher) stringMatcherTest { - return stringMatcherTest{matcher: matcher} +func newKeyValueMatcherTest(matcher KeyValueMatcher) keyValueMatcherTest { + return keyValueMatcherTest{matcher: matcher} } -func (sm stringMatcherTest) Matches(series labels.Labels, bloom filter.Checker) bool { +func (kvm keyValueMatcherTest) Matches(series labels.Labels, bloom filter.Checker) bool { // TODO(rfratto): reintroduce the use of a shared tokenizer here to avoid // desyncing between how tokens are passed during building vs passed during // querying. @@ -159,24 +162,24 @@ func (sm stringMatcherTest) Matches(series labels.Labels, bloom filter.Checker) // 2. It should be possible to test for just the key var ( - combined = fmt.Sprintf("%s=%s", sm.matcher.Key, sm.matcher.Value) + combined = fmt.Sprintf("%s=%s", kvm.matcher.Key, kvm.matcher.Value) rawCombined = unsafe.Slice(unsafe.StringData(combined), len(combined)) ) - return sm.match(series, bloom, rawCombined) + return kvm.match(series, bloom, rawCombined) } -func (sm stringMatcherTest) MatchesWithPrefixBuf(series labels.Labels, bloom filter.Checker, buf []byte, prefixLen int) bool { +func (kvm keyValueMatcherTest) MatchesWithPrefixBuf(series labels.Labels, bloom filter.Checker, buf []byte, prefixLen int) bool { var ( - combined = fmt.Sprintf("%s=%s", sm.matcher.Key, sm.matcher.Value) + combined = fmt.Sprintf("%s=%s", kvm.matcher.Key, kvm.matcher.Value) prefixedCombined = appendToBuf(buf, prefixLen, combined) ) - return sm.match(series, bloom, prefixedCombined) + return kvm.match(series, bloom, prefixedCombined) } // match returns true if the series matches the matcher or is in the bloom filter. -func (sm stringMatcherTest) match(series labels.Labels, bloom filter.Checker, combined []byte) bool { +func (kvm keyValueMatcherTest) match(series labels.Labels, bloom filter.Checker, combined []byte) bool { // If we don't have the series labels, we cannot disambiguate which labels come from the series in which case // we may filter out chunks for queries like `{env="prod"} | env="prod"` if env=prod is not structured metadata if len(series) == 0 { @@ -186,8 +189,8 @@ func (sm stringMatcherTest) match(series labels.Labels, bloom filter.Checker, co // It's in the series if the key is set and has the same value. // By checking val != "" we handle `{env="prod"} | user=""`. - val := series.Get(sm.matcher.Key) - inSeries := val != "" && val == sm.matcher.Value + val := series.Get(kvm.matcher.Key) + inSeries := val != "" && val == kvm.matcher.Value inBloom := bloom.Test(combined) return inSeries || inBloom @@ -199,3 +202,53 @@ func appendToBuf(buf []byte, prefixLen int, str string) []byte { rawString := unsafe.Slice(unsafe.StringData(str), len(str)) return append(buf[:prefixLen], rawString...) } + +type keyMatcherTest struct { + matcher KeyMatcher +} + +func newKeyMatcherTest(matcher KeyMatcher) keyMatcherTest { + return keyMatcherTest{matcher: matcher} +} + +func (km keyMatcherTest) Matches(series labels.Labels, bloom filter.Checker) bool { + // TODO(rfratto): reintroduce the use of a shared tokenizer here to avoid + // desyncing between how tokens are passed during building vs passed during + // querying. + // + // For a shared tokenizer to be ergonomic: + // + // 1. A prefix shouldn't be required until MatchesWithPrefixBuf is called + // 2. It should be possible to test for just the key + + var ( + key = km.matcher.Key + rawKey = unsafe.Slice(unsafe.StringData(key), len(key)) + ) + + return km.match(series, bloom, rawKey) +} + +func (km keyMatcherTest) MatchesWithPrefixBuf(series labels.Labels, bloom filter.Checker, buf []byte, prefixLen int) bool { + var ( + key = km.matcher.Key + prefixedKey = appendToBuf(buf, prefixLen, key) + ) + + return km.match(series, bloom, prefixedKey) +} + +// match returns true if the series matches the matcher or is in the bloom +// filter. +func (km keyMatcherTest) match(series labels.Labels, bloom filter.Checker, key []byte) bool { + // If we don't have the series labels, we cannot disambiguate which labels come from the series in which case + // we may filter out chunks for queries like `{env="prod"} | env="prod"` if env=prod is not structured metadata + if len(series) == 0 { + level.Warn(util_log.Logger).Log("msg", "series has no labels, cannot filter out chunks") + return true + } + + inSeries := series.Get(km.matcher.Key) != "" + inBloom := bloom.Test(key) + return inSeries || inBloom +} diff --git a/pkg/storage/bloom/v1/bloom_tester_test.go b/pkg/storage/bloom/v1/bloom_tester_test.go index 935da19de255a..2c3e67dc224fb 100644 --- a/pkg/storage/bloom/v1/bloom_tester_test.go +++ b/pkg/storage/bloom/v1/bloom_tester_test.go @@ -116,6 +116,16 @@ func TestLabelMatchersToBloomTest(t *testing.T) { query: `{app="fake"} | trace_id="exists_1" and trace_id="noexist"`, match: false, }, + { + name: "presence test pass", + query: `{app="fake"} | trace_id=~".+"`, + match: true, + }, + { + name: "presence test pass", + query: `{app="fake"} | noexist=~".+"`, + match: false, + }, } for _, tc := range tt {