From 9f8be9eebc6c259b82fa88fb2409a2252738f0bd Mon Sep 17 00:00:00 2001
From: Ry Biesemeyer
Date: Tue, 7 Dec 2021 19:32:24 +0000
Subject: [PATCH] fix: correctly handle FieldReference-special characters in
field names.
Keys passed to most methods of `ConvertedMap`, based on `IdentityHashMap`
depend on identity and not equivalence, and therefore rely on the keys being
_interned_ strings. In order to avoid hitting the JVM's global String intern
pool (which can have performance problems), operations to normalize a string
to its interned counterpart have traditionally relied on the behaviour of
`FieldReference#from` returning a likely-cached `FieldReference`, that had
an interned `key` and an empty `path`.
This is problematic on two points.
First, when `ConvertedMap` was given data with keys that _were_ valid string
field references representing a nested field (such as `[host][geo][location]`),
the implementation of `ConvertedMap#put` effectively silently discarded the
path components because it assumed them to be empty, and only the key was
kept (`location`).
Second, when `ConvertedMap` was given a map whose keys contained what the
field reference parser considered special characters but _were NOT_
valid field references, the resulting `FieldReference.IllegalSyntaxException`
caused the operation to abort.
Instead of using the `FieldReference` cache, which sits on top of objects whose
`key` and `path`-components are known to have been interned, we introduce an
internment helper on our `ConvertedMap` that is also backed by the global string
intern pool, and ensure that our field references are primed through this pool.
In addition to fixing the `ConvertedMap#newFromMap` functionality, this has
three net effects:
- Our ConvertedMap operations still use strings
from the global intern pool
- We have a new, smaller cache of individual field
names, improving lookup performance
- Our FieldReference cache no longer is flooded
with fragments and therefore is more likely to
remain performant
NOTE: this does NOT create isolated intern pools, as doing so would require
a careful audit of the possible code-paths to `ConvertedMap#putInterned`.
The new cache is limited to 10k strings, and when more are used only
the FIRST 10k strings will be primed into the cache, leaving the
remainder to always hit the global String intern pool.
NOTE: by fixing this bug, we alow events to be created whose fields _CANNOT_
be referenced with the existing FieldReference implementation.
Resolves: https://github.com/elastic/logstash/issues/13606
Resolves: https://github.com/elastic/logstash/issues/11608
---
logstash-core/spec/logstash/event_spec.rb | 14 ++++++
.../main/java/org/logstash/ConvertedMap.java | 49 ++++++++++++++++---
.../java/org/logstash/FieldReference.java | 26 +++++-----
.../ext/JrubyEventExtLibraryTest.java | 20 +++++---
4 files changed, 84 insertions(+), 25 deletions(-)
diff --git a/logstash-core/spec/logstash/event_spec.rb b/logstash-core/spec/logstash/event_spec.rb
index b130a1c22a6..53f32837aa3 100644
--- a/logstash-core/spec/logstash/event_spec.rb
+++ b/logstash-core/spec/logstash/event_spec.rb
@@ -192,6 +192,15 @@
expect { e.set('[', 'value') }.to raise_exception(::RuntimeError)
end
end
+
+ context 'with map value whose keys have FieldReference-special characters' do
+ let(:event) { LogStash::Event.new }
+ let(:value) { {"this[field]" => "okay"} }
+ it 'sets the value correctly' do
+ event.set('[some][field]', value.dup)
+ expect(event.get('[some][field]')).to eq(value)
+ end
+ end
end
context "timestamp" do
@@ -351,6 +360,11 @@
expect(e.timestamp.to_iso8601).to eq("2016-05-28T23:02:05.350Z")
end
+ it 'accepts maps whose keys contain FieldReference-special characters' do
+ e = LogStash::Event.new({"nested" => {"i][egal" => "okay"}, "n[0]" => "bene"})
+ expect(e.get('nested')).to eq({"i][egal" => "okay"})
+ expect(e.to_hash).to include("n[0]" => "bene")
+ end
end
context "method missing exception messages" do
diff --git a/logstash-core/src/main/java/org/logstash/ConvertedMap.java b/logstash-core/src/main/java/org/logstash/ConvertedMap.java
index a89855dfcd7..c9f0da606e9 100644
--- a/logstash-core/src/main/java/org/logstash/ConvertedMap.java
+++ b/logstash-core/src/main/java/org/logstash/ConvertedMap.java
@@ -24,6 +24,7 @@
import java.util.HashMap;
import java.util.IdentityHashMap;
import java.util.Map;
+import java.util.concurrent.ConcurrentHashMap;
import org.jruby.RubyHash;
import org.jruby.RubyString;
import org.jruby.runtime.ThreadContext;
@@ -35,15 +36,51 @@
* as key.
* The {@code put} method will work with any {@link String} key but is only intended for use in
* situations where {@link ConvertedMap#putInterned(String, Object)} would require manually
- * interning the {@link String} key. This is due to the fact that we use our internal
- * {@link FieldReference} cache to get an interned version of the given key instead of JDKs
- * {@link String#intern()}, which is faster since it works from a much smaller and hotter cache
- * in {@link FieldReference#CACHE} than using String interning directly.
+ * interning the {@link String} key.
+ * This is due to the fact that it is based on {@link IdentityHashMap}, and we rely on the String
+ * intern pool to ensure identity match of equivalent strings.
+ * For performance, we keep a global cache of strings that have been interned for use with {@link ConvertedMap},
+ * and encourage interning through {@link ConvertedMap#internStringForUseAsKey(String)} to avoid
+ * the performance pentalty of the global string intern pool.
*/
public final class ConvertedMap extends IdentityHashMap {
private static final long serialVersionUID = 1L;
+ private static final ConcurrentHashMap KEY_CACHE = new ConcurrentHashMap<>(100, 0.2F, 16);
+
+ /**
+ * Returns an equivalent interned string, possibly avoiding the
+ * global intern pool.
+ *
+ * @param candidate the candidate {@link String}
+ * @return an interned string from the global String intern pool
+ */
+ static String internStringForUseAsKey(final String candidate) {
+ // TODO: replace with LRU cache and/or isolated intern pool
+ final String cached = KEY_CACHE.get(candidate);
+ if (cached != null) { return cached; }
+
+ final String interned = candidate.intern();
+ if (KEY_CACHE.size() <= 10_000 ) {
+ KEY_CACHE.put(interned, interned);
+ }
+ return interned;
+ }
+
+ /**
+ * Ensures that the provided {@code String[]} contains only
+ * instances that have been {@link ConvertedMap::internStringForUseAsKey},
+ * possibly replacing entries with equivalent interned strings.
+ *
+ * @param candidates an array of non-null strings
+ */
+ static void internStringsForUseAsKeys(final String[] candidates) {
+ for (int i = 0; i < candidates.length; i++) {
+ candidates[i] = internStringForUseAsKey(candidates[i]);
+ }
+ }
+
private static final RubyHash.VisitorWithState RUBY_HASH_VISITOR =
new RubyHash.VisitorWithState() {
@Override
@@ -91,7 +128,7 @@ public static ConvertedMap newFromRubyHash(final ThreadContext context, final Ru
@Override
public Object put(final String key, final Object value) {
- return super.put(FieldReference.from(key).getKey(), value);
+ return super.put(internStringForUseAsKey(key), value);
}
/**
@@ -118,6 +155,6 @@ public Object unconvert() {
* @return Interned String
*/
private static String convertKey(final RubyString key) {
- return FieldReference.from(key).getKey();
+ return internStringForUseAsKey(key.asJavaString());
}
}
diff --git a/logstash-core/src/main/java/org/logstash/FieldReference.java b/logstash-core/src/main/java/org/logstash/FieldReference.java
index c0b86d4fb2a..0822e2c26c3 100644
--- a/logstash-core/src/main/java/org/logstash/FieldReference.java
+++ b/logstash-core/src/main/java/org/logstash/FieldReference.java
@@ -27,6 +27,7 @@
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
+
import org.jruby.RubyString;
/**
@@ -75,11 +76,7 @@ public static class IllegalSyntaxException extends RuntimeException {
/**
* Unique {@link FieldReference} pointing at the timestamp field in a {@link Event}.
*/
- public static final FieldReference TIMESTAMP_REFERENCE =
- deduplicate(new FieldReference(EMPTY_STRING_ARRAY, Event.TIMESTAMP, DATA_CHILD));
-
- private static final FieldReference METADATA_PARENT_REFERENCE =
- new FieldReference(EMPTY_STRING_ARRAY, Event.METADATA, META_PARENT);
+ public static final FieldReference TIMESTAMP_REFERENCE = FieldReference.from(Event.TIMESTAMP);
/**
* Cache of all existing {@link FieldReference} by their {@link RubyString} source.
@@ -106,9 +103,10 @@ public static class IllegalSyntaxException extends RuntimeException {
private final int type;
private FieldReference(final String[] path, final String key, final int type) {
- this.key = key;
- this.type = type;
+ this.key = ConvertedMap.internStringForUseAsKey(key);
+ ConvertedMap.internStringsForUseAsKeys(path);
this.path = path;
+ this.type = type;
hash = calculateHash(this.key, this.path, this.type);
}
@@ -221,10 +219,10 @@ private static FieldReference parseToCache(final String reference) {
private static FieldReference parse(final CharSequence reference) {
final List path = TOKENIZER.tokenize(reference);
- final String key = path.remove(path.size() - 1).intern();
+ final String key = path.remove(path.size() - 1);
final boolean empty = path.isEmpty();
if (empty && key.equals(Event.METADATA)) {
- return METADATA_PARENT_REFERENCE;
+ return new FieldReference(EMPTY_STRING_ARRAY, key, META_PARENT);
} else if (!empty && path.get(0).equals(Event.METADATA)) {
return new FieldReference(
path.subList(1, path.size()).toArray(EMPTY_STRING_ARRAY), key, META_CHILD
@@ -240,7 +238,11 @@ private static FieldReference parse(final CharSequence reference) {
**/
private static class StrictTokenizer {
- public List tokenize(CharSequence reference) {
+ /**
+ * @param reference a sequence of characters representing a reference to a field
+ * @return a list of string path fragments.
+ */
+ public List tokenize(final CharSequence reference) {
ArrayList path = new ArrayList<>();
final int length = reference.length();
@@ -286,7 +288,7 @@ public List tokenize(CharSequence reference) {
if (splitPoint < i) {
// if we have something to add, add it.
- path.add(reference.subSequence(splitPoint, i).toString().intern());
+ path.add(reference.subSequence(splitPoint, i).toString());
}
depth--;
@@ -309,7 +311,9 @@ public List tokenize(CharSequence reference) {
// if we saw no brackets, this is a top-level reference that can be emitted as-is without
// further processing
path.add(reference.toString());
+ path.trimToSize();
return path;
+
} else if (depth > 0) {
// when we hit the end-of-input while still in an open bracket, we have an invalid field reference
potentiallyAmbiguousSyntaxDetected = true;
diff --git a/logstash-core/src/test/java/org/logstash/ext/JrubyEventExtLibraryTest.java b/logstash-core/src/test/java/org/logstash/ext/JrubyEventExtLibraryTest.java
index 538d8695715..2bf627c9eb0 100644
--- a/logstash-core/src/test/java/org/logstash/ext/JrubyEventExtLibraryTest.java
+++ b/logstash-core/src/test/java/org/logstash/ext/JrubyEventExtLibraryTest.java
@@ -27,10 +27,12 @@
import java.util.Map;
import org.assertj.core.api.Assertions;
import org.hamcrest.CoreMatchers;
+import org.jruby.RubyBoolean;
import org.jruby.RubyHash;
import org.jruby.RubyString;
import org.jruby.exceptions.RuntimeError;
import org.jruby.javasupport.JavaUtil;
+import org.jruby.runtime.Block;
import org.jruby.runtime.ThreadContext;
import org.jruby.runtime.builtin.IRubyObject;
import org.junit.Assert;
@@ -98,19 +100,21 @@ public void correctlyRaiseRubyRuntimeErrorWhenGivenInvalidFieldReferences() {
}
@Test
- public void correctlyRaiseRubyRuntimeErrorWhenGivenInvalidFieldReferencesInMap() {
+ public void correctlySetsValueWhenGivenMapWithKeysThatHaveFieldReferenceSpecialCharacters() {
final ThreadContext context = RubyUtil.RUBY.getCurrentContext();
final JrubyEventExtLibrary.RubyEvent event =
JrubyEventExtLibrary.RubyEvent.newRubyEvent(context.runtime);
final RubyString key = rubyString("foo");
final RubyHash value = RubyHash.newHash(context.runtime, Collections.singletonMap(rubyString("il[[]]]legal"), rubyString("okay")), context.nil);
- try {
- event.ruby_set_field(context, key, value);
- } catch (RuntimeError rubyRuntimeError) {
- Assert.assertThat(rubyRuntimeError.getLocalizedMessage(), CoreMatchers.containsString("Invalid FieldReference"));
- return;
- }
- Assert.fail("expected ruby RuntimeError was not thrown.");
+
+ event.ruby_set_field(context, key, value);
+ IRubyObject retrievedValue = event.ruby_get_field(context, key);
+ Assert.assertThat(retrievedValue, CoreMatchers.equalTo(value));
+
+ RubyHash eventHash = (RubyHash) event.ruby_to_hash_with_metadata(context);
+ IRubyObject nestedValue = eventHash.dig(context, rubyString("foo"), rubyString("il[[]]]legal"));
+ Assert.assertFalse(nestedValue.isNil());
+ Assert.assertEquals(rubyString("okay"), nestedValue);
}
private static RubyString rubyString(final String java) {