Skip to content

Commit

Permalink
Merge branch 'master' into master
Browse files Browse the repository at this point in the history
  • Loading branch information
jertel authored Jun 6, 2024
2 parents 73972d0 + 1dd39d5 commit 7e89ba2
Show file tree
Hide file tree
Showing 3 changed files with 80 additions and 96 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
## Other changes
- [Docs] Fixed typo in Alerta docs with incorrect number of seconds in a day. - @jertel
- Update GitHub actions to avoid running publish workflows on forked branches. - @jertel
- Rewrite `_find_es_dict_by_key` per [discussion #1450](https://github.com/jertel/elastalert2/discussions/1450) for fieldnames literally ending in `.keyword` [#1459](https://github.com/jertel/elastalert2/pull/1459) - @jmacdone @jertel

# 2.18.0

Expand Down
157 changes: 61 additions & 96 deletions elastalert/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,105 +44,70 @@ def new_get_event_ts(ts_field):
return lambda event: lookup_es_key(event[0], ts_field)


def _find_es_dict_by_key(lookup_dict: dict, term: str, string_multi_field_name: str = ".keyword") -> tuple[dict, str]:
""" Performs iterative dictionary search based upon the following conditions:
1. Subkeys may either appear behind a full stop (.) or at one lookup_dict level lower in the tree.
def _find_es_dict_by_key(lookup_dict: dict, term: str, string_multi_field_name: str = "keyword") -> tuple[dict, str]:
""" Performs a divide-and-conquer recursive search to resolve a term string
string as compatible dictionary key and list index combination. It attempts
to resolve the ambiguity for . and "keyword" being either literals or delimiters.
For example
'my.dotted.name.a_child_field.somelist[4]'
may be found as
lookup_dict['my.dotted.name']['a_child_field']['somelist'][4]
or found as
lookup_dict['my']['dotted.name']['a_child_field.somelist'][4]
1. Prefers longer fieldname matches
2. No wildcards exist within the provided ES search terms (these are treated as string literals)
3. Firstly assumes 'keyword' is a fieldname, then assumes 'keyword' is a subfield specifier for a multifield
This is necessary to get around inconsistencies in ES data.
For example:
{'ad.account_name': 'bob'}
Or:
{'csp_report': {'blocked_uri': 'bob.com'}}
And even:
{'juniper_duo.geoip': {'country_name': 'Democratic People's Republic of Korea'}}
We want a search term of form "key.subkey.subsubkey" to match in all cases.
:returns: A tuple with the first element being the dict that contains the key and the second
element which is the last subkey used to access the target specified by the term. None is
returned for both if the key can not be found.
"""

# For compound fieldnames added by ElastAlert.process_hits()
#
# For example, when query_key is a list of fieldnames it will insert a term
# 'key_1,other_fieldname,a_third_name'
# and if the rule is set for raw_query_keys, the query_key values may end
# with .keyword it will insert instead something like
# 'key_1_ip,other_fieldname_number,a_third_name.keyword'
# and we need to check for that synthentic compound fielname, including the
# .keyword suffix before contnuing
#
# Of course, it also handles happy path, non-ambuiguous fieldnames like
# 'ip_address' and 'src_displayname' that don't have . or [] characters
if term in lookup_dict:
return lookup_dict, term

# If not synthetically added by ElastAlert, matching documents will not have
# .keyword fieldnames, even if a .keyword fieldname was used as a term in
# the search
# e.g. {"term": {"description.keyword": "Target Description Here"}}
# will return a document with {"_source": {"description": "Target Description Here"}}
term = term.removesuffix(string_multi_field_name)
if term in lookup_dict:
return lookup_dict, term

# If the term does not match immediately, perform iterative lookup:
# 1. Split the search term into tokens
# 2. Recurrently concatenate these together to traverse deeper into the dictionary,
# clearing the subkey at every successful lookup.
#
# This greedy approach is correct because subkeys must always appear in order,
# preferring full stops and traversal interchangeably.
#
# Subkeys will NEVER be duplicated between an alias and a traversal.
#
# For example:
# {'foo.bar': {'bar': 'ray'}} to look up foo.bar will return {'bar': 'ray'}, not 'ray'
dict_cursor = lookup_dict

while term:
split_results = re.split(r'\[(\d)\]', term, maxsplit=1)
if len(split_results) == 3:
sub_term, index, term = split_results
index = int(index)
else:
sub_term, index, term = split_results + [None, '']

subkeys = sub_term.split('.')

subkey = ''

while len(subkeys) > 0:
if not dict_cursor:
return {}, None

subkey += subkeys.pop(0)

if subkey in dict_cursor:
if len(subkeys) == 0:
break
dict_cursor = dict_cursor[subkey]
subkey = ''
elif len(subkeys) == 0:
# If there are no keys left to match, return None values
dict_cursor = None
subkey = None
else:
subkey += '.'

if index is not None and subkey:
dict_cursor = dict_cursor[subkey]
if type(dict_cursor) == list and len(dict_cursor) > index:
subkey = index
if term:
dict_cursor = dict_cursor[subkey]
else:
return {}, None

return dict_cursor, subkey
subkeys = term.split('.')

# reverse to match longest fieldnames first
for i in reversed(range(1, len(subkeys)+1)):
root = ".".join(subkeys[0:i])

# Handle array index references
# Example
# foo[3]bar[1]baz is recursively checked as
# _find_es_dict_by_key(lookup_dict['foo'][3], 'bar[1]baz')

m = re.search(r'(.+?)\[(\d)\](.*)', root)
value_index = None
child_components = []
if m:
root = m.group(1)
value_index = int(m.group(2))
if m.group(3):
child_components.append(m.group(3))

if root in lookup_dict:
child_components.extend(subkeys[i:])

# Pursue 'keyword' (if present) as a literal required fieldname
child_components_options = [child_components]
try:
# Then pursue 'keyword' (if present) as subfield specifier by ignoring it
if child_components[-1] == string_multi_field_name:
child_components_options.append(child_components[:-1])
except IndexError:
pass

for child_components_option in child_components_options:
child = ".".join(child_components_option)
if value_index is not None:
if not child:
return lookup_dict[root], value_index
if isinstance(lookup_dict[root][value_index], dict):
try:
return _find_es_dict_by_key(lookup_dict[root][value_index], child, string_multi_field_name)
except IndexError:
return {}, None

if child and isinstance(lookup_dict[root], dict):
return _find_es_dict_by_key(lookup_dict[root], child, string_multi_field_name)
return lookup_dict, root
return {}, None


def set_es_key(lookup_dict, term, value):
Expand Down
18 changes: 18 additions & 0 deletions tests/util_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,24 @@ def test_looking_up_nested_composite_keys(ea):
assert lookup_es_key(record, 'Fields.ts.value.keyword') == expected


def test_looking_up_nested_composite_keys_with_fieldname_literary_containing_keyword(ea):
expected = 12467267
record = {
'Message': '12345',
'Fields': {
'ts': {
'value': {
'keyword': expected,
}
},
'severity': 'large',
'user': 'jimmay'
}
}

assert lookup_es_key(record, 'Fields.ts.value.keyword') == expected


def test_looking_up_arrays(ea):
record = {
'flags': [1, 2, 3],
Expand Down

0 comments on commit 7e89ba2

Please sign in to comment.