Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

perf: add query optimizer #829

Merged
merged 5 commits into from
Nov 9, 2021
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
### New Features

- engine: short circuit logic nodes for better performance #824 @williballenthin
- engine: add optimizer the order faster nodes first #829 @williballenthin

### Breaking Changes

Expand Down
70 changes: 70 additions & 0 deletions capa/optimizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
import logging

import capa.engine as ceng
import capa.features.common

logger = logging.getLogger(__name__)


def get_node_cost(node):
if isinstance(node, (capa.features.common.OS, capa.features.common.Arch, capa.features.common.Format)):
# we assume these are the most restrictive features:
# authors commonly use them at the start of rules to restrict the category of samples to inspect
return 0

# elif "everything else":
# return 1
#
# this should be all hash-lookup features.
# see below.

elif isinstance(node, (capa.features.common.Substring, capa.features.common.Regex)):
# substring and regex features require a full scan of each string
# which we anticipate is more expensive then a hash lookup feature (e.g. mnemonic or count).
#
# TODO: compute the average cost of these feature relative to hash feature
# and adjust the factor accordingly.
return 2

elif isinstance(node, (ceng.Not, ceng.Range)):
# the cost of these nodes are defined by the complexity of their single child.
return get_node_cost(node.child)

elif isinstance(node, (ceng.And, ceng.Or, ceng.Some)):
# the cost of these nodes is the full cost of their children
# as this is the worst-case scenario.
return sum(map(get_node_cost, node.children))

else:
# this should be all hash-lookup features.
# we give this a arbitrary weight of 1.
# the only thing more "important" than this is checking OS/Arch/Format.
return 1


def optimize_statement(statement):
# this routine operates in-place

if isinstance(statement, (ceng.And, ceng.Or, ceng.Some)):
# has .children
statement.children = sorted(statement.children, key=lambda n: get_node_cost(n))
return
elif isinstance(statement, (ceng.Not, ceng.Range)):
# has .child
optimize_statement(statement.child)
return
else:
# appears to be "simple"
return


def optimize_rule(rule):
# this routine operates in-place
optimize_statement(rule.statement)


def optimize_rules(rules):
logger.debug("optimizing %d rules", len(rules))
for rule in rules:
optimize_rule(rule)
return rules
3 changes: 3 additions & 0 deletions capa/rules.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
import capa.perf
import capa.engine as ceng
import capa.features
import capa.optimizer
import capa.features.file
import capa.features.insn
import capa.features.common
Expand Down Expand Up @@ -961,6 +962,8 @@ def __init__(self, rules: List[Rule]):
if len(rules) == 0:
raise InvalidRuleSet("no rules selected")

rules = capa.optimizer.optimize_rules(rules)

self.file_rules = self._get_rules_for_scope(rules, FILE_SCOPE)
self.function_rules = self._get_rules_for_scope(rules, FUNCTION_SCOPE)
self.basic_block_rules = self._get_rules_for_scope(rules, BASIC_BLOCK_SCOPE)
Expand Down