mandiant · williballenthin · Nov 9, 2021 · Nov 8, 2021 · Nov 8, 2021 · Nov 8, 2021
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,7 @@
 ### New Features
 
 - engine: short circuit logic nodes for better performance #824 @williballenthin
+- engine: add optimizer the order faster nodes first #829 @williballenthin
 
 ### Breaking Changes
 

diff --git a/capa/optimizer.py b/capa/optimizer.py
@@ -0,0 +1,70 @@
+import logging
+
+import capa.engine as ceng
+import capa.features.common
+
+logger = logging.getLogger(__name__)
+
+
+def get_node_cost(node):
+    if isinstance(node, (capa.features.common.OS, capa.features.common.Arch, capa.features.common.Format)):
+        # we assume these are the most restrictive features:
+        # authors commonly use them at the start of rules to restrict the category of samples to inspect
+        return 0
+
+    # elif "everything else":
+    #   return 1
+    #
+    # this should be all hash-lookup features.
+    # see below.
+
+    elif isinstance(node, (capa.features.common.Substring, capa.features.common.Regex)):
+        # substring and regex features require a full scan of each string
+        # which we anticipate is more expensive then a hash lookup feature (e.g. mnemonic or count).
+        #
+        # TODO: compute the average cost of these feature relative to hash feature
+        # and adjust the factor accordingly.
+        return 2
+
+    elif isinstance(node, (ceng.Not, ceng.Range)):
+        # the cost of these nodes are defined by the complexity of their single child.
+        return get_node_cost(node.child)
+
+    elif isinstance(node, (ceng.And, ceng.Or, ceng.Some)):
+        # the cost of these nodes is the full cost of their children
+        # as this is the worst-case scenario.
+        return sum(map(get_node_cost, node.children))
+
+    else:
+        # this should be all hash-lookup features.
+        # we give this a arbitrary weight of 1.
+        # the only thing more "important" than this is checking OS/Arch/Format.
+        return 1
+
+
+def optimize_statement(statement):
+    # this routine operates in-place
+
+    if isinstance(statement, (ceng.And, ceng.Or, ceng.Some)):
+        # has .children
+        statement.children = sorted(statement.children, key=lambda n: get_node_cost(n))
+        return
+    elif isinstance(statement, (ceng.Not, ceng.Range)):
+        # has .child
+        optimize_statement(statement.child)
+        return
+    else:
+        # appears to be "simple"
+        return
+
+
+def optimize_rule(rule):
+    # this routine operates in-place
+    optimize_statement(rule.statement)
+
+
+def optimize_rules(rules):
+    logger.debug("optimizing %d rules", len(rules))
+    for rule in rules:
+        optimize_rule(rule)
+    return rules
diff --git a/capa/rules.py b/capa/rules.py
@@ -30,6 +30,7 @@
 import capa.perf
 import capa.engine as ceng
 import capa.features
+import capa.optimizer
 import capa.features.file
 import capa.features.insn
 import capa.features.common
@@ -961,6 +962,8 @@ def __init__(self, rules: List[Rule]):
         if len(rules) == 0:
             raise InvalidRuleSet("no rules selected")
 
+        rules = capa.optimizer.optimize_rules(rules)
+
         self.file_rules = self._get_rules_for_scope(rules, FILE_SCOPE)
         self.function_rules = self._get_rules_for_scope(rules, FUNCTION_SCOPE)
         self.basic_block_rules = self._get_rules_for_scope(rules, BASIC_BLOCK_SCOPE)