diff --git a/CHANGELOG.md b/CHANGELOG.md index 665c3c1d8..ccc9840df 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,7 @@ ### New Features - engine: short circuit logic nodes for better performance #824 @williballenthin +- engine: add optimizer the order faster nodes first #829 @williballenthin ### Breaking Changes diff --git a/capa/optimizer.py b/capa/optimizer.py new file mode 100644 index 000000000..9d14c6e65 --- /dev/null +++ b/capa/optimizer.py @@ -0,0 +1,70 @@ +import logging + +import capa.engine as ceng +import capa.features.common + +logger = logging.getLogger(__name__) + + +def get_node_cost(node): + if isinstance(node, (capa.features.common.OS, capa.features.common.Arch, capa.features.common.Format)): + # we assume these are the most restrictive features: + # authors commonly use them at the start of rules to restrict the category of samples to inspect + return 0 + + # elif "everything else": + # return 1 + # + # this should be all hash-lookup features. + # see below. + + elif isinstance(node, (capa.features.common.Substring, capa.features.common.Regex)): + # substring and regex features require a full scan of each string + # which we anticipate is more expensive then a hash lookup feature (e.g. mnemonic or count). + # + # TODO: compute the average cost of these feature relative to hash feature + # and adjust the factor accordingly. + return 2 + + elif isinstance(node, (ceng.Not, ceng.Range)): + # the cost of these nodes are defined by the complexity of their single child. + return get_node_cost(node.child) + + elif isinstance(node, (ceng.And, ceng.Or, ceng.Some)): + # the cost of these nodes is the full cost of their children + # as this is the worst-case scenario. + return sum(map(get_node_cost, node.children)) + + else: + # this should be all hash-lookup features. + # we give this a arbitrary weight of 1. + # the only thing more "important" than this is checking OS/Arch/Format. + return 1 + + +def optimize_statement(statement): + # this routine operates in-place + + if isinstance(statement, (ceng.And, ceng.Or, ceng.Some)): + # has .children + statement.children = sorted(statement.children, key=lambda n: get_node_cost(n)) + return + elif isinstance(statement, (ceng.Not, ceng.Range)): + # has .child + optimize_statement(statement.child) + return + else: + # appears to be "simple" + return + + +def optimize_rule(rule): + # this routine operates in-place + optimize_statement(rule.statement) + + +def optimize_rules(rules): + logger.debug("optimizing %d rules", len(rules)) + for rule in rules: + optimize_rule(rule) + return rules diff --git a/capa/rules.py b/capa/rules.py index 00dc0837c..2d53a0aa8 100644 --- a/capa/rules.py +++ b/capa/rules.py @@ -30,6 +30,7 @@ import capa.perf import capa.engine as ceng import capa.features +import capa.optimizer import capa.features.file import capa.features.insn import capa.features.common @@ -961,6 +962,8 @@ def __init__(self, rules: List[Rule]): if len(rules) == 0: raise InvalidRuleSet("no rules selected") + rules = capa.optimizer.optimize_rules(rules) + self.file_rules = self._get_rules_for_scope(rules, FILE_SCOPE) self.function_rules = self._get_rules_for_scope(rules, FUNCTION_SCOPE) self.basic_block_rules = self._get_rules_for_scope(rules, BASIC_BLOCK_SCOPE) diff --git a/tests/test_optimizer.py b/tests/test_optimizer.py new file mode 100644 index 000000000..69a79bd63 --- /dev/null +++ b/tests/test_optimizer.py @@ -0,0 +1,65 @@ +# Copyright (C) 2021 FireEye, Inc. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: [package root]/LICENSE.txt +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and limitations under the License. + +import textwrap + +import pytest + +import capa.rules +import capa.engine +import capa.optimizer +import capa.features.common +from capa.engine import Or, And +from capa.features.insn import Mnemonic +from capa.features.common import Arch, Bytes, Substring + + +def test_optimizer_order(): + rule = textwrap.dedent( + """ + rule: + meta: + name: test rule + scope: function + features: + - and: + - substring: "foo" + - arch: amd64 + - mnemonic: cmp + - and: + - bytes: 3 + - offset: 2 + - or: + - number: 1 + - offset: 4 + """ + ) + r = capa.rules.Rule.from_yaml(rule) + + # before optimization + children = list(r.statement.get_children()) + assert isinstance(children[0], Substring) + assert isinstance(children[1], Arch) + assert isinstance(children[2], Mnemonic) + assert isinstance(children[3], And) + assert isinstance(children[4], Or) + + # after optimization + capa.optimizer.optimize_rules([r]) + children = list(r.statement.get_children()) + + # cost: 0 + assert isinstance(children[0], Arch) + # cost: 1 + assert isinstance(children[1], Mnemonic) + # cost: 2 + assert isinstance(children[2], Substring) + # cost: 3 + assert isinstance(children[3], Or) + # cost: 4 + assert isinstance(children[4], And)