Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

perf: add query optimizer #829

Merged
merged 5 commits into from
Nov 9, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
### New Features

- engine: short circuit logic nodes for better performance #824 @williballenthin
- engine: add optimizer the order faster nodes first #829 @williballenthin

### Breaking Changes

Expand Down
70 changes: 70 additions & 0 deletions capa/optimizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
import logging

import capa.engine as ceng
import capa.features.common

logger = logging.getLogger(__name__)


def get_node_cost(node):
if isinstance(node, (capa.features.common.OS, capa.features.common.Arch, capa.features.common.Format)):
# we assume these are the most restrictive features:
# authors commonly use them at the start of rules to restrict the category of samples to inspect
return 0

# elif "everything else":
# return 1
#
# this should be all hash-lookup features.
# see below.

elif isinstance(node, (capa.features.common.Substring, capa.features.common.Regex)):
# substring and regex features require a full scan of each string
# which we anticipate is more expensive then a hash lookup feature (e.g. mnemonic or count).
#
# TODO: compute the average cost of these feature relative to hash feature
# and adjust the factor accordingly.
return 2

elif isinstance(node, (ceng.Not, ceng.Range)):
# the cost of these nodes are defined by the complexity of their single child.
return get_node_cost(node.child)

elif isinstance(node, (ceng.And, ceng.Or, ceng.Some)):
# the cost of these nodes is the full cost of their children
# as this is the worst-case scenario.
return sum(map(get_node_cost, node.children))

else:
# this should be all hash-lookup features.
# we give this a arbitrary weight of 1.
# the only thing more "important" than this is checking OS/Arch/Format.
return 1


def optimize_statement(statement):
# this routine operates in-place

if isinstance(statement, (ceng.And, ceng.Or, ceng.Some)):
# has .children
statement.children = sorted(statement.children, key=lambda n: get_node_cost(n))
return
elif isinstance(statement, (ceng.Not, ceng.Range)):
# has .child
optimize_statement(statement.child)
return
else:
# appears to be "simple"
return


def optimize_rule(rule):
# this routine operates in-place
optimize_statement(rule.statement)


def optimize_rules(rules):
logger.debug("optimizing %d rules", len(rules))
for rule in rules:
optimize_rule(rule)
return rules
3 changes: 3 additions & 0 deletions capa/rules.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
import capa.perf
import capa.engine as ceng
import capa.features
import capa.optimizer
import capa.features.file
import capa.features.insn
import capa.features.common
Expand Down Expand Up @@ -961,6 +962,8 @@ def __init__(self, rules: List[Rule]):
if len(rules) == 0:
raise InvalidRuleSet("no rules selected")

rules = capa.optimizer.optimize_rules(rules)

self.file_rules = self._get_rules_for_scope(rules, FILE_SCOPE)
self.function_rules = self._get_rules_for_scope(rules, FUNCTION_SCOPE)
self.basic_block_rules = self._get_rules_for_scope(rules, BASIC_BLOCK_SCOPE)
Expand Down
65 changes: 65 additions & 0 deletions tests/test_optimizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
# Copyright (C) 2021 FireEye, Inc. All Rights Reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at: [package root]/LICENSE.txt
# Unless required by applicable law or agreed to in writing, software distributed under the License
# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and limitations under the License.

import textwrap

import pytest

import capa.rules
import capa.engine
import capa.optimizer
import capa.features.common
from capa.engine import Or, And
from capa.features.insn import Mnemonic
from capa.features.common import Arch, Bytes, Substring


def test_optimizer_order():
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

great, thanks a lot!

rule = textwrap.dedent(
"""
rule:
meta:
name: test rule
scope: function
features:
- and:
- substring: "foo"
- arch: amd64
- mnemonic: cmp
- and:
- bytes: 3
- offset: 2
- or:
- number: 1
- offset: 4
"""
)
r = capa.rules.Rule.from_yaml(rule)

# before optimization
children = list(r.statement.get_children())
assert isinstance(children[0], Substring)
assert isinstance(children[1], Arch)
assert isinstance(children[2], Mnemonic)
assert isinstance(children[3], And)
assert isinstance(children[4], Or)

# after optimization
capa.optimizer.optimize_rules([r])
children = list(r.statement.get_children())

# cost: 0
assert isinstance(children[0], Arch)
# cost: 1
assert isinstance(children[1], Mnemonic)
# cost: 2
assert isinstance(children[2], Substring)
# cost: 3
assert isinstance(children[3], Or)
# cost: 4
assert isinstance(children[4], And)