Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

profile infrastructure #828

Merged
merged 20 commits into from
Nov 9, 2021
Merged
Show file tree
Hide file tree
Changes from 15 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -115,3 +115,6 @@ isort-output.log
black-output.log
rule-linter-output.log
.vscode
scripts/perf/*.txt
scripts/perf/*.svg
scripts/perf/*.zip
16 changes: 16 additions & 0 deletions capa/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import collections
from typing import Set, Dict, List, Tuple, Union, Mapping, Iterable

import capa.perf
import capa.rules
import capa.features.common
from capa.features.common import Result, Feature
Expand Down Expand Up @@ -79,6 +80,9 @@ def __init__(self, children, description=None):
self.children = children

def evaluate(self, ctx):
capa.perf.counters["evaluate.feature"] += 1
capa.perf.counters["evaluate.feature.and"] += 1
Comment on lines +83 to +84
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this will have a marginal performance impact itself. we can potentially remove these once our performance work is complete.


results = [child.evaluate(ctx) for child in self.children]
success = all(results)
return Result(success, self, results)
Expand All @@ -92,6 +96,9 @@ def __init__(self, children, description=None):
self.children = children

def evaluate(self, ctx):
capa.perf.counters["evaluate.feature"] += 1
capa.perf.counters["evaluate.feature.or"] += 1

results = [child.evaluate(ctx) for child in self.children]
success = any(results)
return Result(success, self, results)
Expand All @@ -105,6 +112,9 @@ def __init__(self, child, description=None):
self.child = child

def evaluate(self, ctx):
capa.perf.counters["evaluate.feature"] += 1
capa.perf.counters["evaluate.feature.not"] += 1

results = [self.child.evaluate(ctx)]
success = not results[0]
return Result(success, self, results)
Expand All @@ -119,6 +129,9 @@ def __init__(self, count, children, description=None):
self.children = children

def evaluate(self, ctx):
capa.perf.counters["evaluate.feature"] += 1
capa.perf.counters["evaluate.feature.some"] += 1

results = [child.evaluate(ctx) for child in self.children]
# note that here we cast the child result as a bool
# because we've overridden `__bool__` above.
Expand All @@ -138,6 +151,9 @@ def __init__(self, child, min=None, max=None, description=None):
self.max = max if max is not None else (1 << 64 - 1)

def evaluate(self, ctx):
capa.perf.counters["evaluate.feature"] += 1
capa.perf.counters["evaluate.feature.range"] += 1

count = len(ctx.get(self.child, []))
if self.min == 0 and count == 0:
return Result(True, self, [])
Expand Down
12 changes: 12 additions & 0 deletions capa/features/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
# circular import, otherwise
import capa.engine

import capa.perf
import capa.features
import capa.features.extractors.elf

Expand Down Expand Up @@ -146,6 +147,8 @@ def __repr__(self):
return str(self)

def evaluate(self, ctx: Dict["Feature", Set[int]]) -> Result:
capa.perf.counters["evaluate.feature"] += 1
capa.perf.counters["evaluate.feature." + self.name] += 1
return Result(self in ctx, self, [], locations=ctx.get(self, []))

def freeze_serialize(self):
Expand Down Expand Up @@ -190,6 +193,9 @@ def __init__(self, value: str, description=None):
self.value = value

def evaluate(self, ctx):
capa.perf.counters["evaluate.feature"] += 1
capa.perf.counters["evaluate.feature.substring"] += 1

# mapping from string value to list of locations.
# will unique the locations later on.
matches = collections.defaultdict(list)
Expand Down Expand Up @@ -275,6 +281,9 @@ def __init__(self, value: str, description=None):
)

def evaluate(self, ctx):
capa.perf.counters["evaluate.feature"] += 1
capa.perf.counters["evaluate.feature.regex"] += 1

# mapping from string value to list of locations.
# will unique the locations later on.
matches = collections.defaultdict(list)
Expand Down Expand Up @@ -358,6 +367,9 @@ def __init__(self, value: bytes, description=None):
self.value = value

def evaluate(self, ctx):
capa.perf.counters["evaluate.feature"] += 1
capa.perf.counters["evaluate.feature.bytes"] += 1

for feature, locations in ctx.items():
if not isinstance(feature, (Bytes,)):
continue
Expand Down
71 changes: 46 additions & 25 deletions capa/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,15 @@
"""
import os
import sys
import time
import hashlib
import logging
import os.path
import argparse
import datetime
import textwrap
import itertools
import contextlib
import collections
from typing import Any, Dict, List, Tuple

Expand All @@ -26,6 +28,7 @@
from pefile import PEFormatError
from elftools.common.exceptions import ELFError

import capa.perf
import capa.rules
import capa.engine
import capa.version
Expand Down Expand Up @@ -65,6 +68,14 @@
logger = logging.getLogger("capa")


@contextlib.contextmanager
def timing(msg: str):
t0 = time.time()
yield
t1 = time.time()
logger.debug("perf: %s: %0.2fs", msg, t1 - t0)


def set_vivisect_log_level(level):
logging.getLogger("vivisect").setLevel(level)
logging.getLogger("vivisect.base").setLevel(level)
Expand Down Expand Up @@ -158,27 +169,28 @@ def find_capabilities(ruleset: RuleSet, extractor: FeatureExtractor, disable_pro
n_funcs = len(functions)

pb = pbar(functions, desc="matching", unit=" functions", postfix="skipped 0 library functions")
for f in pb:
function_address = int(f)

if extractor.is_library_function(function_address):
function_name = extractor.get_function_name(function_address)
logger.debug("skipping library function 0x%x (%s)", function_address, function_name)
meta["library_functions"][function_address] = function_name
n_libs = len(meta["library_functions"])
percentage = 100 * (n_libs / n_funcs)
if isinstance(pb, tqdm.tqdm):
pb.set_postfix_str("skipped %d library functions (%d%%)" % (n_libs, percentage))
continue
with timing("match functions"):
for f in pb:
function_address = int(f)

if extractor.is_library_function(function_address):
function_name = extractor.get_function_name(function_address)
logger.debug("skipping library function 0x%x (%s)", function_address, function_name)
meta["library_functions"][function_address] = function_name
n_libs = len(meta["library_functions"])
percentage = 100 * (n_libs / n_funcs)
if isinstance(pb, tqdm.tqdm):
pb.set_postfix_str("skipped %d library functions (%d%%)" % (n_libs, percentage))
continue

function_matches, bb_matches, feature_count = find_function_capabilities(ruleset, extractor, f)
meta["feature_counts"]["functions"][function_address] = feature_count
logger.debug("analyzed function 0x%x and extracted %d features", function_address, feature_count)
function_matches, bb_matches, feature_count = find_function_capabilities(ruleset, extractor, f)
meta["feature_counts"]["functions"][function_address] = feature_count
logger.debug("analyzed function 0x%x and extracted %d features", function_address, feature_count)

for rule_name, res in function_matches.items():
all_function_matches[rule_name].extend(res)
for rule_name, res in bb_matches.items():
all_bb_matches[rule_name].extend(res)
for rule_name, res in function_matches.items():
all_function_matches[rule_name].extend(res)
for rule_name, res in bb_matches.items():
all_bb_matches[rule_name].extend(res)

# collection of features that captures the rule matches within function and BB scopes.
# mapping from feature (matched rule) to set of addresses at which it matched.
Expand All @@ -188,7 +200,8 @@ def find_capabilities(ruleset: RuleSet, extractor: FeatureExtractor, disable_pro
rule = ruleset[rule_name]
capa.engine.index_rule_matches(function_and_lower_features, rule, locations)

all_file_matches, feature_count = find_file_capabilities(ruleset, extractor, function_and_lower_features)
with timing("match file"):
all_file_matches, feature_count = find_file_capabilities(ruleset, extractor, function_and_lower_features)
meta["feature_counts"]["file"] = feature_count

matches = {
Expand Down Expand Up @@ -400,9 +413,11 @@ def get_workspace(path, format, sigpaths):
else:
raise ValueError("unexpected format: " + format)

viv_utils.flirt.register_flirt_signature_analyzers(vw, sigpaths)
with timing("load FLIRT"):
viv_utils.flirt.register_flirt_signature_analyzers(vw, sigpaths)

vw.analyze()
with timing("viv analyze"):
vw.analyze()

logger.debug("%s", get_meta_str(vw))
return vw
Expand Down Expand Up @@ -890,8 +905,10 @@ def main(argv=None):
return E_MISSING_FILE

try:
rules = get_rules(args.rules, disable_progress=args.quiet)
rules = capa.rules.RuleSet(rules)
with timing("load rules"):
rules = get_rules(args.rules, disable_progress=args.quiet)
rules = capa.rules.RuleSet(rules)

logger.debug(
"successfully loaded %s rules",
# during the load of the RuleSet, we extract subscope statements into their own rules
Expand Down Expand Up @@ -1003,10 +1020,14 @@ def main(argv=None):

meta = collect_metadata(argv, args.sample, args.rules, extractor)

capabilities, counts = find_capabilities(rules, extractor, disable_progress=args.quiet)
with timing("find capabilities"):
capabilities, counts = find_capabilities(rules, extractor, disable_progress=args.quiet)
meta["analysis"].update(counts)
meta["analysis"]["layout"] = compute_layout(rules, extractor, capabilities)

for (counter, count) in capa.perf.counters.most_common():
logger.debug("perf: counter: {:}: {:,}".format(counter, count))

if has_file_limitation(rules, capabilities):
# bail if capa encountered file limitation e.g. a packed binary
# do show the output in verbose mode, though.
Expand Down
9 changes: 9 additions & 0 deletions capa/perf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
import collections
from typing import Dict

counters: Dict[str, int] = collections.Counter()


def reset():
global counters
counters = collections.Counter()
3 changes: 3 additions & 0 deletions capa/rules.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
import yaml
import ruamel.yaml

import capa.perf
import capa.engine as ceng
import capa.features
import capa.features.file
Expand Down Expand Up @@ -620,6 +621,8 @@ def extract_subscope_rules(self):
yield new_rule

def evaluate(self, features: FeatureSet):
capa.perf.counters["evaluate.feature"] += 1
capa.perf.counters["evaluate.feature.rule"] += 1
return self.statement.evaluate(features)

@classmethod
Expand Down
115 changes: 115 additions & 0 deletions scripts/profile-time.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
import sys
williballenthin marked this conversation as resolved.
Show resolved Hide resolved
import timeit
import logging
import argparse
import subprocess

import tqdm
import tabulate

import capa.main
import capa.perf
import capa.rules
import capa.engine
import capa.helpers
import capa.features
import capa.features.common
import capa.features.freeze

logger = logging.getLogger("capa.profile")


def main(argv=None):
if argv is None:
argv = sys.argv[1:]

label = subprocess.run(
"git show --pretty=oneline --abbrev-commit | head -n 1", shell=True, capture_output=True, text=True
).stdout.strip()
is_dirty = (
subprocess.run(
"git status | grep 'modified: ' | grep -v 'rules' | grep -v 'tests/data'",
shell=True,
capture_output=True,
text=True,
).stdout
!= ""
)

if is_dirty:
label += " (dirty)"

parser = argparse.ArgumentParser(description="Profile capa performance")
capa.main.install_common_args(parser, wanted={"format", "sample", "signatures", "rules"})

parser.add_argument("--number", type=int, default=3, help="batch size of profile collection")
parser.add_argument("--repeat", type=int, default=30, help="batch count of profile collection")
parser.add_argument("--label", type=str, default=label, help="description of the profile collection")

args = parser.parse_args(args=argv)
capa.main.handle_common_args(args)

try:
taste = capa.helpers.get_file_taste(args.sample)
except IOError as e:
logger.error("%s", str(e))
return -1

try:
with capa.main.timing("load rules"):
rules = capa.rules.RuleSet(capa.main.get_rules(args.rules, disable_progress=True))
except (IOError) as e:
logger.error("%s", str(e))
return -1

try:
sig_paths = capa.main.get_signatures(args.signatures)
except (IOError) as e:
logger.error("%s", str(e))
return -1

if (args.format == "freeze") or (args.format == "auto" and capa.features.freeze.is_freeze(taste)):
with open(args.sample, "rb") as f:
extractor = capa.features.freeze.load(f.read())
else:
extractor = capa.main.get_extractor(
args.sample, args.format, capa.main.BACKEND_VIV, sig_paths, should_save_workspace=False
)

with tqdm.tqdm(total=args.number * args.repeat) as pbar:

def do_iteration():
capa.perf.reset()
capa.main.find_capabilities(rules, extractor, disable_progress=True)
pbar.update(1)

samples = timeit.repeat(do_iteration, number=args.number, repeat=args.repeat)

logger.debug("perf: find capabilities: min: %0.2fs" % (min(samples) / float(args.number)))
logger.debug("perf: find capabilities: avg: %0.2fs" % (sum(samples) / float(args.repeat) / float(args.number)))
logger.debug("perf: find capabilities: max: %0.2fs" % (max(samples) / float(args.number)))
williballenthin marked this conversation as resolved.
Show resolved Hide resolved

for (counter, count) in capa.perf.counters.most_common():
logger.debug("perf: counter: {:}: {:,}".format(counter, count))

print(
tabulate.tabulate(
[
(
args.label,
"{:,}".format(capa.perf.counters["evaluate.feature"]),
"%0.2fs" % (sum(samples) / float(args.repeat) / float(args.number)),
"%0.2fs" % (min(samples) / float(args.number)),
"%0.2fs" % (max(samples) / float(args.number)),
)
],
headers=["label", "count(evaluations)", "avg(time)", "min(time)", "max(time)"],
tablefmt="github",
)
)

return 0


if __name__ == "__main__":
sys.exit(main())